src/5013007/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1               /* tainted information used? */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
  99  * complications: i.e., if everything matchable is straight forward in the
 100  * bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 128
 129 /* Doesn't do an assert to verify that is correct */
 130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 131     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 132
 133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 136
 137 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 138         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 139         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 140         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 141             * assert should likely and hopefully fail on an EBCDIC machine */ \
 142         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 143                                                                             \
 144         /* No asserts are done for these, in case called on an early        \
 145             * Unicode version in which they map to nothing */               \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 152         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 153
 154 /*
 155    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 156    so that it is possible to override the option here without having to
 157    rebuild the entire core. as we are required to do if we change regcomp.h
 158    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 159 */
 160 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 161 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 162 #endif
 163
 164 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 165 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 166 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 167 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 168 #define RE_utf8_perl_word   PL_utf8_alnum
 169 #define RE_utf8_perl_space  PL_utf8_space
 170 #define RE_utf8_posix_digit PL_utf8_digit
 171 #define perl_word  alnum
 172 #define perl_space space
 173 #define posix_digit digit
 174 #else
 175 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 176 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 177 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 178 #define RE_utf8_perl_word   PL_utf8_perl_word
 179 #define RE_utf8_perl_space  PL_utf8_perl_space
 180 #define RE_utf8_posix_digit PL_utf8_posix_digit
 181 #endif
 182
 183
 184 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
 185         case NAMEL:                                                         \
 186             PL_reg_flags |= RF_tainted;                                     \
 187             /* FALL THROUGH */                                              \
 188         case NAME:                                                          \
 189             if (!nextchr)                                                   \
 190                 sayNO;                                                      \
 191             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
 192                 if (!CAT2(PL_utf8_,CLASS)) {                                \
 193                     bool ok;                                                \
 194                     ENTER;                                                  \
 195                     save_re_context();                                      \
 196                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
 197                     assert(ok);                                             \
 198                     LEAVE;                                                  \
 199                 }                                                           \
 200                 if (!(OP(scan) == NAME                                      \
 201                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 202                     : LCFUNC_utf8((U8*)locinput)))                          \
 203                 {                                                           \
 204                     sayNO;                                                  \
 205                 }                                                           \
 206                 locinput += PL_utf8skip[nextchr];                           \
 207                 nextchr = UCHARAT(locinput);                                \
 208                 break;                                                      \
 209             }                                                               \
 210             /* Drops through to the macro that calls this one */
 211
 212 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
 213     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
 214             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
 215                 sayNO;                                                      \
 216             nextchr = UCHARAT(++locinput);                                  \
 217             break
 218
 219 /* Almost identical to the above, but has a case for a node that matches chars
 220  * between 128 and 255 using Unicode (latin1) semantics. */
 221 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 222     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 223             if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 224                 sayNO;                                                       \
 225             nextchr = UCHARAT(++locinput);                                   \
 226             break
 227
 228 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
 229         case NAMEL:                                                          \
 230             PL_reg_flags |= RF_tainted;                                      \
 231             /* FALL THROUGH */                                               \
 232         case NAME :                                                          \
 233             if (!nextchr && locinput >= PL_regeol)                           \
 234                 sayNO;                                                       \
 235             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
 236                 if (!CAT2(PL_utf8_,CLASS)) {                                 \
 237                     bool ok;                                                 \
 238                     ENTER;                                                   \
 239                     save_re_context();                                       \
 240                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
 241                     assert(ok);                                              \
 242                     LEAVE;                                                   \
 243                 }                                                            \
 244                 if ((OP(scan) == NAME                                        \
 245                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 246                     : LCFUNC_utf8((U8*)locinput)))                           \
 247                 {                                                            \
 248                     sayNO;                                                   \
 249                 }                                                            \
 250                 locinput += PL_utf8skip[nextchr];                            \
 251                 nextchr = UCHARAT(locinput);                                 \
 252                 break;                                                       \
 253             }
 254
 255 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
 256     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 257             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
 258                 sayNO;                                                       \
 259             nextchr = UCHARAT(++locinput);                                   \
 260             break
 261
 262
 263 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 264     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
 265             if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 266                 sayNO;                                                       \
 267             nextchr = UCHARAT(++locinput);                                   \
 268             break
 269
 270
 271
 272 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 273
 274 /* for use after a quantifier and before an EXACT-like node -- japhy */
 275 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 276  *
 277  * NOTE that *nothing* that affects backtracking should be in here, specifically
 278  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 279  * node that is in between two EXACT like nodes when ascertaining what the required
 280  * "follow" character is. This should probably be moved to regex compile time
 281  * although it may be done at run time beause of the REF possibility - more
 282  * investigation required. -- demerphq
 283 */
 284 #define JUMPABLE(rn) (      \
 285     OP(rn) == OPEN ||       \
 286     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 287     OP(rn) == EVAL ||   \
 288     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 289     OP(rn) == PLUS || OP(rn) == MINMOD || \
 290     OP(rn) == KEEPS || \
 291     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 292 )
 293 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 294
 295 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 296
 297 #if 0
 298 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 299    we don't need this definition. */
 300 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 301 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 302 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 303
 304 #else
 305 /* ... so we use this as its faster. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 307 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 309
 310 #endif
 311
 312 /*
 313   Search for mandatory following text node; for lookahead, the text must
 314   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 315 */
 316 #define FIND_NEXT_IMPT(rn) STMT_START { \
 317     while (JUMPABLE(rn)) { \
 318         const OPCODE type = OP(rn); \
 319         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 320             rn = NEXTOPER(NEXTOPER(rn)); \
 321         else if (type == PLUS) \
 322             rn = NEXTOPER(rn); \
 323         else if (type == IFMATCH) \
 324             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 325         else rn += NEXT_OFF(rn); \
 326     } \
 327 } STMT_END
 328
 329
 330 static void restore_pos(pTHX_ void *arg);
 331
 332 #define REGCP_PAREN_ELEMS 4
 333 #define REGCP_OTHER_ELEMS 5
 334 #define REGCP_FRAME_ELEMS 1
 335 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 336  * are needed for the regexp context stack bookkeeping. */
 337
 338 STATIC CHECKPOINT
 339 S_regcppush(pTHX_ I32 parenfloor)
 340 {
 341     dVAR;
 342     const int retval = PL_savestack_ix;
 343     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 344     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 345     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 346     int p;
 347     GET_RE_DEBUG_FLAGS_DECL;
 348
 349     if (paren_elems_to_push < 0)
 350         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 351
 352     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 354                    " out of range (%lu-%ld)",
 355                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 356
 357     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 358
 359     for (p = PL_regsize; p > parenfloor; p--) {
 360 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 361         SSPUSHINT(PL_regoffs[p].end);
 362         SSPUSHINT(PL_regoffs[p].start);
 363         SSPUSHPTR(PL_reg_start_tmp[p]);
 364         SSPUSHINT(p);
 365         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 366           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 367                       (UV)p, (IV)PL_regoffs[p].start,
 368                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 369                       (IV)PL_regoffs[p].end
 370         ));
 371     }
 372 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 373     SSPUSHPTR(PL_regoffs);
 374     SSPUSHINT(PL_regsize);
 375     SSPUSHINT(*PL_reglastparen);
 376     SSPUSHINT(*PL_reglastcloseparen);
 377     SSPUSHPTR(PL_reginput);
 378     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 379
 380     return retval;
 381 }
 382
 383 /* These are needed since we do not localize EVAL nodes: */
 384 #define REGCP_SET(cp)                                           \
 385     DEBUG_STATE_r(                                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 388                 (IV)PL_savestack_ix));                          \
 389     cp = PL_savestack_ix
 390
 391 #define REGCP_UNWIND(cp)                                        \
 392     DEBUG_STATE_r(                                              \
 393         if (cp != PL_savestack_ix)                              \
 394             PerlIO_printf(Perl_debug_log,                       \
 395                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 396                 (IV)(cp), (IV)PL_savestack_ix));                \
 397     regcpblow(cp)
 398
 399 STATIC char *
 400 S_regcppop(pTHX_ const regexp *rex)
 401 {
 402     dVAR;
 403     UV i;
 404     char *input;
 405     GET_RE_DEBUG_FLAGS_DECL;
 406
 407     PERL_ARGS_ASSERT_REGCPPOP;
 408
 409     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 410     i = SSPOPUV;
 411     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 412     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 413     input = (char *) SSPOPPTR;
 414     *PL_reglastcloseparen = SSPOPINT;
 415     *PL_reglastparen = SSPOPINT;
 416     PL_regsize = SSPOPINT;
 417     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 418
 419     i -= REGCP_OTHER_ELEMS;
 420     /* Now restore the parentheses context. */
 421     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 422         I32 tmps;
 423         U32 paren = (U32)SSPOPINT;
 424         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 425         PL_regoffs[paren].start = SSPOPINT;
 426         tmps = SSPOPINT;
 427         if (paren <= *PL_reglastparen)
 428             PL_regoffs[paren].end = tmps;
 429         DEBUG_BUFFERS_r(
 430             PerlIO_printf(Perl_debug_log,
 431                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 432                           (UV)paren, (IV)PL_regoffs[paren].start,
 433                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 434                           (IV)PL_regoffs[paren].end,
 435                           (paren > *PL_reglastparen ? "(no)" : ""));
 436         );
 437     }
 438     DEBUG_BUFFERS_r(
 439         if (*PL_reglastparen + 1 <= rex->nparens) {
 440             PerlIO_printf(Perl_debug_log,
 441                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 442                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 443         }
 444     );
 445 #if 1
 446     /* It would seem that the similar code in regtry()
 447      * already takes care of this, and in fact it is in
 448      * a better location to since this code can #if 0-ed out
 449      * but the code in regtry() is needed or otherwise tests
 450      * requiring null fields (pat.t#187 and split.t#{13,14}
 451      * (as of patchlevel 7877)  will fail.  Then again,
 452      * this code seems to be necessary or otherwise
 453      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 454      * --jhi updated by dapm */
 455     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 456         if (i > PL_regsize)
 457             PL_regoffs[i].start = -1;
 458         PL_regoffs[i].end = -1;
 459     }
 460 #endif
 461     return input;
 462 }
 463
 464 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 465
 466 /*
 467  * pregexec and friends
 468  */
 469
 470 #ifndef PERL_IN_XSUB_RE
 471 /*
 472  - pregexec - match a regexp against a string
 473  */
 474 I32
 475 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 476          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 477 /* strend: pointer to null at end of string */
 478 /* strbeg: real beginning of string */
 479 /* minend: end of match must be >=minend after stringarg. */
 480 /* nosave: For optimizations. */
 481 {
 482     PERL_ARGS_ASSERT_PREGEXEC;
 483
 484     return
 485         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 486                       nosave ? 0 : REXEC_COPY_STR);
 487 }
 488 #endif
 489
 490 /*
 491  * Need to implement the following flags for reg_anch:
 492  *
 493  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 494  * USE_INTUIT_ML
 495  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 496  * INTUIT_AUTORITATIVE_ML
 497  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 498  * INTUIT_ONCE_ML
 499  *
 500  * Another flag for this function: SECOND_TIME (so that float substrs
 501  * with giant delta may be not rechecked).
 502  */
 503
 504 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 505
 506 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 507    Otherwise, only SvCUR(sv) is used to get strbeg. */
 508
 509 /* XXXX We assume that strpos is strbeg unless sv. */
 510
 511 /* XXXX Some places assume that there is a fixed substring.
 512         An update may be needed if optimizer marks as "INTUITable"
 513         RExen without fixed substrings.  Similarly, it is assumed that
 514         lengths of all the strings are no more than minlen, thus they
 515         cannot come from lookahead.
 516         (Or minlen should take into account lookahead.)
 517   NOTE: Some of this comment is not correct. minlen does now take account
 518   of lookahead/behind. Further research is required. -- demerphq
 519
 520 */
 521
 522 /* A failure to find a constant substring means that there is no need to make
 523    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 524    finding a substring too deep into the string means that less calls to
 525    regtry() should be needed.
 526
 527    REx compiler's optimizer found 4 possible hints:
 528         a) Anchored substring;
 529         b) Fixed substring;
 530         c) Whether we are anchored (beginning-of-line or \G);
 531         d) First node (of those at offset 0) which may distingush positions;
 532    We use a)b)d) and multiline-part of c), and try to find a position in the
 533    string which does not contradict any of them.
 534  */
 535
 536 /* Most of decisions we do here should have been done at compile time.
 537    The nodes of the REx which we used for the search should have been
 538    deleted from the finite automaton. */
 539
 540 char *
 541 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 542                      char *strend, const U32 flags, re_scream_pos_data *data)
 543 {
 544     dVAR;
 545     struct regexp *const prog = (struct regexp *)SvANY(rx);
 546     register I32 start_shift = 0;
 547     /* Should be nonnegative! */
 548     register I32 end_shift   = 0;
 549     register char *s;
 550     register SV *check;
 551     char *strbeg;
 552     char *t;
 553     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 554     I32 ml_anch;
 555     register char *other_last = NULL;   /* other substr checked before this */
 556     char *check_at = NULL;              /* check substr found at this pos */
 557     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 558     RXi_GET_DECL(prog,progi);
 559 #ifdef DEBUGGING
 560     const char * const i_strpos = strpos;
 561 #endif
 562     GET_RE_DEBUG_FLAGS_DECL;
 563
 564     PERL_ARGS_ASSERT_RE_INTUIT_START;
 565
 566     RX_MATCH_UTF8_set(rx,utf8_target);
 567
 568     if (RX_UTF8(rx)) {
 569         PL_reg_flags |= RF_utf8;
 570     }
 571     DEBUG_EXECUTE_r(
 572         debug_start_match(rx, utf8_target, strpos, strend,
 573             sv ? "Guessing start of match in sv for"
 574                : "Guessing start of match in string for");
 575               );
 576
 577     /* CHR_DIST() would be more correct here but it makes things slow. */
 578     if (prog->minlen > strend - strpos) {
 579         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 580                               "String too short... [re_intuit_start]\n"));
 581         goto fail;
 582     }
 583
 584     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 585     PL_regeol = strend;
 586     if (utf8_target) {
 587         if (!prog->check_utf8 && prog->check_substr)
 588             to_utf8_substr(prog);
 589         check = prog->check_utf8;
 590     } else {
 591         if (!prog->check_substr && prog->check_utf8)
 592             to_byte_substr(prog);
 593         check = prog->check_substr;
 594     }
 595     if (check == &PL_sv_undef) {
 596         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 597                 "Non-utf8 string cannot match utf8 check string\n"));
 598         goto fail;
 599     }
 600     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 601         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 602                      || ( (prog->extflags & RXf_ANCH_BOL)
 603                           && !multiline ) );    /* Check after \n? */
 604
 605         if (!ml_anch) {
 606           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 607                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 608                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 609                && sv && !SvROK(sv)
 610                && (strpos != strbeg)) {
 611               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 612               goto fail;
 613           }
 614           if (prog->check_offset_min == prog->check_offset_max &&
 615               !(prog->extflags & RXf_CANY_SEEN)) {
 616             /* Substring at constant offset from beg-of-str... */
 617             I32 slen;
 618
 619             s = HOP3c(strpos, prog->check_offset_min, strend);
 620
 621             if (SvTAIL(check)) {
 622                 slen = SvCUR(check);    /* >= 1 */
 623
 624                 if ( strend - s > slen || strend - s < slen - 1
 625                      || (strend - s == slen && strend[-1] != '\n')) {
 626                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 627                     goto fail_finish;
 628                 }
 629                 /* Now should match s[0..slen-2] */
 630                 slen--;
 631                 if (slen && (*SvPVX_const(check) != *s
 632                              || (slen > 1
 633                                  && memNE(SvPVX_const(check), s, slen)))) {
 634                   report_neq:
 635                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 636                     goto fail_finish;
 637                 }
 638             }
 639             else if (*SvPVX_const(check) != *s
 640                      || ((slen = SvCUR(check)) > 1
 641                          && memNE(SvPVX_const(check), s, slen)))
 642                 goto report_neq;
 643             check_at = s;
 644             goto success_at_start;
 645           }
 646         }
 647         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 648         s = strpos;
 649         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 650         end_shift = prog->check_end_shift;
 651
 652         if (!ml_anch) {
 653             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 654                                          - (SvTAIL(check) != 0);
 655             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 656
 657             if (end_shift < eshift)
 658                 end_shift = eshift;
 659         }
 660     }
 661     else {                              /* Can match at random position */
 662         ml_anch = 0;
 663         s = strpos;
 664         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 665         end_shift = prog->check_end_shift;
 666
 667         /* end shift should be non negative here */
 668     }
 669
 670 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 671     if (end_shift < 0)
 672         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 673                    (IV)end_shift, RX_PRECOMP(prog));
 674 #endif
 675
 676   restart:
 677     /* Find a possible match in the region s..strend by looking for
 678        the "check" substring in the region corrected by start/end_shift. */
 679
 680     {
 681         I32 srch_start_shift = start_shift;
 682         I32 srch_end_shift = end_shift;
 683         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 684             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 685             srch_start_shift = strbeg - s;
 686         }
 687     DEBUG_OPTIMISE_MORE_r({
 688         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 689             (IV)prog->check_offset_min,
 690             (IV)srch_start_shift,
 691             (IV)srch_end_shift,
 692             (IV)prog->check_end_shift);
 693     });
 694
 695     if (flags & REXEC_SCREAM) {
 696         I32 p = -1;                     /* Internal iterator of scream. */
 697         I32 * const pp = data ? data->scream_pos : &p;
 698
 699         if (PL_screamfirst[BmRARE(check)] >= 0
 700             || ( BmRARE(check) == '\n'
 701                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 702                  && SvTAIL(check) ))
 703             s = screaminstr(sv, check,
 704                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 705         else
 706             goto fail_finish;
 707         /* we may be pointing at the wrong string */
 708         if (s && RXp_MATCH_COPIED(prog))
 709             s = strbeg + (s - SvPVX_const(sv));
 710         if (data)
 711             *data->scream_olds = s;
 712     }
 713     else {
 714         U8* start_point;
 715         U8* end_point;
 716         if (prog->extflags & RXf_CANY_SEEN) {
 717             start_point= (U8*)(s + srch_start_shift);
 718             end_point= (U8*)(strend - srch_end_shift);
 719         } else {
 720             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 721             end_point= HOP3(strend, -srch_end_shift, strbeg);
 722         }
 723         DEBUG_OPTIMISE_MORE_r({
 724             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 725                 (int)(end_point - start_point),
 726                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 727                 start_point);
 728         });
 729
 730         s = fbm_instr( start_point, end_point,
 731                       check, multiline ? FBMrf_MULTILINE : 0);
 732     }
 733     }
 734     /* Update the count-of-usability, remove useless subpatterns,
 735         unshift s.  */
 736
 737     DEBUG_EXECUTE_r({
 738         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 739             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 740         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 741                           (s ? "Found" : "Did not find"),
 742             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 743                 ? "anchored" : "floating"),
 744             quoted,
 745             RE_SV_TAIL(check),
 746             (s ? " at offset " : "...\n") );
 747     });
 748
 749     if (!s)
 750         goto fail_finish;
 751     /* Finish the diagnostic message */
 752     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 753
 754     /* XXX dmq: first branch is for positive lookbehind...
 755        Our check string is offset from the beginning of the pattern.
 756        So we need to do any stclass tests offset forward from that
 757        point. I think. :-(
 758      */
 759
 760
 761
 762     check_at=s;
 763
 764
 765     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 766        Start with the other substr.
 767        XXXX no SCREAM optimization yet - and a very coarse implementation
 768        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 769                 *always* match.  Probably should be marked during compile...
 770        Probably it is right to do no SCREAM here...
 771      */
 772
 773     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 774                 : (prog->float_substr && prog->anchored_substr))
 775     {
 776         /* Take into account the "other" substring. */
 777         /* XXXX May be hopelessly wrong for UTF... */
 778         if (!other_last)
 779             other_last = strpos;
 780         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 781           do_other_anchored:
 782             {
 783                 char * const last = HOP3c(s, -start_shift, strbeg);
 784                 char *last1, *last2;
 785                 char * const saved_s = s;
 786                 SV* must;
 787
 788                 t = s - prog->check_offset_max;
 789                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 790                     && (!utf8_target
 791                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 792                             && t > strpos)))
 793                     NOOP;
 794                 else
 795                     t = strpos;
 796                 t = HOP3c(t, prog->anchored_offset, strend);
 797                 if (t < other_last)     /* These positions already checked */
 798                     t = other_last;
 799                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 800                 if (last < last1)
 801                     last1 = last;
 802                 /* XXXX It is not documented what units *_offsets are in.
 803                    We assume bytes, but this is clearly wrong.
 804                    Meaning this code needs to be carefully reviewed for errors.
 805                    dmq.
 806                   */
 807
 808                 /* On end-of-str: see comment below. */
 809                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 810                 if (must == &PL_sv_undef) {
 811                     s = (char*)NULL;
 812                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 813                 }
 814                 else
 815                     s = fbm_instr(
 816                         (unsigned char*)t,
 817                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 818                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 819                         must,
 820                         multiline ? FBMrf_MULTILINE : 0
 821                     );
 822                 DEBUG_EXECUTE_r({
 823                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 824                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 825                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 826                         (s ? "Found" : "Contradicts"),
 827                         quoted, RE_SV_TAIL(must));
 828                 });
 829
 830
 831                 if (!s) {
 832                     if (last1 >= last2) {
 833                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 834                                                 ", giving up...\n"));
 835                         goto fail_finish;
 836                     }
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 838                         ", trying floating at offset %ld...\n",
 839                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 840                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 841                     s = HOP3c(last, 1, strend);
 842                     goto restart;
 843                 }
 844                 else {
 845                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 846                           (long)(s - i_strpos)));
 847                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 848                     other_last = HOP3c(s, 1, strend);
 849                     s = saved_s;
 850                     if (t == strpos)
 851                         goto try_at_start;
 852                     goto try_at_offset;
 853                 }
 854             }
 855         }
 856         else {          /* Take into account the floating substring. */
 857             char *last, *last1;
 858             char * const saved_s = s;
 859             SV* must;
 860
 861             t = HOP3c(s, -start_shift, strbeg);
 862             last1 = last =
 863                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 864             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 865                 last = HOP3c(t, prog->float_max_offset, strend);
 866             s = HOP3c(t, prog->float_min_offset, strend);
 867             if (s < other_last)
 868                 s = other_last;
 869  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 870             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 871             /* fbm_instr() takes into account exact value of end-of-str
 872                if the check is SvTAIL(ed).  Since false positives are OK,
 873                and end-of-str is not later than strend we are OK. */
 874             if (must == &PL_sv_undef) {
 875                 s = (char*)NULL;
 876                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 877             }
 878             else
 879                 s = fbm_instr((unsigned char*)s,
 880                               (unsigned char*)last + SvCUR(must)
 881                                   - (SvTAIL(must)!=0),
 882                               must, multiline ? FBMrf_MULTILINE : 0);
 883             DEBUG_EXECUTE_r({
 884                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 885                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 886                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 887                     (s ? "Found" : "Contradicts"),
 888                     quoted, RE_SV_TAIL(must));
 889             });
 890             if (!s) {
 891                 if (last1 == last) {
 892                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 893                                             ", giving up...\n"));
 894                     goto fail_finish;
 895                 }
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 897                     ", trying anchored starting at offset %ld...\n",
 898                     (long)(saved_s + 1 - i_strpos)));
 899                 other_last = last;
 900                 s = HOP3c(t, 1, strend);
 901                 goto restart;
 902             }
 903             else {
 904                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 905                       (long)(s - i_strpos)));
 906                 other_last = s; /* Fix this later. --Hugo */
 907                 s = saved_s;
 908                 if (t == strpos)
 909                     goto try_at_start;
 910                 goto try_at_offset;
 911             }
 912         }
 913     }
 914
 915
 916     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 917
 918     DEBUG_OPTIMISE_MORE_r(
 919         PerlIO_printf(Perl_debug_log,
 920             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 921             (IV)prog->check_offset_min,
 922             (IV)prog->check_offset_max,
 923             (IV)(s-strpos),
 924             (IV)(t-strpos),
 925             (IV)(t-s),
 926             (IV)(strend-strpos)
 927         )
 928     );
 929
 930     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 931         && (!utf8_target
 932             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 933                  && t > strpos)))
 934     {
 935         /* Fixed substring is found far enough so that the match
 936            cannot start at strpos. */
 937       try_at_offset:
 938         if (ml_anch && t[-1] != '\n') {
 939             /* Eventually fbm_*() should handle this, but often
 940                anchored_offset is not 0, so this check will not be wasted. */
 941             /* XXXX In the code below we prefer to look for "^" even in
 942                presence of anchored substrings.  And we search even
 943                beyond the found float position.  These pessimizations
 944                are historical artefacts only.  */
 945           find_anchor:
 946             while (t < strend - prog->minlen) {
 947                 if (*t == '\n') {
 948                     if (t < check_at - prog->check_offset_min) {
 949                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 950                             /* Since we moved from the found position,
 951                                we definitely contradict the found anchored
 952                                substr.  Due to the above check we do not
 953                                contradict "check" substr.
 954                                Thus we can arrive here only if check substr
 955                                is float.  Redo checking for "other"=="fixed".
 956                              */
 957                             strpos = t + 1;
 958                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 959                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 960                             goto do_other_anchored;
 961                         }
 962                         /* We don't contradict the found floating substring. */
 963                         /* XXXX Why not check for STCLASS? */
 964                         s = t + 1;
 965                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 966                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 967                         goto set_useful;
 968                     }
 969                     /* Position contradicts check-string */
 970                     /* XXXX probably better to look for check-string
 971                        than for "\n", so one should lower the limit for t? */
 972                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 973                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 974                     other_last = strpos = s = t + 1;
 975                     goto restart;
 976                 }
 977                 t++;
 978             }
 979             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 980                         PL_colors[0], PL_colors[1]));
 981             goto fail_finish;
 982         }
 983         else {
 984             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 985                         PL_colors[0], PL_colors[1]));
 986         }
 987         s = t;
 988       set_useful:
 989         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 990     }
 991     else {
 992         /* The found string does not prohibit matching at strpos,
 993            - no optimization of calling REx engine can be performed,
 994            unless it was an MBOL and we are not after MBOL,
 995            or a future STCLASS check will fail this. */
 996       try_at_start:
 997         /* Even in this situation we may use MBOL flag if strpos is offset
 998            wrt the start of the string. */
 999         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1000             && (strpos != strbeg) && strpos[-1] != '\n'
1001             /* May be due to an implicit anchor of m{.*foo}  */
1002             && !(prog->intflags & PREGf_IMPLICIT))
1003         {
1004             t = strpos;
1005             goto find_anchor;
1006         }
1007         DEBUG_EXECUTE_r( if (ml_anch)
1008             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1009                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1010         );
1011       success_at_start:
1012         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1013             && (utf8_target ? (
1014                 prog->check_utf8                /* Could be deleted already */
1015                 && --BmUSEFUL(prog->check_utf8) < 0
1016                 && (prog->check_utf8 == prog->float_utf8)
1017             ) : (
1018                 prog->check_substr              /* Could be deleted already */
1019                 && --BmUSEFUL(prog->check_substr) < 0
1020                 && (prog->check_substr == prog->float_substr)
1021             )))
1022         {
1023             /* If flags & SOMETHING - do not do it many times on the same match */
1024             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1025             /* XXX Does the destruction order has to change with utf8_target? */
1026             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1027             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1028             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1029             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1030             check = NULL;                       /* abort */
1031             s = strpos;
1032             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1033                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1034             if (prog->intflags & PREGf_IMPLICIT)
1035                 prog->extflags &= ~RXf_ANCH_MBOL;
1036             /* XXXX This is a remnant of the old implementation.  It
1037                     looks wasteful, since now INTUIT can use many
1038                     other heuristics. */
1039             prog->extflags &= ~RXf_USE_INTUIT;
1040             /* XXXX What other flags might need to be cleared in this branch? */
1041         }
1042         else
1043             s = strpos;
1044     }
1045
1046     /* Last resort... */
1047     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1048     /* trie stclasses are too expensive to use here, we are better off to
1049        leave it to regmatch itself */
1050     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1051         /* minlen == 0 is possible if regstclass is \b or \B,
1052            and the fixed substr is ''$.
1053            Since minlen is already taken into account, s+1 is before strend;
1054            accidentally, minlen >= 1 guaranties no false positives at s + 1
1055            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1056            regstclass does not come from lookahead...  */
1057         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1058            This leaves EXACTF only, which is dealt with in find_byclass().  */
1059         const U8* const str = (U8*)STRING(progi->regstclass);
1060         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1061                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1062                     : 1);
1063         char * endpos;
1064         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1065             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1066         else if (prog->float_substr || prog->float_utf8)
1067             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1068         else
1069             endpos= strend;
1070
1071         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1072                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1073
1074         t = s;
1075         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1076         if (!s) {
1077 #ifdef DEBUGGING
1078             const char *what = NULL;
1079 #endif
1080             if (endpos == strend) {
1081                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                                 "Could not match STCLASS...\n") );
1083                 goto fail;
1084             }
1085             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1086                                    "This position contradicts STCLASS...\n") );
1087             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1088                 goto fail;
1089             /* Contradict one of substrings */
1090             if (prog->anchored_substr || prog->anchored_utf8) {
1091                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1092                     DEBUG_EXECUTE_r( what = "anchored" );
1093                   hop_and_restart:
1094                     s = HOP3c(t, 1, strend);
1095                     if (s + start_shift + end_shift > strend) {
1096                         /* XXXX Should be taken into account earlier? */
1097                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098                                                "Could not match STCLASS...\n") );
1099                         goto fail;
1100                     }
1101                     if (!check)
1102                         goto giveup;
1103                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Looking for %s substr starting at offset %ld...\n",
1105                                  what, (long)(s + start_shift - i_strpos)) );
1106                     goto restart;
1107                 }
1108                 /* Have both, check_string is floating */
1109                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1110                     goto retry_floating_check;
1111                 /* Recheck anchored substring, but not floating... */
1112                 s = check_at;
1113                 if (!check)
1114                     goto giveup;
1115                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116                           "Looking for anchored substr starting at offset %ld...\n",
1117                           (long)(other_last - i_strpos)) );
1118                 goto do_other_anchored;
1119             }
1120             /* Another way we could have checked stclass at the
1121                current position only: */
1122             if (ml_anch) {
1123                 s = t = t + 1;
1124                 if (!check)
1125                     goto giveup;
1126                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                           "Looking for /%s^%s/m starting at offset %ld...\n",
1128                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1129                 goto try_at_offset;
1130             }
1131             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1132                 goto fail;
1133             /* Check is floating subtring. */
1134           retry_floating_check:
1135             t = check_at - start_shift;
1136             DEBUG_EXECUTE_r( what = "floating" );
1137             goto hop_and_restart;
1138         }
1139         if (t != s) {
1140             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1141                         "By STCLASS: moving %ld --> %ld\n",
1142                                   (long)(t - i_strpos), (long)(s - i_strpos))
1143                    );
1144         }
1145         else {
1146             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1147                                   "Does not contradict STCLASS...\n");
1148                    );
1149         }
1150     }
1151   giveup:
1152     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1153                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1154                           PL_colors[5], (long)(s - i_strpos)) );
1155     return s;
1156
1157   fail_finish:                          /* Substring not found */
1158     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1159         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1160   fail:
1161     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1162                           PL_colors[4], PL_colors[5]));
1163     return NULL;
1164 }
1165
1166 #define DECL_TRIE_TYPE(scan) \
1167     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1168                     trie_type = (scan->flags != EXACT) \
1169                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1170                               : (utf8_target ? trie_utf8 : trie_plain)
1171
1172 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1173 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1174     switch (trie_type) {                                                    \
1175     case trie_utf8_fold:                                                    \
1176         if ( foldlen>0 ) {                                                  \
1177             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1178             foldlen -= len;                                                 \
1179             uscan += len;                                                   \
1180             len=0;                                                          \
1181         } else {                                                            \
1182             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1183             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1184             foldlen -= UNISKIP( uvc );                                      \
1185             uscan = foldbuf + UNISKIP( uvc );                               \
1186         }                                                                   \
1187         break;                                                              \
1188     case trie_latin_utf8_fold:                                              \
1189         if ( foldlen>0 ) {                                                  \
1190             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1191             foldlen -= len;                                                 \
1192             uscan += len;                                                   \
1193             len=0;                                                          \
1194         } else {                                                            \
1195             len = 1;                                                        \
1196             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1197             foldlen -= UNISKIP( uvc );                                      \
1198             uscan = foldbuf + UNISKIP( uvc );                               \
1199         }                                                                   \
1200         break;                                                              \
1201     case trie_utf8:                                                         \
1202         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1203         break;                                                              \
1204     case trie_plain:                                                        \
1205         uvc = (UV)*uc;                                                      \
1206         len = 1;                                                            \
1207     }                                                                       \
1208     if (uvc < 256) {                                                        \
1209         charid = trie->charmap[ uvc ];                                      \
1210     }                                                                       \
1211     else {                                                                  \
1212         charid = 0;                                                         \
1213         if (widecharmap) {                                                  \
1214             SV** const svpp = hv_fetch(widecharmap,                         \
1215                         (char*)&uvc, sizeof(UV), 0);                        \
1216             if (svpp)                                                       \
1217                 charid = (U16)SvIV(*svpp);                                  \
1218         }                                                                   \
1219     }                                                                       \
1220 } STMT_END
1221
1222 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1223 {                                                      \
1224     char *my_strend= (char *)strend;                   \
1225     if ( (CoNd)                                        \
1226          && (ln == len ||                              \
1227              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1228                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1229          && (!reginfo || regtry(reginfo, &s)) )        \
1230         goto got_it;                                   \
1231     else {                                             \
1232          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1233          uvchr_to_utf8(tmpbuf, c);                     \
1234          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1235          if ( f != c                                   \
1236               && (f == c1 || f == c2)                  \
1237               && (ln == len ||                         \
1238                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1239                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1240               && (!reginfo || regtry(reginfo, &s)) )   \
1241               goto got_it;                             \
1242     }                                                  \
1243 }                                                      \
1244 s += len
1245
1246 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1247 STMT_START {                                              \
1248     while (s <= e) {                                      \
1249         if ( (CoNd)                                       \
1250              && (ln == 1 || (OP(c) == EXACTF             \
1251                               ? foldEQ(s, m, ln)           \
1252                               : foldEQ_locale(s, m, ln)))  \
1253              && (!reginfo || regtry(reginfo, &s)) )        \
1254             goto got_it;                                  \
1255         s++;                                              \
1256     }                                                     \
1257 } STMT_END
1258
1259 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1260 STMT_START {                                          \
1261     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1262         CoDe                                          \
1263         s += uskip;                                   \
1264     }                                                 \
1265 } STMT_END
1266
1267 #define REXEC_FBC_SCAN(CoDe)                          \
1268 STMT_START {                                          \
1269     while (s < strend) {                              \
1270         CoDe                                          \
1271         s++;                                          \
1272     }                                                 \
1273 } STMT_END
1274
1275 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1276 REXEC_FBC_UTF8_SCAN(                                  \
1277     if (CoNd) {                                       \
1278         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1279             goto got_it;                              \
1280         else                                          \
1281             tmp = doevery;                            \
1282     }                                                 \
1283     else                                              \
1284         tmp = 1;                                      \
1285 )
1286
1287 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1288 REXEC_FBC_SCAN(                                       \
1289     if (CoNd) {                                       \
1290         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1291             goto got_it;                              \
1292         else                                          \
1293             tmp = doevery;                            \
1294     }                                                 \
1295     else                                              \
1296         tmp = 1;                                      \
1297 )
1298
1299 #define REXEC_FBC_TRYIT               \
1300 if ((!reginfo || regtry(reginfo, &s))) \
1301     goto got_it
1302
1303 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1304     if (utf8_target) {                                             \
1305         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1306     }                                                          \
1307     else {                                                     \
1308         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1309     }                                                          \
1310     break
1311
1312 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1313     if (utf8_target) {                                             \
1314         UtFpReLoAd;                                            \
1315         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1316     }                                                          \
1317     else {                                                     \
1318         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1319     }                                                          \
1320     break
1321
1322 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1323     PL_reg_flags |= RF_tainted;                                \
1324     if (utf8_target) {                                             \
1325         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1326     }                                                          \
1327     else {                                                     \
1328         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1329     }                                                          \
1330     break
1331
1332 #define DUMP_EXEC_POS(li,s,doutf8) \
1333     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1334
1335 /* We know what class REx starts with.  Try to find this position... */
1336 /* if reginfo is NULL, its a dryrun */
1337 /* annoyingly all the vars in this routine have different names from their counterparts
1338    in regmatch. /grrr */
1339
1340 STATIC char *
1341 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1342     const char *strend, regmatch_info *reginfo)
1343 {
1344         dVAR;
1345         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1346         char *m;
1347         STRLEN ln;
1348         STRLEN lnc;
1349         register STRLEN uskip;
1350         unsigned int c1;
1351         unsigned int c2;
1352         char *e;
1353         register I32 tmp = 1;   /* Scratch variable? */
1354         register const bool utf8_target = PL_reg_match_utf8;
1355         RXi_GET_DECL(prog,progi);
1356
1357         PERL_ARGS_ASSERT_FIND_BYCLASS;
1358
1359         /* We know what class it must start with. */
1360         switch (OP(c)) {
1361         case ANYOF:
1362             if (utf8_target) {
1363                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1364                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1365                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1366                           REGINCLASS(prog, c, (U8*)s));
1367             }
1368             else {
1369                  while (s < strend) {
1370                       STRLEN skip = 1;
1371
1372                       if (REGINCLASS(prog, c, (U8*)s) ||
1373                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1374                            /* The assignment of 2 is intentional:
1375                             * for the folded sharp s, the skip is 2. */
1376                            (skip = SHARP_S_SKIP))) {
1377                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1378                                 goto got_it;
1379                            else
1380                                 tmp = doevery;
1381                       }
1382                       else
1383                            tmp = 1;
1384                       s += skip;
1385                  }
1386             }
1387             break;
1388         case CANY:
1389             REXEC_FBC_SCAN(
1390                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1391                     goto got_it;
1392                 else
1393                     tmp = doevery;
1394             );
1395             break;
1396         case EXACTF:
1397             m   = STRING(c);
1398             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1399             lnc = (I32) ln;     /* length to match in characters */
1400             if (UTF_PATTERN) {
1401                 STRLEN ulen1, ulen2;
1402                 U8 *sm = (U8 *) m;
1403                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1404                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1405                 /* used by commented-out code below */
1406                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1407
1408                 /* XXX: Since the node will be case folded at compile
1409                    time this logic is a little odd, although im not
1410                    sure that its actually wrong. --dmq */
1411
1412                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1413                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1414
1415                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1416                    codepoint of the first character in the converted
1417                    form, yet originally we did the extra step.
1418                    No tests fail by commenting this code out however
1419                    so Ive left it out. -- dmq.
1420
1421                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1422                                     0, uniflags);
1423                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1424                                     0, uniflags);
1425                 */
1426
1427                 lnc = 0;
1428                 while (sm < ((U8 *) m + ln)) {
1429                     lnc++;
1430                     sm += UTF8SKIP(sm);
1431                 }
1432             }
1433             else {
1434                 c1 = *(U8*)m;
1435                 c2 = PL_fold[c1];
1436             }
1437             goto do_exactf;
1438         case EXACTFL:
1439             m   = STRING(c);
1440             ln  = STR_LEN(c);
1441             lnc = (I32) ln;
1442             c1 = *(U8*)m;
1443             c2 = PL_fold_locale[c1];
1444           do_exactf:
1445             e = HOP3c(strend, -((I32)lnc), s);
1446
1447             if (!reginfo && e < s)
1448                 e = s;                  /* Due to minlen logic of intuit() */
1449
1450             /* The idea in the EXACTF* cases is to first find the
1451              * first character of the EXACTF* node and then, if
1452              * necessary, case-insensitively compare the full
1453              * text of the node.  The c1 and c2 are the first
1454              * characters (though in Unicode it gets a bit
1455              * more complicated because there are more cases
1456              * than just upper and lower: one needs to use
1457              * the so-called folding case for case-insensitive
1458              * matching (called "loose matching" in Unicode).
1459              * foldEQ_utf8() will do just that. */
1460
1461             if (utf8_target || UTF_PATTERN) {
1462                 UV c, f;
1463                 U8 tmpbuf [UTF8_MAXBYTES+1];
1464                 STRLEN len = 1;
1465                 STRLEN foldlen;
1466                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1467                 if (c1 == c2) {
1468                     /* Upper and lower of 1st char are equal -
1469                      * probably not a "letter". */
1470                     while (s <= e) {
1471                         if (utf8_target) {
1472                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1473                                            uniflags);
1474                         } else {
1475                             c = *((U8*)s);
1476                         }
1477                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1478                     }
1479                 }
1480                 else {
1481                     while (s <= e) {
1482                         if (utf8_target) {
1483                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1484                                            uniflags);
1485                         } else {
1486                             c = *((U8*)s);
1487                         }
1488
1489                         /* Handle some of the three Greek sigmas cases.
1490                          * Note that not all the possible combinations
1491                          * are handled here: some of them are handled
1492                          * by the standard folding rules, and some of
1493                          * them (the character class or ANYOF cases)
1494                          * are handled during compiletime in
1495                          * regexec.c:S_regclass(). */
1496                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1497                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1498                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1499
1500                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1501                     }
1502                 }
1503             }
1504             else {
1505                 /* Neither pattern nor string are UTF8 */
1506                 if (c1 == c2)
1507                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1508                 else
1509                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1510             }
1511             break;
1512         case BOUNDL:
1513             PL_reg_flags |= RF_tainted;
1514             /* FALL THROUGH */
1515         case BOUND:
1516             if (utf8_target) {
1517                 if (s == PL_bostr)
1518                     tmp = '\n';
1519                 else {
1520                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1521                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1522                 }
1523                 tmp = ((OP(c) == BOUND ?
1524                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1525                 LOAD_UTF8_CHARCLASS_ALNUM();
1526                 REXEC_FBC_UTF8_SCAN(
1527                     if (tmp == !(OP(c) == BOUND ?
1528                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1529                                  isALNUM_LC_utf8((U8*)s)))
1530                     {
1531                         tmp = !tmp;
1532                         REXEC_FBC_TRYIT;
1533                 }
1534                 );
1535             }
1536             else {  /* Not utf8 */
1537                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1538                 tmp = cBOOL((OP(c) == BOUNDL)
1539                             ? isALNUM_LC(tmp)
1540                             : (isWORDCHAR_L1(tmp)
1541                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1542                 REXEC_FBC_SCAN(
1543                     if (tmp ==
1544                         !((OP(c) == BOUNDL)
1545                           ? isALNUM_LC(*s)
1546                           : (isWORDCHAR_L1((U8) *s)
1547                              && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1548                     {
1549                         tmp = !tmp;
1550                         REXEC_FBC_TRYIT;
1551                 }
1552                 );
1553             }
1554             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1555                 goto got_it;
1556             break;
1557         case NBOUNDL:
1558             PL_reg_flags |= RF_tainted;
1559             /* FALL THROUGH */
1560         case NBOUND:
1561             if (utf8_target) {
1562                 if (s == PL_bostr)
1563                     tmp = '\n';
1564                 else {
1565                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1566                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1567                 }
1568                 tmp = ((OP(c) == NBOUND ?
1569                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1570                 LOAD_UTF8_CHARCLASS_ALNUM();
1571                 REXEC_FBC_UTF8_SCAN(
1572                     if (tmp == !(OP(c) == NBOUND ?
1573                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1574                                  isALNUM_LC_utf8((U8*)s)))
1575                         tmp = !tmp;
1576                     else REXEC_FBC_TRYIT;
1577                 );
1578             }
1579             else {
1580                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1581                 tmp = cBOOL((OP(c) == NBOUNDL)
1582                             ? isALNUM_LC(tmp)
1583                             : (isWORDCHAR_L1(tmp)
1584                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1585                 REXEC_FBC_SCAN(
1586                     if (tmp == ! cBOOL(
1587                             (OP(c) == NBOUNDL)
1588                             ? isALNUM_LC(*s)
1589                             : (isWORDCHAR_L1((U8) *s)
1590                                && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1591                     {
1592                         tmp = !tmp;
1593                     }
1594                     else REXEC_FBC_TRYIT;
1595                 );
1596             }
1597             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1598                 goto got_it;
1599             break;
1600         case ALNUM:
1601             REXEC_FBC_CSCAN_PRELOAD(
1602                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1603                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1604                 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1605             );
1606         case ALNUML:
1607             REXEC_FBC_CSCAN_TAINT(
1608                 isALNUM_LC_utf8((U8*)s),
1609                 isALNUM_LC(*s)
1610             );
1611         case NALNUM:
1612             REXEC_FBC_CSCAN_PRELOAD(
1613                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1614                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1615                 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1616             );
1617         case NALNUML:
1618             REXEC_FBC_CSCAN_TAINT(
1619                 !isALNUM_LC_utf8((U8*)s),
1620                 !isALNUM_LC(*s)
1621             );
1622         case SPACE:
1623             REXEC_FBC_CSCAN_PRELOAD(
1624                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1625                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1626                 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1627             );
1628         case SPACEL:
1629             REXEC_FBC_CSCAN_TAINT(
1630                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1631                 isSPACE_LC(*s)
1632             );
1633         case NSPACE:
1634             REXEC_FBC_CSCAN_PRELOAD(
1635                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1636                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1637                 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1638             );
1639         case NSPACEL:
1640             REXEC_FBC_CSCAN_TAINT(
1641                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1642                 !isSPACE_LC(*s)
1643             );
1644         case DIGIT:
1645             REXEC_FBC_CSCAN_PRELOAD(
1646                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1647                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1648                 isDIGIT(*s)
1649             );
1650         case DIGITL:
1651             REXEC_FBC_CSCAN_TAINT(
1652                 isDIGIT_LC_utf8((U8*)s),
1653                 isDIGIT_LC(*s)
1654             );
1655         case NDIGIT:
1656             REXEC_FBC_CSCAN_PRELOAD(
1657                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1658                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1659                 !isDIGIT(*s)
1660             );
1661         case NDIGITL:
1662             REXEC_FBC_CSCAN_TAINT(
1663                 !isDIGIT_LC_utf8((U8*)s),
1664                 !isDIGIT_LC(*s)
1665             );
1666         case LNBREAK:
1667             REXEC_FBC_CSCAN(
1668                 is_LNBREAK_utf8(s),
1669                 is_LNBREAK_latin1(s)
1670             );
1671         case VERTWS:
1672             REXEC_FBC_CSCAN(
1673                 is_VERTWS_utf8(s),
1674                 is_VERTWS_latin1(s)
1675             );
1676         case NVERTWS:
1677             REXEC_FBC_CSCAN(
1678                 !is_VERTWS_utf8(s),
1679                 !is_VERTWS_latin1(s)
1680             );
1681         case HORIZWS:
1682             REXEC_FBC_CSCAN(
1683                 is_HORIZWS_utf8(s),
1684                 is_HORIZWS_latin1(s)
1685             );
1686         case NHORIZWS:
1687             REXEC_FBC_CSCAN(
1688                 !is_HORIZWS_utf8(s),
1689                 !is_HORIZWS_latin1(s)
1690             );
1691         case AHOCORASICKC:
1692         case AHOCORASICK:
1693             {
1694                 DECL_TRIE_TYPE(c);
1695                 /* what trie are we using right now */
1696                 reg_ac_data *aho
1697                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1698                 reg_trie_data *trie
1699                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1700                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1701
1702                 const char *last_start = strend - trie->minlen;
1703 #ifdef DEBUGGING
1704                 const char *real_start = s;
1705 #endif
1706                 STRLEN maxlen = trie->maxlen;
1707                 SV *sv_points;
1708                 U8 **points; /* map of where we were in the input string
1709                                 when reading a given char. For ASCII this
1710                                 is unnecessary overhead as the relationship
1711                                 is always 1:1, but for Unicode, especially
1712                                 case folded Unicode this is not true. */
1713                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1714                 U8 *bitmap=NULL;
1715
1716
1717                 GET_RE_DEBUG_FLAGS_DECL;
1718
1719                 /* We can't just allocate points here. We need to wrap it in
1720                  * an SV so it gets freed properly if there is a croak while
1721                  * running the match */
1722                 ENTER;
1723                 SAVETMPS;
1724                 sv_points=newSV(maxlen * sizeof(U8 *));
1725                 SvCUR_set(sv_points,
1726                     maxlen * sizeof(U8 *));
1727                 SvPOK_on(sv_points);
1728                 sv_2mortal(sv_points);
1729                 points=(U8**)SvPV_nolen(sv_points );
1730                 if ( trie_type != trie_utf8_fold
1731                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1732                 {
1733                     if (trie->bitmap)
1734                         bitmap=(U8*)trie->bitmap;
1735                     else
1736                         bitmap=(U8*)ANYOF_BITMAP(c);
1737                 }
1738                 /* this is the Aho-Corasick algorithm modified a touch
1739                    to include special handling for long "unknown char"
1740                    sequences. The basic idea being that we use AC as long
1741                    as we are dealing with a possible matching char, when
1742                    we encounter an unknown char (and we have not encountered
1743                    an accepting state) we scan forward until we find a legal
1744                    starting char.
1745                    AC matching is basically that of trie matching, except
1746                    that when we encounter a failing transition, we fall back
1747                    to the current states "fail state", and try the current char
1748                    again, a process we repeat until we reach the root state,
1749                    state 1, or a legal transition. If we fail on the root state
1750                    then we can either terminate if we have reached an accepting
1751                    state previously, or restart the entire process from the beginning
1752                    if we have not.
1753
1754                  */
1755                 while (s <= last_start) {
1756                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1757                     U8 *uc = (U8*)s;
1758                     U16 charid = 0;
1759                     U32 base = 1;
1760                     U32 state = 1;
1761                     UV uvc = 0;
1762                     STRLEN len = 0;
1763                     STRLEN foldlen = 0;
1764                     U8 *uscan = (U8*)NULL;
1765                     U8 *leftmost = NULL;
1766 #ifdef DEBUGGING
1767                     U32 accepted_word= 0;
1768 #endif
1769                     U32 pointpos = 0;
1770
1771                     while ( state && uc <= (U8*)strend ) {
1772                         int failed=0;
1773                         U32 word = aho->states[ state ].wordnum;
1774
1775                         if( state==1 ) {
1776                             if ( bitmap ) {
1777                                 DEBUG_TRIE_EXECUTE_r(
1778                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1779                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1780                                             (char *)uc, utf8_target );
1781                                         PerlIO_printf( Perl_debug_log,
1782                                             " Scanning for legal start char...\n");
1783                                     }
1784                                 );
1785                                 if (utf8_target) {
1786                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1787                                         uc += UTF8SKIP(uc);
1788                                     }
1789                                 } else {
1790                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1791                                         uc++;
1792                                     }
1793                                 }
1794                                 s= (char *)uc;
1795                             }
1796                             if (uc >(U8*)last_start) break;
1797                         }
1798
1799                         if ( word ) {
1800                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1801                             if (!leftmost || lpos < leftmost) {
1802                                 DEBUG_r(accepted_word=word);
1803                                 leftmost= lpos;
1804                             }
1805                             if (base==0) break;
1806
1807                         }
1808                         points[pointpos++ % maxlen]= uc;
1809                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1810                                              uscan, len, uvc, charid, foldlen,
1811                                              foldbuf, uniflags);
1812                         DEBUG_TRIE_EXECUTE_r({
1813                             dump_exec_pos( (char *)uc, c, strend, real_start,
1814                                 s,   utf8_target );
1815                             PerlIO_printf(Perl_debug_log,
1816                                 " Charid:%3u CP:%4"UVxf" ",
1817                                  charid, uvc);
1818                         });
1819
1820                         do {
1821 #ifdef DEBUGGING
1822                             word = aho->states[ state ].wordnum;
1823 #endif
1824                             base = aho->states[ state ].trans.base;
1825
1826                             DEBUG_TRIE_EXECUTE_r({
1827                                 if (failed)
1828                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1829                                         s,   utf8_target );
1830                                 PerlIO_printf( Perl_debug_log,
1831                                     "%sState: %4"UVxf", word=%"UVxf,
1832                                     failed ? " Fail transition to " : "",
1833                                     (UV)state, (UV)word);
1834                             });
1835                             if ( base ) {
1836                                 U32 tmp;
1837                                 I32 offset;
1838                                 if (charid &&
1839                                      ( ((offset = base + charid
1840                                         - 1 - trie->uniquecharcount)) >= 0)
1841                                      && ((U32)offset < trie->lasttrans)
1842                                      && trie->trans[offset].check == state
1843                                      && (tmp=trie->trans[offset].next))
1844                                 {
1845                                     DEBUG_TRIE_EXECUTE_r(
1846                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1847                                     state = tmp;
1848                                     break;
1849                                 }
1850                                 else {
1851                                     DEBUG_TRIE_EXECUTE_r(
1852                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1853                                     failed = 1;
1854                                     state = aho->fail[state];
1855                                 }
1856                             }
1857                             else {
1858                                 /* we must be accepting here */
1859                                 DEBUG_TRIE_EXECUTE_r(
1860                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1861                                 failed = 1;
1862                                 break;
1863                             }
1864                         } while(state);
1865                         uc += len;
1866                         if (failed) {
1867                             if (leftmost)
1868                                 break;
1869                             if (!state) state = 1;
1870                         }
1871                     }
1872                     if ( aho->states[ state ].wordnum ) {
1873                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1874                         if (!leftmost || lpos < leftmost) {
1875                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1876                             leftmost = lpos;
1877                         }
1878                     }
1879                     if (leftmost) {
1880                         s = (char*)leftmost;
1881                         DEBUG_TRIE_EXECUTE_r({
1882                             PerlIO_printf(
1883                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1884                                 (UV)accepted_word, (IV)(s - real_start)
1885                             );
1886                         });
1887                         if (!reginfo || regtry(reginfo, &s)) {
1888                             FREETMPS;
1889                             LEAVE;
1890                             goto got_it;
1891                         }
1892                         s = HOPc(s,1);
1893                         DEBUG_TRIE_EXECUTE_r({
1894                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1895                         });
1896                     } else {
1897                         DEBUG_TRIE_EXECUTE_r(
1898                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1899                         break;
1900                     }
1901                 }
1902                 FREETMPS;
1903                 LEAVE;
1904             }
1905             break;
1906         default:
1907             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1908             break;
1909         }
1910         return 0;
1911       got_it:
1912         return s;
1913 }
1914
1915
1916 /*
1917  - regexec_flags - match a regexp against a string
1918  */
1919 I32
1920 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1921               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1922 /* strend: pointer to null at end of string */
1923 /* strbeg: real beginning of string */
1924 /* minend: end of match must be >=minend after stringarg. */
1925 /* data: May be used for some additional optimizations.
1926          Currently its only used, with a U32 cast, for transmitting
1927          the ganch offset when doing a /g match. This will change */
1928 /* nosave: For optimizations. */
1929 {
1930     dVAR;
1931     struct regexp *const prog = (struct regexp *)SvANY(rx);
1932     /*register*/ char *s;
1933     register regnode *c;
1934     /*register*/ char *startpos = stringarg;
1935     I32 minlen;         /* must match at least this many chars */
1936     I32 dontbother = 0; /* how many characters not to try at end */
1937     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1938     I32 scream_pos = -1;                /* Internal iterator of scream. */
1939     char *scream_olds = NULL;
1940     const bool utf8_target = cBOOL(DO_UTF8(sv));
1941     I32 multiline;
1942     RXi_GET_DECL(prog,progi);
1943     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1944     regexp_paren_pair *swap = NULL;
1945     GET_RE_DEBUG_FLAGS_DECL;
1946
1947     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1948     PERL_UNUSED_ARG(data);
1949
1950     /* Be paranoid... */
1951     if (prog == NULL || startpos == NULL) {
1952         Perl_croak(aTHX_ "NULL regexp parameter");
1953         return 0;
1954     }
1955
1956     multiline = prog->extflags & RXf_PMf_MULTILINE;
1957     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1958
1959     RX_MATCH_UTF8_set(rx, utf8_target);
1960     DEBUG_EXECUTE_r(
1961         debug_start_match(rx, utf8_target, startpos, strend,
1962         "Matching");
1963     );
1964
1965     minlen = prog->minlen;
1966
1967     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1968         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1969                               "String too short [regexec_flags]...\n"));
1970         goto phooey;
1971     }
1972
1973
1974     /* Check validity of program. */
1975     if (UCHARAT(progi->program) != REG_MAGIC) {
1976         Perl_croak(aTHX_ "corrupted regexp program");
1977     }
1978
1979     PL_reg_flags = 0;
1980     PL_reg_eval_set = 0;
1981     PL_reg_maxiter = 0;
1982
1983     if (RX_UTF8(rx))
1984         PL_reg_flags |= RF_utf8;
1985
1986     /* Mark beginning of line for ^ and lookbehind. */
1987     reginfo.bol = startpos; /* XXX not used ??? */
1988     PL_bostr  = strbeg;
1989     reginfo.sv = sv;
1990
1991     /* Mark end of line for $ (and such) */
1992     PL_regeol = strend;
1993
1994     /* see how far we have to get to not match where we matched before */
1995     reginfo.till = startpos+minend;
1996
1997     /* If there is a "must appear" string, look for it. */
1998     s = startpos;
1999
2000     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2001         MAGIC *mg;
2002         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2003             reginfo.ganch = startpos + prog->gofs;
2004             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2005               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2006         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2007                   && SvMAGIC(sv)
2008                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2009                   && mg->mg_len >= 0) {
2010             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2011             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2012                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2013
2014             if (prog->extflags & RXf_ANCH_GPOS) {
2015                 if (s > reginfo.ganch)
2016                     goto phooey;
2017                 s = reginfo.ganch - prog->gofs;
2018                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2019                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2020                 if (s < strbeg)
2021                     goto phooey;
2022             }
2023         }
2024         else if (data) {
2025             reginfo.ganch = strbeg + PTR2UV(data);
2026             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2027                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2028
2029         } else {                                /* pos() not defined */
2030             reginfo.ganch = strbeg;
2031             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2032                  "GPOS: reginfo.ganch = strbeg\n"));
2033         }
2034     }
2035     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2036         /* We have to be careful. If the previous successful match
2037            was from this regex we don't want a subsequent partially
2038            successful match to clobber the old results.
2039            So when we detect this possibility we add a swap buffer
2040            to the re, and switch the buffer each match. If we fail
2041            we switch it back, otherwise we leave it swapped.
2042         */
2043         swap = prog->offs;
2044         /* do we need a save destructor here for eval dies? */
2045         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2046     }
2047     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2048         re_scream_pos_data d;
2049
2050         d.scream_olds = &scream_olds;
2051         d.scream_pos = &scream_pos;
2052         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2053         if (!s) {
2054             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2055             goto phooey;        /* not present */
2056         }
2057     }
2058
2059
2060
2061     /* Simplest case:  anchored match need be tried only once. */
2062     /*  [unless only anchor is BOL and multiline is set] */
2063     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2064         if (s == startpos && regtry(&reginfo, &startpos))
2065             goto got_it;
2066         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2067                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2068         {
2069             char *end;
2070
2071             if (minlen)
2072                 dontbother = minlen - 1;
2073             end = HOP3c(strend, -dontbother, strbeg) - 1;
2074             /* for multiline we only have to try after newlines */
2075             if (prog->check_substr || prog->check_utf8) {
2076                 /* because of the goto we can not easily reuse the macros for bifurcating the
2077                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2078                 if (utf8_target) {
2079                     if (s == startpos)
2080                         goto after_try_utf8;
2081                     while (1) {
2082                         if (regtry(&reginfo, &s)) {
2083                             goto got_it;
2084                         }
2085                       after_try_utf8:
2086                         if (s > end) {
2087                             goto phooey;
2088                         }
2089                         if (prog->extflags & RXf_USE_INTUIT) {
2090                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2091                             if (!s) {
2092                                 goto phooey;
2093                             }
2094                         }
2095                         else {
2096                             s += UTF8SKIP(s);
2097                         }
2098                     }
2099                 } /* end search for check string in unicode */
2100                 else {
2101                     if (s == startpos) {
2102                         goto after_try_latin;
2103                     }
2104                     while (1) {
2105                         if (regtry(&reginfo, &s)) {
2106                             goto got_it;
2107                         }
2108                       after_try_latin:
2109                         if (s > end) {
2110                             goto phooey;
2111                         }
2112                         if (prog->extflags & RXf_USE_INTUIT) {
2113                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2114                             if (!s) {
2115                                 goto phooey;
2116                             }
2117                         }
2118                         else {
2119                             s++;
2120                         }
2121                     }
2122                 } /* end search for check string in latin*/
2123             } /* end search for check string */
2124             else { /* search for newline */
2125                 if (s > startpos) {
2126                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2127                     s--;
2128                 }
2129                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2130                 while (s < end) {
2131                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2132                         if (regtry(&reginfo, &s))
2133                             goto got_it;
2134                     }
2135                 }
2136             } /* end search for newline */
2137         } /* end anchored/multiline check string search */
2138         goto phooey;
2139     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2140     {
2141         /* the warning about reginfo.ganch being used without intialization
2142            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2143            and we only enter this block when the same bit is set. */
2144         char *tmp_s = reginfo.ganch - prog->gofs;
2145
2146         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2147             goto got_it;
2148         goto phooey;
2149     }
2150
2151     /* Messy cases:  unanchored match. */
2152     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2153         /* we have /x+whatever/ */
2154         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2155         char ch;
2156 #ifdef DEBUGGING
2157         int did_match = 0;
2158 #endif
2159         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2160             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2161         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2162
2163         if (utf8_target) {
2164             REXEC_FBC_SCAN(
2165                 if (*s == ch) {
2166                     DEBUG_EXECUTE_r( did_match = 1 );
2167                     if (regtry(&reginfo, &s)) goto got_it;
2168                     s += UTF8SKIP(s);
2169                     while (s < strend && *s == ch)
2170                         s += UTF8SKIP(s);
2171                 }
2172             );
2173         }
2174         else {
2175             REXEC_FBC_SCAN(
2176                 if (*s == ch) {
2177                     DEBUG_EXECUTE_r( did_match = 1 );
2178                     if (regtry(&reginfo, &s)) goto got_it;
2179                     s++;
2180                     while (s < strend && *s == ch)
2181                         s++;
2182                 }
2183             );
2184         }
2185         DEBUG_EXECUTE_r(if (!did_match)
2186                 PerlIO_printf(Perl_debug_log,
2187                                   "Did not find anchored character...\n")
2188                );
2189     }
2190     else if (prog->anchored_substr != NULL
2191               || prog->anchored_utf8 != NULL
2192               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2193                   && prog->float_max_offset < strend - s)) {
2194         SV *must;
2195         I32 back_max;
2196         I32 back_min;
2197         char *last;
2198         char *last1;            /* Last position checked before */
2199 #ifdef DEBUGGING
2200         int did_match = 0;
2201 #endif
2202         if (prog->anchored_substr || prog->anchored_utf8) {
2203             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2204                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2205             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2206             back_max = back_min = prog->anchored_offset;
2207         } else {
2208             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2209                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2210             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2211             back_max = prog->float_max_offset;
2212             back_min = prog->float_min_offset;
2213         }
2214
2215
2216         if (must == &PL_sv_undef)
2217             /* could not downgrade utf8 check substring, so must fail */
2218             goto phooey;
2219
2220         if (back_min<0) {
2221             last = strend;
2222         } else {
2223             last = HOP3c(strend,        /* Cannot start after this */
2224                   -(I32)(CHR_SVLEN(must)
2225                          - (SvTAIL(must) != 0) + back_min), strbeg);
2226         }
2227         if (s > PL_bostr)
2228             last1 = HOPc(s, -1);
2229         else
2230             last1 = s - 1;      /* bogus */
2231
2232         /* XXXX check_substr already used to find "s", can optimize if
2233            check_substr==must. */
2234         scream_pos = -1;
2235         dontbother = end_shift;
2236         strend = HOPc(strend, -dontbother);
2237         while ( (s <= last) &&
2238                 ((flags & REXEC_SCREAM)
2239                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2240                                     end_shift, &scream_pos, 0))
2241                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2242                                   (unsigned char*)strend, must,
2243                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2244             /* we may be pointing at the wrong string */
2245             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2246                 s = strbeg + (s - SvPVX_const(sv));
2247             DEBUG_EXECUTE_r( did_match = 1 );
2248             if (HOPc(s, -back_max) > last1) {
2249                 last1 = HOPc(s, -back_min);
2250                 s = HOPc(s, -back_max);
2251             }
2252             else {
2253                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2254
2255                 last1 = HOPc(s, -back_min);
2256                 s = t;
2257             }
2258             if (utf8_target) {
2259                 while (s <= last1) {
2260                     if (regtry(&reginfo, &s))
2261                         goto got_it;
2262                     s += UTF8SKIP(s);
2263                 }
2264             }
2265             else {
2266                 while (s <= last1) {
2267                     if (regtry(&reginfo, &s))
2268                         goto got_it;
2269                     s++;
2270                 }
2271             }
2272         }
2273         DEBUG_EXECUTE_r(if (!did_match) {
2274             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2275                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2276             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2277                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2278                                ? "anchored" : "floating"),
2279                 quoted, RE_SV_TAIL(must));
2280         });
2281         goto phooey;
2282     }
2283     else if ( (c = progi->regstclass) ) {
2284         if (minlen) {
2285             const OPCODE op = OP(progi->regstclass);
2286             /* don't bother with what can't match */
2287             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2288                 strend = HOPc(strend, -(minlen - 1));
2289         }
2290         DEBUG_EXECUTE_r({
2291             SV * const prop = sv_newmortal();
2292             regprop(prog, prop, c);
2293             {
2294                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2295                     s,strend-s,60);
2296                 PerlIO_printf(Perl_debug_log,
2297                     "Matching stclass %.*s against %s (%d bytes)\n",
2298                     (int)SvCUR(prop), SvPVX_const(prop),
2299                      quoted, (int)(strend - s));
2300             }
2301         });
2302         if (find_byclass(prog, c, s, strend, &reginfo))
2303             goto got_it;
2304         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2305     }
2306     else {
2307         dontbother = 0;
2308         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2309             /* Trim the end. */
2310             char *last;
2311             SV* float_real;
2312
2313             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2314                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2315             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2316
2317             if (flags & REXEC_SCREAM) {
2318                 last = screaminstr(sv, float_real, s - strbeg,
2319                                    end_shift, &scream_pos, 1); /* last one */
2320                 if (!last)
2321                     last = scream_olds; /* Only one occurrence. */
2322                 /* we may be pointing at the wrong string */
2323                 else if (RXp_MATCH_COPIED(prog))
2324                     s = strbeg + (s - SvPVX_const(sv));
2325             }
2326             else {
2327                 STRLEN len;
2328                 const char * const little = SvPV_const(float_real, len);
2329
2330                 if (SvTAIL(float_real)) {
2331                     if (memEQ(strend - len + 1, little, len - 1))
2332                         last = strend - len + 1;
2333                     else if (!multiline)
2334                         last = memEQ(strend - len, little, len)
2335                             ? strend - len : NULL;
2336                     else
2337                         goto find_last;
2338                 } else {
2339                   find_last:
2340                     if (len)
2341                         last = rninstr(s, strend, little, little + len);
2342                     else
2343                         last = strend;  /* matching "$" */
2344                 }
2345             }
2346             if (last == NULL) {
2347                 DEBUG_EXECUTE_r(
2348                     PerlIO_printf(Perl_debug_log,
2349                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2350                         PL_colors[4], PL_colors[5]));
2351                 goto phooey; /* Should not happen! */
2352             }
2353             dontbother = strend - last + prog->float_min_offset;
2354         }
2355         if (minlen && (dontbother < minlen))
2356             dontbother = minlen - 1;
2357         strend -= dontbother;              /* this one's always in bytes! */
2358         /* We don't know much -- general case. */
2359         if (utf8_target) {
2360             for (;;) {
2361                 if (regtry(&reginfo, &s))
2362                     goto got_it;
2363                 if (s >= strend)
2364                     break;
2365                 s += UTF8SKIP(s);
2366             };
2367         }
2368         else {
2369             do {
2370                 if (regtry(&reginfo, &s))
2371                     goto got_it;
2372             } while (s++ < strend);
2373         }
2374     }
2375
2376     /* Failure. */
2377     goto phooey;
2378
2379 got_it:
2380     Safefree(swap);
2381     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2382
2383     if (PL_reg_eval_set)
2384         restore_pos(aTHX_ prog);
2385     if (RXp_PAREN_NAMES(prog))
2386         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2387
2388     /* make sure $`, $&, $', and $digit will work later */
2389     if ( !(flags & REXEC_NOT_FIRST) ) {
2390         RX_MATCH_COPY_FREE(rx);
2391         if (flags & REXEC_COPY_STR) {
2392             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2393 #ifdef PERL_OLD_COPY_ON_WRITE
2394             if ((SvIsCOW(sv)
2395                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2396                 if (DEBUG_C_TEST) {
2397                     PerlIO_printf(Perl_debug_log,
2398                                   "Copy on write: regexp capture, type %d\n",
2399                                   (int) SvTYPE(sv));
2400                 }
2401                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2402                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2403                 assert (SvPOKp(prog->saved_copy));
2404             } else
2405 #endif
2406             {
2407                 RX_MATCH_COPIED_on(rx);
2408                 s = savepvn(strbeg, i);
2409                 prog->subbeg = s;
2410             }
2411             prog->sublen = i;
2412         }
2413         else {
2414             prog->subbeg = strbeg;
2415             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2416         }
2417     }
2418
2419     return 1;
2420
2421 phooey:
2422     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2423                           PL_colors[4], PL_colors[5]));
2424     if (PL_reg_eval_set)
2425         restore_pos(aTHX_ prog);
2426     if (swap) {
2427         /* we failed :-( roll it back */
2428         Safefree(prog->offs);
2429         prog->offs = swap;
2430     }
2431
2432     return 0;
2433 }
2434
2435
2436 /*
2437  - regtry - try match at specific point
2438  */
2439 STATIC I32                      /* 0 failure, 1 success */
2440 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2441 {
2442     dVAR;
2443     CHECKPOINT lastcp;
2444     REGEXP *const rx = reginfo->prog;
2445     regexp *const prog = (struct regexp *)SvANY(rx);
2446     RXi_GET_DECL(prog,progi);
2447     GET_RE_DEBUG_FLAGS_DECL;
2448
2449     PERL_ARGS_ASSERT_REGTRY;
2450
2451     reginfo->cutpoint=NULL;
2452
2453     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2454         MAGIC *mg;
2455
2456         PL_reg_eval_set = RS_init;
2457         DEBUG_EXECUTE_r(DEBUG_s(
2458             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2459                           (IV)(PL_stack_sp - PL_stack_base));
2460             ));
2461         SAVESTACK_CXPOS();
2462         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2463         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2464         SAVETMPS;
2465         /* Apparently this is not needed, judging by wantarray. */
2466         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2467            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2468
2469         if (reginfo->sv) {
2470             /* Make $_ available to executed code. */
2471             if (reginfo->sv != DEFSV) {
2472                 SAVE_DEFSV;
2473                 DEFSV_set(reginfo->sv);
2474             }
2475
2476             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2477                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2478                 /* prepare for quick setting of pos */
2479 #ifdef PERL_OLD_COPY_ON_WRITE
2480                 if (SvIsCOW(reginfo->sv))
2481                     sv_force_normal_flags(reginfo->sv, 0);
2482 #endif
2483                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2484                                  &PL_vtbl_mglob, NULL, 0);
2485                 mg->mg_len = -1;
2486             }
2487             PL_reg_magic    = mg;
2488             PL_reg_oldpos   = mg->mg_len;
2489             SAVEDESTRUCTOR_X(restore_pos, prog);
2490         }
2491         if (!PL_reg_curpm) {
2492             Newxz(PL_reg_curpm, 1, PMOP);
2493 #ifdef USE_ITHREADS
2494             {
2495                 SV* const repointer = &PL_sv_undef;
2496                 /* this regexp is also owned by the new PL_reg_curpm, which
2497                    will try to free it.  */
2498                 av_push(PL_regex_padav, repointer);
2499                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2500                 PL_regex_pad = AvARRAY(PL_regex_padav);
2501             }
2502 #endif
2503         }
2504 #ifdef USE_ITHREADS
2505         /* It seems that non-ithreads works both with and without this code.
2506            So for efficiency reasons it seems best not to have the code
2507            compiled when it is not needed.  */
2508         /* This is safe against NULLs: */
2509         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2510         /* PM_reg_curpm owns a reference to this regexp.  */
2511         ReREFCNT_inc(rx);
2512 #endif
2513         PM_SETRE(PL_reg_curpm, rx);
2514         PL_reg_oldcurpm = PL_curpm;
2515         PL_curpm = PL_reg_curpm;
2516         if (RXp_MATCH_COPIED(prog)) {
2517             /*  Here is a serious problem: we cannot rewrite subbeg,
2518                 since it may be needed if this match fails.  Thus
2519                 $` inside (?{}) could fail... */
2520             PL_reg_oldsaved = prog->subbeg;
2521             PL_reg_oldsavedlen = prog->sublen;
2522 #ifdef PERL_OLD_COPY_ON_WRITE
2523             PL_nrs = prog->saved_copy;
2524 #endif
2525             RXp_MATCH_COPIED_off(prog);
2526         }
2527         else
2528             PL_reg_oldsaved = NULL;
2529         prog->subbeg = PL_bostr;
2530         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2531     }
2532     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2533     prog->offs[0].start = *startpos - PL_bostr;
2534     PL_reginput = *startpos;
2535     PL_reglastparen = &prog->lastparen;
2536     PL_reglastcloseparen = &prog->lastcloseparen;
2537     prog->lastparen = 0;
2538     prog->lastcloseparen = 0;
2539     PL_regsize = 0;
2540     PL_regoffs = prog->offs;
2541     if (PL_reg_start_tmpl <= prog->nparens) {
2542         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2543         if(PL_reg_start_tmp)
2544             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2545         else
2546             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2547     }
2548
2549     /* XXXX What this code is doing here?!!!  There should be no need
2550        to do this again and again, PL_reglastparen should take care of
2551        this!  --ilya*/
2552
2553     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2554      * Actually, the code in regcppop() (which Ilya may be meaning by
2555      * PL_reglastparen), is not needed at all by the test suite
2556      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2557      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2558      * Meanwhile, this code *is* needed for the
2559      * above-mentioned test suite tests to succeed.  The common theme
2560      * on those tests seems to be returning null fields from matches.
2561      * --jhi updated by dapm */
2562 #if 1
2563     if (prog->nparens) {
2564         regexp_paren_pair *pp = PL_regoffs;
2565         register I32 i;
2566         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2567             ++pp;
2568             pp->start = -1;
2569             pp->end = -1;
2570         }
2571     }
2572 #endif
2573     REGCP_SET(lastcp);
2574     if (regmatch(reginfo, progi->program + 1)) {
2575         PL_regoffs[0].end = PL_reginput - PL_bostr;
2576         return 1;
2577     }
2578     if (reginfo->cutpoint)
2579         *startpos= reginfo->cutpoint;
2580     REGCP_UNWIND(lastcp);
2581     return 0;
2582 }
2583
2584
2585 #define sayYES goto yes
2586 #define sayNO goto no
2587 #define sayNO_SILENT goto no_silent
2588
2589 /* we dont use STMT_START/END here because it leads to
2590    "unreachable code" warnings, which are bogus, but distracting. */
2591 #define CACHEsayNO \
2592     if (ST.cache_mask) \
2593        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2594     sayNO
2595
2596 /* this is used to determine how far from the left messages like
2597    'failed...' are printed. It should be set such that messages
2598    are inline with the regop output that created them.
2599 */
2600 #define REPORT_CODE_OFF 32
2601
2602
2603 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2604 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2605
2606 #define SLAB_FIRST(s) (&(s)->states[0])
2607 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2608
2609 /* grab a new slab and return the first slot in it */
2610
2611 STATIC regmatch_state *
2612 S_push_slab(pTHX)
2613 {
2614 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2615     dMY_CXT;
2616 #endif
2617     regmatch_slab *s = PL_regmatch_slab->next;
2618     if (!s) {
2619         Newx(s, 1, regmatch_slab);
2620         s->prev = PL_regmatch_slab;
2621         s->next = NULL;
2622         PL_regmatch_slab->next = s;
2623     }
2624     PL_regmatch_slab = s;
2625     return SLAB_FIRST(s);
2626 }
2627
2628
2629 /* push a new state then goto it */
2630
2631 #define PUSH_STATE_GOTO(state, node) \
2632     scan = node; \
2633     st->resume_state = state; \
2634     goto push_state;
2635
2636 /* push a new state with success backtracking, then goto it */
2637
2638 #define PUSH_YES_STATE_GOTO(state, node) \
2639     scan = node; \
2640     st->resume_state = state; \
2641     goto push_yes_state;
2642
2643
2644
2645 /*
2646
2647 regmatch() - main matching routine
2648
2649 This is basically one big switch statement in a loop. We execute an op,
2650 set 'next' to point the next op, and continue. If we come to a point which
2651 we may need to backtrack to on failure such as (A|B|C), we push a
2652 backtrack state onto the backtrack stack. On failure, we pop the top
2653 state, and re-enter the loop at the state indicated. If there are no more
2654 states to pop, we return failure.
2655
2656 Sometimes we also need to backtrack on success; for example /A+/, where
2657 after successfully matching one A, we need to go back and try to
2658 match another one; similarly for lookahead assertions: if the assertion
2659 completes successfully, we backtrack to the state just before the assertion
2660 and then carry on.  In these cases, the pushed state is marked as
2661 'backtrack on success too'. This marking is in fact done by a chain of
2662 pointers, each pointing to the previous 'yes' state. On success, we pop to
2663 the nearest yes state, discarding any intermediate failure-only states.
2664 Sometimes a yes state is pushed just to force some cleanup code to be
2665 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2666 it to free the inner regex.
2667
2668 Note that failure backtracking rewinds the cursor position, while
2669 success backtracking leaves it alone.
2670
2671 A pattern is complete when the END op is executed, while a subpattern
2672 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2673 ops trigger the "pop to last yes state if any, otherwise return true"
2674 behaviour.
2675
2676 A common convention in this function is to use A and B to refer to the two
2677 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2678 the subpattern to be matched possibly multiple times, while B is the entire
2679 rest of the pattern. Variable and state names reflect this convention.
2680
2681 The states in the main switch are the union of ops and failure/success of
2682 substates associated with with that op.  For example, IFMATCH is the op
2683 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2684 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2685 successfully matched A and IFMATCH_A_fail is a state saying that we have
2686 just failed to match A. Resume states always come in pairs. The backtrack
2687 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2688 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2689 on success or failure.
2690
2691 The struct that holds a backtracking state is actually a big union, with
2692 one variant for each major type of op. The variable st points to the
2693 top-most backtrack struct. To make the code clearer, within each
2694 block of code we #define ST to alias the relevant union.
2695
2696 Here's a concrete example of a (vastly oversimplified) IFMATCH
2697 implementation:
2698
2699     switch (state) {
2700     ....
2701
2702 #define ST st->u.ifmatch
2703
2704     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2705         ST.foo = ...; // some state we wish to save
2706         ...
2707         // push a yes backtrack state with a resume value of
2708         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2709         // first node of A:
2710         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2711         // NOTREACHED
2712
2713     case IFMATCH_A: // we have successfully executed A; now continue with B
2714         next = B;
2715         bar = ST.foo; // do something with the preserved value
2716         break;
2717
2718     case IFMATCH_A_fail: // A failed, so the assertion failed
2719         ...;   // do some housekeeping, then ...
2720         sayNO; // propagate the failure
2721
2722 #undef ST
2723
2724     ...
2725     }
2726
2727 For any old-timers reading this who are familiar with the old recursive
2728 approach, the code above is equivalent to:
2729
2730     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2731     {
2732         int foo = ...
2733         ...
2734         if (regmatch(A)) {
2735             next = B;
2736             bar = foo;
2737             break;
2738         }
2739         ...;   // do some housekeeping, then ...
2740         sayNO; // propagate the failure
2741     }
2742
2743 The topmost backtrack state, pointed to by st, is usually free. If you
2744 want to claim it, populate any ST.foo fields in it with values you wish to
2745 save, then do one of
2746
2747         PUSH_STATE_GOTO(resume_state, node);
2748         PUSH_YES_STATE_GOTO(resume_state, node);
2749
2750 which sets that backtrack state's resume value to 'resume_state', pushes a
2751 new free entry to the top of the backtrack stack, then goes to 'node'.
2752 On backtracking, the free slot is popped, and the saved state becomes the
2753 new free state. An ST.foo field in this new top state can be temporarily
2754 accessed to retrieve values, but once the main loop is re-entered, it
2755 becomes available for reuse.
2756
2757 Note that the depth of the backtrack stack constantly increases during the
2758 left-to-right execution of the pattern, rather than going up and down with
2759 the pattern nesting. For example the stack is at its maximum at Z at the
2760 end of the pattern, rather than at X in the following:
2761
2762     /(((X)+)+)+....(Y)+....Z/
2763
2764 The only exceptions to this are lookahead/behind assertions and the cut,
2765 (?>A), which pop all the backtrack states associated with A before
2766 continuing.
2767
2768 Bascktrack state structs are allocated in slabs of about 4K in size.
2769 PL_regmatch_state and st always point to the currently active state,
2770 and PL_regmatch_slab points to the slab currently containing
2771 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2772 allocated, and is never freed until interpreter destruction. When the slab
2773 is full, a new one is allocated and chained to the end. At exit from
2774 regmatch(), slabs allocated since entry are freed.
2775
2776 */
2777
2778
2779 #define DEBUG_STATE_pp(pp)                                  \
2780     DEBUG_STATE_r({                                         \
2781         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2782         PerlIO_printf(Perl_debug_log,                       \
2783             "    %*s"pp" %s%s%s%s%s\n",                     \
2784             depth*2, "",                                    \
2785             PL_reg_name[st->resume_state],                     \
2786             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2787             ((st==yes_state) ? "Y" : ""),                   \
2788             ((st==mark_state) ? "M" : ""),                  \
2789             ((st==yes_state||st==mark_state) ? "]" : "")    \
2790         );                                                  \
2791     });
2792
2793
2794 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2795
2796 #ifdef DEBUGGING
2797
2798 STATIC void
2799 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2800     const char *start, const char *end, const char *blurb)
2801 {
2802     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2803
2804     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2805
2806     if (!PL_colorset)
2807             reginitcolors();
2808     {
2809         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2810             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2811
2812         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2813             start, end - start, 60);
2814
2815         PerlIO_printf(Perl_debug_log,
2816             "%s%s REx%s %s against %s\n",
2817                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2818
2819         if (utf8_target||utf8_pat)
2820             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2821                 utf8_pat ? "pattern" : "",
2822                 utf8_pat && utf8_target ? " and " : "",
2823                 utf8_target ? "string" : ""
2824             );
2825     }
2826 }
2827
2828 STATIC void
2829 S_dump_exec_pos(pTHX_ const char *locinput,
2830                       const regnode *scan,
2831                       const char *loc_regeol,
2832                       const char *loc_bostr,
2833                       const char *loc_reg_starttry,
2834                       const bool utf8_target)
2835 {
2836     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2837     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2838     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2839     /* The part of the string before starttry has one color
2840        (pref0_len chars), between starttry and current
2841        position another one (pref_len - pref0_len chars),
2842        after the current position the third one.
2843        We assume that pref0_len <= pref_len, otherwise we
2844        decrease pref0_len.  */
2845     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2846         ? (5 + taill) - l : locinput - loc_bostr;
2847     int pref0_len;
2848
2849     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2850
2851     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2852         pref_len++;
2853     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2854     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2855         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2856               ? (5 + taill) - pref_len : loc_regeol - locinput);
2857     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2858         l--;
2859     if (pref0_len < 0)
2860         pref0_len = 0;
2861     if (pref0_len > pref_len)
2862         pref0_len = pref_len;
2863     {
2864         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2865
2866         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2867             (locinput - pref_len),pref0_len, 60, 4, 5);
2868
2869         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2870                     (locinput - pref_len + pref0_len),
2871                     pref_len - pref0_len, 60, 2, 3);
2872
2873         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2874                     locinput, loc_regeol - locinput, 10, 0, 1);
2875
2876         const STRLEN tlen=len0+len1+len2;
2877         PerlIO_printf(Perl_debug_log,
2878                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2879                     (IV)(locinput - loc_bostr),
2880                     len0, s0,
2881                     len1, s1,
2882                     (docolor ? "" : "> <"),
2883                     len2, s2,
2884                     (int)(tlen > 19 ? 0 :  19 - tlen),
2885                     "");
2886     }
2887 }
2888
2889 #endif
2890
2891 /* reg_check_named_buff_matched()
2892  * Checks to see if a named buffer has matched. The data array of
2893  * buffer numbers corresponding to the buffer is expected to reside
2894  * in the regexp->data->data array in the slot stored in the ARG() of
2895  * node involved. Note that this routine doesn't actually care about the
2896  * name, that information is not preserved from compilation to execution.
2897  * Returns the index of the leftmost defined buffer with the given name
2898  * or 0 if non of the buffers matched.
2899  */
2900 STATIC I32
2901 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2902 {
2903     I32 n;
2904     RXi_GET_DECL(rex,rexi);
2905     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2906     I32 *nums=(I32*)SvPVX(sv_dat);
2907
2908     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2909
2910     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2911         if ((I32)*PL_reglastparen >= nums[n] &&
2912             PL_regoffs[nums[n]].end != -1)
2913         {
2914             return nums[n];
2915         }
2916     }
2917     return 0;
2918 }
2919
2920
2921 /* free all slabs above current one  - called during LEAVE_SCOPE */
2922
2923 STATIC void
2924 S_clear_backtrack_stack(pTHX_ void *p)
2925 {
2926     regmatch_slab *s = PL_regmatch_slab->next;
2927     PERL_UNUSED_ARG(p);
2928
2929     if (!s)
2930         return;
2931     PL_regmatch_slab->next = NULL;
2932     while (s) {
2933         regmatch_slab * const osl = s;
2934         s = s->next;
2935         Safefree(osl);
2936     }
2937 }
2938
2939
2940 #define SETREX(Re1,Re2) \
2941     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2942     Re1 = (Re2)
2943
2944 STATIC I32                      /* 0 failure, 1 success */
2945 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2946 {
2947 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2948     dMY_CXT;
2949 #endif
2950     dVAR;
2951     register const bool utf8_target = PL_reg_match_utf8;
2952     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2953     REGEXP *rex_sv = reginfo->prog;
2954     regexp *rex = (struct regexp *)SvANY(rex_sv);
2955     RXi_GET_DECL(rex,rexi);
2956     I32 oldsave;
2957     /* the current state. This is a cached copy of PL_regmatch_state */
2958     register regmatch_state *st;
2959     /* cache heavy used fields of st in registers */
2960     register regnode *scan;
2961     register regnode *next;
2962     register U32 n = 0; /* general value; init to avoid compiler warning */
2963     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2964     register char *locinput = PL_reginput;
2965     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2966
2967     bool result = 0;        /* return value of S_regmatch */
2968     int depth = 0;          /* depth of backtrack stack */
2969     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2970     const U32 max_nochange_depth =
2971         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2972         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2973     regmatch_state *yes_state = NULL; /* state to pop to on success of
2974                                                             subpattern */
2975     /* mark_state piggy backs on the yes_state logic so that when we unwind
2976        the stack on success we can update the mark_state as we go */
2977     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2978     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2979     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2980     U32 state_num;
2981     bool no_final = 0;      /* prevent failure from backtracking? */
2982     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2983     char *startpoint = PL_reginput;
2984     SV *popmark = NULL;     /* are we looking for a mark? */
2985     SV *sv_commit = NULL;   /* last mark name seen in failure */
2986     SV *sv_yes_mark = NULL; /* last mark name we have seen
2987                                during a successfull match */
2988     U32 lastopen = 0;       /* last open we saw */
2989     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2990     SV* const oreplsv = GvSV(PL_replgv);
2991     /* these three flags are set by various ops to signal information to
2992      * the very next op. They have a useful lifetime of exactly one loop
2993      * iteration, and are not preserved or restored by state pushes/pops
2994      */
2995     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2996     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2997     int logical = 0;        /* the following EVAL is:
2998                                 0: (?{...})
2999                                 1: (?(?{...})X|Y)
3000                                 2: (??{...})
3001                                or the following IFMATCH/UNLESSM is:
3002                                 false: plain (?=foo)
3003                                 true:  used as a condition: (?(?=foo))
3004                             */
3005 #ifdef DEBUGGING
3006     GET_RE_DEBUG_FLAGS_DECL;
3007 #endif
3008
3009     PERL_ARGS_ASSERT_REGMATCH;
3010
3011     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3012             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3013     }));
3014     /* on first ever call to regmatch, allocate first slab */
3015     if (!PL_regmatch_slab) {
3016         Newx(PL_regmatch_slab, 1, regmatch_slab);
3017         PL_regmatch_slab->prev = NULL;
3018         PL_regmatch_slab->next = NULL;
3019         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3020     }
3021
3022     oldsave = PL_savestack_ix;
3023     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3024     SAVEVPTR(PL_regmatch_slab);
3025     SAVEVPTR(PL_regmatch_state);
3026
3027     /* grab next free state slot */
3028     st = ++PL_regmatch_state;
3029     if (st >  SLAB_LAST(PL_regmatch_slab))
3030         st = PL_regmatch_state = S_push_slab(aTHX);
3031
3032     /* Note that nextchr is a byte even in UTF */
3033     nextchr = UCHARAT(locinput);
3034     scan = prog;
3035     while (scan != NULL) {
3036
3037         DEBUG_EXECUTE_r( {
3038             SV * const prop = sv_newmortal();
3039             regnode *rnext=regnext(scan);
3040             DUMP_EXEC_POS( locinput, scan, utf8_target );
3041             regprop(rex, prop, scan);
3042
3043             PerlIO_printf(Perl_debug_log,
3044                     "%3"IVdf":%*s%s(%"IVdf")\n",
3045                     (IV)(scan - rexi->program), depth*2, "",
3046                     SvPVX_const(prop),
3047                     (PL_regkind[OP(scan)] == END || !rnext) ?
3048                         0 : (IV)(rnext - rexi->program));
3049         });
3050
3051         next = scan + NEXT_OFF(scan);
3052         if (next == scan)
3053             next = NULL;
3054         state_num = OP(scan);
3055
3056         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3057       reenter_switch:
3058
3059         assert(PL_reglastparen == &rex->lastparen);
3060         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3061         assert(PL_regoffs == rex->offs);
3062
3063         switch (state_num) {
3064         case BOL:
3065             if (locinput == PL_bostr)
3066             {
3067                 /* reginfo->till = reginfo->bol; */
3068                 break;
3069             }
3070             sayNO;
3071         case MBOL:
3072             if (locinput == PL_bostr ||
3073                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3074             {
3075                 break;
3076             }
3077             sayNO;
3078         case SBOL:
3079             if (locinput == PL_bostr)
3080                 break;
3081             sayNO;
3082         case GPOS:
3083             if (locinput == reginfo->ganch)
3084                 break;
3085             sayNO;
3086
3087         case KEEPS:
3088             /* update the startpoint */
3089             st->u.keeper.val = PL_regoffs[0].start;
3090             PL_reginput = locinput;
3091             PL_regoffs[0].start = locinput - PL_bostr;
3092             PUSH_STATE_GOTO(KEEPS_next, next);
3093             /*NOT-REACHED*/
3094         case KEEPS_next_fail:
3095             /* rollback the start point change */
3096             PL_regoffs[0].start = st->u.keeper.val;
3097             sayNO_SILENT;
3098             /*NOT-REACHED*/
3099         case EOL:
3100                 goto seol;
3101         case MEOL:
3102             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3103                 sayNO;
3104             break;
3105         case SEOL:
3106           seol:
3107             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3108                 sayNO;
3109             if (PL_regeol - locinput > 1)
3110                 sayNO;
3111             break;
3112         case EOS:
3113             if (PL_regeol != locinput)
3114                 sayNO;
3115             break;
3116         case SANY:
3117             if (!nextchr && locinput >= PL_regeol)
3118                 sayNO;
3119             if (utf8_target) {
3120                 locinput += PL_utf8skip[nextchr];
3121                 if (locinput > PL_regeol)
3122                     sayNO;
3123                 nextchr = UCHARAT(locinput);
3124             }
3125             else
3126                 nextchr = UCHARAT(++locinput);
3127             break;
3128         case CANY:
3129             if (!nextchr && locinput >= PL_regeol)
3130                 sayNO;
3131             nextchr = UCHARAT(++locinput);
3132             break;
3133         case REG_ANY:
3134             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3135                 sayNO;
3136             if (utf8_target) {
3137                 locinput += PL_utf8skip[nextchr];
3138                 if (locinput > PL_regeol)
3139                     sayNO;
3140                 nextchr = UCHARAT(locinput);
3141             }
3142             else
3143                 nextchr = UCHARAT(++locinput);
3144             break;
3145
3146 #undef  ST
3147 #define ST st->u.trie
3148         case TRIEC:
3149             /* In this case the charclass data is available inline so
3150                we can fail fast without a lot of extra overhead.
3151              */
3152             if (scan->flags == EXACT || !utf8_target) {
3153                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3154                     DEBUG_EXECUTE_r(
3155                         PerlIO_printf(Perl_debug_log,
3156                                   "%*s  %sfailed to match trie start class...%s\n",
3157                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3158                     );
3159                     sayNO_SILENT;
3160                     /* NOTREACHED */
3161                 }
3162             }
3163             /* FALL THROUGH */
3164         case TRIE:
3165             /* the basic plan of execution of the trie is:
3166              * At the beginning, run though all the states, and
3167              * find the longest-matching word. Also remember the position
3168              * of the shortest matching word. For example, this pattern:
3169              *    1  2 3 4    5
3170              *    ab|a|x|abcd|abc
3171              * when matched against the string "abcde", will generate
3172              * accept states for all words except 3, with the longest
3173              * matching word being 4, and the shortest being 1 (with
3174              * the position being after char 1 of the string).
3175              *
3176              * Then for each matching word, in word order (i.e. 1,2,4,5),
3177              * we run the remainder of the pattern; on each try setting
3178              * the current position to the character following the word,
3179              * returning to try the next word on failure.
3180              *
3181              * We avoid having to build a list of words at runtime by
3182              * using a compile-time structure, wordinfo[].prev, which
3183              * gives, for each word, the previous accepting word (if any).
3184              * In the case above it would contain the mappings 1->2, 2->0,
3185              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3186              * the longest word (4 above), a list of all words, by
3187              * following the list of prev pointers; this gives us the
3188              * unordered list 4,5,1,2. Then given the current word we have
3189              * just tried, we can go through the list and find the
3190              * next-biggest word to try (so if we just failed on word 2,
3191              * the next in the list is 4).
3192              *
3193              * Since at runtime we don't record the matching position in
3194              * the string for each word, we have to work that out for
3195              * each word we're about to process. The wordinfo table holds
3196              * the character length of each word; given that we recorded
3197              * at the start: the position of the shortest word and its
3198              * length in chars, we just need to move the pointer the
3199              * difference between the two char lengths. Depending on
3200              * Unicode status and folding, that's cheap or expensive.
3201              *
3202              * This algorithm is optimised for the case where are only a
3203              * small number of accept states, i.e. 0,1, or maybe 2.
3204              * With lots of accepts states, and having to try all of them,
3205              * it becomes quadratic on number of accept states to find all
3206              * the next words.
3207              */
3208
3209             {
3210                 /* what type of TRIE am I? (utf8 makes this contextual) */
3211                 DECL_TRIE_TYPE(scan);
3212
3213                 /* what trie are we using right now */
3214                 reg_trie_data * const trie
3215                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3216                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3217                 U32 state = trie->startstate;
3218
3219                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3220                     !TRIE_BITMAP_TEST(trie,*locinput)
3221                 ) {
3222                     if (trie->states[ state ].wordnum) {
3223                          DEBUG_EXECUTE_r(
3224                             PerlIO_printf(Perl_debug_log,
3225                                           "%*s  %smatched empty string...%s\n",
3226                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3227                         );
3228                         if (!trie->jump)
3229                             break;
3230                     } else {
3231                         DEBUG_EXECUTE_r(
3232                             PerlIO_printf(Perl_debug_log,
3233                                           "%*s  %sfailed to match trie start class...%s\n",
3234                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3235                         );
3236                         sayNO_SILENT;
3237                    }
3238                 }
3239
3240             {
3241                 U8 *uc = ( U8* )locinput;
3242
3243                 STRLEN len = 0;
3244                 STRLEN foldlen = 0;
3245                 U8 *uscan = (U8*)NULL;
3246                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3247                 U32 charcount = 0; /* how many input chars we have matched */
3248                 U32 accepted = 0; /* have we seen any accepting states? */
3249
3250                 ST.B = next;
3251                 ST.jump = trie->jump;
3252                 ST.me = scan;
3253                 ST.firstpos = NULL;
3254                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3255                 ST.nextword = 0;
3256
3257                 /* fully traverse the TRIE; note the position of the
3258                    shortest accept state and the wordnum of the longest
3259                    accept state */
3260
3261                 while ( state && uc <= (U8*)PL_regeol ) {
3262                     U32 base = trie->states[ state ].trans.base;
3263                     UV uvc = 0;
3264                     U16 charid = 0;
3265                     U16 wordnum;
3266                     wordnum = trie->states[ state ].wordnum;
3267
3268                     if (wordnum) { /* it's an accept state */
3269                         if (!accepted) {
3270                             accepted = 1;
3271                             /* record first match position */
3272                             if (ST.longfold) {
3273                                 ST.firstpos = (U8*)locinput;
3274                                 ST.firstchars = 0;
3275                             }
3276                             else {
3277                                 ST.firstpos = uc;
3278                                 ST.firstchars = charcount;
3279                             }
3280                         }
3281                         if (!ST.nextword || wordnum < ST.nextword)
3282                             ST.nextword = wordnum;
3283                         ST.topword = wordnum;
3284                     }
3285
3286                     DEBUG_TRIE_EXECUTE_r({
3287                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3288                                 PerlIO_printf( Perl_debug_log,
3289                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3290                                     2+depth * 2, "", PL_colors[4],
3291                                     (UV)state, (accepted ? 'Y' : 'N'));
3292                     });
3293
3294                     /* read a char and goto next state */
3295                     if ( base ) {
3296                         I32 offset;
3297                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3298                                              uscan, len, uvc, charid, foldlen,
3299                                              foldbuf, uniflags);
3300                         charcount++;
3301                         if (foldlen>0)
3302                             ST.longfold = TRUE;
3303                         if (charid &&
3304                              ( ((offset =
3305                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3306
3307                              && ((U32)offset < trie->lasttrans)
3308                              && trie->trans[offset].check == state)
3309                         {
3310                             state = trie->trans[offset].next;
3311                         }
3312                         else {
3313                             state = 0;
3314                         }
3315                         uc += len;
3316
3317                     }
3318                     else {
3319                         state = 0;
3320                     }
3321                     DEBUG_TRIE_EXECUTE_r(
3322                         PerlIO_printf( Perl_debug_log,
3323                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3324                             charid, uvc, (UV)state, PL_colors[5] );
3325                     );
3326                 }
3327                 if (!accepted)
3328                    sayNO;
3329
3330                 /* calculate total number of accept states */
3331                 {
3332                     U16 w = ST.topword;
3333                     accepted = 0;
3334                     while (w) {
3335                         w = trie->wordinfo[w].prev;
3336                         accepted++;
3337                     }
3338                     ST.accepted = accepted;
3339                 }
3340
3341                 DEBUG_EXECUTE_r(
3342                     PerlIO_printf( Perl_debug_log,
3343                         "%*s  %sgot %"IVdf" possible matches%s\n",
3344                         REPORT_CODE_OFF + depth * 2, "",
3345                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3346                 );
3347                 goto trie_first_try; /* jump into the fail handler */
3348             }}
3349             /* NOTREACHED */
3350
3351         case TRIE_next_fail: /* we failed - try next alternative */
3352             if ( ST.jump) {
3353                 REGCP_UNWIND(ST.cp);
3354                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3355                     PL_regoffs[n].end = -1;
3356                 *PL_reglastparen = n;
3357             }
3358             if (!--ST.accepted) {
3359                 DEBUG_EXECUTE_r({
3360                     PerlIO_printf( Perl_debug_log,
3361                         "%*s  %sTRIE failed...%s\n",
3362                         REPORT_CODE_OFF+depth*2, "",
3363                         PL_colors[4],
3364                         PL_colors[5] );
3365                 });
3366                 sayNO_SILENT;
3367             }
3368             {
3369                 /* Find next-highest word to process.  Note that this code
3370                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3371                 register U16 min = 0;
3372                 register U16 word;
3373                 register U16 const nextword = ST.nextword;
3374                 register reg_trie_wordinfo * const wordinfo
3375                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3376                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3377                     if (word > nextword && (!min || word < min))
3378                         min = word;
3379                 }
3380                 ST.nextword = min;
3381             }
3382
3383           trie_first_try:
3384             if (do_cutgroup) {
3385                 do_cutgroup = 0;
3386                 no_final = 0;
3387             }
3388
3389             if ( ST.jump) {
3390                 ST.lastparen = *PL_reglastparen;
3391                 REGCP_SET(ST.cp);
3392             }
3393
3394             /* find start char of end of current word */
3395             {
3396                 U32 chars; /* how many chars to skip */
3397                 U8 *uc = ST.firstpos;
3398                 reg_trie_data * const trie
3399                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3400
3401                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3402                             >=  ST.firstchars);
3403                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3404                             - ST.firstchars;
3405
3406                 if (ST.longfold) {
3407                     /* the hard option - fold each char in turn and find
3408                      * its folded length (which may be different */
3409                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3410                     STRLEN foldlen;
3411                     STRLEN len;
3412                     UV uvc;
3413                     U8 *uscan;
3414
3415                     while (chars) {
3416                         if (utf8_target) {
3417                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3418                                                     uniflags);
3419                             uc += len;
3420                         }
3421                         else {
3422                             uvc = *uc;
3423                             uc++;
3424                         }
3425                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3426                         uscan = foldbuf;
3427                         while (foldlen) {
3428                             if (!--chars)
3429                                 break;
3430                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3431                                             uniflags);
3432                             uscan += len;
3433                             foldlen -= len;
3434                         }
3435                     }
3436                 }
3437                 else {
3438                     if (utf8_target)
3439                         while (chars--)
3440                             uc += UTF8SKIP(uc);
3441                     else
3442                         uc += chars;
3443                 }
3444                 PL_reginput = (char *)uc;
3445             }
3446
3447             scan = (ST.jump && ST.jump[ST.nextword])
3448                         ? ST.me + ST.jump[ST.nextword]
3449                         : ST.B;
3450
3451             DEBUG_EXECUTE_r({
3452                 PerlIO_printf( Perl_debug_log,
3453                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3454                     REPORT_CODE_OFF+depth*2, "",
3455                     PL_colors[4],
3456                     ST.nextword,
3457                     PL_colors[5]
3458                     );
3459             });
3460
3461             if (ST.accepted > 1 || has_cutgroup) {
3462                 PUSH_STATE_GOTO(TRIE_next, scan);
3463                 /* NOTREACHED */
3464             }
3465             /* only one choice left - just continue */
3466             DEBUG_EXECUTE_r({
3467                 AV *const trie_words
3468                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3469                 SV ** const tmp = av_fetch( trie_words,
3470                     ST.nextword-1, 0 );
3471                 SV *sv= tmp ? sv_newmortal() : NULL;
3472
3473                 PerlIO_printf( Perl_debug_log,
3474                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3475                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3476                     ST.nextword,
3477                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3478                             PL_colors[0], PL_colors[1],
3479                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3480                         )
3481                     : "not compiled under -Dr",
3482                     PL_colors[5] );
3483             });
3484
3485             locinput = PL_reginput;
3486             nextchr = UCHARAT(locinput);
3487             continue; /* execute rest of RE */
3488             /* NOTREACHED */
3489 #undef  ST
3490
3491         case EXACT: {
3492             char *s = STRING(scan);
3493             ln = STR_LEN(scan);
3494             if (utf8_target != UTF_PATTERN) {
3495                 /* The target and the pattern have differing utf8ness. */
3496                 char *l = locinput;
3497                 const char * const e = s + ln;
3498
3499                 if (utf8_target) {
3500                     /* The target is utf8, the pattern is not utf8. */
3501                     while (s < e) {
3502                         STRLEN ulen;
3503                         if (l >= PL_regeol)
3504                              sayNO;
3505                         if (NATIVE_TO_UNI(*(U8*)s) !=
3506                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3507                                             uniflags))
3508                              sayNO;
3509                         l += ulen;
3510                         s ++;
3511                     }
3512                 }
3513                 else {
3514                     /* The target is not utf8, the pattern is utf8. */
3515                     while (s < e) {
3516                         STRLEN ulen;
3517                         if (l >= PL_regeol)
3518                             sayNO;
3519                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3520                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3521                                            uniflags))
3522                             sayNO;
3523                         s += ulen;
3524                         l ++;
3525                     }
3526                 }
3527                 locinput = l;
3528                 nextchr = UCHARAT(locinput);
3529                 break;
3530             }
3531             /* The target and the pattern have the same utf8ness. */
3532             /* Inline the first character, for speed. */
3533             if (UCHARAT(s) != nextchr)
3534                 sayNO;
3535             if (PL_regeol - locinput < ln)
3536                 sayNO;
3537             if (ln > 1 && memNE(s, locinput, ln))
3538                 sayNO;
3539             locinput += ln;
3540             nextchr = UCHARAT(locinput);
3541             break;
3542             }
3543         case EXACTFL:
3544             PL_reg_flags |= RF_tainted;
3545             /* FALL THROUGH */
3546         case EXACTF: {
3547             char * const s = STRING(scan);
3548             ln = STR_LEN(scan);
3549
3550             if (utf8_target || UTF_PATTERN) {
3551               /* Either target or the pattern are utf8. */
3552                 const char * const l = locinput;
3553                 char *e = PL_regeol;
3554
3555                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3556                                l, &e, 0,  utf8_target)) {
3557                      /* One more case for the sharp s:
3558                       * pack("U0U*", 0xDF) =~ /ss/i,
3559                       * the 0xC3 0x9F are the UTF-8
3560                       * byte sequence for the U+00DF. */
3561
3562                      if (!(utf8_target &&
3563                            toLOWER(s[0]) == 's' &&
3564                            ln >= 2 &&
3565                            toLOWER(s[1]) == 's' &&
3566                            (U8)l[0] == 0xC3 &&
3567                            e - l >= 2 &&
3568                            (U8)l[1] == 0x9F))
3569                           sayNO;
3570                 }
3571                 locinput = e;
3572                 nextchr = UCHARAT(locinput);
3573                 break;
3574             }
3575
3576             /* Neither the target and the pattern are utf8. */
3577
3578             /* Inline the first character, for speed. */
3579             if (UCHARAT(s) != nextchr &&
3580                 UCHARAT(s) != ((OP(scan) == EXACTF)
3581                                ? PL_fold : PL_fold_locale)[nextchr])
3582                 sayNO;
3583             if (PL_regeol - locinput < ln)
3584                 sayNO;
3585             if (ln > 1 && (OP(scan) == EXACTF
3586                            ? ! foldEQ(s, locinput, ln)
3587                            : ! foldEQ_locale(s, locinput, ln)))
3588                 sayNO;
3589             locinput += ln;
3590             nextchr = UCHARAT(locinput);
3591             break;
3592             }
3593         case BOUNDL:
3594         case NBOUNDL:
3595             PL_reg_flags |= RF_tainted;
3596             /* FALL THROUGH */
3597         case BOUND:
3598         case NBOUND:
3599             /* was last char in word? */
3600             if (utf8_target) {
3601                 if (locinput == PL_bostr)
3602                     ln = '\n';
3603                 else {
3604                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3605
3606                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3607                 }
3608                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3609                     ln = isALNUM_uni(ln);
3610                     LOAD_UTF8_CHARCLASS_ALNUM();
3611                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3612                 }
3613                 else {
3614                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3615                     n = isALNUM_LC_utf8((U8*)locinput);
3616                 }
3617             }
3618             else {
3619                 ln = (locinput != PL_bostr) ?
3620                     UCHARAT(locinput - 1) : '\n';
3621                 if (FLAGS(scan) & USE_UNI) {
3622
3623                     /* Here, can't be BOUNDL or NBOUNDL because they never set
3624                      * the flags to USE_UNI */
3625                     ln = isWORDCHAR_L1(ln);
3626                     n = isWORDCHAR_L1(nextchr);
3627                 }
3628                 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3629                     ln = isALNUM(ln);
3630                     n = isALNUM(nextchr);
3631                 }
3632                 else {
3633                     ln = isALNUM_LC(ln);
3634                     n = isALNUM_LC(nextchr);
3635                 }
3636             }
3637             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3638                                     OP(scan) == BOUNDL))
3639                     sayNO;
3640             break;
3641         case ANYOF:
3642             if (utf8_target) {
3643                 STRLEN inclasslen = PL_regeol - locinput;
3644                 if (locinput >= PL_regeol)
3645                     sayNO;
3646
3647                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3648                     goto anyof_fail;
3649                 locinput += inclasslen;
3650                 nextchr = UCHARAT(locinput);
3651                 break;
3652             }
3653             else {
3654                 if (nextchr < 0)
3655                     nextchr = UCHARAT(locinput);
3656                 if (!nextchr && locinput >= PL_regeol)
3657                     sayNO;
3658                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3659                     goto anyof_fail;
3660                 nextchr = UCHARAT(++locinput);
3661                 break;
3662             }
3663         anyof_fail:
3664             /* If we might have the case of the German sharp s
3665              * in a casefolding Unicode character class. */
3666
3667             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3668                  locinput += SHARP_S_SKIP;
3669                  nextchr = UCHARAT(locinput);
3670             }
3671             else
3672                  sayNO;
3673             break;
3674         /* Special char classes - The defines start on line 129 or so */
3675         CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3676         CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3677
3678         CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3679         CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3680
3681         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3682         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3683
3684         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3685                        a Unicode extended Grapheme Cluster */
3686             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3687               extended Grapheme Cluster is:
3688
3689                CR LF
3690                | Prepend* Begin Extend*
3691                | .
3692
3693                Begin is (Hangul-syllable | ! Control)
3694                Extend is (Grapheme_Extend | Spacing_Mark)
3695                Control is [ GCB_Control CR LF ]
3696
3697                The discussion below shows how the code for CLUMP is derived
3698                from this regex.  Note that most of these concepts are from
3699                property values of the Grapheme Cluster Boundary (GCB) property.
3700                No code point can have multiple property values for a given
3701                property.  Thus a code point in Prepend can't be in Control, but
3702                it must be in !Control.  This is why Control above includes
3703                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3704                property separately, and so can't be in GCB_Control, even though
3705                they logically are controls.  Control is not the same as gc=cc,
3706                but includes format and other characters as well.
3707
3708                The Unicode definition of Hangul-syllable is:
3709                    L+
3710                    | (L* ( ( V | LV ) V* | LVT ) T*)
3711                    | T+
3712                   )
3713                Each of these is a value for the GCB property, and hence must be
3714                disjoint, so the order they are tested is immaterial, so the
3715                above can safely be changed to
3716                    T+
3717                    | L+
3718                    | (L* ( LVT | ( V | LV ) V*) T*)
3719
3720                The last two terms can be combined like this:
3721                    L* ( L
3722                         | (( LVT | ( V | LV ) V*) T*))
3723
3724                And refactored into this:
3725                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3726
3727                That means that if we have seen any L's at all we can quit
3728                there, but if the next character is a LVT, a V or and LV we
3729                should keep going.
3730
3731                There is a subtlety with Prepend* which showed up in testing.
3732                Note that the Begin, and only the Begin is required in:
3733                 | Prepend* Begin Extend*
3734                Also, Begin contains '! Control'.  A Prepend must be a '!
3735                Control', which means it must be a Begin.  What it comes down to
3736                is that if we match Prepend* and then find no suitable Begin
3737                afterwards, that if we backtrack the last Prepend, that one will
3738                be a suitable Begin.
3739             */
3740
3741             if (locinput >= PL_regeol)
3742                 sayNO;
3743             if  (! utf8_target) {
3744
3745                 /* Match either CR LF  or '.', as all the other possibilities
3746                  * require utf8 */
3747                 locinput++;         /* Match the . or CR */
3748                 if (nextchr == '\r'
3749                     && locinput < PL_regeol
3750                     && UCHARAT(locinput) == '\n') locinput++;
3751             }
3752             else {
3753
3754                 /* Utf8: See if is ( CR LF ); already know that locinput <
3755                  * PL_regeol, so locinput+1 is in bounds */
3756                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3757                     locinput += 2;
3758                 }
3759                 else {
3760                     /* In case have to backtrack to beginning, then match '.' */
3761                     char *starting = locinput;
3762
3763                     /* In case have to backtrack the last prepend */
3764                     char *previous_prepend = 0;
3765
3766                     LOAD_UTF8_CHARCLASS_GCB();
3767
3768                     /* Match (prepend)* */
3769                     while (locinput < PL_regeol
3770                            && swash_fetch(PL_utf8_X_prepend,
3771                                           (U8*)locinput, utf8_target))
3772                     {
3773                         previous_prepend = locinput;
3774                         locinput += UTF8SKIP(locinput);
3775                     }
3776
3777                     /* As noted above, if we matched a prepend character, but
3778                      * the next thing won't match, back off the last prepend we
3779                      * matched, as it is guaranteed to match the begin */
3780                     if (previous_prepend
3781                         && (locinput >=  PL_regeol
3782                             || ! swash_fetch(PL_utf8_X_begin,
3783                                              (U8*)locinput, utf8_target)))
3784                     {
3785                         locinput = previous_prepend;
3786                     }
3787
3788                     /* Note that here we know PL_regeol > locinput, as we
3789                      * tested that upon input to this switch case, and if we
3790                      * moved locinput forward, we tested the result just above
3791                      * and it either passed, or we backed off so that it will
3792                      * now pass */
3793                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3794
3795                         /* Here did not match the required 'Begin' in the
3796                          * second term.  So just match the very first
3797                          * character, the '.' of the final term of the regex */
3798                         locinput = starting + UTF8SKIP(starting);
3799                     } else {
3800
3801                         /* Here is the beginning of a character that can have
3802                          * an extender.  It is either a hangul syllable, or a
3803                          * non-control */
3804                         if (swash_fetch(PL_utf8_X_non_hangul,
3805                                         (U8*)locinput, utf8_target))
3806                         {
3807
3808                             /* Here not a Hangul syllable, must be a
3809                              * ('!  * Control') */
3810                             locinput += UTF8SKIP(locinput);
3811                         } else {
3812
3813                             /* Here is a Hangul syllable.  It can be composed
3814                              * of several individual characters.  One
3815                              * possibility is T+ */
3816                             if (swash_fetch(PL_utf8_X_T,
3817                                             (U8*)locinput, utf8_target))
3818                             {
3819                                 while (locinput < PL_regeol
3820                                         && swash_fetch(PL_utf8_X_T,
3821                                                         (U8*)locinput, utf8_target))
3822                                 {
3823                                     locinput += UTF8SKIP(locinput);
3824                                 }
3825                             } else {
3826
3827                                 /* Here, not T+, but is a Hangul.  That means
3828                                  * it is one of the others: L, LV, LVT or V,
3829                                  * and matches:
3830                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3831
3832                                 /* Match L*           */
3833                                 while (locinput < PL_regeol
3834                                         && swash_fetch(PL_utf8_X_L,
3835                                                         (U8*)locinput, utf8_target))
3836                                 {
3837                                     locinput += UTF8SKIP(locinput);
3838                                 }
3839
3840                                 /* Here, have exhausted L*.  If the next
3841                                  * character is not an LV, LVT nor V, it means
3842                                  * we had to have at least one L, so matches L+
3843                                  * in the original equation, we have a complete
3844                                  * hangul syllable.  Are done. */
3845
3846                                 if (locinput < PL_regeol
3847                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3848                                                     (U8*)locinput, utf8_target))
3849                                 {
3850
3851                                     /* Otherwise keep going.  Must be LV, LVT
3852                                      * or V.  See if LVT */
3853                                     if (swash_fetch(PL_utf8_X_LVT,
3854                                                     (U8*)locinput, utf8_target))
3855                                     {
3856                                         locinput += UTF8SKIP(locinput);
3857                                     } else {
3858
3859                                         /* Must be  V or LV.  Take it, then
3860                                          * match V*     */
3861                                         locinput += UTF8SKIP(locinput);
3862                                         while (locinput < PL_regeol
3863                                                 && swash_fetch(PL_utf8_X_V,
3864                                                          (U8*)locinput, utf8_target))
3865                                         {
3866                                             locinput += UTF8SKIP(locinput);
3867                                         }
3868                                     }
3869
3870                                     /* And any of LV, LVT, or V can be followed
3871                                      * by T*            */
3872                                     while (locinput < PL_regeol
3873                                            && swash_fetch(PL_utf8_X_T,
3874                                                            (U8*)locinput,
3875                                                            utf8_target))
3876                                     {
3877                                         locinput += UTF8SKIP(locinput);
3878                                     }
3879                                 }
3880                             }
3881                         }
3882
3883                         /* Match any extender */
3884                         while (locinput < PL_regeol
3885                                 && swash_fetch(PL_utf8_X_extend,
3886                                                 (U8*)locinput, utf8_target))
3887                         {
3888                             locinput += UTF8SKIP(locinput);
3889                         }
3890                     }
3891                 }
3892                 if (locinput > PL_regeol) sayNO;
3893             }
3894             nextchr = UCHARAT(locinput);
3895             break;
3896
3897         case NREFFL:
3898         {
3899             char *s;
3900             char type;
3901             PL_reg_flags |= RF_tainted;
3902             /* FALL THROUGH */
3903         case NREF:
3904         case NREFF:
3905             type = OP(scan);
3906             n = reg_check_named_buff_matched(rex,scan);
3907
3908             if ( n ) {
3909                 type = REF + ( type - NREF );
3910                 goto do_ref;
3911             } else {
3912                 sayNO;
3913             }
3914             /* unreached */
3915         case REFFL:
3916             PL_reg_flags |= RF_tainted;
3917             /* FALL THROUGH */
3918         case REF:
3919         case REFF:
3920             n = ARG(scan);  /* which paren pair */
3921             type = OP(scan);
3922           do_ref:
3923             ln = PL_regoffs[n].start;
3924             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3925             if (*PL_reglastparen < n || ln == -1)
3926                 sayNO;                  /* Do not match unless seen CLOSEn. */
3927             if (ln == PL_regoffs[n].end)
3928                 break;
3929
3930             s = PL_bostr + ln;
3931             if (utf8_target && type != REF) {   /* REF can do byte comparison */
3932                 char *l = locinput;
3933                 const char *e = PL_bostr + PL_regoffs[n].end;
3934                 /*
3935                  * Note that we can't do the "other character" lookup trick as
3936                  * in the 8-bit case (no pun intended) because in Unicode we
3937                  * have to map both upper and title case to lower case.
3938                  */
3939                 if (type == REFF) {
3940                     while (s < e) {
3941                         STRLEN ulen1, ulen2;
3942                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3943                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3944
3945                         if (l >= PL_regeol)
3946                             sayNO;
3947                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3948                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3949                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3950                             sayNO;
3951                         s += ulen1;
3952                         l += ulen2;
3953                     }
3954                 }
3955                 locinput = l;
3956                 nextchr = UCHARAT(locinput);
3957                 break;
3958             }
3959
3960             /* Inline the first character, for speed. */
3961             if (UCHARAT(s) != nextchr &&
3962                 (type == REF ||
3963                  (UCHARAT(s) != (type == REFF
3964                                   ? PL_fold : PL_fold_locale)[nextchr])))
3965                 sayNO;
3966             ln = PL_regoffs[n].end - ln;
3967             if (locinput + ln > PL_regeol)
3968                 sayNO;
3969             if (ln > 1 && (type == REF
3970                            ? memNE(s, locinput, ln)
3971                            : (type == REFF
3972                               ? ! foldEQ(s, locinput, ln)
3973                               : ! foldEQ_locale(s, locinput, ln))))
3974                 sayNO;
3975             locinput += ln;
3976             nextchr = UCHARAT(locinput);
3977             break;
3978         }
3979         case NOTHING:
3980         case TAIL:
3981             break;
3982         case BACK:
3983             break;
3984
3985 #undef  ST
3986 #define ST st->u.eval
3987         {
3988             SV *ret;
3989             REGEXP *re_sv;
3990             regexp *re;
3991             regexp_internal *rei;
3992             regnode *startpoint;
3993
3994         case GOSTART:
3995         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3996             if (cur_eval && cur_eval->locinput==locinput) {
3997                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3998                     Perl_croak(aTHX_ "Infinite recursion in regex");
3999                 if ( ++nochange_depth > max_nochange_depth )
4000                     Perl_croak(aTHX_
4001                         "Pattern subroutine nesting without pos change"
4002                         " exceeded limit in regex");
4003             } else {
4004                 nochange_depth = 0;
4005             }
4006             re_sv = rex_sv;
4007             re = rex;
4008             rei = rexi;
4009             (void)ReREFCNT_inc(rex_sv);
4010             if (OP(scan)==GOSUB) {
4011                 startpoint = scan + ARG2L(scan);
4012                 ST.close_paren = ARG(scan);
4013             } else {
4014                 startpoint = rei->program+1;
4015                 ST.close_paren = 0;
4016             }
4017             goto eval_recurse_doit;
4018             /* NOTREACHED */
4019         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4020             if (cur_eval && cur_eval->locinput==locinput) {
4021                 if ( ++nochange_depth > max_nochange_depth )
4022                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4023             } else {
4024                 nochange_depth = 0;
4025             }
4026             {
4027                 /* execute the code in the {...} */
4028                 dSP;
4029                 SV ** const before = SP;
4030                 OP_4tree * const oop = PL_op;
4031                 COP * const ocurcop = PL_curcop;
4032                 PAD *old_comppad;
4033                 char *saved_regeol = PL_regeol;
4034                 struct re_save_state saved_state;
4035
4036                 /* To not corrupt the existing regex state while executing the
4037                  * eval we would normally put it on the save stack, like with
4038                  * save_re_context. However, re-evals have a weird scoping so we
4039                  * can't just add ENTER/LEAVE here. With that, things like
4040                  *
4041                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4042                  *
4043                  * would break, as they expect the localisation to be unwound
4044                  * only when the re-engine backtracks through the bit that
4045                  * localised it.
4046                  *
4047                  * What we do instead is just saving the state in a local c
4048                  * variable.
4049                  */
4050                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4051
4052                 n = ARG(scan);
4053                 PL_op = (OP_4tree*)rexi->data->data[n];
4054                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4055                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4056                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4057                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4058
4059                 if (sv_yes_mark) {
4060                     SV *sv_mrk = get_sv("REGMARK", 1);
4061                     sv_setsv(sv_mrk, sv_yes_mark);
4062                 }
4063
4064                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4065                 SPAGAIN;
4066                 if (SP == before)
4067                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4068                 else {
4069                     ret = POPs;
4070                     PUTBACK;
4071                 }
4072
4073                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4074
4075                 PL_op = oop;
4076                 PAD_RESTORE_LOCAL(old_comppad);
4077                 PL_curcop = ocurcop;
4078                 PL_regeol = saved_regeol;
4079                 if (!logical) {
4080                     /* /(?{...})/ */
4081                     sv_setsv(save_scalar(PL_replgv), ret);
4082                     break;
4083                 }
4084             }
4085             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4086                 logical = 0;
4087                 {
4088                     /* extract RE object from returned value; compiling if
4089                      * necessary */
4090                     MAGIC *mg = NULL;
4091                     REGEXP *rx = NULL;
4092
4093                     if (SvROK(ret)) {
4094                         SV *const sv = SvRV(ret);
4095
4096                         if (SvTYPE(sv) == SVt_REGEXP) {
4097                             rx = (REGEXP*) sv;
4098                         } else if (SvSMAGICAL(sv)) {
4099                             mg = mg_find(sv, PERL_MAGIC_qr);
4100                             assert(mg);
4101                         }
4102                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4103                         rx = (REGEXP*) ret;
4104                     } else if (SvSMAGICAL(ret)) {
4105                         if (SvGMAGICAL(ret)) {
4106                             /* I don't believe that there is ever qr magic
4107                                here.  */
4108                             assert(!mg_find(ret, PERL_MAGIC_qr));
4109                             sv_unmagic(ret, PERL_MAGIC_qr);
4110                         }
4111                         else {
4112                             mg = mg_find(ret, PERL_MAGIC_qr);
4113                             /* testing suggests mg only ends up non-NULL for
4114                                scalars who were upgraded and compiled in the
4115                                else block below. In turn, this is only
4116                                triggered in the "postponed utf8 string" tests
4117                                in t/op/pat.t  */
4118                         }
4119                     }
4120
4121                     if (mg) {
4122                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4123                         assert(rx);
4124                     }
4125                     if (rx) {
4126                         rx = reg_temp_copy(NULL, rx);
4127                     }
4128                     else {
4129                         U32 pm_flags = 0;
4130                         const I32 osize = PL_regsize;
4131
4132                         if (DO_UTF8(ret)) {
4133                             assert (SvUTF8(ret));
4134                         } else if (SvUTF8(ret)) {
4135                             /* Not doing UTF-8, despite what the SV says. Is
4136                                this only if we're trapped in use 'bytes'?  */
4137                             /* Make a copy of the octet sequence, but without
4138                                the flag on, as the compiler now honours the
4139                                SvUTF8 flag on ret.  */
4140                             STRLEN len;
4141                             const char *const p = SvPV(ret, len);
4142                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4143                         }
4144                         rx = CALLREGCOMP(ret, pm_flags);
4145                         if (!(SvFLAGS(ret)
4146                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4147                                  | SVs_GMG))) {
4148                             /* This isn't a first class regexp. Instead, it's
4149                                caching a regexp onto an existing, Perl visible
4150                                scalar.  */
4151                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4152                         }
4153                         PL_regsize = osize;
4154                     }
4155                     re_sv = rx;
4156                     re = (struct regexp *)SvANY(rx);
4157                 }
4158                 RXp_MATCH_COPIED_off(re);
4159                 re->subbeg = rex->subbeg;
4160                 re->sublen = rex->sublen;
4161                 rei = RXi_GET(re);
4162                 DEBUG_EXECUTE_r(
4163                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4164                         "Matching embedded");
4165                 );
4166                 startpoint = rei->program + 1;
4167                 ST.close_paren = 0; /* only used for GOSUB */
4168                 /* borrowed from regtry */
4169                 if (PL_reg_start_tmpl <= re->nparens) {
4170                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4171                     if(PL_reg_start_tmp)
4172                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4173                     else
4174                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4175                 }
4176
4177         eval_recurse_doit: /* Share code with GOSUB below this line */
4178                 /* run the pattern returned from (??{...}) */
4179                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4180                 REGCP_SET(ST.lastcp);
4181
4182                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4183
4184                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4185                 PL_reglastparen = &re->lastparen;
4186                 PL_reglastcloseparen = &re->lastcloseparen;
4187                 re->lastparen = 0;
4188                 re->lastcloseparen = 0;
4189
4190                 PL_reginput = locinput;
4191                 PL_regsize = 0;
4192
4193                 /* XXXX This is too dramatic a measure... */
4194                 PL_reg_maxiter = 0;
4195
4196                 ST.toggle_reg_flags = PL_reg_flags;
4197                 if (RX_UTF8(re_sv))
4198                     PL_reg_flags |= RF_utf8;
4199                 else
4200                     PL_reg_flags &= ~RF_utf8;
4201                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4202
4203                 ST.prev_rex = rex_sv;
4204                 ST.prev_curlyx = cur_curlyx;
4205                 SETREX(rex_sv,re_sv);
4206                 rex = re;
4207                 rexi = rei;
4208                 cur_curlyx = NULL;
4209                 ST.B = next;
4210                 ST.prev_eval = cur_eval;
4211                 cur_eval = st;
4212                 /* now continue from first node in postoned RE */
4213                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4214                 /* NOTREACHED */
4215             }
4216             /* logical is 1,   /(?(?{...})X|Y)/ */
4217             sw = cBOOL(SvTRUE(ret));
4218             logical = 0;
4219             break;
4220         }
4221
4222         case EVAL_AB: /* cleanup after a successful (??{A})B */
4223             /* note: this is called twice; first after popping B, then A */
4224             PL_reg_flags ^= ST.toggle_reg_flags;
4225             ReREFCNT_dec(rex_sv);
4226             SETREX(rex_sv,ST.prev_rex);
4227             rex = (struct regexp *)SvANY(rex_sv);
4228             rexi = RXi_GET(rex);
4229             regcpblow(ST.cp);
4230             cur_eval = ST.prev_eval;
4231             cur_curlyx = ST.prev_curlyx;
4232
4233             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4234             PL_reglastparen = &rex->lastparen;
4235             PL_reglastcloseparen = &rex->lastcloseparen;
4236             /* also update PL_regoffs */
4237             PL_regoffs = rex->offs;
4238
4239             /* XXXX This is too dramatic a measure... */
4240             PL_reg_maxiter = 0;
4241             if ( nochange_depth )
4242                 nochange_depth--;
4243             sayYES;
4244
4245
4246         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4247             /* note: this is called twice; first after popping B, then A */
4248             PL_reg_flags ^= ST.toggle_reg_flags;
4249             ReREFCNT_dec(rex_sv);
4250             SETREX(rex_sv,ST.prev_rex);
4251             rex = (struct regexp *)SvANY(rex_sv);
4252             rexi = RXi_GET(rex);
4253             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4254             PL_reglastparen = &rex->lastparen;
4255             PL_reglastcloseparen = &rex->lastcloseparen;
4256
4257             PL_reginput = locinput;
4258             REGCP_UNWIND(ST.lastcp);
4259             regcppop(rex);
4260             cur_eval = ST.prev_eval;
4261             cur_curlyx = ST.prev_curlyx;
4262             /* XXXX This is too dramatic a measure... */
4263             PL_reg_maxiter = 0;
4264             if ( nochange_depth )
4265                 nochange_depth--;
4266             sayNO_SILENT;
4267 #undef ST
4268
4269         case OPEN:
4270             n = ARG(scan);  /* which paren pair */
4271             PL_reg_start_tmp[n] = locinput;
4272             if (n > PL_regsize)
4273                 PL_regsize = n;
4274             lastopen = n;
4275             break;
4276         case CLOSE:
4277             n = ARG(scan);  /* which paren pair */
4278             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4279             PL_regoffs[n].end = locinput - PL_bostr;
4280             /*if (n > PL_regsize)
4281                 PL_regsize = n;*/
4282             if (n > *PL_reglastparen)
4283                 *PL_reglastparen = n;
4284             *PL_reglastcloseparen = n;
4285             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4286                 goto fake_end;
4287             }
4288             break;
4289         case ACCEPT:
4290             if (ARG(scan)){
4291                 regnode *cursor;
4292                 for (cursor=scan;
4293                      cursor && OP(cursor)!=END;
4294                      cursor=regnext(cursor))
4295                 {
4296                     if ( OP(cursor)==CLOSE ){
4297                         n = ARG(cursor);
4298                         if ( n <= lastopen ) {
4299                             PL_regoffs[n].start
4300                                 = PL_reg_start_tmp[n] - PL_bostr;
4301                             PL_regoffs[n].end = locinput - PL_bostr;
4302                             /*if (n > PL_regsize)
4303                             PL_regsize = n;*/
4304                             if (n > *PL_reglastparen)
4305                                 *PL_reglastparen = n;
4306                             *PL_reglastcloseparen = n;
4307                             if ( n == ARG(scan) || (cur_eval &&
4308                                 cur_eval->u.eval.close_paren == n))
4309                                 break;
4310                         }
4311                     }
4312                 }
4313             }
4314             goto fake_end;
4315             /*NOTREACHED*/
4316         case GROUPP:
4317             n = ARG(scan);  /* which paren pair */
4318             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4319             break;
4320         case NGROUPP:
4321             /* reg_check_named_buff_matched returns 0 for no match */
4322             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4323             break;
4324         case INSUBP:
4325             n = ARG(scan);
4326             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4327             break;
4328         case DEFINEP:
4329             sw = 0;
4330             break;
4331         case IFTHEN:
4332             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4333             if (sw)
4334                 next = NEXTOPER(NEXTOPER(scan));
4335             else {
4336                 next = scan + ARG(scan);
4337                 if (OP(next) == IFTHEN) /* Fake one. */
4338                     next = NEXTOPER(NEXTOPER(next));
4339             }
4340             break;
4341         case LOGICAL:
4342             logical = scan->flags;
4343             break;
4344
4345 /*******************************************************************
4346
4347 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4348 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4349 STAR/PLUS/CURLY/CURLYN are used instead.)
4350
4351 A*B is compiled as <CURLYX><A><WHILEM><B>
4352
4353 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4354 state, which contains the current count, initialised to -1. It also sets
4355 cur_curlyx to point to this state, with any previous value saved in the
4356 state block.
4357
4358 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4359 since the pattern may possibly match zero times (i.e. it's a while {} loop
4360 rather than a do {} while loop).
4361
4362 Each entry to WHILEM represents a successful match of A. The count in the
4363 CURLYX block is incremented, another WHILEM state is pushed, and execution
4364 passes to A or B depending on greediness and the current count.
4365
4366 For example, if matching against the string a1a2a3b (where the aN are
4367 substrings that match /A/), then the match progresses as follows: (the
4368 pushed states are interspersed with the bits of strings matched so far):
4369
4370     <CURLYX cnt=-1>
4371     <CURLYX cnt=0><WHILEM>
4372     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4373     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4374     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4375     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4376
4377 (Contrast this with something like CURLYM, which maintains only a single
4378 backtrack state:
4379
4380     <CURLYM cnt=0> a1
4381     a1 <CURLYM cnt=1> a2
4382     a1 a2 <CURLYM cnt=2> a3
4383     a1 a2 a3 <CURLYM cnt=3> b
4384 )
4385
4386 Each WHILEM state block marks a point to backtrack to upon partial failure
4387 of A or B, and also contains some minor state data related to that
4388 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4389 overall state, such as the count, and pointers to the A and B ops.
4390
4391 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4392 must always point to the *current* CURLYX block, the rules are:
4393
4394 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4395 and set cur_curlyx to point the new block.
4396
4397 When popping the CURLYX block after a successful or unsuccessful match,
4398 restore the previous cur_curlyx.
4399
4400 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4401 to the outer one saved in the CURLYX block.
4402
4403 When popping the WHILEM block after a successful or unsuccessful B match,
4404 restore the previous cur_curlyx.
4405
4406 Here's an example for the pattern (AI* BI)*BO
4407 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4408
4409 cur_
4410 curlyx backtrack stack
4411 ------ ---------------
4412 NULL
4413 CO     <CO prev=NULL> <WO>
4414 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4415 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4416 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4417
4418 At this point the pattern succeeds, and we work back down the stack to
4419 clean up, restoring as we go:
4420
4421 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4422 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4423 CO     <CO prev=NULL> <WO>
4424 NULL
4425
4426 *******************************************************************/
4427
4428 #define ST st->u.curlyx
4429
4430         case CURLYX:    /* start of /A*B/  (for complex A) */
4431         {
4432             /* No need to save/restore up to this paren */
4433             I32 parenfloor = scan->flags;
4434
4435             assert(next); /* keep Coverity happy */
4436             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4437                 next += ARG(next);
4438
4439             /* XXXX Probably it is better to teach regpush to support
4440                parenfloor > PL_regsize... */
4441             if (parenfloor > (I32)*PL_reglastparen)
4442                 parenfloor = *PL_reglastparen; /* Pessimization... */
4443
4444             ST.prev_curlyx= cur_curlyx;
4445             cur_curlyx = st;
4446             ST.cp = PL_savestack_ix;
4447
4448             /* these fields contain the state of the current curly.
4449              * they are accessed by subsequent WHILEMs */
4450             ST.parenfloor = parenfloor;
4451             ST.me = scan;
4452             ST.B = next;
4453             ST.minmod = minmod;
4454             minmod = 0;
4455             ST.count = -1;      /* this will be updated by WHILEM */
4456             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4457
4458             PL_reginput = locinput;
4459             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4460             /* NOTREACHED */
4461         }
4462
4463         case CURLYX_end: /* just finished matching all of A*B */
4464             cur_curlyx = ST.prev_curlyx;
4465             sayYES;
4466             /* NOTREACHED */
4467
4468         case CURLYX_end_fail: /* just failed to match all of A*B */
4469             regcpblow(ST.cp);
4470             cur_curlyx = ST.prev_curlyx;
4471             sayNO;
4472             /* NOTREACHED */
4473
4474
4475 #undef ST
4476 #define ST st->u.whilem
4477
4478         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4479         {
4480             /* see the discussion above about CURLYX/WHILEM */
4481             I32 n;
4482             int min = ARG1(cur_curlyx->u.curlyx.me);
4483             int max = ARG2(cur_curlyx->u.curlyx.me);
4484             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4485
4486             assert(cur_curlyx); /* keep Coverity happy */
4487             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4488             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4489             ST.cache_offset = 0;
4490             ST.cache_mask = 0;
4491
4492             PL_reginput = locinput;
4493
4494             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4495                   "%*s  whilem: matched %ld out of %d..%d\n",
4496                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4497             );
4498
4499             /* First just match a string of min A's. */
4500
4501             if (n < min) {
4502                 cur_curlyx->u.curlyx.lastloc = locinput;
4503                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4504                 /* NOTREACHED */
4505             }
4506
4507             /* If degenerate A matches "", assume A done. */
4508
4509             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4510                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4511                    "%*s  whilem: empty match detected, trying continuation...\n",
4512                    REPORT_CODE_OFF+depth*2, "")
4513                 );
4514                 goto do_whilem_B_max;
4515             }
4516
4517             /* super-linear cache processing */
4518
4519             if (scan->flags) {
4520
4521                 if (!PL_reg_maxiter) {
4522                     /* start the countdown: Postpone detection until we
4523                      * know the match is not *that* much linear. */
4524                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4525                     /* possible overflow for long strings and many CURLYX's */
4526                     if (PL_reg_maxiter < 0)
4527                         PL_reg_maxiter = I32_MAX;
4528                     PL_reg_leftiter = PL_reg_maxiter;
4529                 }
4530
4531                 if (PL_reg_leftiter-- == 0) {
4532                     /* initialise cache */
4533                     const I32 size = (PL_reg_maxiter + 7)/8;
4534                     if (PL_reg_poscache) {
4535                         if ((I32)PL_reg_poscache_size < size) {
4536                             Renew(PL_reg_poscache, size, char);
4537                             PL_reg_poscache_size = size;
4538                         }
4539                         Zero(PL_reg_poscache, size, char);
4540                     }
4541                     else {
4542                         PL_reg_poscache_size = size;
4543                         Newxz(PL_reg_poscache, size, char);
4544                     }
4545                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4546       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4547                               PL_colors[4], PL_colors[5])
4548                     );
4549                 }
4550
4551                 if (PL_reg_leftiter < 0) {
4552                     /* have we already failed at this position? */
4553                     I32 offset, mask;
4554                     offset  = (scan->flags & 0xf) - 1
4555                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4556                     mask    = 1 << (offset % 8);
4557                     offset /= 8;
4558                     if (PL_reg_poscache[offset] & mask) {
4559                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4560                             "%*s  whilem: (cache) already tried at this position...\n",
4561                             REPORT_CODE_OFF+depth*2, "")
4562                         );
4563                         sayNO; /* cache records failure */
4564                     }
4565                     ST.cache_offset = offset;
4566                     ST.cache_mask   = mask;
4567                 }
4568             }
4569
4570             /* Prefer B over A for minimal matching. */
4571
4572             if (cur_curlyx->u.curlyx.minmod) {
4573                 ST.save_curlyx = cur_curlyx;
4574                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4575                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4576                 REGCP_SET(ST.lastcp);
4577                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4578                 /* NOTREACHED */
4579             }
4580
4581             /* Prefer A over B for maximal matching. */
4582
4583             if (n < max) { /* More greed allowed? */
4584                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4585                 cur_curlyx->u.curlyx.lastloc = locinput;
4586                 REGCP_SET(ST.lastcp);
4587                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4588                 /* NOTREACHED */
4589             }
4590             goto do_whilem_B_max;
4591         }
4592         /* NOTREACHED */
4593
4594         case WHILEM_B_min: /* just matched B in a minimal match */
4595         case WHILEM_B_max: /* just matched B in a maximal match */
4596             cur_curlyx = ST.save_curlyx;
4597             sayYES;
4598             /* NOTREACHED */
4599
4600         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4601             cur_curlyx = ST.save_curlyx;
4602             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4603             cur_curlyx->u.curlyx.count--;
4604             CACHEsayNO;
4605             /* NOTREACHED */
4606
4607         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4608             REGCP_UNWIND(ST.lastcp);
4609             regcppop(rex);
4610             /* FALL THROUGH */
4611         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4612             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4613             cur_curlyx->u.curlyx.count--;
4614             CACHEsayNO;
4615             /* NOTREACHED */
4616
4617         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4618             REGCP_UNWIND(ST.lastcp);
4619             regcppop(rex);      /* Restore some previous $<digit>s? */
4620             PL_reginput = locinput;
4621             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4622                 "%*s  whilem: failed, trying continuation...\n",
4623                 REPORT_CODE_OFF+depth*2, "")
4624             );
4625           do_whilem_B_max:
4626             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4627                 && ckWARN(WARN_REGEXP)
4628                 && !(PL_reg_flags & RF_warned))
4629             {
4630                 PL_reg_flags |= RF_warned;
4631                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4632                      "Complex regular subexpression recursion",
4633                      REG_INFTY - 1);
4634             }
4635
4636             /* now try B */
4637             ST.save_curlyx = cur_curlyx;
4638             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4639             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4640             /* NOTREACHED */
4641
4642         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4643             cur_curlyx = ST.save_curlyx;
4644             REGCP_UNWIND(ST.lastcp);
4645             regcppop(rex);
4646
4647             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4648                 /* Maximum greed exceeded */
4649                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4650                     && ckWARN(WARN_REGEXP)
4651                     && !(PL_reg_flags & RF_warned))
4652                 {
4653                     PL_reg_flags |= RF_warned;
4654                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4655                         "%s limit (%d) exceeded",
4656                         "Complex regular subexpression recursion",
4657                         REG_INFTY - 1);
4658                 }
4659                 cur_curlyx->u.curlyx.count--;
4660                 CACHEsayNO;
4661             }
4662
4663             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4664                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4665             );
4666             /* Try grabbing another A and see if it helps. */
4667             PL_reginput = locinput;
4668             cur_curlyx->u.curlyx.lastloc = locinput;
4669             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4670             REGCP_SET(ST.lastcp);
4671             PUSH_STATE_GOTO(WHILEM_A_min,
4672                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4673             /* NOTREACHED */
4674
4675 #undef  ST
4676 #define ST st->u.branch
4677
4678         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4679             next = scan + ARG(scan);
4680             if (next == scan)
4681                 next = NULL;
4682             scan = NEXTOPER(scan);
4683             /* FALL THROUGH */
4684
4685         case BRANCH:        /*  /(...|A|...)/ */
4686             scan = NEXTOPER(scan); /* scan now points to inner node */
4687             ST.lastparen = *PL_reglastparen;
4688             ST.next_branch = next;
4689             REGCP_SET(ST.cp);
4690             PL_reginput = locinput;
4691
4692             /* Now go into the branch */
4693             if (has_cutgroup) {
4694                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4695             } else {
4696                 PUSH_STATE_GOTO(BRANCH_next, scan);
4697             }
4698             /* NOTREACHED */
4699         case CUTGROUP:
4700             PL_reginput = locinput;
4701             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4702                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4703             PUSH_STATE_GOTO(CUTGROUP_next,next);
4704             /* NOTREACHED */
4705         case CUTGROUP_next_fail:
4706             do_cutgroup = 1;
4707             no_final = 1;
4708             if (st->u.mark.mark_name)
4709                 sv_commit = st->u.mark.mark_name;
4710             sayNO;
4711             /* NOTREACHED */
4712         case BRANCH_next:
4713             sayYES;
4714             /* NOTREACHED */
4715         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4716             if (do_cutgroup) {
4717                 do_cutgroup = 0;
4718                 no_final = 0;
4719             }
4720             REGCP_UNWIND(ST.cp);
4721             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4722                 PL_regoffs[n].end = -1;
4723             *PL_reglastparen = n;
4724             /*dmq: *PL_reglastcloseparen = n; */
4725             scan = ST.next_branch;
4726             /* no more branches? */
4727             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4728                 DEBUG_EXECUTE_r({
4729                     PerlIO_printf( Perl_debug_log,
4730                         "%*s  %sBRANCH failed...%s\n",
4731                         REPORT_CODE_OFF+depth*2, "",
4732                         PL_colors[4],
4733                         PL_colors[5] );
4734                 });
4735                 sayNO_SILENT;
4736             }
4737             continue; /* execute next BRANCH[J] op */
4738             /* NOTREACHED */
4739
4740         case MINMOD:
4741             minmod = 1;
4742             break;
4743
4744 #undef  ST
4745 #define ST st->u.curlym
4746
4747         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4748
4749             /* This is an optimisation of CURLYX that enables us to push
4750              * only a single backtracking state, no matter how many matches
4751              * there are in {m,n}. It relies on the pattern being constant
4752              * length, with no parens to influence future backrefs
4753              */
4754
4755             ST.me = scan;
4756             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4757
4758             /* if paren positive, emulate an OPEN/CLOSE around A */
4759             if (ST.me->flags) {
4760                 U32 paren = ST.me->flags;
4761                 if (paren > PL_regsize)
4762                     PL_regsize = paren;
4763                 if (paren > *PL_reglastparen)
4764                     *PL_reglastparen = paren;
4765                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4766             }
4767             ST.A = scan;
4768             ST.B = next;
4769             ST.alen = 0;
4770             ST.count = 0;
4771             ST.minmod = minmod;
4772             minmod = 0;
4773             ST.c1 = CHRTEST_UNINIT;
4774             REGCP_SET(ST.cp);
4775
4776             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4777                 goto curlym_do_B;
4778
4779           curlym_do_A: /* execute the A in /A{m,n}B/  */
4780             PL_reginput = locinput;
4781             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4782             /* NOTREACHED */
4783
4784         case CURLYM_A: /* we've just matched an A */
4785             locinput = st->locinput;
4786             nextchr = UCHARAT(locinput);
4787
4788             ST.count++;
4789             /* after first match, determine A's length: u.curlym.alen */
4790             if (ST.count == 1) {
4791                 if (PL_reg_match_utf8) {
4792                     char *s = locinput;
4793                     while (s < PL_reginput) {
4794                         ST.alen++;
4795                         s += UTF8SKIP(s);
4796                     }
4797                 }
4798                 else {
4799                     ST.alen = PL_reginput - locinput;
4800                 }
4801                 if (ST.alen == 0)
4802                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4803             }
4804             DEBUG_EXECUTE_r(
4805                 PerlIO_printf(Perl_debug_log,
4806                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4807                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4808                           (IV) ST.count, (IV)ST.alen)
4809             );
4810
4811             locinput = PL_reginput;
4812
4813             if (cur_eval && cur_eval->u.eval.close_paren &&
4814                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4815                 goto fake_end;
4816
4817             {
4818                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4819                 if ( max == REG_INFTY || ST.count < max )
4820                     goto curlym_do_A; /* try to match another A */
4821             }
4822             goto curlym_do_B; /* try to match B */
4823
4824         case CURLYM_A_fail: /* just failed to match an A */
4825             REGCP_UNWIND(ST.cp);
4826
4827             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4828                 || (cur_eval && cur_eval->u.eval.close_paren &&
4829                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4830                 sayNO;
4831
4832           curlym_do_B: /* execute the B in /A{m,n}B/  */
4833             PL_reginput = locinput;
4834             if (ST.c1 == CHRTEST_UNINIT) {
4835                 /* calculate c1 and c2 for possible match of 1st char
4836                  * following curly */
4837                 ST.c1 = ST.c2 = CHRTEST_VOID;
4838                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4839                     regnode *text_node = ST.B;
4840                     if (! HAS_TEXT(text_node))
4841                         FIND_NEXT_IMPT(text_node);
4842                     /* this used to be
4843
4844                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4845
4846                         But the former is redundant in light of the latter.
4847
4848                         if this changes back then the macro for
4849                         IS_TEXT and friends need to change.
4850                      */
4851                     if (PL_regkind[OP(text_node)] == EXACT)
4852                     {
4853
4854                         ST.c1 = (U8)*STRING(text_node);
4855                         ST.c2 =
4856                             (IS_TEXTF(text_node))
4857                             ? PL_fold[ST.c1]
4858                             : (IS_TEXTFL(text_node))
4859                                 ? PL_fold_locale[ST.c1]
4860                                 : ST.c1;
4861                     }
4862                 }
4863             }
4864
4865             DEBUG_EXECUTE_r(
4866                 PerlIO_printf(Perl_debug_log,
4867                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4868                     (int)(REPORT_CODE_OFF+(depth*2)),
4869                     "", (IV)ST.count)
4870                 );
4871             if (ST.c1 != CHRTEST_VOID
4872                     && UCHARAT(PL_reginput) != ST.c1
4873                     && UCHARAT(PL_reginput) != ST.c2)
4874             {
4875                 /* simulate B failing */
4876                 DEBUG_OPTIMISE_r(
4877                     PerlIO_printf(Perl_debug_log,
4878                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4879                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4880                         (IV)ST.c1,(IV)ST.c2
4881                 ));
4882                 state_num = CURLYM_B_fail;
4883                 goto reenter_switch;
4884             }
4885
4886             if (ST.me->flags) {
4887                 /* mark current A as captured */
4888                 I32 paren = ST.me->flags;
4889                 if (ST.count) {
4890                     PL_regoffs[paren].start
4891                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4892                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4893                     /*dmq: *PL_reglastcloseparen = paren; */
4894                 }
4895                 else
4896                     PL_regoffs[paren].end = -1;
4897                 if (cur_eval && cur_eval->u.eval.close_paren &&
4898                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4899                 {
4900                     if (ST.count)
4901                         goto fake_end;
4902                     else
4903                         sayNO;
4904                 }
4905             }
4906
4907             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4908             /* NOTREACHED */
4909
4910         case CURLYM_B_fail: /* just failed to match a B */
4911             REGCP_UNWIND(ST.cp);
4912             if (ST.minmod) {
4913                 I32 max = ARG2(ST.me);
4914                 if (max != REG_INFTY && ST.count == max)
4915                     sayNO;
4916                 goto curlym_do_A; /* try to match a further A */
4917             }
4918             /* backtrack one A */
4919             if (ST.count == ARG1(ST.me) /* min */)
4920                 sayNO;
4921             ST.count--;
4922             locinput = HOPc(locinput, -ST.alen);
4923             goto curlym_do_B; /* try to match B */
4924
4925 #undef ST
4926 #define ST st->u.curly
4927
4928 #define CURLY_SETPAREN(paren, success) \
4929     if (paren) { \
4930         if (success) { \
4931             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4932             PL_regoffs[paren].end = locinput - PL_bostr; \
4933             *PL_reglastcloseparen = paren; \
4934         } \
4935         else \
4936             PL_regoffs[paren].end = -1; \
4937     }
4938
4939         case STAR:              /*  /A*B/ where A is width 1 */
4940             ST.paren = 0;
4941             ST.min = 0;
4942             ST.max = REG_INFTY;
4943             scan = NEXTOPER(scan);
4944             goto repeat;
4945         case PLUS:              /*  /A+B/ where A is width 1 */
4946             ST.paren = 0;
4947             ST.min = 1;
4948             ST.max = REG_INFTY;
4949             scan = NEXTOPER(scan);
4950             goto repeat;
4951         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4952             ST.paren = scan->flags;     /* Which paren to set */
4953             if (ST.paren > PL_regsize)
4954                 PL_regsize = ST.paren;
4955             if (ST.paren > *PL_reglastparen)
4956                 *PL_reglastparen = ST.paren;
4957             ST.min = ARG1(scan);  /* min to match */
4958             ST.max = ARG2(scan);  /* max to match */
4959             if (cur_eval && cur_eval->u.eval.close_paren &&
4960                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4961                 ST.min=1;
4962                 ST.max=1;
4963             }
4964             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4965             goto repeat;
4966         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4967             ST.paren = 0;
4968             ST.min = ARG1(scan);  /* min to match */
4969             ST.max = ARG2(scan);  /* max to match */
4970             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4971           repeat:
4972             /*
4973             * Lookahead to avoid useless match attempts
4974             * when we know what character comes next.
4975             *
4976             * Used to only do .*x and .*?x, but now it allows
4977             * for )'s, ('s and (?{ ... })'s to be in the way
4978             * of the quantifier and the EXACT-like node.  -- japhy
4979             */
4980
4981             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4982                 sayNO;
4983             if (HAS_TEXT(next) || JUMPABLE(next)) {
4984                 U8 *s;
4985                 regnode *text_node = next;
4986
4987                 if (! HAS_TEXT(text_node))
4988                     FIND_NEXT_IMPT(text_node);
4989
4990                 if (! HAS_TEXT(text_node))
4991                     ST.c1 = ST.c2 = CHRTEST_VOID;
4992                 else {
4993                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4994                         ST.c1 = ST.c2 = CHRTEST_VOID;
4995                         goto assume_ok_easy;
4996                     }
4997                     else
4998                         s = (U8*)STRING(text_node);
4999
5000                     /*  Currently we only get here when
5001
5002                         PL_rekind[OP(text_node)] == EXACT
5003
5004                         if this changes back then the macro for IS_TEXT and
5005                         friends need to change. */
5006                     if (!UTF_PATTERN) {
5007                         ST.c2 = ST.c1 = *s;
5008                         if (IS_TEXTF(text_node))
5009                             ST.c2 = PL_fold[ST.c1];
5010                         else if (IS_TEXTFL(text_node))
5011                             ST.c2 = PL_fold_locale[ST.c1];
5012                     }
5013                     else { /* UTF_PATTERN */
5014                         if (IS_TEXTF(text_node)) {
5015                              STRLEN ulen1, ulen2;
5016                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5017                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5018
5019                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5020                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5021 #ifdef EBCDIC
5022                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5023                                                     ckWARN(WARN_UTF8) ?
5024                                                     0 : UTF8_ALLOW_ANY);
5025                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5026                                                     ckWARN(WARN_UTF8) ?
5027                                                     0 : UTF8_ALLOW_ANY);
5028 #else
5029                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5030                                                     uniflags);
5031                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5032                                                     uniflags);
5033 #endif
5034                         }
5035                         else {
5036                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5037                                                      uniflags);
5038                         }
5039                     }
5040                 }
5041             }
5042             else
5043                 ST.c1 = ST.c2 = CHRTEST_VOID;
5044         assume_ok_easy:
5045
5046             ST.A = scan;
5047             ST.B = next;
5048             PL_reginput = locinput;
5049             if (minmod) {
5050                 minmod = 0;
5051                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5052                     sayNO;
5053                 ST.count = ST.min;
5054                 locinput = PL_reginput;
5055                 REGCP_SET(ST.cp);
5056                 if (ST.c1 == CHRTEST_VOID)
5057                     goto curly_try_B_min;
5058
5059                 ST.oldloc = locinput;
5060
5061                 /* set ST.maxpos to the furthest point along the
5062                  * string that could possibly match */
5063                 if  (ST.max == REG_INFTY) {
5064                     ST.maxpos = PL_regeol - 1;
5065                     if (utf8_target)
5066                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5067                             ST.maxpos--;
5068                 }
5069                 else if (utf8_target) {
5070                     int m = ST.max - ST.min;
5071                     for (ST.maxpos = locinput;
5072                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5073                         ST.maxpos += UTF8SKIP(ST.maxpos);
5074                 }
5075                 else {
5076                     ST.maxpos = locinput + ST.max - ST.min;
5077                     if (ST.maxpos >= PL_regeol)
5078                         ST.maxpos = PL_regeol - 1;
5079                 }
5080                 goto curly_try_B_min_known;
5081
5082             }
5083             else {
5084                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5085                 locinput = PL_reginput;
5086                 if (ST.count < ST.min)
5087                     sayNO;
5088                 if ((ST.count > ST.min)
5089                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5090                 {
5091                     /* A{m,n} must come at the end of the string, there's
5092                      * no point in backing off ... */
5093                     ST.min = ST.count;
5094                     /* ...except that $ and \Z can match before *and* after
5095                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5096                        We may back off by one in this case. */
5097                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5098                         ST.min--;
5099                 }
5100                 REGCP_SET(ST.cp);
5101                 goto curly_try_B_max;
5102             }
5103             /* NOTREACHED */
5104
5105
5106         case CURLY_B_min_known_fail:
5107             /* failed to find B in a non-greedy match where c1,c2 valid */
5108             if (ST.paren && ST.count)
5109                 PL_regoffs[ST.paren].end = -1;
5110
5111             PL_reginput = locinput;     /* Could be reset... */
5112             REGCP_UNWIND(ST.cp);
5113             /* Couldn't or didn't -- move forward. */
5114             ST.oldloc = locinput;
5115             if (utf8_target)
5116                 locinput += UTF8SKIP(locinput);
5117             else
5118                 locinput++;
5119             ST.count++;
5120           curly_try_B_min_known:
5121              /* find the next place where 'B' could work, then call B */
5122             {
5123                 int n;
5124                 if (utf8_target) {
5125                     n = (ST.oldloc == locinput) ? 0 : 1;
5126                     if (ST.c1 == ST.c2) {
5127                         STRLEN len;
5128                         /* set n to utf8_distance(oldloc, locinput) */
5129                         while (locinput <= ST.maxpos &&
5130                                utf8n_to_uvchr((U8*)locinput,
5131                                               UTF8_MAXBYTES, &len,
5132                                               uniflags) != (UV)ST.c1) {
5133                             locinput += len;
5134                             n++;
5135                         }
5136                     }
5137                     else {
5138                         /* set n to utf8_distance(oldloc, locinput) */
5139                         while (locinput <= ST.maxpos) {
5140                             STRLEN len;
5141                             const UV c = utf8n_to_uvchr((U8*)locinput,
5142                                                   UTF8_MAXBYTES, &len,
5143                                                   uniflags);
5144                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5145                                 break;
5146                             locinput += len;
5147                             n++;
5148                         }
5149                     }
5150                 }
5151                 else {
5152                     if (ST.c1 == ST.c2) {
5153                         while (locinput <= ST.maxpos &&
5154                                UCHARAT(locinput) != ST.c1)
5155                             locinput++;
5156                     }
5157                     else {
5158                         while (locinput <= ST.maxpos
5159                                && UCHARAT(locinput) != ST.c1
5160                                && UCHARAT(locinput) != ST.c2)
5161                             locinput++;
5162                     }
5163                     n = locinput - ST.oldloc;
5164                 }
5165                 if (locinput > ST.maxpos)
5166                     sayNO;
5167                 /* PL_reginput == oldloc now */
5168                 if (n) {
5169                     ST.count += n;
5170                     if (regrepeat(rex, ST.A, n, depth) < n)
5171                         sayNO;
5172                 }
5173                 PL_reginput = locinput;
5174                 CURLY_SETPAREN(ST.paren, ST.count);
5175                 if (cur_eval && cur_eval->u.eval.close_paren &&
5176                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5177                     goto fake_end;
5178                 }
5179                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5180             }
5181             /* NOTREACHED */
5182
5183
5184         case CURLY_B_min_fail:
5185             /* failed to find B in a non-greedy match where c1,c2 invalid */
5186             if (ST.paren && ST.count)
5187                 PL_regoffs[ST.paren].end = -1;
5188
5189             REGCP_UNWIND(ST.cp);
5190             /* failed -- move forward one */
5191             PL_reginput = locinput;
5192             if (regrepeat(rex, ST.A, 1, depth)) {
5193                 ST.count++;
5194                 locinput = PL_reginput;
5195                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5196                         ST.count > 0)) /* count overflow ? */
5197                 {
5198                   curly_try_B_min:
5199                     CURLY_SETPAREN(ST.paren, ST.count);
5200                     if (cur_eval && cur_eval->u.eval.close_paren &&
5201                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5202                         goto fake_end;
5203                     }
5204                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5205                 }
5206             }
5207             sayNO;
5208             /* NOTREACHED */
5209
5210
5211         curly_try_B_max:
5212             /* a successful greedy match: now try to match B */
5213             if (cur_eval && cur_eval->u.eval.close_paren &&
5214                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5215                 goto fake_end;
5216             }
5217             {
5218                 UV c = 0;
5219                 if (ST.c1 != CHRTEST_VOID)
5220                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5221                                            UTF8_MAXBYTES, 0, uniflags)
5222                                 : (UV) UCHARAT(PL_reginput);
5223                 /* If it could work, try it. */
5224                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5225                     CURLY_SETPAREN(ST.paren, ST.count);
5226                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5227                     /* NOTREACHED */
5228                 }
5229             }
5230             /* FALL THROUGH */
5231         case CURLY_B_max_fail:
5232             /* failed to find B in a greedy match */
5233             if (ST.paren && ST.count)
5234                 PL_regoffs[ST.paren].end = -1;
5235
5236             REGCP_UNWIND(ST.cp);
5237             /*  back up. */
5238             if (--ST.count < ST.min)
5239                 sayNO;
5240             PL_reginput = locinput = HOPc(locinput, -1);
5241             goto curly_try_B_max;
5242
5243 #undef ST
5244
5245         case END:
5246             fake_end:
5247             if (cur_eval) {
5248                 /* we've just finished A in /(??{A})B/; now continue with B */
5249                 I32 tmpix;
5250                 st->u.eval.toggle_reg_flags
5251                             = cur_eval->u.eval.toggle_reg_flags;
5252                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5253
5254                 st->u.eval.prev_rex = rex_sv;           /* inner */
5255                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5256                 rex = (struct regexp *)SvANY(rex_sv);
5257                 rexi = RXi_GET(rex);
5258                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5259                 ReREFCNT_inc(rex_sv);
5260                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5261
5262                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5263                 PL_reglastparen = &rex->lastparen;
5264                 PL_reglastcloseparen = &rex->lastcloseparen;
5265
5266                 REGCP_SET(st->u.eval.lastcp);
5267                 PL_reginput = locinput;
5268
5269                 /* Restore parens of the outer rex without popping the
5270                  * savestack */
5271                 tmpix = PL_savestack_ix;
5272                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5273                 regcppop(rex);
5274                 PL_savestack_ix = tmpix;
5275
5276                 st->u.eval.prev_eval = cur_eval;
5277                 cur_eval = cur_eval->u.eval.prev_eval;
5278                 DEBUG_EXECUTE_r(
5279                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5280                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5281                 if ( nochange_depth )
5282                     nochange_depth--;
5283
5284                 PUSH_YES_STATE_GOTO(EVAL_AB,
5285                         st->u.eval.prev_eval->u.eval.B); /* match B */
5286             }
5287
5288             if (locinput < reginfo->till) {
5289                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5290                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5291                                       PL_colors[4],
5292                                       (long)(locinput - PL_reg_starttry),
5293                                       (long)(reginfo->till - PL_reg_starttry),
5294                                       PL_colors[5]));
5295
5296                 sayNO_SILENT;           /* Cannot match: too short. */
5297             }
5298             PL_reginput = locinput;     /* put where regtry can find it */
5299             sayYES;                     /* Success! */
5300
5301         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5302             DEBUG_EXECUTE_r(
5303             PerlIO_printf(Perl_debug_log,
5304                 "%*s  %ssubpattern success...%s\n",
5305                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5306             PL_reginput = locinput;     /* put where regtry can find it */
5307             sayYES;                     /* Success! */
5308
5309 #undef  ST
5310 #define ST st->u.ifmatch
5311
5312         case SUSPEND:   /* (?>A) */
5313             ST.wanted = 1;
5314             PL_reginput = locinput;
5315             goto do_ifmatch;
5316
5317         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5318             ST.wanted = 0;
5319             goto ifmatch_trivial_fail_test;
5320
5321         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5322             ST.wanted = 1;
5323           ifmatch_trivial_fail_test:
5324             if (scan->flags) {
5325                 char * const s = HOPBACKc(locinput, scan->flags);
5326                 if (!s) {
5327                     /* trivial fail */
5328                     if (logical) {
5329                         logical = 0;
5330                         sw = 1 - cBOOL(ST.wanted);
5331                     }
5332                     else if (ST.wanted)
5333                         sayNO;
5334                     next = scan + ARG(scan);
5335                     if (next == scan)
5336                         next = NULL;
5337                     break;
5338                 }
5339                 PL_reginput = s;
5340             }
5341             else
5342                 PL_reginput = locinput;
5343
5344           do_ifmatch:
5345             ST.me = scan;
5346             ST.logical = logical;
5347             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5348
5349             /* execute body of (?...A) */
5350             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5351             /* NOTREACHED */
5352
5353         case IFMATCH_A_fail: /* body of (?...A) failed */
5354             ST.wanted = !ST.wanted;
5355             /* FALL THROUGH */
5356
5357         case IFMATCH_A: /* body of (?...A) succeeded */
5358             if (ST.logical) {
5359                 sw = cBOOL(ST.wanted);
5360             }
5361             else if (!ST.wanted)
5362                 sayNO;
5363
5364             if (OP(ST.me) == SUSPEND)
5365                 locinput = PL_reginput;
5366             else {
5367                 locinput = PL_reginput = st->locinput;
5368                 nextchr = UCHARAT(locinput);
5369             }
5370             scan = ST.me + ARG(ST.me);
5371             if (scan == ST.me)
5372                 scan = NULL;
5373             continue; /* execute B */
5374
5375 #undef ST
5376
5377         case LONGJMP:
5378             next = scan + ARG(scan);
5379             if (next == scan)
5380                 next = NULL;
5381             break;
5382         case COMMIT:
5383             reginfo->cutpoint = PL_regeol;
5384             /* FALLTHROUGH */
5385         case PRUNE:
5386             PL_reginput = locinput;
5387             if (!scan->flags)
5388                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5389             PUSH_STATE_GOTO(COMMIT_next,next);
5390             /* NOTREACHED */
5391         case COMMIT_next_fail:
5392             no_final = 1;
5393             /* FALLTHROUGH */
5394         case OPFAIL:
5395             sayNO;
5396             /* NOTREACHED */
5397
5398 #define ST st->u.mark
5399         case MARKPOINT:
5400             ST.prev_mark = mark_state;
5401             ST.mark_name = sv_commit = sv_yes_mark
5402                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5403             mark_state = st;
5404             ST.mark_loc = PL_reginput = locinput;
5405             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5406             /* NOTREACHED */
5407         case MARKPOINT_next:
5408             mark_state = ST.prev_mark;
5409             sayYES;
5410             /* NOTREACHED */
5411         case MARKPOINT_next_fail:
5412             if (popmark && sv_eq(ST.mark_name,popmark))
5413             {
5414                 if (ST.mark_loc > startpoint)
5415                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5416                 popmark = NULL; /* we found our mark */
5417                 sv_commit = ST.mark_name;
5418
5419                 DEBUG_EXECUTE_r({
5420                         PerlIO_printf(Perl_debug_log,
5421                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5422                             REPORT_CODE_OFF+depth*2, "",
5423                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5424                 });
5425             }
5426             mark_state = ST.prev_mark;
5427             sv_yes_mark = mark_state ?
5428                 mark_state->u.mark.mark_name : NULL;
5429             sayNO;
5430             /* NOTREACHED */
5431         case SKIP:
5432             PL_reginput = locinput;
5433             if (scan->flags) {
5434                 /* (*SKIP) : if we fail we cut here*/
5435                 ST.mark_name = NULL;
5436                 ST.mark_loc = locinput;
5437                 PUSH_STATE_GOTO(SKIP_next,next);
5438             } else {
5439                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5440                    otherwise do nothing.  Meaning we need to scan
5441                  */
5442                 regmatch_state *cur = mark_state;
5443                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5444
5445                 while (cur) {
5446                     if ( sv_eq( cur->u.mark.mark_name,
5447                                 find ) )
5448                     {
5449                         ST.mark_name = find;
5450                         PUSH_STATE_GOTO( SKIP_next, next );
5451                     }
5452                     cur = cur->u.mark.prev_mark;
5453                 }
5454             }
5455             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5456             break;
5457         case SKIP_next_fail:
5458             if (ST.mark_name) {
5459                 /* (*CUT:NAME) - Set up to search for the name as we
5460                    collapse the stack*/
5461                 popmark = ST.mark_name;
5462             } else {
5463                 /* (*CUT) - No name, we cut here.*/
5464                 if (ST.mark_loc > startpoint)
5465                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5466                 /* but we set sv_commit to latest mark_name if there
5467                    is one so they can test to see how things lead to this
5468                    cut */
5469                 if (mark_state)
5470                     sv_commit=mark_state->u.mark.mark_name;
5471             }
5472             no_final = 1;
5473             sayNO;
5474             /* NOTREACHED */
5475 #undef ST
5476         case FOLDCHAR:
5477             n = ARG(scan);
5478             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5479                 locinput += ln;
5480             } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
5481                 sayNO;
5482             } else  {
5483                 U8 folded[UTF8_MAXBYTES_CASE+1];
5484                 STRLEN foldlen;
5485                 const char * const l = locinput;
5486                 char *e = PL_regeol;
5487                 to_uni_fold(n, folded, &foldlen);
5488
5489                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5490                                l, &e, 0,  utf8_target)) {
5491                         sayNO;
5492                 }
5493                 locinput = e;
5494             }
5495             nextchr = UCHARAT(locinput);
5496             break;
5497         case LNBREAK:
5498             if ((n=is_LNBREAK(locinput,utf8_target))) {
5499                 locinput += n;
5500                 nextchr = UCHARAT(locinput);
5501             } else
5502                 sayNO;
5503             break;
5504
5505 #define CASE_CLASS(nAmE)                              \
5506         case nAmE:                                    \
5507             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5508                 locinput += n;                        \
5509                 nextchr = UCHARAT(locinput);          \
5510             } else                                    \
5511                 sayNO;                                \
5512             break;                                    \
5513         case N##nAmE:                                 \
5514             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5515                 sayNO;                                \
5516             } else {                                  \
5517                 locinput += UTF8SKIP(locinput);       \
5518                 nextchr = UCHARAT(locinput);          \
5519             }                                         \
5520             break
5521
5522         CASE_CLASS(VERTWS);
5523         CASE_CLASS(HORIZWS);
5524 #undef CASE_CLASS
5525
5526         default:
5527             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5528                           PTR2UV(scan), OP(scan));
5529             Perl_croak(aTHX_ "regexp memory corruption");
5530
5531         } /* end switch */
5532
5533         /* switch break jumps here */
5534         scan = next; /* prepare to execute the next op and ... */
5535         continue;    /* ... jump back to the top, reusing st */
5536         /* NOTREACHED */
5537
5538       push_yes_state:
5539         /* push a state that backtracks on success */
5540         st->u.yes.prev_yes_state = yes_state;
5541         yes_state = st;
5542         /* FALL THROUGH */
5543       push_state:
5544         /* push a new regex state, then continue at scan  */
5545         {
5546             regmatch_state *newst;
5547
5548             DEBUG_STACK_r({
5549                 regmatch_state *cur = st;
5550                 regmatch_state *curyes = yes_state;
5551                 int curd = depth;
5552                 regmatch_slab *slab = PL_regmatch_slab;
5553                 for (;curd > -1;cur--,curd--) {
5554                     if (cur < SLAB_FIRST(slab)) {
5555                         slab = slab->prev;
5556                         cur = SLAB_LAST(slab);
5557                     }
5558                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5559                         REPORT_CODE_OFF + 2 + depth * 2,"",
5560                         curd, PL_reg_name[cur->resume_state],
5561                         (curyes == cur) ? "yes" : ""
5562                     );
5563                     if (curyes == cur)
5564                         curyes = cur->u.yes.prev_yes_state;
5565                 }
5566             } else
5567                 DEBUG_STATE_pp("push")
5568             );
5569             depth++;
5570             st->locinput = locinput;
5571             newst = st+1;
5572             if (newst >  SLAB_LAST(PL_regmatch_slab))
5573                 newst = S_push_slab(aTHX);
5574             PL_regmatch_state = newst;
5575
5576             locinput = PL_reginput;
5577             nextchr = UCHARAT(locinput);
5578             st = newst;
5579             continue;
5580             /* NOTREACHED */
5581         }
5582     }
5583
5584     /*
5585     * We get here only if there's trouble -- normally "case END" is
5586     * the terminating point.
5587     */
5588     Perl_croak(aTHX_ "corrupted regexp pointers");
5589     /*NOTREACHED*/
5590     sayNO;
5591
5592 yes:
5593     if (yes_state) {
5594         /* we have successfully completed a subexpression, but we must now
5595          * pop to the state marked by yes_state and continue from there */
5596         assert(st != yes_state);
5597 #ifdef DEBUGGING
5598         while (st != yes_state) {
5599             st--;
5600             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5601                 PL_regmatch_slab = PL_regmatch_slab->prev;
5602                 st = SLAB_LAST(PL_regmatch_slab);
5603             }
5604             DEBUG_STATE_r({
5605                 if (no_final) {
5606                     DEBUG_STATE_pp("pop (no final)");
5607                 } else {
5608                     DEBUG_STATE_pp("pop (yes)");
5609                 }
5610             });
5611             depth--;
5612         }
5613 #else
5614         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5615             || yes_state > SLAB_LAST(PL_regmatch_slab))
5616         {
5617             /* not in this slab, pop slab */
5618             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5619             PL_regmatch_slab = PL_regmatch_slab->prev;
5620             st = SLAB_LAST(PL_regmatch_slab);
5621         }
5622         depth -= (st - yes_state);
5623 #endif
5624         st = yes_state;
5625         yes_state = st->u.yes.prev_yes_state;
5626         PL_regmatch_state = st;
5627
5628         if (no_final) {
5629             locinput= st->locinput;
5630             nextchr = UCHARAT(locinput);
5631         }
5632         state_num = st->resume_state + no_final;
5633         goto reenter_switch;
5634     }
5635
5636     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5637                           PL_colors[4], PL_colors[5]));
5638
5639     if (PL_reg_eval_set) {
5640         /* each successfully executed (?{...}) block does the equivalent of
5641          *   local $^R = do {...}
5642          * When popping the save stack, all these locals would be undone;
5643          * bypass this by setting the outermost saved $^R to the latest
5644          * value */
5645         if (oreplsv != GvSV(PL_replgv))
5646             sv_setsv(oreplsv, GvSV(PL_replgv));
5647     }
5648     result = 1;
5649     goto final_exit;
5650
5651 no:
5652     DEBUG_EXECUTE_r(
5653         PerlIO_printf(Perl_debug_log,
5654             "%*s  %sfailed...%s\n",
5655             REPORT_CODE_OFF+depth*2, "",
5656             PL_colors[4], PL_colors[5])
5657         );
5658
5659 no_silent:
5660     if (no_final) {
5661         if (yes_state) {
5662             goto yes;
5663         } else {
5664             goto final_exit;
5665         }
5666     }
5667     if (depth) {
5668         /* there's a previous state to backtrack to */
5669         st--;
5670         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5671             PL_regmatch_slab = PL_regmatch_slab->prev;
5672             st = SLAB_LAST(PL_regmatch_slab);
5673         }
5674         PL_regmatch_state = st;
5675         locinput= st->locinput;
5676         nextchr = UCHARAT(locinput);
5677
5678         DEBUG_STATE_pp("pop");
5679         depth--;
5680         if (yes_state == st)
5681             yes_state = st->u.yes.prev_yes_state;
5682
5683         state_num = st->resume_state + 1; /* failure = success + 1 */
5684         goto reenter_switch;
5685     }
5686     result = 0;
5687
5688   final_exit:
5689     if (rex->intflags & PREGf_VERBARG_SEEN) {
5690         SV *sv_err = get_sv("REGERROR", 1);
5691         SV *sv_mrk = get_sv("REGMARK", 1);
5692         if (result) {
5693             sv_commit = &PL_sv_no;
5694             if (!sv_yes_mark)
5695                 sv_yes_mark = &PL_sv_yes;
5696         } else {
5697             if (!sv_commit)
5698                 sv_commit = &PL_sv_yes;
5699             sv_yes_mark = &PL_sv_no;
5700         }
5701         sv_setsv(sv_err, sv_commit);
5702         sv_setsv(sv_mrk, sv_yes_mark);
5703     }
5704
5705     /* clean up; in particular, free all slabs above current one */
5706     LEAVE_SCOPE(oldsave);
5707
5708     return result;
5709 }
5710
5711 /*
5712  - regrepeat - repeatedly match something simple, report how many
5713  */
5714 /*
5715  * [This routine now assumes that it will only match on things of length 1.
5716  * That was true before, but now we assume scan - reginput is the count,
5717  * rather than incrementing count on every character.  [Er, except utf8.]]
5718  */
5719 STATIC I32
5720 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5721 {
5722     dVAR;
5723     register char *scan;
5724     register I32 c;
5725     register char *loceol = PL_regeol;
5726     register I32 hardcount = 0;
5727     register bool utf8_target = PL_reg_match_utf8;
5728 #ifndef DEBUGGING
5729     PERL_UNUSED_ARG(depth);
5730 #endif
5731
5732     PERL_ARGS_ASSERT_REGREPEAT;
5733
5734     scan = PL_reginput;
5735     if (max == REG_INFTY)
5736         max = I32_MAX;
5737     else if (max < loceol - scan)
5738         loceol = scan + max;
5739     switch (OP(p)) {
5740     case REG_ANY:
5741         if (utf8_target) {
5742             loceol = PL_regeol;
5743             while (scan < loceol && hardcount < max && *scan != '\n') {
5744                 scan += UTF8SKIP(scan);
5745                 hardcount++;
5746             }
5747         } else {
5748             while (scan < loceol && *scan != '\n')
5749                 scan++;
5750         }
5751         break;
5752     case SANY:
5753         if (utf8_target) {
5754             loceol = PL_regeol;
5755             while (scan < loceol && hardcount < max) {
5756                 scan += UTF8SKIP(scan);
5757                 hardcount++;
5758             }
5759         }
5760         else
5761             scan = loceol;
5762         break;
5763     case CANY:
5764         scan = loceol;
5765         break;
5766     case EXACTFL:
5767         PL_reg_flags |= RF_tainted;
5768         /* FALL THROUGH */
5769     case EXACT:
5770     case EXACTF:
5771         /* To get here, EXACTish nodes must have *byte* length == 1.  That means
5772          * they match only characters in the string that can be expressed as a
5773          * single byte.  For non-utf8 strings, that means a simple match.  For
5774          * utf8 strings, the character matched must be an invariant, or
5775          * downgradable to a single byte.  The pattern's utf8ness is
5776          * irrelevant, as it must be a single byte, so either it isn't utf8, or
5777          * if it is it's an invariant */
5778
5779         c = (U8)*STRING(p);
5780         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5781
5782         if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
5783
5784             /* Here, the string isn't utf8, or the character in the EXACT
5785              * node is the same in utf8 as not, so can just do equality.
5786              * Each matching char must be 1 byte long */
5787             switch (OP(p)) {
5788             case EXACT:
5789                 while (scan < loceol && UCHARAT(scan) == c) {
5790                     scan++;
5791                 }
5792                 break;
5793             case EXACTF:
5794                 while (scan < loceol &&
5795                     (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5796                 {
5797                     scan++;
5798                 }
5799                 break;
5800             case EXACTFL:
5801                 while (scan < loceol &&
5802                     (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5803                 {
5804                     scan++;
5805                 }
5806                 break;
5807             default:
5808                 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5809             }
5810         }
5811         else {
5812
5813             /* Here, the string is utf8, and the pattern char is different
5814              * in utf8 than not.  */
5815
5816             switch (OP(p)) {
5817             case EXACT:
5818                 {
5819                     /* Fastest to find the two utf8 bytes that represent c, and
5820                      * then look for those in sequence in the utf8 string */
5821                     U8 high = UTF8_TWO_BYTE_HI(c);
5822                     U8 low = UTF8_TWO_BYTE_LO(c);
5823                     loceol = PL_regeol;
5824
5825                     while (hardcount < max
5826                            && scan + 1 < loceol
5827                            && UCHARAT(scan) == high
5828                            && UCHARAT(scan + 1) == low)
5829                     {
5830                         scan += 2;
5831                         hardcount++;
5832                     }
5833                 }
5834                 break;
5835             case EXACTFL:   /* Doesn't really make sense, but is best we can
5836                                do.  The documents warn against mixing locale
5837                                and utf8 */
5838             case EXACTF:
5839                 {   /* utf8 string, so use utf8 foldEQ */
5840                     char *tmpeol = loceol;
5841                     while (hardcount < max
5842                            && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
5843                                           STRING(p), NULL, 1, UTF_PATTERN))
5844                     {
5845                         scan = tmpeol;
5846                         tmpeol = loceol;
5847                         hardcount++;
5848                     }
5849
5850                     /* XXX Note that the above handles properly the German
5851                      * sharp ss in the pattern matching ss in the string.  But
5852                      * it doesn't handle properly cases where the string
5853                      * contains say 'LIGATURE ff' and the pattern is 'f+'.
5854                      * This would require, say, a new function or revised
5855                      * interface to foldEQ_utf8(), in which the maximum number
5856                      * of characters to match could be passed and it would
5857                      * return how many actually did.  This is just one of many
5858                      * cases where multi-char folds don't work properly, and so
5859                      * the fix is being deferred */
5860                 }
5861                 break;
5862             default:
5863                 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5864             }
5865         }
5866         break;
5867     case ANYOF:
5868         if (utf8_target) {
5869             loceol = PL_regeol;
5870             while (hardcount < max && scan < loceol &&
5871                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5872                 scan += UTF8SKIP(scan);
5873                 hardcount++;
5874             }
5875         } else {
5876             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5877                 scan++;
5878         }
5879         break;
5880     case ALNUM:
5881         if (utf8_target) {
5882             loceol = PL_regeol;
5883             LOAD_UTF8_CHARCLASS_ALNUM();
5884             while (hardcount < max && scan < loceol &&
5885                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5886             {
5887                 scan += UTF8SKIP(scan);
5888                 hardcount++;
5889             }
5890         } else if (FLAGS(p) & USE_UNI) {
5891             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5892                 scan++;
5893             }
5894         } else {
5895             while (scan < loceol && isALNUM((U8) *scan)) {
5896                 scan++;
5897             }
5898         }
5899         break;
5900     case ALNUML:
5901         PL_reg_flags |= RF_tainted;
5902         if (utf8_target) {
5903             loceol = PL_regeol;
5904             while (hardcount < max && scan < loceol &&
5905                    isALNUM_LC_utf8((U8*)scan)) {
5906                 scan += UTF8SKIP(scan);
5907                 hardcount++;
5908             }
5909         } else {
5910             while (scan < loceol && isALNUM_LC(*scan))
5911                 scan++;
5912         }
5913         break;
5914     case NALNUM:
5915         if (utf8_target) {
5916             loceol = PL_regeol;
5917             LOAD_UTF8_CHARCLASS_ALNUM();
5918             while (hardcount < max && scan < loceol &&
5919                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5920             {
5921                 scan += UTF8SKIP(scan);
5922                 hardcount++;
5923             }
5924         } else if (FLAGS(p) & USE_UNI) {
5925             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5926                 scan++;
5927             }
5928         } else {
5929             while (scan < loceol && ! isALNUM((U8) *scan)) {
5930                 scan++;
5931             }
5932         }
5933         break;
5934     case NALNUML:
5935         PL_reg_flags |= RF_tainted;
5936         if (utf8_target) {
5937             loceol = PL_regeol;
5938             while (hardcount < max && scan < loceol &&
5939                    !isALNUM_LC_utf8((U8*)scan)) {
5940                 scan += UTF8SKIP(scan);
5941                 hardcount++;
5942             }
5943         } else {
5944             while (scan < loceol && !isALNUM_LC(*scan))
5945                 scan++;
5946         }
5947         break;
5948     case SPACE:
5949         if (utf8_target) {
5950             loceol = PL_regeol;
5951             LOAD_UTF8_CHARCLASS_SPACE();
5952             while (hardcount < max && scan < loceol &&
5953                    (*scan == ' ' ||
5954                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5955             {
5956                 scan += UTF8SKIP(scan);
5957                 hardcount++;
5958             }
5959         } else if (FLAGS(p) & USE_UNI) {
5960             while (scan < loceol && isSPACE_L1((U8) *scan)) {
5961                 scan++;
5962             }
5963         } else {
5964             while (scan < loceol && isSPACE((U8) *scan))
5965                 scan++;
5966         }
5967         break;
5968     case SPACEL:
5969         PL_reg_flags |= RF_tainted;
5970         if (utf8_target) {
5971             loceol = PL_regeol;
5972             while (hardcount < max && scan < loceol &&
5973                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5974                 scan += UTF8SKIP(scan);
5975                 hardcount++;
5976             }
5977         } else {
5978             while (scan < loceol && isSPACE_LC(*scan))
5979                 scan++;
5980         }
5981         break;
5982     case NSPACE:
5983         if (utf8_target) {
5984             loceol = PL_regeol;
5985             LOAD_UTF8_CHARCLASS_SPACE();
5986             while (hardcount < max && scan < loceol &&
5987                    !(*scan == ' ' ||
5988                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5989             {
5990                 scan += UTF8SKIP(scan);
5991                 hardcount++;
5992             }
5993         } else if (FLAGS(p) & USE_UNI) {
5994             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
5995                 scan++;
5996             }
5997         } else {
5998             while (scan < loceol && ! isSPACE((U8) *scan)) {
5999                 scan++;
6000             }
6001         }
6002         break;
6003     case NSPACEL:
6004         PL_reg_flags |= RF_tainted;
6005         if (utf8_target) {
6006             loceol = PL_regeol;
6007             while (hardcount < max && scan < loceol &&
6008                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
6009                 scan += UTF8SKIP(scan);
6010                 hardcount++;
6011             }
6012         } else {
6013             while (scan < loceol && !isSPACE_LC(*scan))
6014                 scan++;
6015         }
6016         break;
6017     case DIGIT:
6018         if (utf8_target) {
6019             loceol = PL_regeol;
6020             LOAD_UTF8_CHARCLASS_DIGIT();
6021             while (hardcount < max && scan < loceol &&
6022                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6023                 scan += UTF8SKIP(scan);
6024                 hardcount++;
6025             }
6026         } else {
6027             while (scan < loceol && isDIGIT(*scan))
6028                 scan++;
6029         }
6030         break;
6031     case NDIGIT:
6032         if (utf8_target) {
6033             loceol = PL_regeol;
6034             LOAD_UTF8_CHARCLASS_DIGIT();
6035             while (hardcount < max && scan < loceol &&
6036                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6037                 scan += UTF8SKIP(scan);
6038                 hardcount++;
6039             }
6040         } else {
6041             while (scan < loceol && !isDIGIT(*scan))
6042                 scan++;
6043         }
6044     case LNBREAK:
6045         if (utf8_target) {
6046             loceol = PL_regeol;
6047             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6048                 scan += c;
6049                 hardcount++;
6050             }
6051         } else {
6052             /*
6053               LNBREAK can match two latin chars, which is ok,
6054               because we have a null terminated string, but we
6055               have to use hardcount in this situation
6056             */
6057             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6058                 scan+=c;
6059                 hardcount++;
6060             }
6061         }
6062         break;
6063     case HORIZWS:
6064         if (utf8_target) {
6065             loceol = PL_regeol;
6066             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6067                 scan += c;
6068                 hardcount++;
6069             }
6070         } else {
6071             while (scan < loceol && is_HORIZWS_latin1(scan))
6072                 scan++;
6073         }
6074         break;
6075     case NHORIZWS:
6076         if (utf8_target) {
6077             loceol = PL_regeol;
6078             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6079                 scan += UTF8SKIP(scan);
6080                 hardcount++;
6081             }
6082         } else {
6083             while (scan < loceol && !is_HORIZWS_latin1(scan))
6084                 scan++;
6085
6086         }
6087         break;
6088     case VERTWS:
6089         if (utf8_target) {
6090             loceol = PL_regeol;
6091             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6092                 scan += c;
6093                 hardcount++;
6094             }
6095         } else {
6096             while (scan < loceol && is_VERTWS_latin1(scan))
6097                 scan++;
6098
6099         }
6100         break;
6101     case NVERTWS:
6102         if (utf8_target) {
6103             loceol = PL_regeol;
6104             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6105                 scan += UTF8SKIP(scan);
6106                 hardcount++;
6107             }
6108         } else {
6109             while (scan < loceol && !is_VERTWS_latin1(scan))
6110                 scan++;
6111
6112         }
6113         break;
6114
6115     default:            /* Called on something of 0 width. */
6116         break;          /* So match right here or not at all. */
6117     }
6118
6119     if (hardcount)
6120         c = hardcount;
6121     else
6122         c = scan - PL_reginput;
6123     PL_reginput = scan;
6124
6125     DEBUG_r({
6126         GET_RE_DEBUG_FLAGS_DECL;
6127         DEBUG_EXECUTE_r({
6128             SV * const prop = sv_newmortal();
6129             regprop(prog, prop, p);
6130             PerlIO_printf(Perl_debug_log,
6131                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6132                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6133         });
6134     });
6135
6136     return(c);
6137 }
6138
6139
6140 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6141 /*
6142 - regclass_swash - prepare the utf8 swash
6143 */
6144
6145 SV *
6146 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6147 {
6148     dVAR;
6149     SV *sw  = NULL;
6150     SV *si  = NULL;
6151     SV *alt = NULL;
6152     RXi_GET_DECL(prog,progi);
6153     const struct reg_data * const data = prog ? progi->data : NULL;
6154
6155     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6156
6157     if (data && data->count) {
6158         const U32 n = ARG(node);
6159
6160         if (data->what[n] == 's') {
6161             SV * const rv = MUTABLE_SV(data->data[n]);
6162             AV * const av = MUTABLE_AV(SvRV(rv));
6163             SV **const ary = AvARRAY(av);
6164             SV **a, **b;
6165
6166             /* See the end of regcomp.c:S_regclass() for
6167              * documentation of these array elements. */
6168
6169             si = *ary;
6170             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6171             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6172
6173             if (a)
6174                 sw = *a;
6175             else if (si && doinit) {
6176                 sw = swash_init("utf8", "", si, 1, 0);
6177                 (void)av_store(av, 1, sw);
6178             }
6179             if (b)
6180                 alt = *b;
6181         }
6182     }
6183
6184     if (listsvp)
6185         *listsvp = si;
6186     if (altsvp)
6187         *altsvp  = alt;
6188
6189     return sw;
6190 }
6191 #endif
6192
6193 /*
6194  - reginclass - determine if a character falls into a character class
6195
6196   n is the ANYOF regnode
6197   p is the target string
6198   lenp is pointer to the maximum number of bytes of how far to go in p
6199     (This is assumed wthout checking to always be at least the current
6200     character's size)
6201   utf8_target tells whether p is in UTF-8.
6202
6203   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6204   from a successful match, the value it points to will be updated to how many
6205   bytes in p were matched.  If there was no match, the value is undefined,
6206   possibly changed from the input.
6207
6208  */
6209
6210 STATIC bool
6211 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6212 {
6213     dVAR;
6214     const char flags = ANYOF_FLAGS(n);
6215     bool match = FALSE;
6216     UV c = *p;
6217     STRLEN c_len = 0;
6218     STRLEN maxlen;
6219
6220     PERL_ARGS_ASSERT_REGINCLASS;
6221
6222     /* If c is not already the code point, get it */
6223     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6224         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6225                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6226                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6227                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6228                  * UTF8_ALLOW_FFFF */
6229         if (c_len == (STRLEN)-1)
6230             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6231     }
6232     else {
6233         c_len = 1;
6234     }
6235
6236     /* Use passed in max length, or one character if none passed in or less
6237      * than one character.  And assume will match just one character.  This is
6238      * overwritten later if matched more. */
6239     if (lenp) {
6240         maxlen = (*lenp > c_len) ? *lenp : c_len;
6241         *lenp = c_len;
6242
6243     }
6244     else {
6245         maxlen = c_len;
6246     }
6247
6248     /* If this character is potentially in the bitmap, check it */
6249     if (c < 256) {
6250         if (ANYOF_BITMAP_TEST(n, c))
6251             match = TRUE;
6252         else if (flags & ANYOF_FOLD) {
6253             U8 f;
6254
6255             if (flags & ANYOF_LOCALE) {
6256                 PL_reg_flags |= RF_tainted;
6257                 f = PL_fold_locale[c];
6258             }
6259             else
6260                 f = PL_fold[c];
6261             if (f != c && ANYOF_BITMAP_TEST(n, f))
6262                 match = TRUE;
6263         }
6264
6265         if (!match && (flags & ANYOF_CLASS)) {
6266             PL_reg_flags |= RF_tainted;
6267             if (
6268                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6269                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6270                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6271                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6272                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6273                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6274                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6275                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6276                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6277                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6278                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6279                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6280                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6281                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6282                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6283                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6284                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6285                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6286                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6287                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6288                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6289                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6290                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6291                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6292                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6293                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6294                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6295                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6296                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6297                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6298                 ) /* How's that for a conditional? */
6299             {
6300                 match = TRUE;
6301             }
6302         }
6303     }
6304
6305     /* If the bitmap didn't (or couldn't) match, and something outside the
6306      * bitmap could match, try that */
6307     if (!match && (utf8_target || (flags & ANYOF_UNICODE))) {
6308         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6309             match = TRUE;
6310         }
6311         else {
6312             AV *av;
6313             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6314
6315             if (sw) {
6316                 U8 * utf8_p;
6317                 if (utf8_target) {
6318                     utf8_p = (U8 *) p;
6319                 } else {
6320                     STRLEN len = 1;
6321                     utf8_p = bytes_to_utf8(p, &len);
6322                 }
6323                 if (swash_fetch(sw, utf8_p, 1))
6324                     match = TRUE;
6325                 else if (flags & ANYOF_FOLD) {
6326                     if (!match && lenp && av) {
6327                         I32 i;
6328                         for (i = 0; i <= av_len(av); i++) {
6329                             SV* const sv = *av_fetch(av, i, FALSE);
6330                             STRLEN len;
6331                             const char * const s = SvPV_const(sv, len);
6332                             if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
6333                                 *lenp = len;
6334                                 match = TRUE;
6335                                 break;
6336                             }
6337                         }
6338                     }
6339                     if (!match) {
6340                         U8 folded[UTF8_MAXBYTES_CASE+1];
6341
6342                         /* See if the folded version matches */
6343                         STRLEN foldlen;
6344                         to_utf8_fold(utf8_p, folded, &foldlen);
6345                         if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
6346                             match = TRUE;
6347                         }
6348                         else {
6349                             SV** listp;
6350
6351                             /* Consider "k" =~ /[K]/i.  The line above would
6352                              * have just folded the 'k' to itself, and that
6353                              * isn't going to match 'K'.  So we look through
6354                              * the closure of everything that folds to 'k'.
6355                              * That will find the 'K'.  Initialize the list, if
6356                              * necessary */
6357                             if (! PL_utf8_foldclosures) {
6358
6359                                 /* If the folds haven't been read in, call a fold
6360                              * function to force that */
6361                                 if (! PL_utf8_tofold) {
6362                                     U8 dummy[UTF8_MAXBYTES+1];
6363                                     STRLEN dummy_len;
6364                                     to_utf8_fold((U8*) "A", dummy, &dummy_len);
6365                                 }
6366                                 PL_utf8_foldclosures =
6367                                         _swash_inversion_hash(PL_utf8_tofold);
6368                             }
6369
6370                             /* The data structure is a hash with the keys every
6371                              * character that is folded to, like 'k', and the
6372                              * values each an array of everything that folds to
6373                              * its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6374                             if ((listp = hv_fetch(PL_utf8_foldclosures,
6375                                             (char *) folded, foldlen, FALSE)))
6376                             {
6377                                 AV* list = (AV*) *listp;
6378                                 IV i;
6379                                 for (i = 0; i <= av_len(list); i++) {
6380                                     SV** try_p = av_fetch(list, i, FALSE);
6381                                     if (try_p == NULL) {
6382                                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6383                                     }
6384                                     /* Don't have to worry about embeded nulls
6385                                      * since NULL isn't folded or foldable */
6386                                     if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
6387                                         match = TRUE;
6388                                         break;
6389                                     }
6390                                 }
6391                             }
6392                         }
6393                     }
6394                 }
6395
6396                 /* If we allocated a string above, free it */
6397                 if (! utf8_target) Safefree(utf8_p);
6398             }
6399         }
6400     }
6401
6402     return (flags & ANYOF_INVERT) ? !match : match;
6403 }
6404
6405 STATIC U8 *
6406 S_reghop3(U8 *s, I32 off, const U8* lim)
6407 {
6408     dVAR;
6409
6410     PERL_ARGS_ASSERT_REGHOP3;
6411
6412     if (off >= 0) {
6413         while (off-- && s < lim) {
6414             /* XXX could check well-formedness here */
6415             s += UTF8SKIP(s);
6416         }
6417     }
6418     else {
6419         while (off++ && s > lim) {
6420             s--;
6421             if (UTF8_IS_CONTINUED(*s)) {
6422                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6423                     s--;
6424             }
6425             /* XXX could check well-formedness here */
6426         }
6427     }
6428     return s;
6429 }
6430
6431 #ifdef XXX_dmq
6432 /* there are a bunch of places where we use two reghop3's that should
6433    be replaced with this routine. but since thats not done yet
6434    we ifdef it out - dmq
6435 */
6436 STATIC U8 *
6437 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6438 {
6439     dVAR;
6440
6441     PERL_ARGS_ASSERT_REGHOP4;
6442
6443     if (off >= 0) {
6444         while (off-- && s < rlim) {
6445             /* XXX could check well-formedness here */
6446             s += UTF8SKIP(s);
6447         }
6448     }
6449     else {
6450         while (off++ && s > llim) {
6451             s--;
6452             if (UTF8_IS_CONTINUED(*s)) {
6453                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6454                     s--;
6455             }
6456             /* XXX could check well-formedness here */
6457         }
6458     }
6459     return s;
6460 }
6461 #endif
6462
6463 STATIC U8 *
6464 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6465 {
6466     dVAR;
6467
6468     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6469
6470     if (off >= 0) {
6471         while (off-- && s < lim) {
6472             /* XXX could check well-formedness here */
6473             s += UTF8SKIP(s);
6474         }
6475         if (off >= 0)
6476             return NULL;
6477     }
6478     else {
6479         while (off++ && s > lim) {
6480             s--;
6481             if (UTF8_IS_CONTINUED(*s)) {
6482                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6483                     s--;
6484             }
6485             /* XXX could check well-formedness here */
6486         }
6487         if (off <= 0)
6488             return NULL;
6489     }
6490     return s;
6491 }
6492
6493 static void
6494 restore_pos(pTHX_ void *arg)
6495 {
6496     dVAR;
6497     regexp * const rex = (regexp *)arg;
6498     if (PL_reg_eval_set) {
6499         if (PL_reg_oldsaved) {
6500             rex->subbeg = PL_reg_oldsaved;
6501             rex->sublen = PL_reg_oldsavedlen;
6502 #ifdef PERL_OLD_COPY_ON_WRITE
6503             rex->saved_copy = PL_nrs;
6504 #endif
6505             RXp_MATCH_COPIED_on(rex);
6506         }
6507         PL_reg_magic->mg_len = PL_reg_oldpos;
6508         PL_reg_eval_set = 0;
6509         PL_curpm = PL_reg_oldcurpm;
6510     }
6511 }
6512
6513 STATIC void
6514 S_to_utf8_substr(pTHX_ register regexp *prog)
6515 {
6516     int i = 1;
6517
6518     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6519
6520     do {
6521         if (prog->substrs->data[i].substr
6522             && !prog->substrs->data[i].utf8_substr) {
6523             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6524             prog->substrs->data[i].utf8_substr = sv;
6525             sv_utf8_upgrade(sv);
6526             if (SvVALID(prog->substrs->data[i].substr)) {
6527                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6528                 if (flags & FBMcf_TAIL) {
6529                     /* Trim the trailing \n that fbm_compile added last
6530                        time.  */
6531                     SvCUR_set(sv, SvCUR(sv) - 1);
6532                     /* Whilst this makes the SV technically "invalid" (as its
6533                        buffer is no longer followed by "\0") when fbm_compile()
6534                        adds the "\n" back, a "\0" is restored.  */
6535                 }
6536                 fbm_compile(sv, flags);
6537             }
6538             if (prog->substrs->data[i].substr == prog->check_substr)
6539                 prog->check_utf8 = sv;
6540         }
6541     } while (i--);
6542 }
6543
6544 STATIC void
6545 S_to_byte_substr(pTHX_ register regexp *prog)
6546 {
6547     dVAR;
6548     int i = 1;
6549
6550     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6551
6552     do {
6553         if (prog->substrs->data[i].utf8_substr
6554             && !prog->substrs->data[i].substr) {
6555             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6556             if (sv_utf8_downgrade(sv, TRUE)) {
6557                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6558                     const U8 flags
6559                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6560                     if (flags & FBMcf_TAIL) {
6561                         /* Trim the trailing \n that fbm_compile added last
6562                            time.  */
6563                         SvCUR_set(sv, SvCUR(sv) - 1);
6564                     }
6565                     fbm_compile(sv, flags);
6566                 }
6567             } else {
6568                 SvREFCNT_dec(sv);
6569                 sv = &PL_sv_undef;
6570             }
6571             prog->substrs->data[i].substr = sv;
6572             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6573                 prog->check_substr = sv;
6574         }
6575     } while (i--);
6576 }
6577
6578 /*
6579  * Local variables:
6580  * c-indentation-style: bsd
6581  * c-basic-offset: 4
6582  * indent-tabs-mode: t
6583  * End:
6584  *
6585  * ex: set ts=8 sts=4 sw=4 noet:
6586  */