src/5015004/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #ifdef EBCDIC
 127     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 128      * skip the check on EBCDIC platforms */
 129 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 130 #else
 131 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 132     if (!CAT2(PL_utf8_,class)) { \
 133         bool ok; \
 134         ENTER; save_re_context(); \
 135         ok=CAT2(is_utf8_,class)((const U8*)str); \
 136         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 137 #endif
 138
 139 /* Doesn't do an assert to verify that is correct */
 140 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 141     if (!CAT2(PL_utf8_,class)) { \
 142         bool throw_away __attribute__unused__; \
 143         ENTER; save_re_context(); \
 144         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 145         LEAVE; } } STMT_END
 146
 147 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 148 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 149 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 150
 151 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 152         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 153         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 154         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 155             * assert should likely and hopefully fail on an EBCDIC machine */ \
 156         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 157                                                                             \
 158         /* No asserts are done for these, in case called on an early        \
 159             * Unicode version in which they map to nothing */               \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 163         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 164         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 165         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 166         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 167
 168 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 169
 170 /* The actual code for CCC_TRY, which uses several variables from the routine
 171  * it's callable from.  It is designed to be the bulk of a case statement.
 172  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 173  *      nextchr matches the class.
 174  * UTF8_TEST is the whole test string to use for utf8 targets
 175  * LOAD is what to use to test, and if not present to load in the swash for the
 176  *      class
 177  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 178  *      UTF8_TEST test.
 179  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 180  * utf8 and a variant, load the swash if necessary and test using the utf8
 181  * test.  Advance to the next character if test is ok, otherwise fail; If not
 182  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 183  * fails, or advance to the next character */
 184
 185 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 186     if (locinput >= PL_regeol) {                                              \
 187         sayNO;                                                                \
 188     }                                                                         \
 189     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 190         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 191         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 192             sayNO;                                                            \
 193         }                                                                     \
 194         locinput += PL_utf8skip[nextchr];                                     \
 195         nextchr = UCHARAT(locinput);                                          \
 196         break;                                                                \
 197     }                                                                         \
 198     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 199         sayNO;                                                                \
 200     }                                                                         \
 201     nextchr = UCHARAT(++locinput);                                            \
 202     break;
 203
 204 /* Handle the non-locale cases for a character class and its complement.  It
 205  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 206  * This is because that code fails when the test succeeds, so we want to have
 207  * the test fail so that the code succeeds.  The swash is stored in a
 208  * predictable PL_ place */
 209 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 210                            CLASS, STR)                                        \
 211     case NAME:                                                                \
 212         _CCC_TRY_CODE( !, FUNC,                                               \
 213                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 214                                             (U8*)locinput, TRUE)),            \
 215                           CLASS, STR)                                         \
 216     case NNAME:                                                               \
 217         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 218                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 219                                             (U8*)locinput, TRUE)),            \
 220                           CLASS, STR)                                         \
 221
 222 /* Generate the case statements for both locale and non-locale character
 223  * classes in regmatch for classes that don't have special unicode semantics.
 224  * Locales don't use an immediate swash, but an intermediary special locale
 225  * function that is called on the pointer to the current place in the input
 226  * string.  That function will resolve to needing the same swash.  One might
 227  * think that because we don't know what the locale will match, we shouldn't
 228  * check with the swash loading function that it loaded properly; ie, that we
 229  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 230  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 231  * irrelevant here */
 232 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 233                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 234                 NAMEA, NNAMEA, FUNCA,                                         \
 235                 CLASS, STR)                                                   \
 236     case NAMEL:                                                               \
 237         PL_reg_flags |= RF_tainted;                                           \
 238         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 239     case NNAMEL:                                                              \
 240         PL_reg_flags |= RF_tainted;                                           \
 241         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 242                        CLASS, STR)                                            \
 243     case NAMEA:                                                               \
 244         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 245             sayNO;                                                            \
 246         }                                                                     \
 247         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 248         nextchr = UCHARAT(++locinput);                                        \
 249         break;                                                                \
 250     case NNAMEA:                                                              \
 251         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 252             sayNO;                                                            \
 253         }                                                                     \
 254         if (utf8_target) {                                                    \
 255             locinput += PL_utf8skip[nextchr];                                 \
 256             nextchr = UCHARAT(locinput);                                      \
 257         }                                                                     \
 258         else {                                                                \
 259             nextchr = UCHARAT(++locinput);                                    \
 260         }                                                                     \
 261         break;                                                                \
 262     /* Generate the non-locale cases */                                       \
 263     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 264
 265 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 266  * statements to handle separate Unicode semantics nodes */
 267 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 268                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 269                   NAMEU, NNAMEU, FUNCU,                                        \
 270                   NAMEA, NNAMEA, FUNCA,                                        \
 271                   CLASS, STR)                                                  \
 272     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 273             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 274             NAMEA, NNAMEA, FUNCA,                                              \
 275             CLASS, STR)                                                        \
 276     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 277
 278 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 279
 280 /* for use after a quantifier and before an EXACT-like node -- japhy */
 281 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 282  *
 283  * NOTE that *nothing* that affects backtracking should be in here, specifically
 284  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 285  * node that is in between two EXACT like nodes when ascertaining what the required
 286  * "follow" character is. This should probably be moved to regex compile time
 287  * although it may be done at run time beause of the REF possibility - more
 288  * investigation required. -- demerphq
 289 */
 290 #define JUMPABLE(rn) (      \
 291     OP(rn) == OPEN ||       \
 292     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 293     OP(rn) == EVAL ||   \
 294     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 295     OP(rn) == PLUS || OP(rn) == MINMOD || \
 296     OP(rn) == KEEPS || \
 297     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 298 )
 299 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 300
 301 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 302
 303 #if 0
 304 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 305    we don't need this definition. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 307 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 309
 310 #else
 311 /* ... so we use this as its faster. */
 312 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 313 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 314 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 315 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 316
 317 #endif
 318
 319 /*
 320   Search for mandatory following text node; for lookahead, the text must
 321   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 322 */
 323 #define FIND_NEXT_IMPT(rn) STMT_START { \
 324     while (JUMPABLE(rn)) { \
 325         const OPCODE type = OP(rn); \
 326         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 327             rn = NEXTOPER(NEXTOPER(rn)); \
 328         else if (type == PLUS) \
 329             rn = NEXTOPER(rn); \
 330         else if (type == IFMATCH) \
 331             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 332         else rn += NEXT_OFF(rn); \
 333     } \
 334 } STMT_END
 335
 336
 337 static void restore_pos(pTHX_ void *arg);
 338
 339 #define REGCP_PAREN_ELEMS 4
 340 #define REGCP_OTHER_ELEMS 5
 341 #define REGCP_FRAME_ELEMS 1
 342 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 343  * are needed for the regexp context stack bookkeeping. */
 344
 345 STATIC CHECKPOINT
 346 S_regcppush(pTHX_ I32 parenfloor)
 347 {
 348     dVAR;
 349     const int retval = PL_savestack_ix;
 350     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 351     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 352     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 353     int p;
 354     GET_RE_DEBUG_FLAGS_DECL;
 355
 356     if (paren_elems_to_push < 0)
 357         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 358
 359     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 360         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 361                    " out of range (%lu-%ld)",
 362                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 363
 364     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 365
 366     for (p = PL_regsize; p > parenfloor; p--) {
 367 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 368         SSPUSHINT(PL_regoffs[p].end);
 369         SSPUSHINT(PL_regoffs[p].start);
 370         SSPUSHPTR(PL_reg_start_tmp[p]);
 371         SSPUSHINT(p);
 372         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 373           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 374                       (UV)p, (IV)PL_regoffs[p].start,
 375                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 376                       (IV)PL_regoffs[p].end
 377         ));
 378     }
 379 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 380     SSPUSHPTR(PL_regoffs);
 381     SSPUSHINT(PL_regsize);
 382     SSPUSHINT(*PL_reglastparen);
 383     SSPUSHINT(*PL_reglastcloseparen);
 384     SSPUSHPTR(PL_reginput);
 385     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 386
 387     return retval;
 388 }
 389
 390 /* These are needed since we do not localize EVAL nodes: */
 391 #define REGCP_SET(cp)                                           \
 392     DEBUG_STATE_r(                                              \
 393             PerlIO_printf(Perl_debug_log,                       \
 394                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 395                 (IV)PL_savestack_ix));                          \
 396     cp = PL_savestack_ix
 397
 398 #define REGCP_UNWIND(cp)                                        \
 399     DEBUG_STATE_r(                                              \
 400         if (cp != PL_savestack_ix)                              \
 401             PerlIO_printf(Perl_debug_log,                       \
 402                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 403                 (IV)(cp), (IV)PL_savestack_ix));                \
 404     regcpblow(cp)
 405
 406 STATIC char *
 407 S_regcppop(pTHX_ const regexp *rex)
 408 {
 409     dVAR;
 410     UV i;
 411     char *input;
 412     GET_RE_DEBUG_FLAGS_DECL;
 413
 414     PERL_ARGS_ASSERT_REGCPPOP;
 415
 416     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 417     i = SSPOPUV;
 418     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 419     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 420     input = (char *) SSPOPPTR;
 421     *PL_reglastcloseparen = SSPOPINT;
 422     *PL_reglastparen = SSPOPINT;
 423     PL_regsize = SSPOPINT;
 424     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 425
 426     i -= REGCP_OTHER_ELEMS;
 427     /* Now restore the parentheses context. */
 428     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 429         I32 tmps;
 430         U32 paren = (U32)SSPOPINT;
 431         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 432         PL_regoffs[paren].start = SSPOPINT;
 433         tmps = SSPOPINT;
 434         if (paren <= *PL_reglastparen)
 435             PL_regoffs[paren].end = tmps;
 436         DEBUG_BUFFERS_r(
 437             PerlIO_printf(Perl_debug_log,
 438                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 439                           (UV)paren, (IV)PL_regoffs[paren].start,
 440                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 441                           (IV)PL_regoffs[paren].end,
 442                           (paren > *PL_reglastparen ? "(no)" : ""));
 443         );
 444     }
 445     DEBUG_BUFFERS_r(
 446         if (*PL_reglastparen + 1 <= rex->nparens) {
 447             PerlIO_printf(Perl_debug_log,
 448                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 449                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 450         }
 451     );
 452 #if 1
 453     /* It would seem that the similar code in regtry()
 454      * already takes care of this, and in fact it is in
 455      * a better location to since this code can #if 0-ed out
 456      * but the code in regtry() is needed or otherwise tests
 457      * requiring null fields (pat.t#187 and split.t#{13,14}
 458      * (as of patchlevel 7877)  will fail.  Then again,
 459      * this code seems to be necessary or otherwise
 460      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 461      * --jhi updated by dapm */
 462     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 463         if (i > PL_regsize)
 464             PL_regoffs[i].start = -1;
 465         PL_regoffs[i].end = -1;
 466     }
 467 #endif
 468     return input;
 469 }
 470
 471 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 472
 473 /*
 474  * pregexec and friends
 475  */
 476
 477 #ifndef PERL_IN_XSUB_RE
 478 /*
 479  - pregexec - match a regexp against a string
 480  */
 481 I32
 482 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 483          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 484 /* strend: pointer to null at end of string */
 485 /* strbeg: real beginning of string */
 486 /* minend: end of match must be >=minend after stringarg. */
 487 /* nosave: For optimizations. */
 488 {
 489     PERL_ARGS_ASSERT_PREGEXEC;
 490
 491     return
 492         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 493                       nosave ? 0 : REXEC_COPY_STR);
 494 }
 495 #endif
 496
 497 /*
 498  * Need to implement the following flags for reg_anch:
 499  *
 500  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 501  * USE_INTUIT_ML
 502  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 503  * INTUIT_AUTORITATIVE_ML
 504  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 505  * INTUIT_ONCE_ML
 506  *
 507  * Another flag for this function: SECOND_TIME (so that float substrs
 508  * with giant delta may be not rechecked).
 509  */
 510
 511 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 512
 513 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 514    Otherwise, only SvCUR(sv) is used to get strbeg. */
 515
 516 /* XXXX We assume that strpos is strbeg unless sv. */
 517
 518 /* XXXX Some places assume that there is a fixed substring.
 519         An update may be needed if optimizer marks as "INTUITable"
 520         RExen without fixed substrings.  Similarly, it is assumed that
 521         lengths of all the strings are no more than minlen, thus they
 522         cannot come from lookahead.
 523         (Or minlen should take into account lookahead.)
 524   NOTE: Some of this comment is not correct. minlen does now take account
 525   of lookahead/behind. Further research is required. -- demerphq
 526
 527 */
 528
 529 /* A failure to find a constant substring means that there is no need to make
 530    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 531    finding a substring too deep into the string means that less calls to
 532    regtry() should be needed.
 533
 534    REx compiler's optimizer found 4 possible hints:
 535         a) Anchored substring;
 536         b) Fixed substring;
 537         c) Whether we are anchored (beginning-of-line or \G);
 538         d) First node (of those at offset 0) which may distinguish positions;
 539    We use a)b)d) and multiline-part of c), and try to find a position in the
 540    string which does not contradict any of them.
 541  */
 542
 543 /* Most of decisions we do here should have been done at compile time.
 544    The nodes of the REx which we used for the search should have been
 545    deleted from the finite automaton. */
 546
 547 char *
 548 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 549                      char *strend, const U32 flags, re_scream_pos_data *data)
 550 {
 551     dVAR;
 552     struct regexp *const prog = (struct regexp *)SvANY(rx);
 553     register I32 start_shift = 0;
 554     /* Should be nonnegative! */
 555     register I32 end_shift   = 0;
 556     register char *s;
 557     register SV *check;
 558     char *strbeg;
 559     char *t;
 560     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 561     I32 ml_anch;
 562     register char *other_last = NULL;   /* other substr checked before this */
 563     char *check_at = NULL;              /* check substr found at this pos */
 564     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 565     RXi_GET_DECL(prog,progi);
 566 #ifdef DEBUGGING
 567     const char * const i_strpos = strpos;
 568 #endif
 569     GET_RE_DEBUG_FLAGS_DECL;
 570
 571     PERL_ARGS_ASSERT_RE_INTUIT_START;
 572
 573     RX_MATCH_UTF8_set(rx,utf8_target);
 574
 575     if (RX_UTF8(rx)) {
 576         PL_reg_flags |= RF_utf8;
 577     }
 578     DEBUG_EXECUTE_r(
 579         debug_start_match(rx, utf8_target, strpos, strend,
 580             sv ? "Guessing start of match in sv for"
 581                : "Guessing start of match in string for");
 582               );
 583
 584     /* CHR_DIST() would be more correct here but it makes things slow. */
 585     if (prog->minlen > strend - strpos) {
 586         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 587                               "String too short... [re_intuit_start]\n"));
 588         goto fail;
 589     }
 590
 591     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 592     PL_regeol = strend;
 593     if (utf8_target) {
 594         if (!prog->check_utf8 && prog->check_substr)
 595             to_utf8_substr(prog);
 596         check = prog->check_utf8;
 597     } else {
 598         if (!prog->check_substr && prog->check_utf8)
 599             to_byte_substr(prog);
 600         check = prog->check_substr;
 601     }
 602     if (check == &PL_sv_undef) {
 603         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 604                 "Non-utf8 string cannot match utf8 check string\n"));
 605         goto fail;
 606     }
 607     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 608         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 609                      || ( (prog->extflags & RXf_ANCH_BOL)
 610                           && !multiline ) );    /* Check after \n? */
 611
 612         if (!ml_anch) {
 613           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 614                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 615                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 616                && sv && !SvROK(sv)
 617                && (strpos != strbeg)) {
 618               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 619               goto fail;
 620           }
 621           if (prog->check_offset_min == prog->check_offset_max &&
 622               !(prog->extflags & RXf_CANY_SEEN)) {
 623             /* Substring at constant offset from beg-of-str... */
 624             I32 slen;
 625
 626             s = HOP3c(strpos, prog->check_offset_min, strend);
 627
 628             if (SvTAIL(check)) {
 629                 slen = SvCUR(check);    /* >= 1 */
 630
 631                 if ( strend - s > slen || strend - s < slen - 1
 632                      || (strend - s == slen && strend[-1] != '\n')) {
 633                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 634                     goto fail_finish;
 635                 }
 636                 /* Now should match s[0..slen-2] */
 637                 slen--;
 638                 if (slen && (*SvPVX_const(check) != *s
 639                              || (slen > 1
 640                                  && memNE(SvPVX_const(check), s, slen)))) {
 641                   report_neq:
 642                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 643                     goto fail_finish;
 644                 }
 645             }
 646             else if (*SvPVX_const(check) != *s
 647                      || ((slen = SvCUR(check)) > 1
 648                          && memNE(SvPVX_const(check), s, slen)))
 649                 goto report_neq;
 650             check_at = s;
 651             goto success_at_start;
 652           }
 653         }
 654         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 655         s = strpos;
 656         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 657         end_shift = prog->check_end_shift;
 658
 659         if (!ml_anch) {
 660             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 661                                          - (SvTAIL(check) != 0);
 662             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 663
 664             if (end_shift < eshift)
 665                 end_shift = eshift;
 666         }
 667     }
 668     else {                              /* Can match at random position */
 669         ml_anch = 0;
 670         s = strpos;
 671         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 672         end_shift = prog->check_end_shift;
 673
 674         /* end shift should be non negative here */
 675     }
 676
 677 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 678     if (end_shift < 0)
 679         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 680                    (IV)end_shift, RX_PRECOMP(prog));
 681 #endif
 682
 683   restart:
 684     /* Find a possible match in the region s..strend by looking for
 685        the "check" substring in the region corrected by start/end_shift. */
 686
 687     {
 688         I32 srch_start_shift = start_shift;
 689         I32 srch_end_shift = end_shift;
 690         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 691             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 692             srch_start_shift = strbeg - s;
 693         }
 694     DEBUG_OPTIMISE_MORE_r({
 695         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 696             (IV)prog->check_offset_min,
 697             (IV)srch_start_shift,
 698             (IV)srch_end_shift,
 699             (IV)prog->check_end_shift);
 700     });
 701
 702     if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
 703         I32 p = -1;                     /* Internal iterator of scream. */
 704         I32 * const pp = data ? data->scream_pos : &p;
 705         const MAGIC *mg;
 706         bool found = FALSE;
 707
 708         assert(SvMAGICAL(sv));
 709         mg = mg_find(sv, PERL_MAGIC_study);
 710         assert(mg);
 711
 712         if (mg->mg_private == 1) {
 713             found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
 714         } else if (mg->mg_private == 2) {
 715             found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
 716         } else {
 717             assert (mg->mg_private == 4);
 718             found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
 719         }
 720
 721         if (found
 722             || ( BmRARE(check) == '\n'
 723                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 724                  && SvTAIL(check) ))
 725             s = screaminstr(sv, check,
 726                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 727         else
 728             goto fail_finish;
 729         /* we may be pointing at the wrong string */
 730         if (s && RXp_MATCH_COPIED(prog))
 731             s = strbeg + (s - SvPVX_const(sv));
 732         if (data)
 733             *data->scream_olds = s;
 734     }
 735     else {
 736         U8* start_point;
 737         U8* end_point;
 738         if (prog->extflags & RXf_CANY_SEEN) {
 739             start_point= (U8*)(s + srch_start_shift);
 740             end_point= (U8*)(strend - srch_end_shift);
 741         } else {
 742             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 743             end_point= HOP3(strend, -srch_end_shift, strbeg);
 744         }
 745         DEBUG_OPTIMISE_MORE_r({
 746             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 747                 (int)(end_point - start_point),
 748                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 749                 start_point);
 750         });
 751
 752         s = fbm_instr( start_point, end_point,
 753                       check, multiline ? FBMrf_MULTILINE : 0);
 754     }
 755     }
 756     /* Update the count-of-usability, remove useless subpatterns,
 757         unshift s.  */
 758
 759     DEBUG_EXECUTE_r({
 760         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 761             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 762         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 763                           (s ? "Found" : "Did not find"),
 764             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 765                 ? "anchored" : "floating"),
 766             quoted,
 767             RE_SV_TAIL(check),
 768             (s ? " at offset " : "...\n") );
 769     });
 770
 771     if (!s)
 772         goto fail_finish;
 773     /* Finish the diagnostic message */
 774     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 775
 776     /* XXX dmq: first branch is for positive lookbehind...
 777        Our check string is offset from the beginning of the pattern.
 778        So we need to do any stclass tests offset forward from that
 779        point. I think. :-(
 780      */
 781
 782
 783
 784     check_at=s;
 785
 786
 787     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 788        Start with the other substr.
 789        XXXX no SCREAM optimization yet - and a very coarse implementation
 790        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 791                 *always* match.  Probably should be marked during compile...
 792        Probably it is right to do no SCREAM here...
 793      */
 794
 795     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 796                 : (prog->float_substr && prog->anchored_substr))
 797     {
 798         /* Take into account the "other" substring. */
 799         /* XXXX May be hopelessly wrong for UTF... */
 800         if (!other_last)
 801             other_last = strpos;
 802         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 803           do_other_anchored:
 804             {
 805                 char * const last = HOP3c(s, -start_shift, strbeg);
 806                 char *last1, *last2;
 807                 char * const saved_s = s;
 808                 SV* must;
 809
 810                 t = s - prog->check_offset_max;
 811                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 812                     && (!utf8_target
 813                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 814                             && t > strpos)))
 815                     NOOP;
 816                 else
 817                     t = strpos;
 818                 t = HOP3c(t, prog->anchored_offset, strend);
 819                 if (t < other_last)     /* These positions already checked */
 820                     t = other_last;
 821                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 822                 if (last < last1)
 823                     last1 = last;
 824                 /* XXXX It is not documented what units *_offsets are in.
 825                    We assume bytes, but this is clearly wrong.
 826                    Meaning this code needs to be carefully reviewed for errors.
 827                    dmq.
 828                   */
 829
 830                 /* On end-of-str: see comment below. */
 831                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 832                 if (must == &PL_sv_undef) {
 833                     s = (char*)NULL;
 834                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 835                 }
 836                 else
 837                     s = fbm_instr(
 838                         (unsigned char*)t,
 839                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 840                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 841                         must,
 842                         multiline ? FBMrf_MULTILINE : 0
 843                     );
 844                 DEBUG_EXECUTE_r({
 845                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 846                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 847                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 848                         (s ? "Found" : "Contradicts"),
 849                         quoted, RE_SV_TAIL(must));
 850                 });
 851
 852
 853                 if (!s) {
 854                     if (last1 >= last2) {
 855                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856                                                 ", giving up...\n"));
 857                         goto fail_finish;
 858                     }
 859                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 860                         ", trying floating at offset %ld...\n",
 861                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 862                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 863                     s = HOP3c(last, 1, strend);
 864                     goto restart;
 865                 }
 866                 else {
 867                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 868                           (long)(s - i_strpos)));
 869                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 870                     other_last = HOP3c(s, 1, strend);
 871                     s = saved_s;
 872                     if (t == strpos)
 873                         goto try_at_start;
 874                     goto try_at_offset;
 875                 }
 876             }
 877         }
 878         else {          /* Take into account the floating substring. */
 879             char *last, *last1;
 880             char * const saved_s = s;
 881             SV* must;
 882
 883             t = HOP3c(s, -start_shift, strbeg);
 884             last1 = last =
 885                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 886             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 887                 last = HOP3c(t, prog->float_max_offset, strend);
 888             s = HOP3c(t, prog->float_min_offset, strend);
 889             if (s < other_last)
 890                 s = other_last;
 891  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 892             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 893             /* fbm_instr() takes into account exact value of end-of-str
 894                if the check is SvTAIL(ed).  Since false positives are OK,
 895                and end-of-str is not later than strend we are OK. */
 896             if (must == &PL_sv_undef) {
 897                 s = (char*)NULL;
 898                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 899             }
 900             else
 901                 s = fbm_instr((unsigned char*)s,
 902                               (unsigned char*)last + SvCUR(must)
 903                                   - (SvTAIL(must)!=0),
 904                               must, multiline ? FBMrf_MULTILINE : 0);
 905             DEBUG_EXECUTE_r({
 906                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 907                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 908                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 909                     (s ? "Found" : "Contradicts"),
 910                     quoted, RE_SV_TAIL(must));
 911             });
 912             if (!s) {
 913                 if (last1 == last) {
 914                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 915                                             ", giving up...\n"));
 916                     goto fail_finish;
 917                 }
 918                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 919                     ", trying anchored starting at offset %ld...\n",
 920                     (long)(saved_s + 1 - i_strpos)));
 921                 other_last = last;
 922                 s = HOP3c(t, 1, strend);
 923                 goto restart;
 924             }
 925             else {
 926                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 927                       (long)(s - i_strpos)));
 928                 other_last = s; /* Fix this later. --Hugo */
 929                 s = saved_s;
 930                 if (t == strpos)
 931                     goto try_at_start;
 932                 goto try_at_offset;
 933             }
 934         }
 935     }
 936
 937
 938     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 939
 940     DEBUG_OPTIMISE_MORE_r(
 941         PerlIO_printf(Perl_debug_log,
 942             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 943             (IV)prog->check_offset_min,
 944             (IV)prog->check_offset_max,
 945             (IV)(s-strpos),
 946             (IV)(t-strpos),
 947             (IV)(t-s),
 948             (IV)(strend-strpos)
 949         )
 950     );
 951
 952     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 953         && (!utf8_target
 954             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 955                  && t > strpos)))
 956     {
 957         /* Fixed substring is found far enough so that the match
 958            cannot start at strpos. */
 959       try_at_offset:
 960         if (ml_anch && t[-1] != '\n') {
 961             /* Eventually fbm_*() should handle this, but often
 962                anchored_offset is not 0, so this check will not be wasted. */
 963             /* XXXX In the code below we prefer to look for "^" even in
 964                presence of anchored substrings.  And we search even
 965                beyond the found float position.  These pessimizations
 966                are historical artefacts only.  */
 967           find_anchor:
 968             while (t < strend - prog->minlen) {
 969                 if (*t == '\n') {
 970                     if (t < check_at - prog->check_offset_min) {
 971                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 972                             /* Since we moved from the found position,
 973                                we definitely contradict the found anchored
 974                                substr.  Due to the above check we do not
 975                                contradict "check" substr.
 976                                Thus we can arrive here only if check substr
 977                                is float.  Redo checking for "other"=="fixed".
 978                              */
 979                             strpos = t + 1;
 980                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 981                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 982                             goto do_other_anchored;
 983                         }
 984                         /* We don't contradict the found floating substring. */
 985                         /* XXXX Why not check for STCLASS? */
 986                         s = t + 1;
 987                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 988                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 989                         goto set_useful;
 990                     }
 991                     /* Position contradicts check-string */
 992                     /* XXXX probably better to look for check-string
 993                        than for "\n", so one should lower the limit for t? */
 994                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 995                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 996                     other_last = strpos = s = t + 1;
 997                     goto restart;
 998                 }
 999                 t++;
1000             }
1001             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1002                         PL_colors[0], PL_colors[1]));
1003             goto fail_finish;
1004         }
1005         else {
1006             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1007                         PL_colors[0], PL_colors[1]));
1008         }
1009         s = t;
1010       set_useful:
1011         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1012     }
1013     else {
1014         /* The found string does not prohibit matching at strpos,
1015            - no optimization of calling REx engine can be performed,
1016            unless it was an MBOL and we are not after MBOL,
1017            or a future STCLASS check will fail this. */
1018       try_at_start:
1019         /* Even in this situation we may use MBOL flag if strpos is offset
1020            wrt the start of the string. */
1021         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1022             && (strpos != strbeg) && strpos[-1] != '\n'
1023             /* May be due to an implicit anchor of m{.*foo}  */
1024             && !(prog->intflags & PREGf_IMPLICIT))
1025         {
1026             t = strpos;
1027             goto find_anchor;
1028         }
1029         DEBUG_EXECUTE_r( if (ml_anch)
1030             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1031                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1032         );
1033       success_at_start:
1034         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1035             && (utf8_target ? (
1036                 prog->check_utf8                /* Could be deleted already */
1037                 && --BmUSEFUL(prog->check_utf8) < 0
1038                 && (prog->check_utf8 == prog->float_utf8)
1039             ) : (
1040                 prog->check_substr              /* Could be deleted already */
1041                 && --BmUSEFUL(prog->check_substr) < 0
1042                 && (prog->check_substr == prog->float_substr)
1043             )))
1044         {
1045             /* If flags & SOMETHING - do not do it many times on the same match */
1046             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1047             /* XXX Does the destruction order has to change with utf8_target? */
1048             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1049             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1050             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1051             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1052             check = NULL;                       /* abort */
1053             s = strpos;
1054             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1055                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1056             if (prog->intflags & PREGf_IMPLICIT)
1057                 prog->extflags &= ~RXf_ANCH_MBOL;
1058             /* XXXX This is a remnant of the old implementation.  It
1059                     looks wasteful, since now INTUIT can use many
1060                     other heuristics. */
1061             prog->extflags &= ~RXf_USE_INTUIT;
1062             /* XXXX What other flags might need to be cleared in this branch? */
1063         }
1064         else
1065             s = strpos;
1066     }
1067
1068     /* Last resort... */
1069     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1070     /* trie stclasses are too expensive to use here, we are better off to
1071        leave it to regmatch itself */
1072     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1073         /* minlen == 0 is possible if regstclass is \b or \B,
1074            and the fixed substr is ''$.
1075            Since minlen is already taken into account, s+1 is before strend;
1076            accidentally, minlen >= 1 guaranties no false positives at s + 1
1077            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1078            regstclass does not come from lookahead...  */
1079         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1080            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1081         const U8* const str = (U8*)STRING(progi->regstclass);
1082         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1083                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1084                     : 1);
1085         char * endpos;
1086         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1087             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1088         else if (prog->float_substr || prog->float_utf8)
1089             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1090         else
1091             endpos= strend;
1092
1093         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1094                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1095
1096         t = s;
1097         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1098         if (!s) {
1099 #ifdef DEBUGGING
1100             const char *what = NULL;
1101 #endif
1102             if (endpos == strend) {
1103                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Could not match STCLASS...\n") );
1105                 goto fail;
1106             }
1107             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                                    "This position contradicts STCLASS...\n") );
1109             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1110                 goto fail;
1111             /* Contradict one of substrings */
1112             if (prog->anchored_substr || prog->anchored_utf8) {
1113                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1114                     DEBUG_EXECUTE_r( what = "anchored" );
1115                   hop_and_restart:
1116                     s = HOP3c(t, 1, strend);
1117                     if (s + start_shift + end_shift > strend) {
1118                         /* XXXX Should be taken into account earlier? */
1119                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120                                                "Could not match STCLASS...\n") );
1121                         goto fail;
1122                     }
1123                     if (!check)
1124                         goto giveup;
1125                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126                                 "Looking for %s substr starting at offset %ld...\n",
1127                                  what, (long)(s + start_shift - i_strpos)) );
1128                     goto restart;
1129                 }
1130                 /* Have both, check_string is floating */
1131                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1132                     goto retry_floating_check;
1133                 /* Recheck anchored substring, but not floating... */
1134                 s = check_at;
1135                 if (!check)
1136                     goto giveup;
1137                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1138                           "Looking for anchored substr starting at offset %ld...\n",
1139                           (long)(other_last - i_strpos)) );
1140                 goto do_other_anchored;
1141             }
1142             /* Another way we could have checked stclass at the
1143                current position only: */
1144             if (ml_anch) {
1145                 s = t = t + 1;
1146                 if (!check)
1147                     goto giveup;
1148                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1149                           "Looking for /%s^%s/m starting at offset %ld...\n",
1150                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1151                 goto try_at_offset;
1152             }
1153             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1154                 goto fail;
1155             /* Check is floating substring. */
1156           retry_floating_check:
1157             t = check_at - start_shift;
1158             DEBUG_EXECUTE_r( what = "floating" );
1159             goto hop_and_restart;
1160         }
1161         if (t != s) {
1162             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1163                         "By STCLASS: moving %ld --> %ld\n",
1164                                   (long)(t - i_strpos), (long)(s - i_strpos))
1165                    );
1166         }
1167         else {
1168             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1169                                   "Does not contradict STCLASS...\n");
1170                    );
1171         }
1172     }
1173   giveup:
1174     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1175                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1176                           PL_colors[5], (long)(s - i_strpos)) );
1177     return s;
1178
1179   fail_finish:                          /* Substring not found */
1180     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1181         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1182   fail:
1183     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1184                           PL_colors[4], PL_colors[5]));
1185     return NULL;
1186 }
1187
1188 #define DECL_TRIE_TYPE(scan) \
1189     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1190                     trie_type = (scan->flags != EXACT) \
1191                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1192                               : (utf8_target ? trie_utf8 : trie_plain)
1193
1194 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1195 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1196     switch (trie_type) {                                                    \
1197     case trie_utf8_fold:                                                    \
1198         if ( foldlen>0 ) {                                                  \
1199             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1200             foldlen -= len;                                                 \
1201             uscan += len;                                                   \
1202             len=0;                                                          \
1203         } else {                                                            \
1204             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1205             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1206             foldlen -= UNISKIP( uvc );                                      \
1207             uscan = foldbuf + UNISKIP( uvc );                               \
1208         }                                                                   \
1209         break;                                                              \
1210     case trie_latin_utf8_fold:                                              \
1211         if ( foldlen>0 ) {                                                  \
1212             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1213             foldlen -= len;                                                 \
1214             uscan += len;                                                   \
1215             len=0;                                                          \
1216         } else {                                                            \
1217             len = 1;                                                        \
1218             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1219             foldlen -= UNISKIP( uvc );                                      \
1220             uscan = foldbuf + UNISKIP( uvc );                               \
1221         }                                                                   \
1222         break;                                                              \
1223     case trie_utf8:                                                         \
1224         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1225         break;                                                              \
1226     case trie_plain:                                                        \
1227         uvc = (UV)*uc;                                                      \
1228         len = 1;                                                            \
1229     }                                                                       \
1230     if (uvc < 256) {                                                        \
1231         charid = trie->charmap[ uvc ];                                      \
1232     }                                                                       \
1233     else {                                                                  \
1234         charid = 0;                                                         \
1235         if (widecharmap) {                                                  \
1236             SV** const svpp = hv_fetch(widecharmap,                         \
1237                         (char*)&uvc, sizeof(UV), 0);                        \
1238             if (svpp)                                                       \
1239                 charid = (U16)SvIV(*svpp);                                  \
1240         }                                                                   \
1241     }                                                                       \
1242 } STMT_END
1243
1244 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1245 STMT_START {                                              \
1246     while (s <= e) {                                      \
1247         if ( (CoNd)                                       \
1248              && (ln == 1 || folder(s, pat_string, ln))    \
1249              && (!reginfo || regtry(reginfo, &s)) )       \
1250             goto got_it;                                  \
1251         s++;                                              \
1252     }                                                     \
1253 } STMT_END
1254
1255 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1256 STMT_START {                                          \
1257     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1258         CoDe                                          \
1259         s += uskip;                                   \
1260     }                                                 \
1261 } STMT_END
1262
1263 #define REXEC_FBC_SCAN(CoDe)                          \
1264 STMT_START {                                          \
1265     while (s < strend) {                              \
1266         CoDe                                          \
1267         s++;                                          \
1268     }                                                 \
1269 } STMT_END
1270
1271 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1272 REXEC_FBC_UTF8_SCAN(                                  \
1273     if (CoNd) {                                       \
1274         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1275             goto got_it;                              \
1276         else                                          \
1277             tmp = doevery;                            \
1278     }                                                 \
1279     else                                              \
1280         tmp = 1;                                      \
1281 )
1282
1283 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1284 REXEC_FBC_SCAN(                                       \
1285     if (CoNd) {                                       \
1286         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1287             goto got_it;                              \
1288         else                                          \
1289             tmp = doevery;                            \
1290     }                                                 \
1291     else                                              \
1292         tmp = 1;                                      \
1293 )
1294
1295 #define REXEC_FBC_TRYIT               \
1296 if ((!reginfo || regtry(reginfo, &s))) \
1297     goto got_it
1298
1299 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1300     if (utf8_target) {                                             \
1301         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1302     }                                                          \
1303     else {                                                     \
1304         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1305     }
1306
1307 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1308     if (utf8_target) {                                             \
1309         UtFpReLoAd;                                            \
1310         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1311     }                                                          \
1312     else {                                                     \
1313         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1314     }
1315
1316 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1317     PL_reg_flags |= RF_tainted;                                \
1318     if (utf8_target) {                                             \
1319         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1320     }                                                          \
1321     else {                                                     \
1322         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1323     }
1324
1325 #define DUMP_EXEC_POS(li,s,doutf8) \
1326     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1327
1328
1329 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1330         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1331         tmp = TEST_NON_UTF8(tmp);                                              \
1332         REXEC_FBC_UTF8_SCAN(                                                   \
1333             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1334                 tmp = !tmp;                                                    \
1335                 IF_SUCCESS;                                                    \
1336             }                                                                  \
1337             else {                                                             \
1338                 IF_FAIL;                                                       \
1339             }                                                                  \
1340         );                                                                     \
1341
1342 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1343         if (s == PL_bostr) {                                                   \
1344             tmp = '\n';                                                        \
1345         }                                                                      \
1346         else {                                                                 \
1347             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1348             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1349         }                                                                      \
1350         tmp = TeSt1_UtF8;                                                      \
1351         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1352         REXEC_FBC_UTF8_SCAN(                                                   \
1353             if (tmp == ! (TeSt2_UtF8)) { \
1354                 tmp = !tmp;                                                    \
1355                 IF_SUCCESS;                                                    \
1356             }                                                                  \
1357             else {                                                             \
1358                 IF_FAIL;                                                       \
1359             }                                                                  \
1360         );                                                                     \
1361
1362 /* The only difference between the BOUND and NBOUND cases is that
1363  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1364  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1365  * with the other one being empty */
1366 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1367     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1368
1369 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1370     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1371
1372 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1373     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1374
1375 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1376     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1377
1378
1379 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1380  * be passed in completely with the variable name being tested, which isn't
1381  * such a clean interface, but this is easier to read than it was before.  We
1382  * are looking for the boundary (or non-boundary between a word and non-word
1383  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1384  * must be different.  Find the "wordness" of the character just prior to this
1385  * one, and compare it with the wordness of this one.  If they differ, we have
1386  * a boundary.  At the beginning of the string, pretend that the previous
1387  * character was a new-line */
1388 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1389     if (utf8_target) {                                                         \
1390                 UTF8_CODE \
1391     }                                                                          \
1392     else {  /* Not utf8 */                                                     \
1393         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1394         tmp = TEST_NON_UTF8(tmp);                                              \
1395         REXEC_FBC_SCAN(                                                        \
1396             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1397                 tmp = !tmp;                                                    \
1398                 IF_SUCCESS;                                                    \
1399             }                                                                  \
1400             else {                                                             \
1401                 IF_FAIL;                                                       \
1402             }                                                                  \
1403         );                                                                     \
1404     }                                                                          \
1405     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1406         goto got_it;
1407
1408 /* We know what class REx starts with.  Try to find this position... */
1409 /* if reginfo is NULL, its a dryrun */
1410 /* annoyingly all the vars in this routine have different names from their counterparts
1411    in regmatch. /grrr */
1412
1413 STATIC char *
1414 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1415     const char *strend, regmatch_info *reginfo)
1416 {
1417         dVAR;
1418         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1419         char *pat_string;   /* The pattern's exactish string */
1420         char *pat_end;      /* ptr to end char of pat_string */
1421         re_fold_t folder;       /* Function for computing non-utf8 folds */
1422         const U8 *fold_array;   /* array for folding ords < 256 */
1423         STRLEN ln;
1424         STRLEN lnc;
1425         register STRLEN uskip;
1426         U8 c1;
1427         U8 c2;
1428         char *e;
1429         register I32 tmp = 1;   /* Scratch variable? */
1430         register const bool utf8_target = PL_reg_match_utf8;
1431         UV utf8_fold_flags = 0;
1432         RXi_GET_DECL(prog,progi);
1433
1434         PERL_ARGS_ASSERT_FIND_BYCLASS;
1435
1436         /* We know what class it must start with. */
1437         switch (OP(c)) {
1438         case ANYOFV:
1439         case ANYOF:
1440             if (utf8_target || OP(c) == ANYOFV) {
1441                 STRLEN inclasslen = strend - s;
1442                 REXEC_FBC_UTF8_CLASS_SCAN(
1443                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1444             }
1445             else {
1446                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1447             }
1448             break;
1449         case CANY:
1450             REXEC_FBC_SCAN(
1451                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1452                     goto got_it;
1453                 else
1454                     tmp = doevery;
1455             );
1456             break;
1457
1458         case EXACTFA:
1459             if (UTF_PATTERN || utf8_target) {
1460                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1461                 goto do_exactf_utf8;
1462             }
1463             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1464             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1465             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1466
1467         case EXACTF:
1468             if (UTF_PATTERN || utf8_target) {
1469
1470                 /* regcomp.c already folded this if pattern is in UTF-8 */
1471                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1472                 goto do_exactf_utf8;
1473             }
1474             fold_array = PL_fold;
1475             folder = foldEQ;
1476             goto do_exactf_non_utf8;
1477
1478         case EXACTFL:
1479             if (UTF_PATTERN || utf8_target) {
1480                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1481                 goto do_exactf_utf8;
1482             }
1483             fold_array = PL_fold_locale;
1484             folder = foldEQ_locale;
1485             goto do_exactf_non_utf8;
1486
1487         case EXACTFU:
1488             if (UTF_PATTERN || utf8_target) {
1489                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1490                 goto do_exactf_utf8;
1491             }
1492
1493             /* Any 'ss' in the pattern should have been replaced by regcomp,
1494              * so we don't have to worry here about this single special case
1495              * in the Latin1 range */
1496             fold_array = PL_fold_latin1;
1497             folder = foldEQ_latin1;
1498
1499             /* FALL THROUGH */
1500
1501         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1502
1503             /* The idea in the non-utf8 EXACTF* cases is to first find the
1504              * first character of the EXACTF* node and then, if necessary,
1505              * case-insensitively compare the full text of the node.  c1 is the
1506              * first character.  c2 is its fold.  This logic will not work for
1507              * Unicode semantics and the german sharp ss, which hence should
1508              * not be compiled into a node that gets here. */
1509             pat_string = STRING(c);
1510             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1511
1512             /* We know that we have to match at least 'ln' bytes (which is the
1513              * same as characters, since not utf8).  If we have to match 3
1514              * characters, and there are only 2 availabe, we know without
1515              * trying that it will fail; so don't start a match past the
1516              * required minimum number from the far end */
1517             e = HOP3c(strend, -((I32)ln), s);
1518
1519             if (!reginfo && e < s) {
1520                 e = s;                  /* Due to minlen logic of intuit() */
1521             }
1522
1523             c1 = *pat_string;
1524             c2 = fold_array[c1];
1525             if (c1 == c2) { /* If char and fold are the same */
1526                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1527             }
1528             else {
1529                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1530             }
1531             break;
1532
1533         do_exactf_utf8:
1534         {
1535             unsigned expansion;
1536
1537
1538             /* If one of the operands is in utf8, we can't use the simpler
1539              * folding above, due to the fact that many different characters
1540              * can have the same fold, or portion of a fold, or different-
1541              * length fold */
1542             pat_string = STRING(c);
1543             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1544             pat_end = pat_string + ln;
1545             lnc = (UTF_PATTERN) /* length to match in characters */
1546                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1547                     : ln;
1548
1549             /* We have 'lnc' characters to match in the pattern, but because of
1550              * multi-character folding, each character in the target can match
1551              * up to 3 characters (Unicode guarantees it will never exceed
1552              * this) if it is utf8-encoded; and up to 2 if not (based on the
1553              * fact that the Latin 1 folds are already determined, and the
1554              * only multi-char fold in that range is the sharp-s folding to
1555              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1556              * string character.  Adjust lnc accordingly, always matching at
1557              * least 1 */
1558             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1559             lnc = (lnc < expansion) ? 1 : lnc / expansion;
1560
1561             /* As in the non-UTF8 case, if we have to match 3 characters, and
1562              * only 2 are left, it's guaranteed to fail, so don't start a
1563              * match that would require us to go beyond the end of the string
1564              */
1565             e = HOP3c(strend, -((I32)lnc), s);
1566
1567             if (!reginfo && e < s) {
1568                 e = s;                  /* Due to minlen logic of intuit() */
1569             }
1570
1571             /* XXX Note that we could recalculate e every so-often through the
1572              * loop to stop earlier, as the worst case expansion above will
1573              * rarely be met, and as we go along we would usually find that e
1574              * moves further to the left.  Unclear if worth the expense */
1575
1576             while (s <= e) {
1577                 char *my_strend= (char *)strend;
1578                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1579                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1580                     && (!reginfo || regtry(reginfo, &s)) )
1581                 {
1582                     goto got_it;
1583                 }
1584                 s += UTF8SKIP(s);
1585             }
1586             break;
1587         }
1588         case BOUNDL:
1589             PL_reg_flags |= RF_tainted;
1590             FBC_BOUND(isALNUM_LC,
1591                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1592                       isALNUM_LC_utf8((U8*)s));
1593             break;
1594         case NBOUNDL:
1595             PL_reg_flags |= RF_tainted;
1596             FBC_NBOUND(isALNUM_LC,
1597                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1598                        isALNUM_LC_utf8((U8*)s));
1599             break;
1600         case BOUND:
1601             FBC_BOUND(isWORDCHAR,
1602                       isALNUM_uni(tmp),
1603                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1604             break;
1605         case BOUNDA:
1606             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1607                              isWORDCHAR_A(tmp),
1608                              isWORDCHAR_A((U8*)s));
1609             break;
1610         case NBOUND:
1611             FBC_NBOUND(isWORDCHAR,
1612                        isALNUM_uni(tmp),
1613                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1614             break;
1615         case NBOUNDA:
1616             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1617                               isWORDCHAR_A(tmp),
1618                               isWORDCHAR_A((U8*)s));
1619             break;
1620         case BOUNDU:
1621             FBC_BOUND(isWORDCHAR_L1,
1622                       isALNUM_uni(tmp),
1623                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1624             break;
1625         case NBOUNDU:
1626             FBC_NBOUND(isWORDCHAR_L1,
1627                        isALNUM_uni(tmp),
1628                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1629             break;
1630         case ALNUML:
1631             REXEC_FBC_CSCAN_TAINT(
1632                 isALNUM_LC_utf8((U8*)s),
1633                 isALNUM_LC(*s)
1634             );
1635             break;
1636         case ALNUMU:
1637             REXEC_FBC_CSCAN_PRELOAD(
1638                 LOAD_UTF8_CHARCLASS_ALNUM(),
1639                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1640                 isWORDCHAR_L1((U8) *s)
1641             );
1642             break;
1643         case ALNUM:
1644             REXEC_FBC_CSCAN_PRELOAD(
1645                 LOAD_UTF8_CHARCLASS_ALNUM(),
1646                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1647                 isWORDCHAR((U8) *s)
1648             );
1649             break;
1650         case ALNUMA:
1651             /* Don't need to worry about utf8, as it can match only a single
1652              * byte invariant character */
1653             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1654             break;
1655         case NALNUMU:
1656             REXEC_FBC_CSCAN_PRELOAD(
1657                 LOAD_UTF8_CHARCLASS_ALNUM(),
1658                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1659                 ! isWORDCHAR_L1((U8) *s)
1660             );
1661             break;
1662         case NALNUM:
1663             REXEC_FBC_CSCAN_PRELOAD(
1664                 LOAD_UTF8_CHARCLASS_ALNUM(),
1665                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1666                 ! isALNUM(*s)
1667             );
1668             break;
1669         case NALNUMA:
1670             REXEC_FBC_CSCAN(
1671                 !isWORDCHAR_A(*s),
1672                 !isWORDCHAR_A(*s)
1673             );
1674             break;
1675         case NALNUML:
1676             REXEC_FBC_CSCAN_TAINT(
1677                 !isALNUM_LC_utf8((U8*)s),
1678                 !isALNUM_LC(*s)
1679             );
1680             break;
1681         case SPACEU:
1682             REXEC_FBC_CSCAN_PRELOAD(
1683                 LOAD_UTF8_CHARCLASS_SPACE(),
1684                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1685                 isSPACE_L1((U8) *s)
1686             );
1687             break;
1688         case SPACE:
1689             REXEC_FBC_CSCAN_PRELOAD(
1690                 LOAD_UTF8_CHARCLASS_SPACE(),
1691                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1692                 isSPACE((U8) *s)
1693             );
1694             break;
1695         case SPACEA:
1696             /* Don't need to worry about utf8, as it can match only a single
1697              * byte invariant character */
1698             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1699             break;
1700         case SPACEL:
1701             REXEC_FBC_CSCAN_TAINT(
1702                 isSPACE_LC_utf8((U8*)s),
1703                 isSPACE_LC(*s)
1704             );
1705             break;
1706         case NSPACEU:
1707             REXEC_FBC_CSCAN_PRELOAD(
1708                 LOAD_UTF8_CHARCLASS_SPACE(),
1709                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1710                 ! isSPACE_L1((U8) *s)
1711             );
1712             break;
1713         case NSPACE:
1714             REXEC_FBC_CSCAN_PRELOAD(
1715                 LOAD_UTF8_CHARCLASS_SPACE(),
1716                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1717                 ! isSPACE((U8) *s)
1718             );
1719             break;
1720         case NSPACEA:
1721             REXEC_FBC_CSCAN(
1722                 !isSPACE_A(*s),
1723                 !isSPACE_A(*s)
1724             );
1725             break;
1726         case NSPACEL:
1727             REXEC_FBC_CSCAN_TAINT(
1728                 !isSPACE_LC_utf8((U8*)s),
1729                 !isSPACE_LC(*s)
1730             );
1731             break;
1732         case DIGIT:
1733             REXEC_FBC_CSCAN_PRELOAD(
1734                 LOAD_UTF8_CHARCLASS_DIGIT(),
1735                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1736                 isDIGIT(*s)
1737             );
1738             break;
1739         case DIGITA:
1740             /* Don't need to worry about utf8, as it can match only a single
1741              * byte invariant character */
1742             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1743             break;
1744         case DIGITL:
1745             REXEC_FBC_CSCAN_TAINT(
1746                 isDIGIT_LC_utf8((U8*)s),
1747                 isDIGIT_LC(*s)
1748             );
1749             break;
1750         case NDIGIT:
1751             REXEC_FBC_CSCAN_PRELOAD(
1752                 LOAD_UTF8_CHARCLASS_DIGIT(),
1753                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1754                 !isDIGIT(*s)
1755             );
1756             break;
1757         case NDIGITA:
1758             REXEC_FBC_CSCAN(
1759                 !isDIGIT_A(*s),
1760                 !isDIGIT_A(*s)
1761             );
1762             break;
1763         case NDIGITL:
1764             REXEC_FBC_CSCAN_TAINT(
1765                 !isDIGIT_LC_utf8((U8*)s),
1766                 !isDIGIT_LC(*s)
1767             );
1768             break;
1769         case LNBREAK:
1770             REXEC_FBC_CSCAN(
1771                 is_LNBREAK_utf8(s),
1772                 is_LNBREAK_latin1(s)
1773             );
1774             break;
1775         case VERTWS:
1776             REXEC_FBC_CSCAN(
1777                 is_VERTWS_utf8(s),
1778                 is_VERTWS_latin1(s)
1779             );
1780             break;
1781         case NVERTWS:
1782             REXEC_FBC_CSCAN(
1783                 !is_VERTWS_utf8(s),
1784                 !is_VERTWS_latin1(s)
1785             );
1786             break;
1787         case HORIZWS:
1788             REXEC_FBC_CSCAN(
1789                 is_HORIZWS_utf8(s),
1790                 is_HORIZWS_latin1(s)
1791             );
1792             break;
1793         case NHORIZWS:
1794             REXEC_FBC_CSCAN(
1795                 !is_HORIZWS_utf8(s),
1796                 !is_HORIZWS_latin1(s)
1797             );
1798             break;
1799         case AHOCORASICKC:
1800         case AHOCORASICK:
1801             {
1802                 DECL_TRIE_TYPE(c);
1803                 /* what trie are we using right now */
1804                 reg_ac_data *aho
1805                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1806                 reg_trie_data *trie
1807                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1808                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1809
1810                 const char *last_start = strend - trie->minlen;
1811 #ifdef DEBUGGING
1812                 const char *real_start = s;
1813 #endif
1814                 STRLEN maxlen = trie->maxlen;
1815                 SV *sv_points;
1816                 U8 **points; /* map of where we were in the input string
1817                                 when reading a given char. For ASCII this
1818                                 is unnecessary overhead as the relationship
1819                                 is always 1:1, but for Unicode, especially
1820                                 case folded Unicode this is not true. */
1821                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1822                 U8 *bitmap=NULL;
1823
1824
1825                 GET_RE_DEBUG_FLAGS_DECL;
1826
1827                 /* We can't just allocate points here. We need to wrap it in
1828                  * an SV so it gets freed properly if there is a croak while
1829                  * running the match */
1830                 ENTER;
1831                 SAVETMPS;
1832                 sv_points=newSV(maxlen * sizeof(U8 *));
1833                 SvCUR_set(sv_points,
1834                     maxlen * sizeof(U8 *));
1835                 SvPOK_on(sv_points);
1836                 sv_2mortal(sv_points);
1837                 points=(U8**)SvPV_nolen(sv_points );
1838                 if ( trie_type != trie_utf8_fold
1839                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1840                 {
1841                     if (trie->bitmap)
1842                         bitmap=(U8*)trie->bitmap;
1843                     else
1844                         bitmap=(U8*)ANYOF_BITMAP(c);
1845                 }
1846                 /* this is the Aho-Corasick algorithm modified a touch
1847                    to include special handling for long "unknown char"
1848                    sequences. The basic idea being that we use AC as long
1849                    as we are dealing with a possible matching char, when
1850                    we encounter an unknown char (and we have not encountered
1851                    an accepting state) we scan forward until we find a legal
1852                    starting char.
1853                    AC matching is basically that of trie matching, except
1854                    that when we encounter a failing transition, we fall back
1855                    to the current states "fail state", and try the current char
1856                    again, a process we repeat until we reach the root state,
1857                    state 1, or a legal transition. If we fail on the root state
1858                    then we can either terminate if we have reached an accepting
1859                    state previously, or restart the entire process from the beginning
1860                    if we have not.
1861
1862                  */
1863                 while (s <= last_start) {
1864                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1865                     U8 *uc = (U8*)s;
1866                     U16 charid = 0;
1867                     U32 base = 1;
1868                     U32 state = 1;
1869                     UV uvc = 0;
1870                     STRLEN len = 0;
1871                     STRLEN foldlen = 0;
1872                     U8 *uscan = (U8*)NULL;
1873                     U8 *leftmost = NULL;
1874 #ifdef DEBUGGING
1875                     U32 accepted_word= 0;
1876 #endif
1877                     U32 pointpos = 0;
1878
1879                     while ( state && uc <= (U8*)strend ) {
1880                         int failed=0;
1881                         U32 word = aho->states[ state ].wordnum;
1882
1883                         if( state==1 ) {
1884                             if ( bitmap ) {
1885                                 DEBUG_TRIE_EXECUTE_r(
1886                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1887                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1888                                             (char *)uc, utf8_target );
1889                                         PerlIO_printf( Perl_debug_log,
1890                                             " Scanning for legal start char...\n");
1891                                     }
1892                                 );
1893                                 if (utf8_target) {
1894                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1895                                         uc += UTF8SKIP(uc);
1896                                     }
1897                                 } else {
1898                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1899                                         uc++;
1900                                     }
1901                                 }
1902                                 s= (char *)uc;
1903                             }
1904                             if (uc >(U8*)last_start) break;
1905                         }
1906
1907                         if ( word ) {
1908                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1909                             if (!leftmost || lpos < leftmost) {
1910                                 DEBUG_r(accepted_word=word);
1911                                 leftmost= lpos;
1912                             }
1913                             if (base==0) break;
1914
1915                         }
1916                         points[pointpos++ % maxlen]= uc;
1917                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1918                                              uscan, len, uvc, charid, foldlen,
1919                                              foldbuf, uniflags);
1920                         DEBUG_TRIE_EXECUTE_r({
1921                             dump_exec_pos( (char *)uc, c, strend, real_start,
1922                                 s,   utf8_target );
1923                             PerlIO_printf(Perl_debug_log,
1924                                 " Charid:%3u CP:%4"UVxf" ",
1925                                  charid, uvc);
1926                         });
1927
1928                         do {
1929 #ifdef DEBUGGING
1930                             word = aho->states[ state ].wordnum;
1931 #endif
1932                             base = aho->states[ state ].trans.base;
1933
1934                             DEBUG_TRIE_EXECUTE_r({
1935                                 if (failed)
1936                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1937                                         s,   utf8_target );
1938                                 PerlIO_printf( Perl_debug_log,
1939                                     "%sState: %4"UVxf", word=%"UVxf,
1940                                     failed ? " Fail transition to " : "",
1941                                     (UV)state, (UV)word);
1942                             });
1943                             if ( base ) {
1944                                 U32 tmp;
1945                                 I32 offset;
1946                                 if (charid &&
1947                                      ( ((offset = base + charid
1948                                         - 1 - trie->uniquecharcount)) >= 0)
1949                                      && ((U32)offset < trie->lasttrans)
1950                                      && trie->trans[offset].check == state
1951                                      && (tmp=trie->trans[offset].next))
1952                                 {
1953                                     DEBUG_TRIE_EXECUTE_r(
1954                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1955                                     state = tmp;
1956                                     break;
1957                                 }
1958                                 else {
1959                                     DEBUG_TRIE_EXECUTE_r(
1960                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1961                                     failed = 1;
1962                                     state = aho->fail[state];
1963                                 }
1964                             }
1965                             else {
1966                                 /* we must be accepting here */
1967                                 DEBUG_TRIE_EXECUTE_r(
1968                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1969                                 failed = 1;
1970                                 break;
1971                             }
1972                         } while(state);
1973                         uc += len;
1974                         if (failed) {
1975                             if (leftmost)
1976                                 break;
1977                             if (!state) state = 1;
1978                         }
1979                     }
1980                     if ( aho->states[ state ].wordnum ) {
1981                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1982                         if (!leftmost || lpos < leftmost) {
1983                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1984                             leftmost = lpos;
1985                         }
1986                     }
1987                     if (leftmost) {
1988                         s = (char*)leftmost;
1989                         DEBUG_TRIE_EXECUTE_r({
1990                             PerlIO_printf(
1991                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1992                                 (UV)accepted_word, (IV)(s - real_start)
1993                             );
1994                         });
1995                         if (!reginfo || regtry(reginfo, &s)) {
1996                             FREETMPS;
1997                             LEAVE;
1998                             goto got_it;
1999                         }
2000                         s = HOPc(s,1);
2001                         DEBUG_TRIE_EXECUTE_r({
2002                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2003                         });
2004                     } else {
2005                         DEBUG_TRIE_EXECUTE_r(
2006                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2007                         break;
2008                     }
2009                 }
2010                 FREETMPS;
2011                 LEAVE;
2012             }
2013             break;
2014         default:
2015             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2016             break;
2017         }
2018         return 0;
2019       got_it:
2020         return s;
2021 }
2022
2023
2024 /*
2025  - regexec_flags - match a regexp against a string
2026  */
2027 I32
2028 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2029               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2030 /* strend: pointer to null at end of string */
2031 /* strbeg: real beginning of string */
2032 /* minend: end of match must be >=minend after stringarg. */
2033 /* data: May be used for some additional optimizations.
2034          Currently its only used, with a U32 cast, for transmitting
2035          the ganch offset when doing a /g match. This will change */
2036 /* nosave: For optimizations. */
2037 {
2038     dVAR;
2039     struct regexp *const prog = (struct regexp *)SvANY(rx);
2040     /*register*/ char *s;
2041     register regnode *c;
2042     /*register*/ char *startpos = stringarg;
2043     I32 minlen;         /* must match at least this many chars */
2044     I32 dontbother = 0; /* how many characters not to try at end */
2045     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2046     I32 scream_pos = -1;                /* Internal iterator of scream. */
2047     char *scream_olds = NULL;
2048     const bool utf8_target = cBOOL(DO_UTF8(sv));
2049     I32 multiline;
2050     RXi_GET_DECL(prog,progi);
2051     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2052     regexp_paren_pair *swap = NULL;
2053     GET_RE_DEBUG_FLAGS_DECL;
2054
2055     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2056     PERL_UNUSED_ARG(data);
2057
2058     /* Be paranoid... */
2059     if (prog == NULL || startpos == NULL) {
2060         Perl_croak(aTHX_ "NULL regexp parameter");
2061         return 0;
2062     }
2063
2064     multiline = prog->extflags & RXf_PMf_MULTILINE;
2065     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2066
2067     RX_MATCH_UTF8_set(rx, utf8_target);
2068     DEBUG_EXECUTE_r(
2069         debug_start_match(rx, utf8_target, startpos, strend,
2070         "Matching");
2071     );
2072
2073     minlen = prog->minlen;
2074
2075     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2076         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2077                               "String too short [regexec_flags]...\n"));
2078         goto phooey;
2079     }
2080
2081
2082     /* Check validity of program. */
2083     if (UCHARAT(progi->program) != REG_MAGIC) {
2084         Perl_croak(aTHX_ "corrupted regexp program");
2085     }
2086
2087     PL_reg_flags = 0;
2088     PL_reg_eval_set = 0;
2089     PL_reg_maxiter = 0;
2090
2091     if (RX_UTF8(rx))
2092         PL_reg_flags |= RF_utf8;
2093
2094     /* Mark beginning of line for ^ and lookbehind. */
2095     reginfo.bol = startpos; /* XXX not used ??? */
2096     PL_bostr  = strbeg;
2097     reginfo.sv = sv;
2098
2099     /* Mark end of line for $ (and such) */
2100     PL_regeol = strend;
2101
2102     /* see how far we have to get to not match where we matched before */
2103     reginfo.till = startpos+minend;
2104
2105     /* If there is a "must appear" string, look for it. */
2106     s = startpos;
2107
2108     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2109         MAGIC *mg;
2110         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2111             reginfo.ganch = startpos + prog->gofs;
2112             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2113               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2114         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2115                   && SvMAGIC(sv)
2116                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2117                   && mg->mg_len >= 0) {
2118             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2119             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2120                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2121
2122             if (prog->extflags & RXf_ANCH_GPOS) {
2123                 if (s > reginfo.ganch)
2124                     goto phooey;
2125                 s = reginfo.ganch - prog->gofs;
2126                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2127                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2128                 if (s < strbeg)
2129                     goto phooey;
2130             }
2131         }
2132         else if (data) {
2133             reginfo.ganch = strbeg + PTR2UV(data);
2134             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2135                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2136
2137         } else {                                /* pos() not defined */
2138             reginfo.ganch = strbeg;
2139             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2140                  "GPOS: reginfo.ganch = strbeg\n"));
2141         }
2142     }
2143     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2144         /* We have to be careful. If the previous successful match
2145            was from this regex we don't want a subsequent partially
2146            successful match to clobber the old results.
2147            So when we detect this possibility we add a swap buffer
2148            to the re, and switch the buffer each match. If we fail
2149            we switch it back, otherwise we leave it swapped.
2150         */
2151         swap = prog->offs;
2152         /* do we need a save destructor here for eval dies? */
2153         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2154     }
2155     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2156         re_scream_pos_data d;
2157
2158         d.scream_olds = &scream_olds;
2159         d.scream_pos = &scream_pos;
2160         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2161         if (!s) {
2162             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2163             goto phooey;        /* not present */
2164         }
2165     }
2166
2167
2168
2169     /* Simplest case:  anchored match need be tried only once. */
2170     /*  [unless only anchor is BOL and multiline is set] */
2171     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2172         if (s == startpos && regtry(&reginfo, &startpos))
2173             goto got_it;
2174         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2175                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2176         {
2177             char *end;
2178
2179             if (minlen)
2180                 dontbother = minlen - 1;
2181             end = HOP3c(strend, -dontbother, strbeg) - 1;
2182             /* for multiline we only have to try after newlines */
2183             if (prog->check_substr || prog->check_utf8) {
2184                 /* because of the goto we can not easily reuse the macros for bifurcating the
2185                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2186                 if (utf8_target) {
2187                     if (s == startpos)
2188                         goto after_try_utf8;
2189                     while (1) {
2190                         if (regtry(&reginfo, &s)) {
2191                             goto got_it;
2192                         }
2193                       after_try_utf8:
2194                         if (s > end) {
2195                             goto phooey;
2196                         }
2197                         if (prog->extflags & RXf_USE_INTUIT) {
2198                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2199                             if (!s) {
2200                                 goto phooey;
2201                             }
2202                         }
2203                         else {
2204                             s += UTF8SKIP(s);
2205                         }
2206                     }
2207                 } /* end search for check string in unicode */
2208                 else {
2209                     if (s == startpos) {
2210                         goto after_try_latin;
2211                     }
2212                     while (1) {
2213                         if (regtry(&reginfo, &s)) {
2214                             goto got_it;
2215                         }
2216                       after_try_latin:
2217                         if (s > end) {
2218                             goto phooey;
2219                         }
2220                         if (prog->extflags & RXf_USE_INTUIT) {
2221                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2222                             if (!s) {
2223                                 goto phooey;
2224                             }
2225                         }
2226                         else {
2227                             s++;
2228                         }
2229                     }
2230                 } /* end search for check string in latin*/
2231             } /* end search for check string */
2232             else { /* search for newline */
2233                 if (s > startpos) {
2234                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2235                     s--;
2236                 }
2237                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2238                 while (s < end) {
2239                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2240                         if (regtry(&reginfo, &s))
2241                             goto got_it;
2242                     }
2243                 }
2244             } /* end search for newline */
2245         } /* end anchored/multiline check string search */
2246         goto phooey;
2247     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2248     {
2249         /* the warning about reginfo.ganch being used without initialization
2250            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2251            and we only enter this block when the same bit is set. */
2252         char *tmp_s = reginfo.ganch - prog->gofs;
2253
2254         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2255             goto got_it;
2256         goto phooey;
2257     }
2258
2259     /* Messy cases:  unanchored match. */
2260     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2261         /* we have /x+whatever/ */
2262         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2263         char ch;
2264 #ifdef DEBUGGING
2265         int did_match = 0;
2266 #endif
2267         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2268             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2269         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2270
2271         if (utf8_target) {
2272             REXEC_FBC_SCAN(
2273                 if (*s == ch) {
2274                     DEBUG_EXECUTE_r( did_match = 1 );
2275                     if (regtry(&reginfo, &s)) goto got_it;
2276                     s += UTF8SKIP(s);
2277                     while (s < strend && *s == ch)
2278                         s += UTF8SKIP(s);
2279                 }
2280             );
2281         }
2282         else {
2283             REXEC_FBC_SCAN(
2284                 if (*s == ch) {
2285                     DEBUG_EXECUTE_r( did_match = 1 );
2286                     if (regtry(&reginfo, &s)) goto got_it;
2287                     s++;
2288                     while (s < strend && *s == ch)
2289                         s++;
2290                 }
2291             );
2292         }
2293         DEBUG_EXECUTE_r(if (!did_match)
2294                 PerlIO_printf(Perl_debug_log,
2295                                   "Did not find anchored character...\n")
2296                );
2297     }
2298     else if (prog->anchored_substr != NULL
2299               || prog->anchored_utf8 != NULL
2300               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2301                   && prog->float_max_offset < strend - s)) {
2302         SV *must;
2303         I32 back_max;
2304         I32 back_min;
2305         char *last;
2306         char *last1;            /* Last position checked before */
2307 #ifdef DEBUGGING
2308         int did_match = 0;
2309 #endif
2310         if (prog->anchored_substr || prog->anchored_utf8) {
2311             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2312                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2313             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2314             back_max = back_min = prog->anchored_offset;
2315         } else {
2316             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2317                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2318             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2319             back_max = prog->float_max_offset;
2320             back_min = prog->float_min_offset;
2321         }
2322
2323
2324         if (must == &PL_sv_undef)
2325             /* could not downgrade utf8 check substring, so must fail */
2326             goto phooey;
2327
2328         if (back_min<0) {
2329             last = strend;
2330         } else {
2331             last = HOP3c(strend,        /* Cannot start after this */
2332                   -(I32)(CHR_SVLEN(must)
2333                          - (SvTAIL(must) != 0) + back_min), strbeg);
2334         }
2335         if (s > PL_bostr)
2336             last1 = HOPc(s, -1);
2337         else
2338             last1 = s - 1;      /* bogus */
2339
2340         /* XXXX check_substr already used to find "s", can optimize if
2341            check_substr==must. */
2342         scream_pos = -1;
2343         dontbother = end_shift;
2344         strend = HOPc(strend, -dontbother);
2345         while ( (s <= last) &&
2346                 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
2347                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2348                                     end_shift, &scream_pos, 0))
2349                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2350                                   (unsigned char*)strend, must,
2351                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2352             /* we may be pointing at the wrong string */
2353             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2354                 s = strbeg + (s - SvPVX_const(sv));
2355             DEBUG_EXECUTE_r( did_match = 1 );
2356             if (HOPc(s, -back_max) > last1) {
2357                 last1 = HOPc(s, -back_min);
2358                 s = HOPc(s, -back_max);
2359             }
2360             else {
2361                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2362
2363                 last1 = HOPc(s, -back_min);
2364                 s = t;
2365             }
2366             if (utf8_target) {
2367                 while (s <= last1) {
2368                     if (regtry(&reginfo, &s))
2369                         goto got_it;
2370                     s += UTF8SKIP(s);
2371                 }
2372             }
2373             else {
2374                 while (s <= last1) {
2375                     if (regtry(&reginfo, &s))
2376                         goto got_it;
2377                     s++;
2378                 }
2379             }
2380         }
2381         DEBUG_EXECUTE_r(if (!did_match) {
2382             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2383                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2384             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2385                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2386                                ? "anchored" : "floating"),
2387                 quoted, RE_SV_TAIL(must));
2388         });
2389         goto phooey;
2390     }
2391     else if ( (c = progi->regstclass) ) {
2392         if (minlen) {
2393             const OPCODE op = OP(progi->regstclass);
2394             /* don't bother with what can't match */
2395             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2396                 strend = HOPc(strend, -(minlen - 1));
2397         }
2398         DEBUG_EXECUTE_r({
2399             SV * const prop = sv_newmortal();
2400             regprop(prog, prop, c);
2401             {
2402                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2403                     s,strend-s,60);
2404                 PerlIO_printf(Perl_debug_log,
2405                     "Matching stclass %.*s against %s (%d bytes)\n",
2406                     (int)SvCUR(prop), SvPVX_const(prop),
2407                      quoted, (int)(strend - s));
2408             }
2409         });
2410         if (find_byclass(prog, c, s, strend, &reginfo))
2411             goto got_it;
2412         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2413     }
2414     else {
2415         dontbother = 0;
2416         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2417             /* Trim the end. */
2418             char *last;
2419             SV* float_real;
2420
2421             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2422                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2423             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2424
2425             if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
2426                 last = screaminstr(sv, float_real, s - strbeg,
2427                                    end_shift, &scream_pos, 1); /* last one */
2428                 if (!last)
2429                     last = scream_olds; /* Only one occurrence. */
2430                 /* we may be pointing at the wrong string */
2431                 else if (RXp_MATCH_COPIED(prog))
2432                     s = strbeg + (s - SvPVX_const(sv));
2433             }
2434             else {
2435                 STRLEN len;
2436                 const char * const little = SvPV_const(float_real, len);
2437
2438                 if (SvTAIL(float_real)) {
2439                     if (memEQ(strend - len + 1, little, len - 1))
2440                         last = strend - len + 1;
2441                     else if (!multiline)
2442                         last = memEQ(strend - len, little, len)
2443                             ? strend - len : NULL;
2444                     else
2445                         goto find_last;
2446                 } else {
2447                   find_last:
2448                     if (len)
2449                         last = rninstr(s, strend, little, little + len);
2450                     else
2451                         last = strend;  /* matching "$" */
2452                 }
2453             }
2454             if (last == NULL) {
2455                 DEBUG_EXECUTE_r(
2456                     PerlIO_printf(Perl_debug_log,
2457                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2458                         PL_colors[4], PL_colors[5]));
2459                 goto phooey; /* Should not happen! */
2460             }
2461             dontbother = strend - last + prog->float_min_offset;
2462         }
2463         if (minlen && (dontbother < minlen))
2464             dontbother = minlen - 1;
2465         strend -= dontbother;              /* this one's always in bytes! */
2466         /* We don't know much -- general case. */
2467         if (utf8_target) {
2468             for (;;) {
2469                 if (regtry(&reginfo, &s))
2470                     goto got_it;
2471                 if (s >= strend)
2472                     break;
2473                 s += UTF8SKIP(s);
2474             };
2475         }
2476         else {
2477             do {
2478                 if (regtry(&reginfo, &s))
2479                     goto got_it;
2480             } while (s++ < strend);
2481         }
2482     }
2483
2484     /* Failure. */
2485     goto phooey;
2486
2487 got_it:
2488     Safefree(swap);
2489     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2490
2491     if (PL_reg_eval_set)
2492         restore_pos(aTHX_ prog);
2493     if (RXp_PAREN_NAMES(prog))
2494         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2495
2496     /* make sure $`, $&, $', and $digit will work later */
2497     if ( !(flags & REXEC_NOT_FIRST) ) {
2498         RX_MATCH_COPY_FREE(rx);
2499         if (flags & REXEC_COPY_STR) {
2500             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2501 #ifdef PERL_OLD_COPY_ON_WRITE
2502             if ((SvIsCOW(sv)
2503                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2504                 if (DEBUG_C_TEST) {
2505                     PerlIO_printf(Perl_debug_log,
2506                                   "Copy on write: regexp capture, type %d\n",
2507                                   (int) SvTYPE(sv));
2508                 }
2509                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2510                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2511                 assert (SvPOKp(prog->saved_copy));
2512             } else
2513 #endif
2514             {
2515                 RX_MATCH_COPIED_on(rx);
2516                 s = savepvn(strbeg, i);
2517                 prog->subbeg = s;
2518             }
2519             prog->sublen = i;
2520         }
2521         else {
2522             prog->subbeg = strbeg;
2523             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2524         }
2525     }
2526
2527     return 1;
2528
2529 phooey:
2530     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2531                           PL_colors[4], PL_colors[5]));
2532     if (PL_reg_eval_set)
2533         restore_pos(aTHX_ prog);
2534     if (swap) {
2535         /* we failed :-( roll it back */
2536         Safefree(prog->offs);
2537         prog->offs = swap;
2538     }
2539
2540     return 0;
2541 }
2542
2543
2544 /*
2545  - regtry - try match at specific point
2546  */
2547 STATIC I32                      /* 0 failure, 1 success */
2548 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2549 {
2550     dVAR;
2551     CHECKPOINT lastcp;
2552     REGEXP *const rx = reginfo->prog;
2553     regexp *const prog = (struct regexp *)SvANY(rx);
2554     RXi_GET_DECL(prog,progi);
2555     GET_RE_DEBUG_FLAGS_DECL;
2556
2557     PERL_ARGS_ASSERT_REGTRY;
2558
2559     reginfo->cutpoint=NULL;
2560
2561     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2562         MAGIC *mg;
2563
2564         PL_reg_eval_set = RS_init;
2565         DEBUG_EXECUTE_r(DEBUG_s(
2566             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2567                           (IV)(PL_stack_sp - PL_stack_base));
2568             ));
2569         SAVESTACK_CXPOS();
2570         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2571         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2572         SAVETMPS;
2573         /* Apparently this is not needed, judging by wantarray. */
2574         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2575            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2576
2577         if (reginfo->sv) {
2578             /* Make $_ available to executed code. */
2579             if (reginfo->sv != DEFSV) {
2580                 SAVE_DEFSV;
2581                 DEFSV_set(reginfo->sv);
2582             }
2583
2584             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2585                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2586                 /* prepare for quick setting of pos */
2587 #ifdef PERL_OLD_COPY_ON_WRITE
2588                 if (SvIsCOW(reginfo->sv))
2589                     sv_force_normal_flags(reginfo->sv, 0);
2590 #endif
2591                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2592                                  &PL_vtbl_mglob, NULL, 0);
2593                 mg->mg_len = -1;
2594             }
2595             PL_reg_magic    = mg;
2596             PL_reg_oldpos   = mg->mg_len;
2597             SAVEDESTRUCTOR_X(restore_pos, prog);
2598         }
2599         if (!PL_reg_curpm) {
2600             Newxz(PL_reg_curpm, 1, PMOP);
2601 #ifdef USE_ITHREADS
2602             {
2603                 SV* const repointer = &PL_sv_undef;
2604                 /* this regexp is also owned by the new PL_reg_curpm, which
2605                    will try to free it.  */
2606                 av_push(PL_regex_padav, repointer);
2607                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2608                 PL_regex_pad = AvARRAY(PL_regex_padav);
2609             }
2610 #endif
2611         }
2612 #ifdef USE_ITHREADS
2613         /* It seems that non-ithreads works both with and without this code.
2614            So for efficiency reasons it seems best not to have the code
2615            compiled when it is not needed.  */
2616         /* This is safe against NULLs: */
2617         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2618         /* PM_reg_curpm owns a reference to this regexp.  */
2619         (void)ReREFCNT_inc(rx);
2620 #endif
2621         PM_SETRE(PL_reg_curpm, rx);
2622         PL_reg_oldcurpm = PL_curpm;
2623         PL_curpm = PL_reg_curpm;
2624         if (RXp_MATCH_COPIED(prog)) {
2625             /*  Here is a serious problem: we cannot rewrite subbeg,
2626                 since it may be needed if this match fails.  Thus
2627                 $` inside (?{}) could fail... */
2628             PL_reg_oldsaved = prog->subbeg;
2629             PL_reg_oldsavedlen = prog->sublen;
2630 #ifdef PERL_OLD_COPY_ON_WRITE
2631             PL_nrs = prog->saved_copy;
2632 #endif
2633             RXp_MATCH_COPIED_off(prog);
2634         }
2635         else
2636             PL_reg_oldsaved = NULL;
2637         prog->subbeg = PL_bostr;
2638         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2639     }
2640     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2641     prog->offs[0].start = *startpos - PL_bostr;
2642     PL_reginput = *startpos;
2643     PL_reglastparen = &prog->lastparen;
2644     PL_reglastcloseparen = &prog->lastcloseparen;
2645     prog->lastparen = 0;
2646     prog->lastcloseparen = 0;
2647     PL_regsize = 0;
2648     PL_regoffs = prog->offs;
2649     if (PL_reg_start_tmpl <= prog->nparens) {
2650         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2651         if(PL_reg_start_tmp)
2652             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2653         else
2654             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2655     }
2656
2657     /* XXXX What this code is doing here?!!!  There should be no need
2658        to do this again and again, PL_reglastparen should take care of
2659        this!  --ilya*/
2660
2661     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2662      * Actually, the code in regcppop() (which Ilya may be meaning by
2663      * PL_reglastparen), is not needed at all by the test suite
2664      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2665      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2666      * Meanwhile, this code *is* needed for the
2667      * above-mentioned test suite tests to succeed.  The common theme
2668      * on those tests seems to be returning null fields from matches.
2669      * --jhi updated by dapm */
2670 #if 1
2671     if (prog->nparens) {
2672         regexp_paren_pair *pp = PL_regoffs;
2673         register I32 i;
2674         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2675             ++pp;
2676             pp->start = -1;
2677             pp->end = -1;
2678         }
2679     }
2680 #endif
2681     REGCP_SET(lastcp);
2682     if (regmatch(reginfo, progi->program + 1)) {
2683         PL_regoffs[0].end = PL_reginput - PL_bostr;
2684         return 1;
2685     }
2686     if (reginfo->cutpoint)
2687         *startpos= reginfo->cutpoint;
2688     REGCP_UNWIND(lastcp);
2689     return 0;
2690 }
2691
2692
2693 #define sayYES goto yes
2694 #define sayNO goto no
2695 #define sayNO_SILENT goto no_silent
2696
2697 /* we dont use STMT_START/END here because it leads to
2698    "unreachable code" warnings, which are bogus, but distracting. */
2699 #define CACHEsayNO \
2700     if (ST.cache_mask) \
2701        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2702     sayNO
2703
2704 /* this is used to determine how far from the left messages like
2705    'failed...' are printed. It should be set such that messages
2706    are inline with the regop output that created them.
2707 */
2708 #define REPORT_CODE_OFF 32
2709
2710
2711 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2712 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2713
2714 #define SLAB_FIRST(s) (&(s)->states[0])
2715 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2716
2717 /* grab a new slab and return the first slot in it */
2718
2719 STATIC regmatch_state *
2720 S_push_slab(pTHX)
2721 {
2722 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2723     dMY_CXT;
2724 #endif
2725     regmatch_slab *s = PL_regmatch_slab->next;
2726     if (!s) {
2727         Newx(s, 1, regmatch_slab);
2728         s->prev = PL_regmatch_slab;
2729         s->next = NULL;
2730         PL_regmatch_slab->next = s;
2731     }
2732     PL_regmatch_slab = s;
2733     return SLAB_FIRST(s);
2734 }
2735
2736
2737 /* push a new state then goto it */
2738
2739 #define PUSH_STATE_GOTO(state, node) \
2740     scan = node; \
2741     st->resume_state = state; \
2742     goto push_state;
2743
2744 /* push a new state with success backtracking, then goto it */
2745
2746 #define PUSH_YES_STATE_GOTO(state, node) \
2747     scan = node; \
2748     st->resume_state = state; \
2749     goto push_yes_state;
2750
2751
2752
2753 /*
2754
2755 regmatch() - main matching routine
2756
2757 This is basically one big switch statement in a loop. We execute an op,
2758 set 'next' to point the next op, and continue. If we come to a point which
2759 we may need to backtrack to on failure such as (A|B|C), we push a
2760 backtrack state onto the backtrack stack. On failure, we pop the top
2761 state, and re-enter the loop at the state indicated. If there are no more
2762 states to pop, we return failure.
2763
2764 Sometimes we also need to backtrack on success; for example /A+/, where
2765 after successfully matching one A, we need to go back and try to
2766 match another one; similarly for lookahead assertions: if the assertion
2767 completes successfully, we backtrack to the state just before the assertion
2768 and then carry on.  In these cases, the pushed state is marked as
2769 'backtrack on success too'. This marking is in fact done by a chain of
2770 pointers, each pointing to the previous 'yes' state. On success, we pop to
2771 the nearest yes state, discarding any intermediate failure-only states.
2772 Sometimes a yes state is pushed just to force some cleanup code to be
2773 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2774 it to free the inner regex.
2775
2776 Note that failure backtracking rewinds the cursor position, while
2777 success backtracking leaves it alone.
2778
2779 A pattern is complete when the END op is executed, while a subpattern
2780 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2781 ops trigger the "pop to last yes state if any, otherwise return true"
2782 behaviour.
2783
2784 A common convention in this function is to use A and B to refer to the two
2785 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2786 the subpattern to be matched possibly multiple times, while B is the entire
2787 rest of the pattern. Variable and state names reflect this convention.
2788
2789 The states in the main switch are the union of ops and failure/success of
2790 substates associated with with that op.  For example, IFMATCH is the op
2791 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2792 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2793 successfully matched A and IFMATCH_A_fail is a state saying that we have
2794 just failed to match A. Resume states always come in pairs. The backtrack
2795 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2796 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2797 on success or failure.
2798
2799 The struct that holds a backtracking state is actually a big union, with
2800 one variant for each major type of op. The variable st points to the
2801 top-most backtrack struct. To make the code clearer, within each
2802 block of code we #define ST to alias the relevant union.
2803
2804 Here's a concrete example of a (vastly oversimplified) IFMATCH
2805 implementation:
2806
2807     switch (state) {
2808     ....
2809
2810 #define ST st->u.ifmatch
2811
2812     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2813         ST.foo = ...; // some state we wish to save
2814         ...
2815         // push a yes backtrack state with a resume value of
2816         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2817         // first node of A:
2818         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2819         // NOTREACHED
2820
2821     case IFMATCH_A: // we have successfully executed A; now continue with B
2822         next = B;
2823         bar = ST.foo; // do something with the preserved value
2824         break;
2825
2826     case IFMATCH_A_fail: // A failed, so the assertion failed
2827         ...;   // do some housekeeping, then ...
2828         sayNO; // propagate the failure
2829
2830 #undef ST
2831
2832     ...
2833     }
2834
2835 For any old-timers reading this who are familiar with the old recursive
2836 approach, the code above is equivalent to:
2837
2838     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2839     {
2840         int foo = ...
2841         ...
2842         if (regmatch(A)) {
2843             next = B;
2844             bar = foo;
2845             break;
2846         }
2847         ...;   // do some housekeeping, then ...
2848         sayNO; // propagate the failure
2849     }
2850
2851 The topmost backtrack state, pointed to by st, is usually free. If you
2852 want to claim it, populate any ST.foo fields in it with values you wish to
2853 save, then do one of
2854
2855         PUSH_STATE_GOTO(resume_state, node);
2856         PUSH_YES_STATE_GOTO(resume_state, node);
2857
2858 which sets that backtrack state's resume value to 'resume_state', pushes a
2859 new free entry to the top of the backtrack stack, then goes to 'node'.
2860 On backtracking, the free slot is popped, and the saved state becomes the
2861 new free state. An ST.foo field in this new top state can be temporarily
2862 accessed to retrieve values, but once the main loop is re-entered, it
2863 becomes available for reuse.
2864
2865 Note that the depth of the backtrack stack constantly increases during the
2866 left-to-right execution of the pattern, rather than going up and down with
2867 the pattern nesting. For example the stack is at its maximum at Z at the
2868 end of the pattern, rather than at X in the following:
2869
2870     /(((X)+)+)+....(Y)+....Z/
2871
2872 The only exceptions to this are lookahead/behind assertions and the cut,
2873 (?>A), which pop all the backtrack states associated with A before
2874 continuing.
2875
2876 Backtrack state structs are allocated in slabs of about 4K in size.
2877 PL_regmatch_state and st always point to the currently active state,
2878 and PL_regmatch_slab points to the slab currently containing
2879 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2880 allocated, and is never freed until interpreter destruction. When the slab
2881 is full, a new one is allocated and chained to the end. At exit from
2882 regmatch(), slabs allocated since entry are freed.
2883
2884 */
2885
2886
2887 #define DEBUG_STATE_pp(pp)                                  \
2888     DEBUG_STATE_r({                                         \
2889         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2890         PerlIO_printf(Perl_debug_log,                       \
2891             "    %*s"pp" %s%s%s%s%s\n",                     \
2892             depth*2, "",                                    \
2893             PL_reg_name[st->resume_state],                     \
2894             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2895             ((st==yes_state) ? "Y" : ""),                   \
2896             ((st==mark_state) ? "M" : ""),                  \
2897             ((st==yes_state||st==mark_state) ? "]" : "")    \
2898         );                                                  \
2899     });
2900
2901
2902 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2903
2904 #ifdef DEBUGGING
2905
2906 STATIC void
2907 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2908     const char *start, const char *end, const char *blurb)
2909 {
2910     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2911
2912     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2913
2914     if (!PL_colorset)
2915             reginitcolors();
2916     {
2917         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2918             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2919
2920         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2921             start, end - start, 60);
2922
2923         PerlIO_printf(Perl_debug_log,
2924             "%s%s REx%s %s against %s\n",
2925                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2926
2927         if (utf8_target||utf8_pat)
2928             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2929                 utf8_pat ? "pattern" : "",
2930                 utf8_pat && utf8_target ? " and " : "",
2931                 utf8_target ? "string" : ""
2932             );
2933     }
2934 }
2935
2936 STATIC void
2937 S_dump_exec_pos(pTHX_ const char *locinput,
2938                       const regnode *scan,
2939                       const char *loc_regeol,
2940                       const char *loc_bostr,
2941                       const char *loc_reg_starttry,
2942                       const bool utf8_target)
2943 {
2944     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2945     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2946     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2947     /* The part of the string before starttry has one color
2948        (pref0_len chars), between starttry and current
2949        position another one (pref_len - pref0_len chars),
2950        after the current position the third one.
2951        We assume that pref0_len <= pref_len, otherwise we
2952        decrease pref0_len.  */
2953     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2954         ? (5 + taill) - l : locinput - loc_bostr;
2955     int pref0_len;
2956
2957     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2958
2959     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2960         pref_len++;
2961     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2962     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2963         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2964               ? (5 + taill) - pref_len : loc_regeol - locinput);
2965     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2966         l--;
2967     if (pref0_len < 0)
2968         pref0_len = 0;
2969     if (pref0_len > pref_len)
2970         pref0_len = pref_len;
2971     {
2972         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2973
2974         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2975             (locinput - pref_len),pref0_len, 60, 4, 5);
2976
2977         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2978                     (locinput - pref_len + pref0_len),
2979                     pref_len - pref0_len, 60, 2, 3);
2980
2981         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2982                     locinput, loc_regeol - locinput, 10, 0, 1);
2983
2984         const STRLEN tlen=len0+len1+len2;
2985         PerlIO_printf(Perl_debug_log,
2986                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2987                     (IV)(locinput - loc_bostr),
2988                     len0, s0,
2989                     len1, s1,
2990                     (docolor ? "" : "> <"),
2991                     len2, s2,
2992                     (int)(tlen > 19 ? 0 :  19 - tlen),
2993                     "");
2994     }
2995 }
2996
2997 #endif
2998
2999 /* reg_check_named_buff_matched()
3000  * Checks to see if a named buffer has matched. The data array of
3001  * buffer numbers corresponding to the buffer is expected to reside
3002  * in the regexp->data->data array in the slot stored in the ARG() of
3003  * node involved. Note that this routine doesn't actually care about the
3004  * name, that information is not preserved from compilation to execution.
3005  * Returns the index of the leftmost defined buffer with the given name
3006  * or 0 if non of the buffers matched.
3007  */
3008 STATIC I32
3009 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3010 {
3011     I32 n;
3012     RXi_GET_DECL(rex,rexi);
3013     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3014     I32 *nums=(I32*)SvPVX(sv_dat);
3015
3016     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3017
3018     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3019         if ((I32)*PL_reglastparen >= nums[n] &&
3020             PL_regoffs[nums[n]].end != -1)
3021         {
3022             return nums[n];
3023         }
3024     }
3025     return 0;
3026 }
3027
3028
3029 /* free all slabs above current one  - called during LEAVE_SCOPE */
3030
3031 STATIC void
3032 S_clear_backtrack_stack(pTHX_ void *p)
3033 {
3034     regmatch_slab *s = PL_regmatch_slab->next;
3035     PERL_UNUSED_ARG(p);
3036
3037     if (!s)
3038         return;
3039     PL_regmatch_slab->next = NULL;
3040     while (s) {
3041         regmatch_slab * const osl = s;
3042         s = s->next;
3043         Safefree(osl);
3044     }
3045 }
3046
3047
3048 #define SETREX(Re1,Re2) \
3049     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3050     Re1 = (Re2)
3051
3052 STATIC I32                      /* 0 failure, 1 success */
3053 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3054 {
3055 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3056     dMY_CXT;
3057 #endif
3058     dVAR;
3059     register const bool utf8_target = PL_reg_match_utf8;
3060     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3061     REGEXP *rex_sv = reginfo->prog;
3062     regexp *rex = (struct regexp *)SvANY(rex_sv);
3063     RXi_GET_DECL(rex,rexi);
3064     I32 oldsave;
3065     /* the current state. This is a cached copy of PL_regmatch_state */
3066     register regmatch_state *st;
3067     /* cache heavy used fields of st in registers */
3068     register regnode *scan;
3069     register regnode *next;
3070     register U32 n = 0; /* general value; init to avoid compiler warning */
3071     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3072     register char *locinput = PL_reginput;
3073     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3074
3075     bool result = 0;        /* return value of S_regmatch */
3076     int depth = 0;          /* depth of backtrack stack */
3077     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3078     const U32 max_nochange_depth =
3079         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3080         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3081     regmatch_state *yes_state = NULL; /* state to pop to on success of
3082                                                             subpattern */
3083     /* mark_state piggy backs on the yes_state logic so that when we unwind
3084        the stack on success we can update the mark_state as we go */
3085     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3086     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3087     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3088     U32 state_num;
3089     bool no_final = 0;      /* prevent failure from backtracking? */
3090     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3091     char *startpoint = PL_reginput;
3092     SV *popmark = NULL;     /* are we looking for a mark? */
3093     SV *sv_commit = NULL;   /* last mark name seen in failure */
3094     SV *sv_yes_mark = NULL; /* last mark name we have seen
3095                                during a successful match */
3096     U32 lastopen = 0;       /* last open we saw */
3097     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3098     SV* const oreplsv = GvSV(PL_replgv);
3099     /* these three flags are set by various ops to signal information to
3100      * the very next op. They have a useful lifetime of exactly one loop
3101      * iteration, and are not preserved or restored by state pushes/pops
3102      */
3103     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3104     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3105     int logical = 0;        /* the following EVAL is:
3106                                 0: (?{...})
3107                                 1: (?(?{...})X|Y)
3108                                 2: (??{...})
3109                                or the following IFMATCH/UNLESSM is:
3110                                 false: plain (?=foo)
3111                                 true:  used as a condition: (?(?=foo))
3112                             */
3113 #ifdef DEBUGGING
3114     GET_RE_DEBUG_FLAGS_DECL;
3115 #endif
3116
3117     PERL_ARGS_ASSERT_REGMATCH;
3118
3119     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3120             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3121     }));
3122     /* on first ever call to regmatch, allocate first slab */
3123     if (!PL_regmatch_slab) {
3124         Newx(PL_regmatch_slab, 1, regmatch_slab);
3125         PL_regmatch_slab->prev = NULL;
3126         PL_regmatch_slab->next = NULL;
3127         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3128     }
3129
3130     oldsave = PL_savestack_ix;
3131     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3132     SAVEVPTR(PL_regmatch_slab);
3133     SAVEVPTR(PL_regmatch_state);
3134
3135     /* grab next free state slot */
3136     st = ++PL_regmatch_state;
3137     if (st >  SLAB_LAST(PL_regmatch_slab))
3138         st = PL_regmatch_state = S_push_slab(aTHX);
3139
3140     /* Note that nextchr is a byte even in UTF */
3141     nextchr = UCHARAT(locinput);
3142     scan = prog;
3143     while (scan != NULL) {
3144
3145         DEBUG_EXECUTE_r( {
3146             SV * const prop = sv_newmortal();
3147             regnode *rnext=regnext(scan);
3148             DUMP_EXEC_POS( locinput, scan, utf8_target );
3149             regprop(rex, prop, scan);
3150
3151             PerlIO_printf(Perl_debug_log,
3152                     "%3"IVdf":%*s%s(%"IVdf")\n",
3153                     (IV)(scan - rexi->program), depth*2, "",
3154                     SvPVX_const(prop),
3155                     (PL_regkind[OP(scan)] == END || !rnext) ?
3156                         0 : (IV)(rnext - rexi->program));
3157         });
3158
3159         next = scan + NEXT_OFF(scan);
3160         if (next == scan)
3161             next = NULL;
3162         state_num = OP(scan);
3163
3164         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3165       reenter_switch:
3166
3167         assert(PL_reglastparen == &rex->lastparen);
3168         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3169         assert(PL_regoffs == rex->offs);
3170
3171         switch (state_num) {
3172         case BOL:
3173             if (locinput == PL_bostr)
3174             {
3175                 /* reginfo->till = reginfo->bol; */
3176                 break;
3177             }
3178             sayNO;
3179         case MBOL:
3180             if (locinput == PL_bostr ||
3181                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3182             {
3183                 break;
3184             }
3185             sayNO;
3186         case SBOL:
3187             if (locinput == PL_bostr)
3188                 break;
3189             sayNO;
3190         case GPOS:
3191             if (locinput == reginfo->ganch)
3192                 break;
3193             sayNO;
3194
3195         case KEEPS:
3196             /* update the startpoint */
3197             st->u.keeper.val = PL_regoffs[0].start;
3198             PL_reginput = locinput;
3199             PL_regoffs[0].start = locinput - PL_bostr;
3200             PUSH_STATE_GOTO(KEEPS_next, next);
3201             /*NOT-REACHED*/
3202         case KEEPS_next_fail:
3203             /* rollback the start point change */
3204             PL_regoffs[0].start = st->u.keeper.val;
3205             sayNO_SILENT;
3206             /*NOT-REACHED*/
3207         case EOL:
3208                 goto seol;
3209         case MEOL:
3210             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3211                 sayNO;
3212             break;
3213         case SEOL:
3214           seol:
3215             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3216                 sayNO;
3217             if (PL_regeol - locinput > 1)
3218                 sayNO;
3219             break;
3220         case EOS:
3221             if (PL_regeol != locinput)
3222                 sayNO;
3223             break;
3224         case SANY:
3225             if (!nextchr && locinput >= PL_regeol)
3226                 sayNO;
3227             if (utf8_target) {
3228                 locinput += PL_utf8skip[nextchr];
3229                 if (locinput > PL_regeol)
3230                     sayNO;
3231                 nextchr = UCHARAT(locinput);
3232             }
3233             else
3234                 nextchr = UCHARAT(++locinput);
3235             break;
3236         case CANY:
3237             if (!nextchr && locinput >= PL_regeol)
3238                 sayNO;
3239             nextchr = UCHARAT(++locinput);
3240             break;
3241         case REG_ANY:
3242             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3243                 sayNO;
3244             if (utf8_target) {
3245                 locinput += PL_utf8skip[nextchr];
3246                 if (locinput > PL_regeol)
3247                     sayNO;
3248                 nextchr = UCHARAT(locinput);
3249             }
3250             else
3251                 nextchr = UCHARAT(++locinput);
3252             break;
3253
3254 #undef  ST
3255 #define ST st->u.trie
3256         case TRIEC:
3257             /* In this case the charclass data is available inline so
3258                we can fail fast without a lot of extra overhead.
3259              */
3260             if (scan->flags == EXACT || !utf8_target) {
3261                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3262                     DEBUG_EXECUTE_r(
3263                         PerlIO_printf(Perl_debug_log,
3264                                   "%*s  %sfailed to match trie start class...%s\n",
3265                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3266                     );
3267                     sayNO_SILENT;
3268                     /* NOTREACHED */
3269                 }
3270             }
3271             /* FALL THROUGH */
3272         case TRIE:
3273             /* the basic plan of execution of the trie is:
3274              * At the beginning, run though all the states, and
3275              * find the longest-matching word. Also remember the position
3276              * of the shortest matching word. For example, this pattern:
3277              *    1  2 3 4    5
3278              *    ab|a|x|abcd|abc
3279              * when matched against the string "abcde", will generate
3280              * accept states for all words except 3, with the longest
3281              * matching word being 4, and the shortest being 1 (with
3282              * the position being after char 1 of the string).
3283              *
3284              * Then for each matching word, in word order (i.e. 1,2,4,5),
3285              * we run the remainder of the pattern; on each try setting
3286              * the current position to the character following the word,
3287              * returning to try the next word on failure.
3288              *
3289              * We avoid having to build a list of words at runtime by
3290              * using a compile-time structure, wordinfo[].prev, which
3291              * gives, for each word, the previous accepting word (if any).
3292              * In the case above it would contain the mappings 1->2, 2->0,
3293              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3294              * the longest word (4 above), a list of all words, by
3295              * following the list of prev pointers; this gives us the
3296              * unordered list 4,5,1,2. Then given the current word we have
3297              * just tried, we can go through the list and find the
3298              * next-biggest word to try (so if we just failed on word 2,
3299              * the next in the list is 4).
3300              *
3301              * Since at runtime we don't record the matching position in
3302              * the string for each word, we have to work that out for
3303              * each word we're about to process. The wordinfo table holds
3304              * the character length of each word; given that we recorded
3305              * at the start: the position of the shortest word and its
3306              * length in chars, we just need to move the pointer the
3307              * difference between the two char lengths. Depending on
3308              * Unicode status and folding, that's cheap or expensive.
3309              *
3310              * This algorithm is optimised for the case where are only a
3311              * small number of accept states, i.e. 0,1, or maybe 2.
3312              * With lots of accepts states, and having to try all of them,
3313              * it becomes quadratic on number of accept states to find all
3314              * the next words.
3315              */
3316
3317             {
3318                 /* what type of TRIE am I? (utf8 makes this contextual) */
3319                 DECL_TRIE_TYPE(scan);
3320
3321                 /* what trie are we using right now */
3322                 reg_trie_data * const trie
3323                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3324                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3325                 U32 state = trie->startstate;
3326
3327                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3328                     !TRIE_BITMAP_TEST(trie,*locinput)
3329                 ) {
3330                     if (trie->states[ state ].wordnum) {
3331                          DEBUG_EXECUTE_r(
3332                             PerlIO_printf(Perl_debug_log,
3333                                           "%*s  %smatched empty string...%s\n",
3334                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3335                         );
3336                         if (!trie->jump)
3337                             break;
3338                     } else {
3339                         DEBUG_EXECUTE_r(
3340                             PerlIO_printf(Perl_debug_log,
3341                                           "%*s  %sfailed to match trie start class...%s\n",
3342                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3343                         );
3344                         sayNO_SILENT;
3345                    }
3346                 }
3347
3348             {
3349                 U8 *uc = ( U8* )locinput;
3350
3351                 STRLEN len = 0;
3352                 STRLEN foldlen = 0;
3353                 U8 *uscan = (U8*)NULL;
3354                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3355                 U32 charcount = 0; /* how many input chars we have matched */
3356                 U32 accepted = 0; /* have we seen any accepting states? */
3357
3358                 ST.B = next;
3359                 ST.jump = trie->jump;
3360                 ST.me = scan;
3361                 ST.firstpos = NULL;
3362                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3363                 ST.nextword = 0;
3364
3365                 /* fully traverse the TRIE; note the position of the
3366                    shortest accept state and the wordnum of the longest
3367                    accept state */
3368
3369                 while ( state && uc <= (U8*)PL_regeol ) {
3370                     U32 base = trie->states[ state ].trans.base;
3371                     UV uvc = 0;
3372                     U16 charid = 0;
3373                     U16 wordnum;
3374                     wordnum = trie->states[ state ].wordnum;
3375
3376                     if (wordnum) { /* it's an accept state */
3377                         if (!accepted) {
3378                             accepted = 1;
3379                             /* record first match position */
3380                             if (ST.longfold) {
3381                                 ST.firstpos = (U8*)locinput;
3382                                 ST.firstchars = 0;
3383                             }
3384                             else {
3385                                 ST.firstpos = uc;
3386                                 ST.firstchars = charcount;
3387                             }
3388                         }
3389                         if (!ST.nextword || wordnum < ST.nextword)
3390                             ST.nextword = wordnum;
3391                         ST.topword = wordnum;
3392                     }
3393
3394                     DEBUG_TRIE_EXECUTE_r({
3395                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3396                                 PerlIO_printf( Perl_debug_log,
3397                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3398                                     2+depth * 2, "", PL_colors[4],
3399                                     (UV)state, (accepted ? 'Y' : 'N'));
3400                     });
3401
3402                     /* read a char and goto next state */
3403                     if ( base ) {
3404                         I32 offset;
3405                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3406                                              uscan, len, uvc, charid, foldlen,
3407                                              foldbuf, uniflags);
3408                         charcount++;
3409                         if (foldlen>0)
3410                             ST.longfold = TRUE;
3411                         if (charid &&
3412                              ( ((offset =
3413                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3414
3415                              && ((U32)offset < trie->lasttrans)
3416                              && trie->trans[offset].check == state)
3417                         {
3418                             state = trie->trans[offset].next;
3419                         }
3420                         else {
3421                             state = 0;
3422                         }
3423                         uc += len;
3424
3425                     }
3426                     else {
3427                         state = 0;
3428                     }
3429                     DEBUG_TRIE_EXECUTE_r(
3430                         PerlIO_printf( Perl_debug_log,
3431                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3432                             charid, uvc, (UV)state, PL_colors[5] );
3433                     );
3434                 }
3435                 if (!accepted)
3436                    sayNO;
3437
3438                 /* calculate total number of accept states */
3439                 {
3440                     U16 w = ST.topword;
3441                     accepted = 0;
3442                     while (w) {
3443                         w = trie->wordinfo[w].prev;
3444                         accepted++;
3445                     }
3446                     ST.accepted = accepted;
3447                 }
3448
3449                 DEBUG_EXECUTE_r(
3450                     PerlIO_printf( Perl_debug_log,
3451                         "%*s  %sgot %"IVdf" possible matches%s\n",
3452                         REPORT_CODE_OFF + depth * 2, "",
3453                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3454                 );
3455                 goto trie_first_try; /* jump into the fail handler */
3456             }}
3457             /* NOTREACHED */
3458
3459         case TRIE_next_fail: /* we failed - try next alternative */
3460             if ( ST.jump) {
3461                 REGCP_UNWIND(ST.cp);
3462                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3463                     PL_regoffs[n].end = -1;
3464                 *PL_reglastparen = n;
3465             }
3466             if (!--ST.accepted) {
3467                 DEBUG_EXECUTE_r({
3468                     PerlIO_printf( Perl_debug_log,
3469                         "%*s  %sTRIE failed...%s\n",
3470                         REPORT_CODE_OFF+depth*2, "",
3471                         PL_colors[4],
3472                         PL_colors[5] );
3473                 });
3474                 sayNO_SILENT;
3475             }
3476             {
3477                 /* Find next-highest word to process.  Note that this code
3478                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3479                 register U16 min = 0;
3480                 register U16 word;
3481                 register U16 const nextword = ST.nextword;
3482                 register reg_trie_wordinfo * const wordinfo
3483                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3484                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3485                     if (word > nextword && (!min || word < min))
3486                         min = word;
3487                 }
3488                 ST.nextword = min;
3489             }
3490
3491           trie_first_try:
3492             if (do_cutgroup) {
3493                 do_cutgroup = 0;
3494                 no_final = 0;
3495             }
3496
3497             if ( ST.jump) {
3498                 ST.lastparen = *PL_reglastparen;
3499                 REGCP_SET(ST.cp);
3500             }
3501
3502             /* find start char of end of current word */
3503             {
3504                 U32 chars; /* how many chars to skip */
3505                 U8 *uc = ST.firstpos;
3506                 reg_trie_data * const trie
3507                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3508
3509                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3510                             >=  ST.firstchars);
3511                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3512                             - ST.firstchars;
3513
3514                 if (ST.longfold) {
3515                     /* the hard option - fold each char in turn and find
3516                      * its folded length (which may be different */
3517                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3518                     STRLEN foldlen;
3519                     STRLEN len;
3520                     UV uvc;
3521                     U8 *uscan;
3522
3523                     while (chars) {
3524                         if (utf8_target) {
3525                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3526                                                     uniflags);
3527                             uc += len;
3528                         }
3529                         else {
3530                             uvc = *uc;
3531                             uc++;
3532                         }
3533                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3534                         uscan = foldbuf;
3535                         while (foldlen) {
3536                             if (!--chars)
3537                                 break;
3538                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3539                                             uniflags);
3540                             uscan += len;
3541                             foldlen -= len;
3542                         }
3543                     }
3544                 }
3545                 else {
3546                     if (utf8_target)
3547                         while (chars--)
3548                             uc += UTF8SKIP(uc);
3549                     else
3550                         uc += chars;
3551                 }
3552                 PL_reginput = (char *)uc;
3553             }
3554
3555             scan = (ST.jump && ST.jump[ST.nextword])
3556                         ? ST.me + ST.jump[ST.nextword]
3557                         : ST.B;
3558
3559             DEBUG_EXECUTE_r({
3560                 PerlIO_printf( Perl_debug_log,
3561                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3562                     REPORT_CODE_OFF+depth*2, "",
3563                     PL_colors[4],
3564                     ST.nextword,
3565                     PL_colors[5]
3566                     );
3567             });
3568
3569             if (ST.accepted > 1 || has_cutgroup) {
3570                 PUSH_STATE_GOTO(TRIE_next, scan);
3571                 /* NOTREACHED */
3572             }
3573             /* only one choice left - just continue */
3574             DEBUG_EXECUTE_r({
3575                 AV *const trie_words
3576                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3577                 SV ** const tmp = av_fetch( trie_words,
3578                     ST.nextword-1, 0 );
3579                 SV *sv= tmp ? sv_newmortal() : NULL;
3580
3581                 PerlIO_printf( Perl_debug_log,
3582                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3583                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3584                     ST.nextword,
3585                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3586                             PL_colors[0], PL_colors[1],
3587                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3588                         )
3589                     : "not compiled under -Dr",
3590                     PL_colors[5] );
3591             });
3592
3593             locinput = PL_reginput;
3594             nextchr = UCHARAT(locinput);
3595             continue; /* execute rest of RE */
3596             /* NOTREACHED */
3597 #undef  ST
3598
3599         case EXACT: {
3600             char *s = STRING(scan);
3601             ln = STR_LEN(scan);
3602             if (utf8_target != UTF_PATTERN) {
3603                 /* The target and the pattern have differing utf8ness. */
3604                 char *l = locinput;
3605                 const char * const e = s + ln;
3606
3607                 if (utf8_target) {
3608                     /* The target is utf8, the pattern is not utf8. */
3609                     while (s < e) {
3610                         STRLEN ulen;
3611                         if (l >= PL_regeol)
3612                              sayNO;
3613                         if (NATIVE_TO_UNI(*(U8*)s) !=
3614                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3615                                             uniflags))
3616                              sayNO;
3617                         l += ulen;
3618                         s ++;
3619                     }
3620                 }
3621                 else {
3622                     /* The target is not utf8, the pattern is utf8. */
3623                     while (s < e) {
3624                         STRLEN ulen;
3625                         if (l >= PL_regeol)
3626                             sayNO;
3627                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3628                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3629                                            uniflags))
3630                             sayNO;
3631                         s += ulen;
3632                         l ++;
3633                     }
3634                 }
3635                 locinput = l;
3636                 nextchr = UCHARAT(locinput);
3637                 break;
3638             }
3639             /* The target and the pattern have the same utf8ness. */
3640             /* Inline the first character, for speed. */
3641             if (UCHARAT(s) != nextchr)
3642                 sayNO;
3643             if (PL_regeol - locinput < ln)
3644                 sayNO;
3645             if (ln > 1 && memNE(s, locinput, ln))
3646                 sayNO;
3647             locinput += ln;
3648             nextchr = UCHARAT(locinput);
3649             break;
3650             }
3651         case EXACTFL: {
3652             re_fold_t folder;
3653             const U8 * fold_array;
3654             const char * s;
3655             U32 fold_utf8_flags;
3656
3657             PL_reg_flags |= RF_tainted;
3658             folder = foldEQ_locale;
3659             fold_array = PL_fold_locale;
3660             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3661             goto do_exactf;
3662
3663         case EXACTFU:
3664             folder = foldEQ_latin1;
3665             fold_array = PL_fold_latin1;
3666             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3667             goto do_exactf;
3668
3669         case EXACTFA:
3670             folder = foldEQ_latin1;
3671             fold_array = PL_fold_latin1;
3672             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3673             goto do_exactf;
3674
3675         case EXACTF:
3676             folder = foldEQ;
3677             fold_array = PL_fold;
3678             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3679
3680           do_exactf:
3681             s = STRING(scan);
3682             ln = STR_LEN(scan);
3683
3684             if (utf8_target || UTF_PATTERN) {
3685               /* Either target or the pattern are utf8. */
3686                 const char * const l = locinput;
3687                 char *e = PL_regeol;
3688
3689                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3690                                l, &e, 0,  utf8_target, fold_utf8_flags))
3691                 {
3692                     sayNO;
3693                 }
3694                 locinput = e;
3695                 nextchr = UCHARAT(locinput);
3696                 break;
3697             }
3698
3699             /* Neither the target nor the pattern are utf8 */
3700             if (UCHARAT(s) != nextchr &&
3701                 UCHARAT(s) != fold_array[nextchr])
3702             {
3703                 sayNO;
3704             }
3705             if (PL_regeol - locinput < ln)
3706                 sayNO;
3707             if (ln > 1 && ! folder(s, locinput, ln))
3708                 sayNO;
3709             locinput += ln;
3710             nextchr = UCHARAT(locinput);
3711             break;
3712         }
3713
3714         /* XXX Could improve efficiency by separating these all out using a
3715          * macro or in-line function.  At that point regcomp.c would no longer
3716          * have to set the FLAGS fields of these */
3717         case BOUNDL:
3718         case NBOUNDL:
3719             PL_reg_flags |= RF_tainted;
3720             /* FALL THROUGH */
3721         case BOUND:
3722         case BOUNDU:
3723         case BOUNDA:
3724         case NBOUND:
3725         case NBOUNDU:
3726         case NBOUNDA:
3727             /* was last char in word? */
3728             if (utf8_target
3729                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3730                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3731             {
3732                 if (locinput == PL_bostr)
3733                     ln = '\n';
3734                 else {
3735                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3736
3737                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3738                 }
3739                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3740                     ln = isALNUM_uni(ln);
3741                     LOAD_UTF8_CHARCLASS_ALNUM();
3742                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3743                 }
3744                 else {
3745                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3746                     n = isALNUM_LC_utf8((U8*)locinput);
3747                 }
3748             }
3749             else {
3750
3751                 /* Here the string isn't utf8, or is utf8 and only ascii
3752                  * characters are to match \w.  In the latter case looking at
3753                  * the byte just prior to the current one may be just the final
3754                  * byte of a multi-byte character.  This is ok.  There are two
3755                  * cases:
3756                  * 1) it is a single byte character, and then the test is doing
3757                  *      just what it's supposed to.
3758                  * 2) it is a multi-byte character, in which case the final
3759                  *      byte is never mistakable for ASCII, and so the test
3760                  *      will say it is not a word character, which is the
3761                  *      correct answer. */
3762                 ln = (locinput != PL_bostr) ?
3763                     UCHARAT(locinput - 1) : '\n';
3764                 switch (FLAGS(scan)) {
3765                     case REGEX_UNICODE_CHARSET:
3766                         ln = isWORDCHAR_L1(ln);
3767                         n = isWORDCHAR_L1(nextchr);
3768                         break;
3769                     case REGEX_LOCALE_CHARSET:
3770                         ln = isALNUM_LC(ln);
3771                         n = isALNUM_LC(nextchr);
3772                         break;
3773                     case REGEX_DEPENDS_CHARSET:
3774                         ln = isALNUM(ln);
3775                         n = isALNUM(nextchr);
3776                         break;
3777                     case REGEX_ASCII_RESTRICTED_CHARSET:
3778                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3779                         ln = isWORDCHAR_A(ln);
3780                         n = isWORDCHAR_A(nextchr);
3781                         break;
3782                     default:
3783                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3784                         break;
3785                 }
3786             }
3787             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3788              * regcomp.sym */
3789             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3790                     sayNO;
3791             break;
3792         case ANYOFV:
3793         case ANYOF:
3794             if (utf8_target || state_num == ANYOFV) {
3795                 STRLEN inclasslen = PL_regeol - locinput;
3796                 if (locinput >= PL_regeol)
3797                     sayNO;
3798
3799                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3800                     sayNO;
3801                 locinput += inclasslen;
3802                 nextchr = UCHARAT(locinput);
3803                 break;
3804             }
3805             else {
3806                 if (nextchr < 0)
3807                     nextchr = UCHARAT(locinput);
3808                 if (!nextchr && locinput >= PL_regeol)
3809                     sayNO;
3810                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3811                     sayNO;
3812                 nextchr = UCHARAT(++locinput);
3813                 break;
3814             }
3815             break;
3816         /* Special char classes - The defines start on line 129 or so */
3817         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3818                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3819                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3820                   ALNUMA, NALNUMA, isWORDCHAR_A,
3821                   alnum, "a");
3822
3823         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3824                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3825                   SPACEU, NSPACEU, isSPACE_L1,
3826                   SPACEA, NSPACEA, isSPACE_A,
3827                   space, " ");
3828
3829         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3830                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3831                 DIGITA, NDIGITA, isDIGIT_A,
3832                 digit, "0");
3833
3834         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3835                        a Unicode extended Grapheme Cluster */
3836             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3837               extended Grapheme Cluster is:
3838
3839                CR LF
3840                | Prepend* Begin Extend*
3841                | .
3842
3843                Begin is (Hangul-syllable | ! Control)
3844                Extend is (Grapheme_Extend | Spacing_Mark)
3845                Control is [ GCB_Control CR LF ]
3846
3847                The discussion below shows how the code for CLUMP is derived
3848                from this regex.  Note that most of these concepts are from
3849                property values of the Grapheme Cluster Boundary (GCB) property.
3850                No code point can have multiple property values for a given
3851                property.  Thus a code point in Prepend can't be in Control, but
3852                it must be in !Control.  This is why Control above includes
3853                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3854                property separately, and so can't be in GCB_Control, even though
3855                they logically are controls.  Control is not the same as gc=cc,
3856                but includes format and other characters as well.
3857
3858                The Unicode definition of Hangul-syllable is:
3859                    L+
3860                    | (L* ( ( V | LV ) V* | LVT ) T*)
3861                    | T+
3862                   )
3863                Each of these is a value for the GCB property, and hence must be
3864                disjoint, so the order they are tested is immaterial, so the
3865                above can safely be changed to
3866                    T+
3867                    | L+
3868                    | (L* ( LVT | ( V | LV ) V*) T*)
3869
3870                The last two terms can be combined like this:
3871                    L* ( L
3872                         | (( LVT | ( V | LV ) V*) T*))
3873
3874                And refactored into this:
3875                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3876
3877                That means that if we have seen any L's at all we can quit
3878                there, but if the next character is an LVT, a V, or an LV we
3879                should keep going.
3880
3881                There is a subtlety with Prepend* which showed up in testing.
3882                Note that the Begin, and only the Begin is required in:
3883                 | Prepend* Begin Extend*
3884                Also, Begin contains '! Control'.  A Prepend must be a
3885                '!  Control', which means it must also be a Begin.  What it
3886                comes down to is that if we match Prepend* and then find no
3887                suitable Begin afterwards, that if we backtrack the last
3888                Prepend, that one will be a suitable Begin.
3889             */
3890
3891             if (locinput >= PL_regeol)
3892                 sayNO;
3893             if  (! utf8_target) {
3894
3895                 /* Match either CR LF  or '.', as all the other possibilities
3896                  * require utf8 */
3897                 locinput++;         /* Match the . or CR */
3898                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3899                                        match the LF */
3900                     && locinput < PL_regeol
3901                     && UCHARAT(locinput) == '\n') locinput++;
3902             }
3903             else {
3904
3905                 /* Utf8: See if is ( CR LF ); already know that locinput <
3906                  * PL_regeol, so locinput+1 is in bounds */
3907                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3908                     locinput += 2;
3909                 }
3910                 else {
3911                     /* In case have to backtrack to beginning, then match '.' */
3912                     char *starting = locinput;
3913
3914                     /* In case have to backtrack the last prepend */
3915                     char *previous_prepend = 0;
3916
3917                     LOAD_UTF8_CHARCLASS_GCB();
3918
3919                     /* Match (prepend)* */
3920                     while (locinput < PL_regeol
3921                            && swash_fetch(PL_utf8_X_prepend,
3922                                           (U8*)locinput, utf8_target))
3923                     {
3924                         previous_prepend = locinput;
3925                         locinput += UTF8SKIP(locinput);
3926                     }
3927
3928                     /* As noted above, if we matched a prepend character, but
3929                      * the next thing won't match, back off the last prepend we
3930                      * matched, as it is guaranteed to match the begin */
3931                     if (previous_prepend
3932                         && (locinput >=  PL_regeol
3933                             || ! swash_fetch(PL_utf8_X_begin,
3934                                              (U8*)locinput, utf8_target)))
3935                     {
3936                         locinput = previous_prepend;
3937                     }
3938
3939                     /* Note that here we know PL_regeol > locinput, as we
3940                      * tested that upon input to this switch case, and if we
3941                      * moved locinput forward, we tested the result just above
3942                      * and it either passed, or we backed off so that it will
3943                      * now pass */
3944                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3945
3946                         /* Here did not match the required 'Begin' in the
3947                          * second term.  So just match the very first
3948                          * character, the '.' of the final term of the regex */
3949                         locinput = starting + UTF8SKIP(starting);
3950                     } else {
3951
3952                         /* Here is the beginning of a character that can have
3953                          * an extender.  It is either a hangul syllable, or a
3954                          * non-control */
3955                         if (swash_fetch(PL_utf8_X_non_hangul,
3956                                         (U8*)locinput, utf8_target))
3957                         {
3958
3959                             /* Here not a Hangul syllable, must be a
3960                              * ('!  * Control') */
3961                             locinput += UTF8SKIP(locinput);
3962                         } else {
3963
3964                             /* Here is a Hangul syllable.  It can be composed
3965                              * of several individual characters.  One
3966                              * possibility is T+ */
3967                             if (swash_fetch(PL_utf8_X_T,
3968                                             (U8*)locinput, utf8_target))
3969                             {
3970                                 while (locinput < PL_regeol
3971                                         && swash_fetch(PL_utf8_X_T,
3972                                                         (U8*)locinput, utf8_target))
3973                                 {
3974                                     locinput += UTF8SKIP(locinput);
3975                                 }
3976                             } else {
3977
3978                                 /* Here, not T+, but is a Hangul.  That means
3979                                  * it is one of the others: L, LV, LVT or V,
3980                                  * and matches:
3981                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3982
3983                                 /* Match L*           */
3984                                 while (locinput < PL_regeol
3985                                         && swash_fetch(PL_utf8_X_L,
3986                                                         (U8*)locinput, utf8_target))
3987                                 {
3988                                     locinput += UTF8SKIP(locinput);
3989                                 }
3990
3991                                 /* Here, have exhausted L*.  If the next
3992                                  * character is not an LV, LVT nor V, it means
3993                                  * we had to have at least one L, so matches L+
3994                                  * in the original equation, we have a complete
3995                                  * hangul syllable.  Are done. */
3996
3997                                 if (locinput < PL_regeol
3998                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3999                                                     (U8*)locinput, utf8_target))
4000                                 {
4001
4002                                     /* Otherwise keep going.  Must be LV, LVT
4003                                      * or V.  See if LVT */
4004                                     if (swash_fetch(PL_utf8_X_LVT,
4005                                                     (U8*)locinput, utf8_target))
4006                                     {
4007                                         locinput += UTF8SKIP(locinput);
4008                                     } else {
4009
4010                                         /* Must be  V or LV.  Take it, then
4011                                          * match V*     */
4012                                         locinput += UTF8SKIP(locinput);
4013                                         while (locinput < PL_regeol
4014                                                 && swash_fetch(PL_utf8_X_V,
4015                                                          (U8*)locinput, utf8_target))
4016                                         {
4017                                             locinput += UTF8SKIP(locinput);
4018                                         }
4019                                     }
4020
4021                                     /* And any of LV, LVT, or V can be followed
4022                                      * by T*            */
4023                                     while (locinput < PL_regeol
4024                                            && swash_fetch(PL_utf8_X_T,
4025                                                            (U8*)locinput,
4026                                                            utf8_target))
4027                                     {
4028                                         locinput += UTF8SKIP(locinput);
4029                                     }
4030                                 }
4031                             }
4032                         }
4033
4034                         /* Match any extender */
4035                         while (locinput < PL_regeol
4036                                 && swash_fetch(PL_utf8_X_extend,
4037                                                 (U8*)locinput, utf8_target))
4038                         {
4039                             locinput += UTF8SKIP(locinput);
4040                         }
4041                     }
4042                 }
4043                 if (locinput > PL_regeol) sayNO;
4044             }
4045             nextchr = UCHARAT(locinput);
4046             break;
4047
4048         case NREFFL:
4049         {   /* The capture buffer cases.  The ones beginning with N for the
4050                named buffers just convert to the equivalent numbered and
4051                pretend they were called as the corresponding numbered buffer
4052                op.  */
4053             /* don't initialize these in the declaration, it makes C++
4054                unhappy */
4055             char *s;
4056             char type;
4057             re_fold_t folder;
4058             const U8 *fold_array;
4059             UV utf8_fold_flags;
4060
4061             PL_reg_flags |= RF_tainted;
4062             folder = foldEQ_locale;
4063             fold_array = PL_fold_locale;
4064             type = REFFL;
4065             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4066             goto do_nref;
4067
4068         case NREFFA:
4069             folder = foldEQ_latin1;
4070             fold_array = PL_fold_latin1;
4071             type = REFFA;
4072             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4073             goto do_nref;
4074
4075         case NREFFU:
4076             folder = foldEQ_latin1;
4077             fold_array = PL_fold_latin1;
4078             type = REFFU;
4079             utf8_fold_flags = 0;
4080             goto do_nref;
4081
4082         case NREFF:
4083             folder = foldEQ;
4084             fold_array = PL_fold;
4085             type = REFF;
4086             utf8_fold_flags = 0;
4087             goto do_nref;
4088
4089         case NREF:
4090             type = REF;
4091             folder = NULL;
4092             fold_array = NULL;
4093             utf8_fold_flags = 0;
4094           do_nref:
4095
4096             /* For the named back references, find the corresponding buffer
4097              * number */
4098             n = reg_check_named_buff_matched(rex,scan);
4099
4100             if ( ! n ) {
4101                 sayNO;
4102             }
4103             goto do_nref_ref_common;
4104
4105         case REFFL:
4106             PL_reg_flags |= RF_tainted;
4107             folder = foldEQ_locale;
4108             fold_array = PL_fold_locale;
4109             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4110             goto do_ref;
4111
4112         case REFFA:
4113             folder = foldEQ_latin1;
4114             fold_array = PL_fold_latin1;
4115             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4116             goto do_ref;
4117
4118         case REFFU:
4119             folder = foldEQ_latin1;
4120             fold_array = PL_fold_latin1;
4121             utf8_fold_flags = 0;
4122             goto do_ref;
4123
4124         case REFF:
4125             folder = foldEQ;
4126             fold_array = PL_fold;
4127             utf8_fold_flags = 0;
4128             goto do_ref;
4129
4130         case REF:
4131             folder = NULL;
4132             fold_array = NULL;
4133             utf8_fold_flags = 0;
4134
4135           do_ref:
4136             type = OP(scan);
4137             n = ARG(scan);  /* which paren pair */
4138
4139           do_nref_ref_common:
4140             ln = PL_regoffs[n].start;
4141             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4142             if (*PL_reglastparen < n || ln == -1)
4143                 sayNO;                  /* Do not match unless seen CLOSEn. */
4144             if (ln == PL_regoffs[n].end)
4145                 break;
4146
4147             s = PL_bostr + ln;
4148             if (type != REF     /* REF can do byte comparison */
4149                 && (utf8_target || type == REFFU))
4150             { /* XXX handle REFFL better */
4151                 char * limit = PL_regeol;
4152
4153                 /* This call case insensitively compares the entire buffer
4154                     * at s, with the current input starting at locinput, but
4155                     * not going off the end given by PL_regeol, and returns in
4156                     * limit upon success, how much of the current input was
4157                     * matched */
4158                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4159                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4160                 {
4161                     sayNO;
4162                 }
4163                 locinput = limit;
4164                 nextchr = UCHARAT(locinput);
4165                 break;
4166             }
4167
4168             /* Not utf8:  Inline the first character, for speed. */
4169             if (UCHARAT(s) != nextchr &&
4170                 (type == REF ||
4171                  UCHARAT(s) != fold_array[nextchr]))
4172                 sayNO;
4173             ln = PL_regoffs[n].end - ln;
4174             if (locinput + ln > PL_regeol)
4175                 sayNO;
4176             if (ln > 1 && (type == REF
4177                            ? memNE(s, locinput, ln)
4178                            : ! folder(s, locinput, ln)))
4179                 sayNO;
4180             locinput += ln;
4181             nextchr = UCHARAT(locinput);
4182             break;
4183         }
4184         case NOTHING:
4185         case TAIL:
4186             break;
4187         case BACK:
4188             break;
4189
4190 #undef  ST
4191 #define ST st->u.eval
4192         {
4193             SV *ret;
4194             REGEXP *re_sv;
4195             regexp *re;
4196             regexp_internal *rei;
4197             regnode *startpoint;
4198
4199         case GOSTART:
4200         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4201             if (cur_eval && cur_eval->locinput==locinput) {
4202                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4203                     Perl_croak(aTHX_ "Infinite recursion in regex");
4204                 if ( ++nochange_depth > max_nochange_depth )
4205                     Perl_croak(aTHX_
4206                         "Pattern subroutine nesting without pos change"
4207                         " exceeded limit in regex");
4208             } else {
4209                 nochange_depth = 0;
4210             }
4211             re_sv = rex_sv;
4212             re = rex;
4213             rei = rexi;
4214             (void)ReREFCNT_inc(rex_sv);
4215             if (OP(scan)==GOSUB) {
4216                 startpoint = scan + ARG2L(scan);
4217                 ST.close_paren = ARG(scan);
4218             } else {
4219                 startpoint = rei->program+1;
4220                 ST.close_paren = 0;
4221             }
4222             goto eval_recurse_doit;
4223             /* NOTREACHED */
4224         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4225             if (cur_eval && cur_eval->locinput==locinput) {
4226                 if ( ++nochange_depth > max_nochange_depth )
4227                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4228             } else {
4229                 nochange_depth = 0;
4230             }
4231             {
4232                 /* execute the code in the {...} */
4233                 dSP;
4234                 SV ** const before = SP;
4235                 OP_4tree * const oop = PL_op;
4236                 COP * const ocurcop = PL_curcop;
4237                 PAD *old_comppad;
4238                 char *saved_regeol = PL_regeol;
4239                 struct re_save_state saved_state;
4240
4241                 /* To not corrupt the existing regex state while executing the
4242                  * eval we would normally put it on the save stack, like with
4243                  * save_re_context. However, re-evals have a weird scoping so we
4244                  * can't just add ENTER/LEAVE here. With that, things like
4245                  *
4246                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4247                  *
4248                  * would break, as they expect the localisation to be unwound
4249                  * only when the re-engine backtracks through the bit that
4250                  * localised it.
4251                  *
4252                  * What we do instead is just saving the state in a local c
4253                  * variable.
4254                  */
4255                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4256
4257                 n = ARG(scan);
4258                 PL_op = (OP_4tree*)rexi->data->data[n];
4259                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4260                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4261                 /* wrap the call in two SAVECOMPPADs. This ensures that
4262                  * when the save stack is eventually unwound, all the
4263                  * accumulated SAVEt_CLEARSV's will be processed with
4264                  * interspersed SAVEt_COMPPAD's to ensure that lexicals
4265                  * are cleared in the right pad */
4266                 SAVECOMPPAD();
4267                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4268                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4269
4270                 if (sv_yes_mark) {
4271                     SV *sv_mrk = get_sv("REGMARK", 1);
4272                     sv_setsv(sv_mrk, sv_yes_mark);
4273                 }
4274
4275                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4276                 SPAGAIN;
4277                 if (SP == before)
4278                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4279                 else {
4280                     ret = POPs;
4281                     PUTBACK;
4282                 }
4283
4284                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4285
4286                 PL_op = oop;
4287                 SAVECOMPPAD();
4288                 PAD_RESTORE_LOCAL(old_comppad);
4289                 PL_curcop = ocurcop;
4290                 PL_regeol = saved_regeol;
4291                 if (!logical) {
4292                     /* /(?{...})/ */
4293                     sv_setsv(save_scalar(PL_replgv), ret);
4294                     break;
4295                 }
4296             }
4297             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4298                 logical = 0;
4299                 {
4300                     /* extract RE object from returned value; compiling if
4301                      * necessary */
4302                     MAGIC *mg = NULL;
4303                     REGEXP *rx = NULL;
4304
4305                     if (SvROK(ret)) {
4306                         SV *const sv = SvRV(ret);
4307
4308                         if (SvTYPE(sv) == SVt_REGEXP) {
4309                             rx = (REGEXP*) sv;
4310                         } else if (SvSMAGICAL(sv)) {
4311                             mg = mg_find(sv, PERL_MAGIC_qr);
4312                             assert(mg);
4313                         }
4314                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4315                         rx = (REGEXP*) ret;
4316                     } else if (SvSMAGICAL(ret)) {
4317                         if (SvGMAGICAL(ret)) {
4318                             /* I don't believe that there is ever qr magic
4319                                here.  */
4320                             assert(!mg_find(ret, PERL_MAGIC_qr));
4321                             sv_unmagic(ret, PERL_MAGIC_qr);
4322                         }
4323                         else {
4324                             mg = mg_find(ret, PERL_MAGIC_qr);
4325                             /* testing suggests mg only ends up non-NULL for
4326                                scalars who were upgraded and compiled in the
4327                                else block below. In turn, this is only
4328                                triggered in the "postponed utf8 string" tests
4329                                in t/op/pat.t  */
4330                         }
4331                     }
4332
4333                     if (mg) {
4334                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4335                         assert(rx);
4336                     }
4337                     if (rx) {
4338                         rx = reg_temp_copy(NULL, rx);
4339                     }
4340                     else {
4341                         U32 pm_flags = 0;
4342                         const I32 osize = PL_regsize;
4343
4344                         if (DO_UTF8(ret)) {
4345                             assert (SvUTF8(ret));
4346                         } else if (SvUTF8(ret)) {
4347                             /* Not doing UTF-8, despite what the SV says. Is
4348                                this only if we're trapped in use 'bytes'?  */
4349                             /* Make a copy of the octet sequence, but without
4350                                the flag on, as the compiler now honours the
4351                                SvUTF8 flag on ret.  */
4352                             STRLEN len;
4353                             const char *const p = SvPV(ret, len);
4354                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4355                         }
4356                         rx = CALLREGCOMP(ret, pm_flags);
4357                         if (!(SvFLAGS(ret)
4358                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4359                                  | SVs_GMG))) {
4360                             /* This isn't a first class regexp. Instead, it's
4361                                caching a regexp onto an existing, Perl visible
4362                                scalar.  */
4363                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4364                         }
4365                         PL_regsize = osize;
4366                     }
4367                     re_sv = rx;
4368                     re = (struct regexp *)SvANY(rx);
4369                 }
4370                 RXp_MATCH_COPIED_off(re);
4371                 re->subbeg = rex->subbeg;
4372                 re->sublen = rex->sublen;
4373                 rei = RXi_GET(re);
4374                 DEBUG_EXECUTE_r(
4375                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4376                         "Matching embedded");
4377                 );
4378                 startpoint = rei->program + 1;
4379                 ST.close_paren = 0; /* only used for GOSUB */
4380                 /* borrowed from regtry */
4381                 if (PL_reg_start_tmpl <= re->nparens) {
4382                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4383                     if(PL_reg_start_tmp)
4384                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4385                     else
4386                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4387                 }
4388
4389         eval_recurse_doit: /* Share code with GOSUB below this line */
4390                 /* run the pattern returned from (??{...}) */
4391                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4392                 REGCP_SET(ST.lastcp);
4393
4394                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4395
4396                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4397                 PL_reglastparen = &re->lastparen;
4398                 PL_reglastcloseparen = &re->lastcloseparen;
4399                 re->lastparen = 0;
4400                 re->lastcloseparen = 0;
4401
4402                 PL_reginput = locinput;
4403                 PL_regsize = 0;
4404
4405                 /* XXXX This is too dramatic a measure... */
4406                 PL_reg_maxiter = 0;
4407
4408                 ST.toggle_reg_flags = PL_reg_flags;
4409                 if (RX_UTF8(re_sv))
4410                     PL_reg_flags |= RF_utf8;
4411                 else
4412                     PL_reg_flags &= ~RF_utf8;
4413                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4414
4415                 ST.prev_rex = rex_sv;
4416                 ST.prev_curlyx = cur_curlyx;
4417                 SETREX(rex_sv,re_sv);
4418                 rex = re;
4419                 rexi = rei;
4420                 cur_curlyx = NULL;
4421                 ST.B = next;
4422                 ST.prev_eval = cur_eval;
4423                 cur_eval = st;
4424                 /* now continue from first node in postoned RE */
4425                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4426                 /* NOTREACHED */
4427             }
4428             /* logical is 1,   /(?(?{...})X|Y)/ */
4429             sw = cBOOL(SvTRUE(ret));
4430             logical = 0;
4431             break;
4432         }
4433
4434         case EVAL_AB: /* cleanup after a successful (??{A})B */
4435             /* note: this is called twice; first after popping B, then A */
4436             PL_reg_flags ^= ST.toggle_reg_flags;
4437             ReREFCNT_dec(rex_sv);
4438             SETREX(rex_sv,ST.prev_rex);
4439             rex = (struct regexp *)SvANY(rex_sv);
4440             rexi = RXi_GET(rex);
4441             regcpblow(ST.cp);
4442             cur_eval = ST.prev_eval;
4443             cur_curlyx = ST.prev_curlyx;
4444
4445             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4446             PL_reglastparen = &rex->lastparen;
4447             PL_reglastcloseparen = &rex->lastcloseparen;
4448             /* also update PL_regoffs */
4449             PL_regoffs = rex->offs;
4450
4451             /* XXXX This is too dramatic a measure... */
4452             PL_reg_maxiter = 0;
4453             if ( nochange_depth )
4454                 nochange_depth--;
4455             sayYES;
4456
4457
4458         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4459             /* note: this is called twice; first after popping B, then A */
4460             PL_reg_flags ^= ST.toggle_reg_flags;
4461             ReREFCNT_dec(rex_sv);
4462             SETREX(rex_sv,ST.prev_rex);
4463             rex = (struct regexp *)SvANY(rex_sv);
4464             rexi = RXi_GET(rex);
4465             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4466             PL_reglastparen = &rex->lastparen;
4467             PL_reglastcloseparen = &rex->lastcloseparen;
4468
4469             PL_reginput = locinput;
4470             REGCP_UNWIND(ST.lastcp);
4471             regcppop(rex);
4472             cur_eval = ST.prev_eval;
4473             cur_curlyx = ST.prev_curlyx;
4474             /* XXXX This is too dramatic a measure... */
4475             PL_reg_maxiter = 0;
4476             if ( nochange_depth )
4477                 nochange_depth--;
4478             sayNO_SILENT;
4479 #undef ST
4480
4481         case OPEN:
4482             n = ARG(scan);  /* which paren pair */
4483             PL_reg_start_tmp[n] = locinput;
4484             if (n > PL_regsize)
4485                 PL_regsize = n;
4486             lastopen = n;
4487             break;
4488         case CLOSE:
4489             n = ARG(scan);  /* which paren pair */
4490             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4491             PL_regoffs[n].end = locinput - PL_bostr;
4492             /*if (n > PL_regsize)
4493                 PL_regsize = n;*/
4494             if (n > *PL_reglastparen)
4495                 *PL_reglastparen = n;
4496             *PL_reglastcloseparen = n;
4497             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4498                 goto fake_end;
4499             }
4500             break;
4501         case ACCEPT:
4502             if (ARG(scan)){
4503                 regnode *cursor;
4504                 for (cursor=scan;
4505                      cursor && OP(cursor)!=END;
4506                      cursor=regnext(cursor))
4507                 {
4508                     if ( OP(cursor)==CLOSE ){
4509                         n = ARG(cursor);
4510                         if ( n <= lastopen ) {
4511                             PL_regoffs[n].start
4512                                 = PL_reg_start_tmp[n] - PL_bostr;
4513                             PL_regoffs[n].end = locinput - PL_bostr;
4514                             /*if (n > PL_regsize)
4515                             PL_regsize = n;*/
4516                             if (n > *PL_reglastparen)
4517                                 *PL_reglastparen = n;
4518                             *PL_reglastcloseparen = n;
4519                             if ( n == ARG(scan) || (cur_eval &&
4520                                 cur_eval->u.eval.close_paren == n))
4521                                 break;
4522                         }
4523                     }
4524                 }
4525             }
4526             goto fake_end;
4527             /*NOTREACHED*/
4528         case GROUPP:
4529             n = ARG(scan);  /* which paren pair */
4530             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4531             break;
4532         case NGROUPP:
4533             /* reg_check_named_buff_matched returns 0 for no match */
4534             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4535             break;
4536         case INSUBP:
4537             n = ARG(scan);
4538             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4539             break;
4540         case DEFINEP:
4541             sw = 0;
4542             break;
4543         case IFTHEN:
4544             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4545             if (sw)
4546                 next = NEXTOPER(NEXTOPER(scan));
4547             else {
4548                 next = scan + ARG(scan);
4549                 if (OP(next) == IFTHEN) /* Fake one. */
4550                     next = NEXTOPER(NEXTOPER(next));
4551             }
4552             break;
4553         case LOGICAL:
4554             logical = scan->flags;
4555             break;
4556
4557 /*******************************************************************
4558
4559 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4560 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4561 STAR/PLUS/CURLY/CURLYN are used instead.)
4562
4563 A*B is compiled as <CURLYX><A><WHILEM><B>
4564
4565 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4566 state, which contains the current count, initialised to -1. It also sets
4567 cur_curlyx to point to this state, with any previous value saved in the
4568 state block.
4569
4570 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4571 since the pattern may possibly match zero times (i.e. it's a while {} loop
4572 rather than a do {} while loop).
4573
4574 Each entry to WHILEM represents a successful match of A. The count in the
4575 CURLYX block is incremented, another WHILEM state is pushed, and execution
4576 passes to A or B depending on greediness and the current count.
4577
4578 For example, if matching against the string a1a2a3b (where the aN are
4579 substrings that match /A/), then the match progresses as follows: (the
4580 pushed states are interspersed with the bits of strings matched so far):
4581
4582     <CURLYX cnt=-1>
4583     <CURLYX cnt=0><WHILEM>
4584     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4585     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4586     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4587     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4588
4589 (Contrast this with something like CURLYM, which maintains only a single
4590 backtrack state:
4591
4592     <CURLYM cnt=0> a1
4593     a1 <CURLYM cnt=1> a2
4594     a1 a2 <CURLYM cnt=2> a3
4595     a1 a2 a3 <CURLYM cnt=3> b
4596 )
4597
4598 Each WHILEM state block marks a point to backtrack to upon partial failure
4599 of A or B, and also contains some minor state data related to that
4600 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4601 overall state, such as the count, and pointers to the A and B ops.
4602
4603 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4604 must always point to the *current* CURLYX block, the rules are:
4605
4606 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4607 and set cur_curlyx to point the new block.
4608
4609 When popping the CURLYX block after a successful or unsuccessful match,
4610 restore the previous cur_curlyx.
4611
4612 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4613 to the outer one saved in the CURLYX block.
4614
4615 When popping the WHILEM block after a successful or unsuccessful B match,
4616 restore the previous cur_curlyx.
4617
4618 Here's an example for the pattern (AI* BI)*BO
4619 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4620
4621 cur_
4622 curlyx backtrack stack
4623 ------ ---------------
4624 NULL
4625 CO     <CO prev=NULL> <WO>
4626 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4627 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4628 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4629
4630 At this point the pattern succeeds, and we work back down the stack to
4631 clean up, restoring as we go:
4632
4633 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4634 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4635 CO     <CO prev=NULL> <WO>
4636 NULL
4637
4638 *******************************************************************/
4639
4640 #define ST st->u.curlyx
4641
4642         case CURLYX:    /* start of /A*B/  (for complex A) */
4643         {
4644             /* No need to save/restore up to this paren */
4645             I32 parenfloor = scan->flags;
4646
4647             assert(next); /* keep Coverity happy */
4648             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4649                 next += ARG(next);
4650
4651             /* XXXX Probably it is better to teach regpush to support
4652                parenfloor > PL_regsize... */
4653             if (parenfloor > (I32)*PL_reglastparen)
4654                 parenfloor = *PL_reglastparen; /* Pessimization... */
4655
4656             ST.prev_curlyx= cur_curlyx;
4657             cur_curlyx = st;
4658             ST.cp = PL_savestack_ix;
4659
4660             /* these fields contain the state of the current curly.
4661              * they are accessed by subsequent WHILEMs */
4662             ST.parenfloor = parenfloor;
4663             ST.me = scan;
4664             ST.B = next;
4665             ST.minmod = minmod;
4666             minmod = 0;
4667             ST.count = -1;      /* this will be updated by WHILEM */
4668             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4669
4670             PL_reginput = locinput;
4671             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4672             /* NOTREACHED */
4673         }
4674
4675         case CURLYX_end: /* just finished matching all of A*B */
4676             cur_curlyx = ST.prev_curlyx;
4677             sayYES;
4678             /* NOTREACHED */
4679
4680         case CURLYX_end_fail: /* just failed to match all of A*B */
4681             regcpblow(ST.cp);
4682             cur_curlyx = ST.prev_curlyx;
4683             sayNO;
4684             /* NOTREACHED */
4685
4686
4687 #undef ST
4688 #define ST st->u.whilem
4689
4690         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4691         {
4692             /* see the discussion above about CURLYX/WHILEM */
4693             I32 n;
4694             int min = ARG1(cur_curlyx->u.curlyx.me);
4695             int max = ARG2(cur_curlyx->u.curlyx.me);
4696             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4697
4698             assert(cur_curlyx); /* keep Coverity happy */
4699             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4700             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4701             ST.cache_offset = 0;
4702             ST.cache_mask = 0;
4703
4704             PL_reginput = locinput;
4705
4706             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4707                   "%*s  whilem: matched %ld out of %d..%d\n",
4708                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4709             );
4710
4711             /* First just match a string of min A's. */
4712
4713             if (n < min) {
4714                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4715                 cur_curlyx->u.curlyx.lastloc = locinput;
4716                 REGCP_SET(ST.lastcp);
4717
4718                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4719                 /* NOTREACHED */
4720             }
4721
4722             /* If degenerate A matches "", assume A done. */
4723
4724             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4725                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4726                    "%*s  whilem: empty match detected, trying continuation...\n",
4727                    REPORT_CODE_OFF+depth*2, "")
4728                 );
4729                 goto do_whilem_B_max;
4730             }
4731
4732             /* super-linear cache processing */
4733
4734             if (scan->flags) {
4735
4736                 if (!PL_reg_maxiter) {
4737                     /* start the countdown: Postpone detection until we
4738                      * know the match is not *that* much linear. */
4739                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4740                     /* possible overflow for long strings and many CURLYX's */
4741                     if (PL_reg_maxiter < 0)
4742                         PL_reg_maxiter = I32_MAX;
4743                     PL_reg_leftiter = PL_reg_maxiter;
4744                 }
4745
4746                 if (PL_reg_leftiter-- == 0) {
4747                     /* initialise cache */
4748                     const I32 size = (PL_reg_maxiter + 7)/8;
4749                     if (PL_reg_poscache) {
4750                         if ((I32)PL_reg_poscache_size < size) {
4751                             Renew(PL_reg_poscache, size, char);
4752                             PL_reg_poscache_size = size;
4753                         }
4754                         Zero(PL_reg_poscache, size, char);
4755                     }
4756                     else {
4757                         PL_reg_poscache_size = size;
4758                         Newxz(PL_reg_poscache, size, char);
4759                     }
4760                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4761       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4762                               PL_colors[4], PL_colors[5])
4763                     );
4764                 }
4765
4766                 if (PL_reg_leftiter < 0) {
4767                     /* have we already failed at this position? */
4768                     I32 offset, mask;
4769                     offset  = (scan->flags & 0xf) - 1
4770                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4771                     mask    = 1 << (offset % 8);
4772                     offset /= 8;
4773                     if (PL_reg_poscache[offset] & mask) {
4774                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4775                             "%*s  whilem: (cache) already tried at this position...\n",
4776                             REPORT_CODE_OFF+depth*2, "")
4777                         );
4778                         sayNO; /* cache records failure */
4779                     }
4780                     ST.cache_offset = offset;
4781                     ST.cache_mask   = mask;
4782                 }
4783             }
4784
4785             /* Prefer B over A for minimal matching. */
4786
4787             if (cur_curlyx->u.curlyx.minmod) {
4788                 ST.save_curlyx = cur_curlyx;
4789                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4790                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4791                 REGCP_SET(ST.lastcp);
4792                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4793                 /* NOTREACHED */
4794             }
4795
4796             /* Prefer A over B for maximal matching. */
4797
4798             if (n < max) { /* More greed allowed? */
4799                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4800                 cur_curlyx->u.curlyx.lastloc = locinput;
4801                 REGCP_SET(ST.lastcp);
4802                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4803                 /* NOTREACHED */
4804             }
4805             goto do_whilem_B_max;
4806         }
4807         /* NOTREACHED */
4808
4809         case WHILEM_B_min: /* just matched B in a minimal match */
4810         case WHILEM_B_max: /* just matched B in a maximal match */
4811             cur_curlyx = ST.save_curlyx;
4812             sayYES;
4813             /* NOTREACHED */
4814
4815         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4816             cur_curlyx = ST.save_curlyx;
4817             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4818             cur_curlyx->u.curlyx.count--;
4819             CACHEsayNO;
4820             /* NOTREACHED */
4821
4822         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4823             /* FALL THROUGH */
4824         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4825             REGCP_UNWIND(ST.lastcp);
4826             regcppop(rex);
4827             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4828             cur_curlyx->u.curlyx.count--;
4829             CACHEsayNO;
4830             /* NOTREACHED */
4831
4832         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4833             REGCP_UNWIND(ST.lastcp);
4834             regcppop(rex);      /* Restore some previous $<digit>s? */
4835             PL_reginput = locinput;
4836             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4837                 "%*s  whilem: failed, trying continuation...\n",
4838                 REPORT_CODE_OFF+depth*2, "")
4839             );
4840           do_whilem_B_max:
4841             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4842                 && ckWARN(WARN_REGEXP)
4843                 && !(PL_reg_flags & RF_warned))
4844             {
4845                 PL_reg_flags |= RF_warned;
4846                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4847                      "Complex regular subexpression recursion",
4848                      REG_INFTY - 1);
4849             }
4850
4851             /* now try B */
4852             ST.save_curlyx = cur_curlyx;
4853             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4854             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4855             /* NOTREACHED */
4856
4857         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4858             cur_curlyx = ST.save_curlyx;
4859             REGCP_UNWIND(ST.lastcp);
4860             regcppop(rex);
4861
4862             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4863                 /* Maximum greed exceeded */
4864                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4865                     && ckWARN(WARN_REGEXP)
4866                     && !(PL_reg_flags & RF_warned))
4867                 {
4868                     PL_reg_flags |= RF_warned;
4869                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4870                         "%s limit (%d) exceeded",
4871                         "Complex regular subexpression recursion",
4872                         REG_INFTY - 1);
4873                 }
4874                 cur_curlyx->u.curlyx.count--;
4875                 CACHEsayNO;
4876             }
4877
4878             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4879                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4880             );
4881             /* Try grabbing another A and see if it helps. */
4882             PL_reginput = locinput;
4883             cur_curlyx->u.curlyx.lastloc = locinput;
4884             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4885             REGCP_SET(ST.lastcp);
4886             PUSH_STATE_GOTO(WHILEM_A_min,
4887                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4888             /* NOTREACHED */
4889
4890 #undef  ST
4891 #define ST st->u.branch
4892
4893         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4894             next = scan + ARG(scan);
4895             if (next == scan)
4896                 next = NULL;
4897             scan = NEXTOPER(scan);
4898             /* FALL THROUGH */
4899
4900         case BRANCH:        /*  /(...|A|...)/ */
4901             scan = NEXTOPER(scan); /* scan now points to inner node */
4902             ST.lastparen = *PL_reglastparen;
4903             ST.next_branch = next;
4904             REGCP_SET(ST.cp);
4905             PL_reginput = locinput;
4906
4907             /* Now go into the branch */
4908             if (has_cutgroup) {
4909                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4910             } else {
4911                 PUSH_STATE_GOTO(BRANCH_next, scan);
4912             }
4913             /* NOTREACHED */
4914         case CUTGROUP:
4915             PL_reginput = locinput;
4916             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4917                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4918             PUSH_STATE_GOTO(CUTGROUP_next,next);
4919             /* NOTREACHED */
4920         case CUTGROUP_next_fail:
4921             do_cutgroup = 1;
4922             no_final = 1;
4923             if (st->u.mark.mark_name)
4924                 sv_commit = st->u.mark.mark_name;
4925             sayNO;
4926             /* NOTREACHED */
4927         case BRANCH_next:
4928             sayYES;
4929             /* NOTREACHED */
4930         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4931             if (do_cutgroup) {
4932                 do_cutgroup = 0;
4933                 no_final = 0;
4934             }
4935             REGCP_UNWIND(ST.cp);
4936             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4937                 PL_regoffs[n].end = -1;
4938             *PL_reglastparen = n;
4939             /*dmq: *PL_reglastcloseparen = n; */
4940             scan = ST.next_branch;
4941             /* no more branches? */
4942             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4943                 DEBUG_EXECUTE_r({
4944                     PerlIO_printf( Perl_debug_log,
4945                         "%*s  %sBRANCH failed...%s\n",
4946                         REPORT_CODE_OFF+depth*2, "",
4947                         PL_colors[4],
4948                         PL_colors[5] );
4949                 });
4950                 sayNO_SILENT;
4951             }
4952             continue; /* execute next BRANCH[J] op */
4953             /* NOTREACHED */
4954
4955         case MINMOD:
4956             minmod = 1;
4957             break;
4958
4959 #undef  ST
4960 #define ST st->u.curlym
4961
4962         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4963
4964             /* This is an optimisation of CURLYX that enables us to push
4965              * only a single backtracking state, no matter how many matches
4966              * there are in {m,n}. It relies on the pattern being constant
4967              * length, with no parens to influence future backrefs
4968              */
4969
4970             ST.me = scan;
4971             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4972
4973             /* if paren positive, emulate an OPEN/CLOSE around A */
4974             if (ST.me->flags) {
4975                 U32 paren = ST.me->flags;
4976                 if (paren > PL_regsize)
4977                     PL_regsize = paren;
4978                 if (paren > *PL_reglastparen)
4979                     *PL_reglastparen = paren;
4980                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4981             }
4982             ST.A = scan;
4983             ST.B = next;
4984             ST.alen = 0;
4985             ST.count = 0;
4986             ST.minmod = minmod;
4987             minmod = 0;
4988             ST.c1 = CHRTEST_UNINIT;
4989             REGCP_SET(ST.cp);
4990
4991             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4992                 goto curlym_do_B;
4993
4994           curlym_do_A: /* execute the A in /A{m,n}B/  */
4995             PL_reginput = locinput;
4996             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4997             /* NOTREACHED */
4998
4999         case CURLYM_A: /* we've just matched an A */
5000             locinput = st->locinput;
5001             nextchr = UCHARAT(locinput);
5002
5003             ST.count++;
5004             /* after first match, determine A's length: u.curlym.alen */
5005             if (ST.count == 1) {
5006                 if (PL_reg_match_utf8) {
5007                     char *s = locinput;
5008                     while (s < PL_reginput) {
5009                         ST.alen++;
5010                         s += UTF8SKIP(s);
5011                     }
5012                 }
5013                 else {
5014                     ST.alen = PL_reginput - locinput;
5015                 }
5016                 if (ST.alen == 0)
5017                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5018             }
5019             DEBUG_EXECUTE_r(
5020                 PerlIO_printf(Perl_debug_log,
5021                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5022                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5023                           (IV) ST.count, (IV)ST.alen)
5024             );
5025
5026             locinput = PL_reginput;
5027
5028             if (cur_eval && cur_eval->u.eval.close_paren &&
5029                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5030                 goto fake_end;
5031
5032             {
5033                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5034                 if ( max == REG_INFTY || ST.count < max )
5035                     goto curlym_do_A; /* try to match another A */
5036             }
5037             goto curlym_do_B; /* try to match B */
5038
5039         case CURLYM_A_fail: /* just failed to match an A */
5040             REGCP_UNWIND(ST.cp);
5041
5042             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5043                 || (cur_eval && cur_eval->u.eval.close_paren &&
5044                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5045                 sayNO;
5046
5047           curlym_do_B: /* execute the B in /A{m,n}B/  */
5048             PL_reginput = locinput;
5049             if (ST.c1 == CHRTEST_UNINIT) {
5050                 /* calculate c1 and c2 for possible match of 1st char
5051                  * following curly */
5052                 ST.c1 = ST.c2 = CHRTEST_VOID;
5053                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5054                     regnode *text_node = ST.B;
5055                     if (! HAS_TEXT(text_node))
5056                         FIND_NEXT_IMPT(text_node);
5057                     /* this used to be
5058
5059                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5060
5061                         But the former is redundant in light of the latter.
5062
5063                         if this changes back then the macro for
5064                         IS_TEXT and friends need to change.
5065                      */
5066                     if (PL_regkind[OP(text_node)] == EXACT)
5067                     {
5068
5069                         ST.c1 = (U8)*STRING(text_node);
5070                         switch (OP(text_node)) {
5071                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5072                             case EXACTFA:
5073                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5074                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5075                             default: ST.c2 = ST.c1;
5076                         }
5077                     }
5078                 }
5079             }
5080
5081             DEBUG_EXECUTE_r(
5082                 PerlIO_printf(Perl_debug_log,
5083                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5084                     (int)(REPORT_CODE_OFF+(depth*2)),
5085                     "", (IV)ST.count)
5086                 );
5087             if (ST.c1 != CHRTEST_VOID
5088                     && UCHARAT(PL_reginput) != ST.c1
5089                     && UCHARAT(PL_reginput) != ST.c2)
5090             {
5091                 /* simulate B failing */
5092                 DEBUG_OPTIMISE_r(
5093                     PerlIO_printf(Perl_debug_log,
5094                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5095                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5096                         (IV)ST.c1,(IV)ST.c2
5097                 ));
5098                 state_num = CURLYM_B_fail;
5099                 goto reenter_switch;
5100             }
5101
5102             if (ST.me->flags) {
5103                 /* mark current A as captured */
5104                 I32 paren = ST.me->flags;
5105                 if (ST.count) {
5106                     PL_regoffs[paren].start
5107                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5108                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5109                     /*dmq: *PL_reglastcloseparen = paren; */
5110                 }
5111                 else
5112                     PL_regoffs[paren].end = -1;
5113                 if (cur_eval && cur_eval->u.eval.close_paren &&
5114                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5115                 {
5116                     if (ST.count)
5117                         goto fake_end;
5118                     else
5119                         sayNO;
5120                 }
5121             }
5122
5123             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5124             /* NOTREACHED */
5125
5126         case CURLYM_B_fail: /* just failed to match a B */
5127             REGCP_UNWIND(ST.cp);
5128             if (ST.minmod) {
5129                 I32 max = ARG2(ST.me);
5130                 if (max != REG_INFTY && ST.count == max)
5131                     sayNO;
5132                 goto curlym_do_A; /* try to match a further A */
5133             }
5134             /* backtrack one A */
5135             if (ST.count == ARG1(ST.me) /* min */)
5136                 sayNO;
5137             ST.count--;
5138             locinput = HOPc(locinput, -ST.alen);
5139             goto curlym_do_B; /* try to match B */
5140
5141 #undef ST
5142 #define ST st->u.curly
5143
5144 #define CURLY_SETPAREN(paren, success) \
5145     if (paren) { \
5146         if (success) { \
5147             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5148             PL_regoffs[paren].end = locinput - PL_bostr; \
5149             *PL_reglastcloseparen = paren; \
5150         } \
5151         else \
5152             PL_regoffs[paren].end = -1; \
5153     }
5154
5155         case STAR:              /*  /A*B/ where A is width 1 */
5156             ST.paren = 0;
5157             ST.min = 0;
5158             ST.max = REG_INFTY;
5159             scan = NEXTOPER(scan);
5160             goto repeat;
5161         case PLUS:              /*  /A+B/ where A is width 1 */
5162             ST.paren = 0;
5163             ST.min = 1;
5164             ST.max = REG_INFTY;
5165             scan = NEXTOPER(scan);
5166             goto repeat;
5167         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5168             ST.paren = scan->flags;     /* Which paren to set */
5169             if (ST.paren > PL_regsize)
5170                 PL_regsize = ST.paren;
5171             if (ST.paren > *PL_reglastparen)
5172                 *PL_reglastparen = ST.paren;
5173             ST.min = ARG1(scan);  /* min to match */
5174             ST.max = ARG2(scan);  /* max to match */
5175             if (cur_eval && cur_eval->u.eval.close_paren &&
5176                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5177                 ST.min=1;
5178                 ST.max=1;
5179             }
5180             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5181             goto repeat;
5182         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5183             ST.paren = 0;
5184             ST.min = ARG1(scan);  /* min to match */
5185             ST.max = ARG2(scan);  /* max to match */
5186             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5187           repeat:
5188             /*
5189             * Lookahead to avoid useless match attempts
5190             * when we know what character comes next.
5191             *
5192             * Used to only do .*x and .*?x, but now it allows
5193             * for )'s, ('s and (?{ ... })'s to be in the way
5194             * of the quantifier and the EXACT-like node.  -- japhy
5195             */
5196
5197             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5198                 sayNO;
5199             if (HAS_TEXT(next) || JUMPABLE(next)) {
5200                 U8 *s;
5201                 regnode *text_node = next;
5202
5203                 if (! HAS_TEXT(text_node))
5204                     FIND_NEXT_IMPT(text_node);
5205
5206                 if (! HAS_TEXT(text_node))
5207                     ST.c1 = ST.c2 = CHRTEST_VOID;
5208                 else {
5209                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5210                         ST.c1 = ST.c2 = CHRTEST_VOID;
5211                         goto assume_ok_easy;
5212                     }
5213                     else
5214                         s = (U8*)STRING(text_node);
5215
5216                     /*  Currently we only get here when
5217
5218                         PL_rekind[OP(text_node)] == EXACT
5219
5220                         if this changes back then the macro for IS_TEXT and
5221                         friends need to change. */
5222                     if (!UTF_PATTERN) {
5223                         ST.c1 = *s;
5224                         switch (OP(text_node)) {
5225                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5226                             case EXACTFA:
5227                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5228                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5229                             default: ST.c2 = ST.c1; break;
5230                         }
5231                     }
5232                     else { /* UTF_PATTERN */
5233                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5234                              STRLEN ulen1, ulen2;
5235                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5236                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5237
5238                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5239                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5240 #ifdef EBCDIC
5241                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5242                                                     ckWARN(WARN_UTF8) ?
5243                                                     0 : UTF8_ALLOW_ANY);
5244                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5245                                                     ckWARN(WARN_UTF8) ?
5246                                                     0 : UTF8_ALLOW_ANY);
5247 #else
5248                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5249                                                     uniflags);
5250                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5251                                                     uniflags);
5252 #endif
5253                         }
5254                         else {
5255                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5256                                                      uniflags);
5257                         }
5258                     }
5259                 }
5260             }
5261             else
5262                 ST.c1 = ST.c2 = CHRTEST_VOID;
5263         assume_ok_easy:
5264
5265             ST.A = scan;
5266             ST.B = next;
5267             PL_reginput = locinput;
5268             if (minmod) {
5269                 minmod = 0;
5270                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5271                     sayNO;
5272                 ST.count = ST.min;
5273                 locinput = PL_reginput;
5274                 REGCP_SET(ST.cp);
5275                 if (ST.c1 == CHRTEST_VOID)
5276                     goto curly_try_B_min;
5277
5278                 ST.oldloc = locinput;
5279
5280                 /* set ST.maxpos to the furthest point along the
5281                  * string that could possibly match */
5282                 if  (ST.max == REG_INFTY) {
5283                     ST.maxpos = PL_regeol - 1;
5284                     if (utf8_target)
5285                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5286                             ST.maxpos--;
5287                 }
5288                 else if (utf8_target) {
5289                     int m = ST.max - ST.min;
5290                     for (ST.maxpos = locinput;
5291                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5292                         ST.maxpos += UTF8SKIP(ST.maxpos);
5293                 }
5294                 else {
5295                     ST.maxpos = locinput + ST.max - ST.min;
5296                     if (ST.maxpos >= PL_regeol)
5297                         ST.maxpos = PL_regeol - 1;
5298                 }
5299                 goto curly_try_B_min_known;
5300
5301             }
5302             else {
5303                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5304                 locinput = PL_reginput;
5305                 if (ST.count < ST.min)
5306                     sayNO;
5307                 if ((ST.count > ST.min)
5308                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5309                 {
5310                     /* A{m,n} must come at the end of the string, there's
5311                      * no point in backing off ... */
5312                     ST.min = ST.count;
5313                     /* ...except that $ and \Z can match before *and* after
5314                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5315                        We may back off by one in this case. */
5316                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5317                         ST.min--;
5318                 }
5319                 REGCP_SET(ST.cp);
5320                 goto curly_try_B_max;
5321             }
5322             /* NOTREACHED */
5323
5324
5325         case CURLY_B_min_known_fail:
5326             /* failed to find B in a non-greedy match where c1,c2 valid */
5327             if (ST.paren && ST.count)
5328                 PL_regoffs[ST.paren].end = -1;
5329
5330             PL_reginput = locinput;     /* Could be reset... */
5331             REGCP_UNWIND(ST.cp);
5332             /* Couldn't or didn't -- move forward. */
5333             ST.oldloc = locinput;
5334             if (utf8_target)
5335                 locinput += UTF8SKIP(locinput);
5336             else
5337                 locinput++;
5338             ST.count++;
5339           curly_try_B_min_known:
5340              /* find the next place where 'B' could work, then call B */
5341             {
5342                 int n;
5343                 if (utf8_target) {
5344                     n = (ST.oldloc == locinput) ? 0 : 1;
5345                     if (ST.c1 == ST.c2) {
5346                         STRLEN len;
5347                         /* set n to utf8_distance(oldloc, locinput) */
5348                         while (locinput <= ST.maxpos &&
5349                                utf8n_to_uvchr((U8*)locinput,
5350                                               UTF8_MAXBYTES, &len,
5351                                               uniflags) != (UV)ST.c1) {
5352                             locinput += len;
5353                             n++;
5354                         }
5355                     }
5356                     else {
5357                         /* set n to utf8_distance(oldloc, locinput) */
5358                         while (locinput <= ST.maxpos) {
5359                             STRLEN len;
5360                             const UV c = utf8n_to_uvchr((U8*)locinput,
5361                                                   UTF8_MAXBYTES, &len,
5362                                                   uniflags);
5363                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5364                                 break;
5365                             locinput += len;
5366                             n++;
5367                         }
5368                     }
5369                 }
5370                 else {
5371                     if (ST.c1 == ST.c2) {
5372                         while (locinput <= ST.maxpos &&
5373                                UCHARAT(locinput) != ST.c1)
5374                             locinput++;
5375                     }
5376                     else {
5377                         while (locinput <= ST.maxpos
5378                                && UCHARAT(locinput) != ST.c1
5379                                && UCHARAT(locinput) != ST.c2)
5380                             locinput++;
5381                     }
5382                     n = locinput - ST.oldloc;
5383                 }
5384                 if (locinput > ST.maxpos)
5385                     sayNO;
5386                 /* PL_reginput == oldloc now */
5387                 if (n) {
5388                     ST.count += n;
5389                     if (regrepeat(rex, ST.A, n, depth) < n)
5390                         sayNO;
5391                 }
5392                 PL_reginput = locinput;
5393                 CURLY_SETPAREN(ST.paren, ST.count);
5394                 if (cur_eval && cur_eval->u.eval.close_paren &&
5395                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5396                     goto fake_end;
5397                 }
5398                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5399             }
5400             /* NOTREACHED */
5401
5402
5403         case CURLY_B_min_fail:
5404             /* failed to find B in a non-greedy match where c1,c2 invalid */
5405             if (ST.paren && ST.count)
5406                 PL_regoffs[ST.paren].end = -1;
5407
5408             REGCP_UNWIND(ST.cp);
5409             /* failed -- move forward one */
5410             PL_reginput = locinput;
5411             if (regrepeat(rex, ST.A, 1, depth)) {
5412                 ST.count++;
5413                 locinput = PL_reginput;
5414                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5415                         ST.count > 0)) /* count overflow ? */
5416                 {
5417                   curly_try_B_min:
5418                     CURLY_SETPAREN(ST.paren, ST.count);
5419                     if (cur_eval && cur_eval->u.eval.close_paren &&
5420                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5421                         goto fake_end;
5422                     }
5423                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5424                 }
5425             }
5426             sayNO;
5427             /* NOTREACHED */
5428
5429
5430         curly_try_B_max:
5431             /* a successful greedy match: now try to match B */
5432             if (cur_eval && cur_eval->u.eval.close_paren &&
5433                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5434                 goto fake_end;
5435             }
5436             {
5437                 UV c = 0;
5438                 if (ST.c1 != CHRTEST_VOID)
5439                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5440                                            UTF8_MAXBYTES, 0, uniflags)
5441                                 : (UV) UCHARAT(PL_reginput);
5442                 /* If it could work, try it. */
5443                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5444                     CURLY_SETPAREN(ST.paren, ST.count);
5445                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5446                     /* NOTREACHED */
5447                 }
5448             }
5449             /* FALL THROUGH */
5450         case CURLY_B_max_fail:
5451             /* failed to find B in a greedy match */
5452             if (ST.paren && ST.count)
5453                 PL_regoffs[ST.paren].end = -1;
5454
5455             REGCP_UNWIND(ST.cp);
5456             /*  back up. */
5457             if (--ST.count < ST.min)
5458                 sayNO;
5459             PL_reginput = locinput = HOPc(locinput, -1);
5460             goto curly_try_B_max;
5461
5462 #undef ST
5463
5464         case END:
5465             fake_end:
5466             if (cur_eval) {
5467                 /* we've just finished A in /(??{A})B/; now continue with B */
5468                 I32 tmpix;
5469                 st->u.eval.toggle_reg_flags
5470                             = cur_eval->u.eval.toggle_reg_flags;
5471                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5472
5473                 st->u.eval.prev_rex = rex_sv;           /* inner */
5474                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5475                 rex = (struct regexp *)SvANY(rex_sv);
5476                 rexi = RXi_GET(rex);
5477                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5478                 (void)ReREFCNT_inc(rex_sv);
5479                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5480
5481                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5482                 PL_reglastparen = &rex->lastparen;
5483                 PL_reglastcloseparen = &rex->lastcloseparen;
5484
5485                 REGCP_SET(st->u.eval.lastcp);
5486                 PL_reginput = locinput;
5487
5488                 /* Restore parens of the outer rex without popping the
5489                  * savestack */
5490                 tmpix = PL_savestack_ix;
5491                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5492                 regcppop(rex);
5493                 PL_savestack_ix = tmpix;
5494
5495                 st->u.eval.prev_eval = cur_eval;
5496                 cur_eval = cur_eval->u.eval.prev_eval;
5497                 DEBUG_EXECUTE_r(
5498                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5499                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5500                 if ( nochange_depth )
5501                     nochange_depth--;
5502
5503                 PUSH_YES_STATE_GOTO(EVAL_AB,
5504                         st->u.eval.prev_eval->u.eval.B); /* match B */
5505             }
5506
5507             if (locinput < reginfo->till) {
5508                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5509                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5510                                       PL_colors[4],
5511                                       (long)(locinput - PL_reg_starttry),
5512                                       (long)(reginfo->till - PL_reg_starttry),
5513                                       PL_colors[5]));
5514
5515                 sayNO_SILENT;           /* Cannot match: too short. */
5516             }
5517             PL_reginput = locinput;     /* put where regtry can find it */
5518             sayYES;                     /* Success! */
5519
5520         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5521             DEBUG_EXECUTE_r(
5522             PerlIO_printf(Perl_debug_log,
5523                 "%*s  %ssubpattern success...%s\n",
5524                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5525             PL_reginput = locinput;     /* put where regtry can find it */
5526             sayYES;                     /* Success! */
5527
5528 #undef  ST
5529 #define ST st->u.ifmatch
5530
5531         case SUSPEND:   /* (?>A) */
5532             ST.wanted = 1;
5533             PL_reginput = locinput;
5534             goto do_ifmatch;
5535
5536         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5537             ST.wanted = 0;
5538             goto ifmatch_trivial_fail_test;
5539
5540         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5541             ST.wanted = 1;
5542           ifmatch_trivial_fail_test:
5543             if (scan->flags) {
5544                 char * const s = HOPBACKc(locinput, scan->flags);
5545                 if (!s) {
5546                     /* trivial fail */
5547                     if (logical) {
5548                         logical = 0;
5549                         sw = 1 - cBOOL(ST.wanted);
5550                     }
5551                     else if (ST.wanted)
5552                         sayNO;
5553                     next = scan + ARG(scan);
5554                     if (next == scan)
5555                         next = NULL;
5556                     break;
5557                 }
5558                 PL_reginput = s;
5559             }
5560             else
5561                 PL_reginput = locinput;
5562
5563           do_ifmatch:
5564             ST.me = scan;
5565             ST.logical = logical;
5566             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5567
5568             /* execute body of (?...A) */
5569             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5570             /* NOTREACHED */
5571
5572         case IFMATCH_A_fail: /* body of (?...A) failed */
5573             ST.wanted = !ST.wanted;
5574             /* FALL THROUGH */
5575
5576         case IFMATCH_A: /* body of (?...A) succeeded */
5577             if (ST.logical) {
5578                 sw = cBOOL(ST.wanted);
5579             }
5580             else if (!ST.wanted)
5581                 sayNO;
5582
5583             if (OP(ST.me) == SUSPEND)
5584                 locinput = PL_reginput;
5585             else {
5586                 locinput = PL_reginput = st->locinput;
5587                 nextchr = UCHARAT(locinput);
5588             }
5589             scan = ST.me + ARG(ST.me);
5590             if (scan == ST.me)
5591                 scan = NULL;
5592             continue; /* execute B */
5593
5594 #undef ST
5595
5596         case LONGJMP:
5597             next = scan + ARG(scan);
5598             if (next == scan)
5599                 next = NULL;
5600             break;
5601         case COMMIT:
5602             reginfo->cutpoint = PL_regeol;
5603             /* FALLTHROUGH */
5604         case PRUNE:
5605             PL_reginput = locinput;
5606             if (!scan->flags)
5607                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5608             PUSH_STATE_GOTO(COMMIT_next,next);
5609             /* NOTREACHED */
5610         case COMMIT_next_fail:
5611             no_final = 1;
5612             /* FALLTHROUGH */
5613         case OPFAIL:
5614             sayNO;
5615             /* NOTREACHED */
5616
5617 #define ST st->u.mark
5618         case MARKPOINT:
5619             ST.prev_mark = mark_state;
5620             ST.mark_name = sv_commit = sv_yes_mark
5621                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5622             mark_state = st;
5623             ST.mark_loc = PL_reginput = locinput;
5624             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5625             /* NOTREACHED */
5626         case MARKPOINT_next:
5627             mark_state = ST.prev_mark;
5628             sayYES;
5629             /* NOTREACHED */
5630         case MARKPOINT_next_fail:
5631             if (popmark && sv_eq(ST.mark_name,popmark))
5632             {
5633                 if (ST.mark_loc > startpoint)
5634                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5635                 popmark = NULL; /* we found our mark */
5636                 sv_commit = ST.mark_name;
5637
5638                 DEBUG_EXECUTE_r({
5639                         PerlIO_printf(Perl_debug_log,
5640                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5641                             REPORT_CODE_OFF+depth*2, "",
5642                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5643                 });
5644             }
5645             mark_state = ST.prev_mark;
5646             sv_yes_mark = mark_state ?
5647                 mark_state->u.mark.mark_name : NULL;
5648             sayNO;
5649             /* NOTREACHED */
5650         case SKIP:
5651             PL_reginput = locinput;
5652             if (scan->flags) {
5653                 /* (*SKIP) : if we fail we cut here*/
5654                 ST.mark_name = NULL;
5655                 ST.mark_loc = locinput;
5656                 PUSH_STATE_GOTO(SKIP_next,next);
5657             } else {
5658                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5659                    otherwise do nothing.  Meaning we need to scan
5660                  */
5661                 regmatch_state *cur = mark_state;
5662                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5663
5664                 while (cur) {
5665                     if ( sv_eq( cur->u.mark.mark_name,
5666                                 find ) )
5667                     {
5668                         ST.mark_name = find;
5669                         PUSH_STATE_GOTO( SKIP_next, next );
5670                     }
5671                     cur = cur->u.mark.prev_mark;
5672                 }
5673             }
5674             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5675             break;
5676         case SKIP_next_fail:
5677             if (ST.mark_name) {
5678                 /* (*CUT:NAME) - Set up to search for the name as we
5679                    collapse the stack*/
5680                 popmark = ST.mark_name;
5681             } else {
5682                 /* (*CUT) - No name, we cut here.*/
5683                 if (ST.mark_loc > startpoint)
5684                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5685                 /* but we set sv_commit to latest mark_name if there
5686                    is one so they can test to see how things lead to this
5687                    cut */
5688                 if (mark_state)
5689                     sv_commit=mark_state->u.mark.mark_name;
5690             }
5691             no_final = 1;
5692             sayNO;
5693             /* NOTREACHED */
5694 #undef ST
5695         case FOLDCHAR:
5696             n = ARG(scan);
5697             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5698                 locinput += ln;
5699             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5700                 sayNO;
5701             } else  {
5702                 U8 folded[UTF8_MAXBYTES_CASE+1];
5703                 STRLEN foldlen;
5704                 const char * const l = locinput;
5705                 char *e = PL_regeol;
5706                 to_uni_fold(n, folded, &foldlen);
5707
5708                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5709                                l, &e, 0,  utf8_target)) {
5710                         sayNO;
5711                 }
5712                 locinput = e;
5713             }
5714             nextchr = UCHARAT(locinput);
5715             break;
5716         case LNBREAK:
5717             if ((n=is_LNBREAK(locinput,utf8_target))) {
5718                 locinput += n;
5719                 nextchr = UCHARAT(locinput);
5720             } else
5721                 sayNO;
5722             break;
5723
5724 #define CASE_CLASS(nAmE)                              \
5725         case nAmE:                                    \
5726             if (locinput >= PL_regeol)                \
5727                 sayNO;                                \
5728             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5729                 locinput += n;                        \
5730                 nextchr = UCHARAT(locinput);          \
5731             } else                                    \
5732                 sayNO;                                \
5733             break;                                    \
5734         case N##nAmE:                                 \
5735             if (locinput >= PL_regeol)                \
5736                 sayNO;                                \
5737             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5738                 sayNO;                                \
5739             } else {                                  \
5740                 locinput += UTF8SKIP(locinput);       \
5741                 nextchr = UCHARAT(locinput);          \
5742             }                                         \
5743             break
5744
5745         CASE_CLASS(VERTWS);
5746         CASE_CLASS(HORIZWS);
5747 #undef CASE_CLASS
5748
5749         default:
5750             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5751                           PTR2UV(scan), OP(scan));
5752             Perl_croak(aTHX_ "regexp memory corruption");
5753
5754         } /* end switch */
5755
5756         /* switch break jumps here */
5757         scan = next; /* prepare to execute the next op and ... */
5758         continue;    /* ... jump back to the top, reusing st */
5759         /* NOTREACHED */
5760
5761       push_yes_state:
5762         /* push a state that backtracks on success */
5763         st->u.yes.prev_yes_state = yes_state;
5764         yes_state = st;
5765         /* FALL THROUGH */
5766       push_state:
5767         /* push a new regex state, then continue at scan  */
5768         {
5769             regmatch_state *newst;
5770
5771             DEBUG_STACK_r({
5772                 regmatch_state *cur = st;
5773                 regmatch_state *curyes = yes_state;
5774                 int curd = depth;
5775                 regmatch_slab *slab = PL_regmatch_slab;
5776                 for (;curd > -1;cur--,curd--) {
5777                     if (cur < SLAB_FIRST(slab)) {
5778                         slab = slab->prev;
5779                         cur = SLAB_LAST(slab);
5780                     }
5781                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5782                         REPORT_CODE_OFF + 2 + depth * 2,"",
5783                         curd, PL_reg_name[cur->resume_state],
5784                         (curyes == cur) ? "yes" : ""
5785                     );
5786                     if (curyes == cur)
5787                         curyes = cur->u.yes.prev_yes_state;
5788                 }
5789             } else
5790                 DEBUG_STATE_pp("push")
5791             );
5792             depth++;
5793             st->locinput = locinput;
5794             newst = st+1;
5795             if (newst >  SLAB_LAST(PL_regmatch_slab))
5796                 newst = S_push_slab(aTHX);
5797             PL_regmatch_state = newst;
5798
5799             locinput = PL_reginput;
5800             nextchr = UCHARAT(locinput);
5801             st = newst;
5802             continue;
5803             /* NOTREACHED */
5804         }
5805     }
5806
5807     /*
5808     * We get here only if there's trouble -- normally "case END" is
5809     * the terminating point.
5810     */
5811     Perl_croak(aTHX_ "corrupted regexp pointers");
5812     /*NOTREACHED*/
5813     sayNO;
5814
5815 yes:
5816     if (yes_state) {
5817         /* we have successfully completed a subexpression, but we must now
5818          * pop to the state marked by yes_state and continue from there */
5819         assert(st != yes_state);
5820 #ifdef DEBUGGING
5821         while (st != yes_state) {
5822             st--;
5823             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5824                 PL_regmatch_slab = PL_regmatch_slab->prev;
5825                 st = SLAB_LAST(PL_regmatch_slab);
5826             }
5827             DEBUG_STATE_r({
5828                 if (no_final) {
5829                     DEBUG_STATE_pp("pop (no final)");
5830                 } else {
5831                     DEBUG_STATE_pp("pop (yes)");
5832                 }
5833             });
5834             depth--;
5835         }
5836 #else
5837         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5838             || yes_state > SLAB_LAST(PL_regmatch_slab))
5839         {
5840             /* not in this slab, pop slab */
5841             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5842             PL_regmatch_slab = PL_regmatch_slab->prev;
5843             st = SLAB_LAST(PL_regmatch_slab);
5844         }
5845         depth -= (st - yes_state);
5846 #endif
5847         st = yes_state;
5848         yes_state = st->u.yes.prev_yes_state;
5849         PL_regmatch_state = st;
5850
5851         if (no_final) {
5852             locinput= st->locinput;
5853             nextchr = UCHARAT(locinput);
5854         }
5855         state_num = st->resume_state + no_final;
5856         goto reenter_switch;
5857     }
5858
5859     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5860                           PL_colors[4], PL_colors[5]));
5861
5862     if (PL_reg_eval_set) {
5863         /* each successfully executed (?{...}) block does the equivalent of
5864          *   local $^R = do {...}
5865          * When popping the save stack, all these locals would be undone;
5866          * bypass this by setting the outermost saved $^R to the latest
5867          * value */
5868         if (oreplsv != GvSV(PL_replgv))
5869             sv_setsv(oreplsv, GvSV(PL_replgv));
5870     }
5871     result = 1;
5872     goto final_exit;
5873
5874 no:
5875     DEBUG_EXECUTE_r(
5876         PerlIO_printf(Perl_debug_log,
5877             "%*s  %sfailed...%s\n",
5878             REPORT_CODE_OFF+depth*2, "",
5879             PL_colors[4], PL_colors[5])
5880         );
5881
5882 no_silent:
5883     if (no_final) {
5884         if (yes_state) {
5885             goto yes;
5886         } else {
5887             goto final_exit;
5888         }
5889     }
5890     if (depth) {
5891         /* there's a previous state to backtrack to */
5892         st--;
5893         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5894             PL_regmatch_slab = PL_regmatch_slab->prev;
5895             st = SLAB_LAST(PL_regmatch_slab);
5896         }
5897         PL_regmatch_state = st;
5898         locinput= st->locinput;
5899         nextchr = UCHARAT(locinput);
5900
5901         DEBUG_STATE_pp("pop");
5902         depth--;
5903         if (yes_state == st)
5904             yes_state = st->u.yes.prev_yes_state;
5905
5906         state_num = st->resume_state + 1; /* failure = success + 1 */
5907         goto reenter_switch;
5908     }
5909     result = 0;
5910
5911   final_exit:
5912     if (rex->intflags & PREGf_VERBARG_SEEN) {
5913         SV *sv_err = get_sv("REGERROR", 1);
5914         SV *sv_mrk = get_sv("REGMARK", 1);
5915         if (result) {
5916             sv_commit = &PL_sv_no;
5917             if (!sv_yes_mark)
5918                 sv_yes_mark = &PL_sv_yes;
5919         } else {
5920             if (!sv_commit)
5921                 sv_commit = &PL_sv_yes;
5922             sv_yes_mark = &PL_sv_no;
5923         }
5924         sv_setsv(sv_err, sv_commit);
5925         sv_setsv(sv_mrk, sv_yes_mark);
5926     }
5927
5928     /* clean up; in particular, free all slabs above current one */
5929     LEAVE_SCOPE(oldsave);
5930
5931     return result;
5932 }
5933
5934 /*
5935  - regrepeat - repeatedly match something simple, report how many
5936  */
5937 /*
5938  * [This routine now assumes that it will only match on things of length 1.
5939  * That was true before, but now we assume scan - reginput is the count,
5940  * rather than incrementing count on every character.  [Er, except utf8.]]
5941  */
5942 STATIC I32
5943 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5944 {
5945     dVAR;
5946     register char *scan;
5947     register I32 c;
5948     register char *loceol = PL_regeol;
5949     register I32 hardcount = 0;
5950     register bool utf8_target = PL_reg_match_utf8;
5951     UV utf8_flags;
5952 #ifndef DEBUGGING
5953     PERL_UNUSED_ARG(depth);
5954 #endif
5955
5956     PERL_ARGS_ASSERT_REGREPEAT;
5957
5958     scan = PL_reginput;
5959     if (max == REG_INFTY)
5960         max = I32_MAX;
5961     else if (max < loceol - scan)
5962         loceol = scan + max;
5963     switch (OP(p)) {
5964     case REG_ANY:
5965         if (utf8_target) {
5966             loceol = PL_regeol;
5967             while (scan < loceol && hardcount < max && *scan != '\n') {
5968                 scan += UTF8SKIP(scan);
5969                 hardcount++;
5970             }
5971         } else {
5972             while (scan < loceol && *scan != '\n')
5973                 scan++;
5974         }
5975         break;
5976     case SANY:
5977         if (utf8_target) {
5978             loceol = PL_regeol;
5979             while (scan < loceol && hardcount < max) {
5980                 scan += UTF8SKIP(scan);
5981                 hardcount++;
5982             }
5983         }
5984         else
5985             scan = loceol;
5986         break;
5987     case CANY:
5988         scan = loceol;
5989         break;
5990     case EXACT:
5991         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5992          * means they match only characters in the string that can be expressed
5993          * as a single byte.  For non-utf8 strings, that means a simple match.
5994          * For utf8 strings, the character matched must be an invariant, or
5995          * downgradable to a single byte.  The pattern's utf8ness is
5996          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5997          * it is, it's an invariant */
5998
5999         c = (U8)*STRING(p);
6000         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6001
6002         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6003             while (scan < loceol && UCHARAT(scan) == c) {
6004                 scan++;
6005             }
6006         }
6007         else {
6008
6009             /* Here, the string is utf8, and the pattern char is different
6010              * in utf8 than not, so can't compare them directly.  Outside the
6011              * loop, find find the two utf8 bytes that represent c, and then
6012              * look for those in sequence in the utf8 string */
6013             U8 high = UTF8_TWO_BYTE_HI(c);
6014             U8 low = UTF8_TWO_BYTE_LO(c);
6015             loceol = PL_regeol;
6016
6017             while (hardcount < max
6018                     && scan + 1 < loceol
6019                     && UCHARAT(scan) == high
6020                     && UCHARAT(scan + 1) == low)
6021             {
6022                 scan += 2;
6023                 hardcount++;
6024             }
6025         }
6026         break;
6027     case EXACTFA:
6028         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6029         goto do_exactf;
6030
6031     case EXACTFL:
6032         PL_reg_flags |= RF_tainted;
6033         utf8_flags = FOLDEQ_UTF8_LOCALE;
6034         goto do_exactf;
6035
6036     case EXACTF:
6037     case EXACTFU:
6038         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6039
6040         /* The comments for the EXACT case above apply as well to these fold
6041          * ones */
6042
6043     do_exactf:
6044         c = (U8)*STRING(p);
6045         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6046
6047         if (utf8_target) { /* Use full Unicode fold matching */
6048             char *tmpeol = loceol;
6049             while (hardcount < max
6050                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6051                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6052             {
6053                 scan = tmpeol;
6054                 tmpeol = loceol;
6055                 hardcount++;
6056             }
6057
6058             /* XXX Note that the above handles properly the German sharp s in
6059              * the pattern matching ss in the string.  But it doesn't handle
6060              * properly cases where the string contains say 'LIGATURE ff' and
6061              * the pattern is 'f+'.  This would require, say, a new function or
6062              * revised interface to foldEQ_utf8(), in which the maximum number
6063              * of characters to match could be passed and it would return how
6064              * many actually did.  This is just one of many cases where
6065              * multi-char folds don't work properly, and so the fix is being
6066              * deferred */
6067         }
6068         else {
6069             U8 folded;
6070
6071             /* Here, the string isn't utf8 and c is a single byte; and either
6072              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6073              * doesn't affect c.  Can just do simple comparisons for exact or
6074              * fold matching. */
6075             switch (OP(p)) {
6076                 case EXACTF: folded = PL_fold[c]; break;
6077                 case EXACTFA:
6078                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6079                 case EXACTFL: folded = PL_fold_locale[c]; break;
6080                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6081             }
6082             while (scan < loceol &&
6083                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6084             {
6085                 scan++;
6086             }
6087         }
6088         break;
6089     case ANYOFV:
6090     case ANYOF:
6091         if (utf8_target || OP(p) == ANYOFV) {
6092             STRLEN inclasslen;
6093             loceol = PL_regeol;
6094             inclasslen = loceol - scan;
6095             while (hardcount < max
6096                    && ((inclasslen = loceol - scan) > 0)
6097                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6098             {
6099                 scan += inclasslen;
6100                 hardcount++;
6101             }
6102         } else {
6103             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6104                 scan++;
6105         }
6106         break;
6107     case ALNUMU:
6108         if (utf8_target) {
6109     utf8_wordchar:
6110             loceol = PL_regeol;
6111             LOAD_UTF8_CHARCLASS_ALNUM();
6112             while (hardcount < max && scan < loceol &&
6113                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6114             {
6115                 scan += UTF8SKIP(scan);
6116                 hardcount++;
6117             }
6118         } else {
6119             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6120                 scan++;
6121             }
6122         }
6123         break;
6124     case ALNUM:
6125         if (utf8_target)
6126             goto utf8_wordchar;
6127         while (scan < loceol && isALNUM((U8) *scan)) {
6128             scan++;
6129         }
6130         break;
6131     case ALNUMA:
6132         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6133             scan++;
6134         }
6135         break;
6136     case ALNUML:
6137         PL_reg_flags |= RF_tainted;
6138         if (utf8_target) {
6139             loceol = PL_regeol;
6140             while (hardcount < max && scan < loceol &&
6141                    isALNUM_LC_utf8((U8*)scan)) {
6142                 scan += UTF8SKIP(scan);
6143                 hardcount++;
6144             }
6145         } else {
6146             while (scan < loceol && isALNUM_LC(*scan))
6147                 scan++;
6148         }
6149         break;
6150     case NALNUMU:
6151         if (utf8_target) {
6152
6153     utf8_Nwordchar:
6154
6155             loceol = PL_regeol;
6156             LOAD_UTF8_CHARCLASS_ALNUM();
6157             while (hardcount < max && scan < loceol &&
6158                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6159             {
6160                 scan += UTF8SKIP(scan);
6161                 hardcount++;
6162             }
6163         } else {
6164             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6165                 scan++;
6166             }
6167         }
6168         break;
6169     case NALNUM:
6170         if (utf8_target)
6171             goto utf8_Nwordchar;
6172         while (scan < loceol && ! isALNUM((U8) *scan)) {
6173             scan++;
6174         }
6175         break;
6176     case NALNUMA:
6177         if (utf8_target) {
6178             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6179                 scan += UTF8SKIP(scan);
6180             }
6181         }
6182         else {
6183             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6184                 scan++;
6185             }
6186         }
6187         break;
6188     case NALNUML:
6189         PL_reg_flags |= RF_tainted;
6190         if (utf8_target) {
6191             loceol = PL_regeol;
6192             while (hardcount < max && scan < loceol &&
6193                    !isALNUM_LC_utf8((U8*)scan)) {
6194                 scan += UTF8SKIP(scan);
6195                 hardcount++;
6196             }
6197         } else {
6198             while (scan < loceol && !isALNUM_LC(*scan))
6199                 scan++;
6200         }
6201         break;
6202     case SPACEU:
6203         if (utf8_target) {
6204
6205     utf8_space:
6206
6207             loceol = PL_regeol;
6208             LOAD_UTF8_CHARCLASS_SPACE();
6209             while (hardcount < max && scan < loceol &&
6210                    (*scan == ' ' ||
6211                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6212             {
6213                 scan += UTF8SKIP(scan);
6214                 hardcount++;
6215             }
6216             break;
6217         }
6218         else {
6219             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6220                 scan++;
6221             }
6222             break;
6223         }
6224     case SPACE:
6225         if (utf8_target)
6226             goto utf8_space;
6227
6228         while (scan < loceol && isSPACE((U8) *scan)) {
6229             scan++;
6230         }
6231         break;
6232     case SPACEA:
6233         while (scan < loceol && isSPACE_A((U8) *scan)) {
6234             scan++;
6235         }
6236         break;
6237     case SPACEL:
6238         PL_reg_flags |= RF_tainted;
6239         if (utf8_target) {
6240             loceol = PL_regeol;
6241             while (hardcount < max && scan < loceol &&
6242                    isSPACE_LC_utf8((U8*)scan)) {
6243                 scan += UTF8SKIP(scan);
6244                 hardcount++;
6245             }
6246         } else {
6247             while (scan < loceol && isSPACE_LC(*scan))
6248                 scan++;
6249         }
6250         break;
6251     case NSPACEU:
6252         if (utf8_target) {
6253
6254     utf8_Nspace:
6255
6256             loceol = PL_regeol;
6257             LOAD_UTF8_CHARCLASS_SPACE();
6258             while (hardcount < max && scan < loceol &&
6259                    ! (*scan == ' ' ||
6260                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6261             {
6262                 scan += UTF8SKIP(scan);
6263                 hardcount++;
6264             }
6265             break;
6266         }
6267         else {
6268             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6269                 scan++;
6270             }
6271         }
6272         break;
6273     case NSPACE:
6274         if (utf8_target)
6275             goto utf8_Nspace;
6276
6277         while (scan < loceol && ! isSPACE((U8) *scan)) {
6278             scan++;
6279         }
6280         break;
6281     case NSPACEA:
6282         if (utf8_target) {
6283             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6284                 scan += UTF8SKIP(scan);
6285             }
6286         }
6287         else {
6288             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6289                 scan++;
6290             }
6291         }
6292         break;
6293     case NSPACEL:
6294         PL_reg_flags |= RF_tainted;
6295         if (utf8_target) {
6296             loceol = PL_regeol;
6297             while (hardcount < max && scan < loceol &&
6298                    !isSPACE_LC_utf8((U8*)scan)) {
6299                 scan += UTF8SKIP(scan);
6300                 hardcount++;
6301             }
6302         } else {
6303             while (scan < loceol && !isSPACE_LC(*scan))
6304                 scan++;
6305         }
6306         break;
6307     case DIGIT:
6308         if (utf8_target) {
6309             loceol = PL_regeol;
6310             LOAD_UTF8_CHARCLASS_DIGIT();
6311             while (hardcount < max && scan < loceol &&
6312                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6313                 scan += UTF8SKIP(scan);
6314                 hardcount++;
6315             }
6316         } else {
6317             while (scan < loceol && isDIGIT(*scan))
6318                 scan++;
6319         }
6320         break;
6321     case DIGITA:
6322         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6323             scan++;
6324         }
6325         break;
6326     case DIGITL:
6327         PL_reg_flags |= RF_tainted;
6328         if (utf8_target) {
6329             loceol = PL_regeol;
6330             while (hardcount < max && scan < loceol &&
6331                    isDIGIT_LC_utf8((U8*)scan)) {
6332                 scan += UTF8SKIP(scan);
6333                 hardcount++;
6334             }
6335         } else {
6336             while (scan < loceol && isDIGIT_LC(*scan))
6337                 scan++;
6338         }
6339         break;
6340     case NDIGIT:
6341         if (utf8_target) {
6342             loceol = PL_regeol;
6343             LOAD_UTF8_CHARCLASS_DIGIT();
6344             while (hardcount < max && scan < loceol &&
6345                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6346                 scan += UTF8SKIP(scan);
6347                 hardcount++;
6348             }
6349         } else {
6350             while (scan < loceol && !isDIGIT(*scan))
6351                 scan++;
6352         }
6353         break;
6354     case NDIGITA:
6355         if (utf8_target) {
6356             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6357                 scan += UTF8SKIP(scan);
6358             }
6359         }
6360         else {
6361             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6362                 scan++;
6363             }
6364         }
6365         break;
6366     case NDIGITL:
6367         PL_reg_flags |= RF_tainted;
6368         if (utf8_target) {
6369             loceol = PL_regeol;
6370             while (hardcount < max && scan < loceol &&
6371                    !isDIGIT_LC_utf8((U8*)scan)) {
6372                 scan += UTF8SKIP(scan);
6373                 hardcount++;
6374             }
6375         } else {
6376             while (scan < loceol && !isDIGIT_LC(*scan))
6377                 scan++;
6378         }
6379         break;
6380     case LNBREAK:
6381         if (utf8_target) {
6382             loceol = PL_regeol;
6383             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6384                 scan += c;
6385                 hardcount++;
6386             }
6387         } else {
6388             /*
6389               LNBREAK can match two latin chars, which is ok,
6390               because we have a null terminated string, but we
6391               have to use hardcount in this situation
6392             */
6393             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6394                 scan+=c;
6395                 hardcount++;
6396             }
6397         }
6398         break;
6399     case HORIZWS:
6400         if (utf8_target) {
6401             loceol = PL_regeol;
6402             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6403                 scan += c;
6404                 hardcount++;
6405             }
6406         } else {
6407             while (scan < loceol && is_HORIZWS_latin1(scan))
6408                 scan++;
6409         }
6410         break;
6411     case NHORIZWS:
6412         if (utf8_target) {
6413             loceol = PL_regeol;
6414             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6415                 scan += UTF8SKIP(scan);
6416                 hardcount++;
6417             }
6418         } else {
6419             while (scan < loceol && !is_HORIZWS_latin1(scan))
6420                 scan++;
6421
6422         }
6423         break;
6424     case VERTWS:
6425         if (utf8_target) {
6426             loceol = PL_regeol;
6427             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6428                 scan += c;
6429                 hardcount++;
6430             }
6431         } else {
6432             while (scan < loceol && is_VERTWS_latin1(scan))
6433                 scan++;
6434
6435         }
6436         break;
6437     case NVERTWS:
6438         if (utf8_target) {
6439             loceol = PL_regeol;
6440             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6441                 scan += UTF8SKIP(scan);
6442                 hardcount++;
6443             }
6444         } else {
6445             while (scan < loceol && !is_VERTWS_latin1(scan))
6446                 scan++;
6447
6448         }
6449         break;
6450
6451     default:            /* Called on something of 0 width. */
6452         break;          /* So match right here or not at all. */
6453     }
6454
6455     if (hardcount)
6456         c = hardcount;
6457     else
6458         c = scan - PL_reginput;
6459     PL_reginput = scan;
6460
6461     DEBUG_r({
6462         GET_RE_DEBUG_FLAGS_DECL;
6463         DEBUG_EXECUTE_r({
6464             SV * const prop = sv_newmortal();
6465             regprop(prog, prop, p);
6466             PerlIO_printf(Perl_debug_log,
6467                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6468                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6469         });
6470     });
6471
6472     return(c);
6473 }
6474
6475
6476 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6477 /*
6478 - regclass_swash - prepare the utf8 swash
6479 */
6480
6481 SV *
6482 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6483 {
6484     dVAR;
6485     SV *sw  = NULL;
6486     SV *si  = NULL;
6487     SV *alt = NULL;
6488     RXi_GET_DECL(prog,progi);
6489     const struct reg_data * const data = prog ? progi->data : NULL;
6490
6491     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6492
6493     assert(ANYOF_NONBITMAP(node));
6494
6495     if (data && data->count) {
6496         const U32 n = ARG(node);
6497
6498         if (data->what[n] == 's') {
6499             SV * const rv = MUTABLE_SV(data->data[n]);
6500             AV * const av = MUTABLE_AV(SvRV(rv));
6501             SV **const ary = AvARRAY(av);
6502             SV **a, **b;
6503
6504             /* See the end of regcomp.c:S_regclass() for
6505              * documentation of these array elements. */
6506
6507             si = *ary;
6508             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6509             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6510
6511             if (a)
6512                 sw = *a;
6513             else if (si && doinit) {
6514                 sw = swash_init("utf8", "", si, 1, 0);
6515                 (void)av_store(av, 1, sw);
6516             }
6517             if (b)
6518                 alt = *b;
6519         }
6520     }
6521
6522     if (listsvp)
6523         *listsvp = si;
6524     if (altsvp)
6525         *altsvp  = alt;
6526
6527     return sw;
6528 }
6529 #endif
6530
6531 /*
6532  - reginclass - determine if a character falls into a character class
6533
6534   n is the ANYOF regnode
6535   p is the target string
6536   lenp is pointer to the maximum number of bytes of how far to go in p
6537     (This is assumed wthout checking to always be at least the current
6538     character's size)
6539   utf8_target tells whether p is in UTF-8.
6540
6541   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6542   from a successful match, the value it points to will be updated to how many
6543   bytes in p were matched.  If there was no match, the value is undefined,
6544   possibly changed from the input.
6545
6546   Note that this can be a synthetic start class, a combination of various
6547   nodes, so things you think might be mutually exclusive, such as locale,
6548   aren't.  It can match both locale and non-locale
6549
6550  */
6551
6552 STATIC bool
6553 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6554 {
6555     dVAR;
6556     const char flags = ANYOF_FLAGS(n);
6557     bool match = FALSE;
6558     UV c = *p;
6559     STRLEN c_len = 0;
6560     STRLEN maxlen;
6561
6562     PERL_ARGS_ASSERT_REGINCLASS;
6563
6564     /* If c is not already the code point, get it */
6565     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6566         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6567                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6568                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6569                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6570                  * UTF8_ALLOW_FFFF */
6571         if (c_len == (STRLEN)-1)
6572             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6573     }
6574     else {
6575         c_len = 1;
6576     }
6577
6578     /* Use passed in max length, or one character if none passed in or less
6579      * than one character.  And assume will match just one character.  This is
6580      * overwritten later if matched more. */
6581     if (lenp) {
6582         maxlen = (*lenp > c_len) ? *lenp : c_len;
6583         *lenp = c_len;
6584
6585     }
6586     else {
6587         maxlen = c_len;
6588     }
6589
6590     /* If this character is potentially in the bitmap, check it */
6591     if (c < 256) {
6592         if (ANYOF_BITMAP_TEST(n, c))
6593             match = TRUE;
6594         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6595                 && ! utf8_target
6596                 && ! isASCII(c))
6597         {
6598             match = TRUE;
6599         }
6600
6601         else if (flags & ANYOF_LOCALE) {
6602             PL_reg_flags |= RF_tainted;
6603
6604             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6605                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6606             {
6607                 match = TRUE;
6608             }
6609             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6610                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6611                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6612                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6613                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6614                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6615                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6616                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6617                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6618                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6619                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6620                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6621                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6622                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6623                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6624                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6625                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6626                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6627                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6628                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6629                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6630                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6631                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6632                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6633                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6634                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6635                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6636                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6637                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6638                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6639                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6640                      ) /* How's that for a conditional? */
6641             ) {
6642                 match = TRUE;
6643             }
6644         }
6645     }
6646
6647     /* If the bitmap didn't (or couldn't) match, and something outside the
6648      * bitmap could match, try that.  Locale nodes specifiy completely the
6649      * behavior of code points in the bit map (otherwise, a utf8 target would
6650      * cause them to be treated as Unicode and not locale), except in
6651      * the very unlikely event when this node is a synthetic start class, which
6652      * could be a combination of locale and non-locale nodes.  So allow locale
6653      * to match for the synthetic start class, which will give a false
6654      * positive that will be resolved when the match is done again as not part
6655      * of the synthetic start class */
6656     if (!match) {
6657         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6658             match = TRUE;       /* Everything above 255 matches */
6659         }
6660         else if (ANYOF_NONBITMAP(n)
6661                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6662                      || (utf8_target
6663                          && (c >=256
6664                              || (! (flags & ANYOF_LOCALE))
6665                              || (flags & ANYOF_IS_SYNTHETIC)))))
6666         {
6667             AV *av;
6668             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6669
6670             if (sw) {
6671                 U8 * utf8_p;
6672                 if (utf8_target) {
6673                     utf8_p = (U8 *) p;
6674                 } else {
6675
6676                     /* Not utf8.  Convert as much of the string as available up
6677                      * to the limit of how far the (single) character in the
6678                      * pattern can possibly match (no need to go further).  If
6679                      * the node is a straight ANYOF or not folding, it can't
6680                      * match more than one.  Otherwise, It can match up to how
6681                      * far a single char can fold to.  Since not utf8, each
6682                      * character is a single byte, so the max it can be in
6683                      * bytes is the same as the max it can be in characters */
6684                     STRLEN len = (OP(n) == ANYOF
6685                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6686                                   ? 1
6687                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6688                                     ? maxlen
6689                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6690                     utf8_p = bytes_to_utf8(p, &len);
6691                 }
6692
6693                 if (swash_fetch(sw, utf8_p, TRUE))
6694                     match = TRUE;
6695                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6696
6697                     /* Here, we need to test if the fold of the target string
6698                      * matches.  The non-multi char folds have all been moved to
6699                      * the compilation phase, and the multi-char folds have
6700                      * been stored by regcomp into 'av'; we linearly check to
6701                      * see if any match the target string (folded).   We know
6702                      * that the originals were each one character, but we don't
6703                      * currently know how many characters/bytes each folded to,
6704                      * except we do know that there are small limits imposed by
6705                      * Unicode.  XXX A performance enhancement would be to have
6706                      * regcomp.c store the max number of chars/bytes that are
6707                      * in an av entry, as, say the 0th element.  Even better
6708                      * would be to have a hash of the few characters that can
6709                      * start a multi-char fold to the max number of chars of
6710                      * those folds.
6711                      *
6712                      * If there is a match, we will need to advance (if lenp is
6713                      * specified) the match pointer in the target string.  But
6714                      * what we are comparing here isn't that string directly,
6715                      * but its fold, whose length may differ from the original.
6716                      * As we go along in constructing the fold, therefore, we
6717                      * create a map so that we know how many bytes in the
6718                      * source to advance given that we have matched a certain
6719                      * number of bytes in the fold.  This map is stored in
6720                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6721                      * the fold of the first character that we are folding.
6722                      * Then map_fold_len_back[n] is set to the number of bytes
6723                      * in that first character.  Similarly let m be the
6724                      * corresponding number for the second character to be
6725                      * folded.  Then map_fold_len_back[n+m] is set to the
6726                      * number of bytes occupied by the first two source
6727                      * characters. ... */
6728                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6729                     U8 folded[UTF8_MAXBYTES_CASE+1];
6730                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6731                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6732                                                   chars */
6733
6734                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6735
6736                         /* Here, only need to fold the first char of the target
6737                          * string.  It the source wasn't utf8, is 1 byte long */
6738                         to_utf8_fold(utf8_p, folded, &foldlen);
6739                         total_foldlen = foldlen;
6740                         map_fold_len_back[foldlen] = (utf8_target)
6741                                                      ? UTF8SKIP(utf8_p)
6742                                                      : 1;
6743                     }
6744                     else {
6745
6746                         /* Here, need to fold more than the first char.  Do so
6747                          * up to the limits */
6748                         U8* source_ptr = utf8_p;    /* The source for the fold
6749                                                        is the regex target
6750                                                        string */
6751                         U8* folded_ptr = folded;
6752                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6753                                                        available byte in the
6754                                                        target string */
6755                         U8 i;
6756                         for (i = 0;
6757                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6758                              i++)
6759                         {
6760
6761                             /* Fold the next character */
6762                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6763                             STRLEN this_char_foldlen;
6764                             to_utf8_fold(source_ptr,
6765                                          this_char_folded,
6766                                          &this_char_foldlen);
6767
6768                             /* Bail if it would exceed the byte limit for
6769                              * folding a single char. */
6770                             if (this_char_foldlen + folded_ptr - folded >
6771                                                             UTF8_MAXBYTES_CASE)
6772                             {
6773                                 break;
6774                             }
6775
6776                             /* Add the fold of this character */
6777                             Copy(this_char_folded,
6778                                  folded_ptr,
6779                                  this_char_foldlen,
6780                                  U8);
6781                             source_ptr += UTF8SKIP(source_ptr);
6782                             folded_ptr += this_char_foldlen;
6783                             total_foldlen = folded_ptr - folded;
6784
6785                             /* Create map from the number of bytes in the fold
6786                              * back to the number of bytes in the source.  If
6787                              * the source isn't utf8, the byte count is just
6788                              * the number of characters so far */
6789                             map_fold_len_back[total_foldlen]
6790                                                       = (utf8_target)
6791                                                         ? source_ptr - utf8_p
6792                                                         : i + 1;
6793                         }
6794                         *folded_ptr = '\0';
6795                     }
6796
6797
6798                     /* Do the linear search to see if the fold is in the list
6799                      * of multi-char folds. */
6800                     if (av) {
6801                         I32 i;
6802                         for (i = 0; i <= av_len(av); i++) {
6803                             SV* const sv = *av_fetch(av, i, FALSE);
6804                             STRLEN len;
6805                             const char * const s = SvPV_const(sv, len);
6806
6807                             if (len <= total_foldlen
6808                                 && memEQ(s, (char*)folded, len)
6809
6810                                    /* If 0, means matched a partial char. See
6811                                     * [perl #90536] */
6812                                 && map_fold_len_back[len])
6813                             {
6814
6815                                 /* Advance the target string ptr to account for
6816                                  * this fold, but have to translate from the
6817                                  * folded length to the corresponding source
6818                                  * length. */
6819                                 if (lenp) {
6820                                     *lenp = map_fold_len_back[len];
6821                                 }
6822                                 match = TRUE;
6823                                 break;
6824                             }
6825                         }
6826                     }
6827                 }
6828
6829                 /* If we allocated a string above, free it */
6830                 if (! utf8_target) Safefree(utf8_p);
6831             }
6832         }
6833     }
6834
6835     return (flags & ANYOF_INVERT) ? !match : match;
6836 }
6837
6838 STATIC U8 *
6839 S_reghop3(U8 *s, I32 off, const U8* lim)
6840 {
6841     /* return the position 'off' UTF-8 characters away from 's', forward if
6842      * 'off' >= 0, backwards if negative.  But don't go outside of position
6843      * 'lim', which better be < s  if off < 0 */
6844
6845     dVAR;
6846
6847     PERL_ARGS_ASSERT_REGHOP3;
6848
6849     if (off >= 0) {
6850         while (off-- && s < lim) {
6851             /* XXX could check well-formedness here */
6852             s += UTF8SKIP(s);
6853         }
6854     }
6855     else {
6856         while (off++ && s > lim) {
6857             s--;
6858             if (UTF8_IS_CONTINUED(*s)) {
6859                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6860                     s--;
6861             }
6862             /* XXX could check well-formedness here */
6863         }
6864     }
6865     return s;
6866 }
6867
6868 #ifdef XXX_dmq
6869 /* there are a bunch of places where we use two reghop3's that should
6870    be replaced with this routine. but since thats not done yet
6871    we ifdef it out - dmq
6872 */
6873 STATIC U8 *
6874 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6875 {
6876     dVAR;
6877
6878     PERL_ARGS_ASSERT_REGHOP4;
6879
6880     if (off >= 0) {
6881         while (off-- && s < rlim) {
6882             /* XXX could check well-formedness here */
6883             s += UTF8SKIP(s);
6884         }
6885     }
6886     else {
6887         while (off++ && s > llim) {
6888             s--;
6889             if (UTF8_IS_CONTINUED(*s)) {
6890                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6891                     s--;
6892             }
6893             /* XXX could check well-formedness here */
6894         }
6895     }
6896     return s;
6897 }
6898 #endif
6899
6900 STATIC U8 *
6901 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6902 {
6903     dVAR;
6904
6905     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6906
6907     if (off >= 0) {
6908         while (off-- && s < lim) {
6909             /* XXX could check well-formedness here */
6910             s += UTF8SKIP(s);
6911         }
6912         if (off >= 0)
6913             return NULL;
6914     }
6915     else {
6916         while (off++ && s > lim) {
6917             s--;
6918             if (UTF8_IS_CONTINUED(*s)) {
6919                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6920                     s--;
6921             }
6922             /* XXX could check well-formedness here */
6923         }
6924         if (off <= 0)
6925             return NULL;
6926     }
6927     return s;
6928 }
6929
6930 static void
6931 restore_pos(pTHX_ void *arg)
6932 {
6933     dVAR;
6934     regexp * const rex = (regexp *)arg;
6935     if (PL_reg_eval_set) {
6936         if (PL_reg_oldsaved) {
6937             rex->subbeg = PL_reg_oldsaved;
6938             rex->sublen = PL_reg_oldsavedlen;
6939 #ifdef PERL_OLD_COPY_ON_WRITE
6940             rex->saved_copy = PL_nrs;
6941 #endif
6942             RXp_MATCH_COPIED_on(rex);
6943         }
6944         PL_reg_magic->mg_len = PL_reg_oldpos;
6945         PL_reg_eval_set = 0;
6946         PL_curpm = PL_reg_oldcurpm;
6947     }
6948 }
6949
6950 STATIC void
6951 S_to_utf8_substr(pTHX_ register regexp *prog)
6952 {
6953     int i = 1;
6954
6955     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6956
6957     do {
6958         if (prog->substrs->data[i].substr
6959             && !prog->substrs->data[i].utf8_substr) {
6960             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6961             prog->substrs->data[i].utf8_substr = sv;
6962             sv_utf8_upgrade(sv);
6963             if (SvVALID(prog->substrs->data[i].substr)) {
6964                 if (SvTAIL(prog->substrs->data[i].substr)) {
6965                     /* Trim the trailing \n that fbm_compile added last
6966                        time.  */
6967                     SvCUR_set(sv, SvCUR(sv) - 1);
6968                     /* Whilst this makes the SV technically "invalid" (as its
6969                        buffer is no longer followed by "\0") when fbm_compile()
6970                        adds the "\n" back, a "\0" is restored.  */
6971                     fbm_compile(sv, FBMcf_TAIL);
6972                 } else
6973                     fbm_compile(sv, 0);
6974             }
6975             if (prog->substrs->data[i].substr == prog->check_substr)
6976                 prog->check_utf8 = sv;
6977         }
6978     } while (i--);
6979 }
6980
6981 STATIC void
6982 S_to_byte_substr(pTHX_ register regexp *prog)
6983 {
6984     dVAR;
6985     int i = 1;
6986
6987     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6988
6989     do {
6990         if (prog->substrs->data[i].utf8_substr
6991             && !prog->substrs->data[i].substr) {
6992             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6993             if (sv_utf8_downgrade(sv, TRUE)) {
6994                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6995                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
6996                         /* Trim the trailing \n that fbm_compile added last
6997                            time.  */
6998                         SvCUR_set(sv, SvCUR(sv) - 1);
6999                         fbm_compile(sv, FBMcf_TAIL);
7000                     } else
7001                         fbm_compile(sv, 0);
7002                 }
7003             } else {
7004                 SvREFCNT_dec(sv);
7005                 sv = &PL_sv_undef;
7006             }
7007             prog->substrs->data[i].substr = sv;
7008             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7009                 prog->check_substr = sv;
7010         }
7011     } while (i--);
7012 }
7013
7014 /*
7015  * Local variables:
7016  * c-indentation-style: bsd
7017  * c-basic-offset: 4
7018  * indent-tabs-mode: t
7019  * End:
7020  *
7021  * ex: set ts=8 sts=4 sw=4 noet:
7022  */