src/5015004/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #ifdef EBCDIC
 126     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 127      * skip the check on EBCDIC platforms */
 128 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 129 #else
 130 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 131     if (!CAT2(PL_utf8_,class)) { \
 132         bool ok; \
 133         ENTER; save_re_context(); \
 134         ok=CAT2(is_utf8_,class)((const U8*)str); \
 135         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 136 #endif
 137
 138 /* Doesn't do an assert to verify that is correct */
 139 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 140     if (!CAT2(PL_utf8_,class)) { \
 141         bool throw_away __attribute__unused__; \
 142         ENTER; save_re_context(); \
 143         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 144         LEAVE; } } STMT_END
 145
 146 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 147 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 148 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 149
 150 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 151         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 152         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 153         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 154             * assert should likely and hopefully fail on an EBCDIC machine */ \
 155         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 156                                                                             \
 157         /* No asserts are done for these, in case called on an early        \
 158             * Unicode version in which they map to nothing */               \
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 163         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 164         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 165         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 166
 167 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 168
 169 /* The actual code for CCC_TRY, which uses several variables from the routine
 170  * it's callable from.  It is designed to be the bulk of a case statement.
 171  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 172  *      nextchr matches the class.
 173  * UTF8_TEST is the whole test string to use for utf8 targets
 174  * LOAD is what to use to test, and if not present to load in the swash for the
 175  *      class
 176  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 177  *      UTF8_TEST test.
 178  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 179  * utf8 and a variant, load the swash if necessary and test using the utf8
 180  * test.  Advance to the next character if test is ok, otherwise fail; If not
 181  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 182  * fails, or advance to the next character */
 183
 184 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 185     if (locinput >= PL_regeol) {                                              \
 186         sayNO;                                                                \
 187     }                                                                         \
 188     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 189         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 190         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 191             sayNO;                                                            \
 192         }                                                                     \
 193         locinput += PL_utf8skip[nextchr];                                     \
 194         nextchr = UCHARAT(locinput);                                          \
 195         break;                                                                \
 196     }                                                                         \
 197     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 198         sayNO;                                                                \
 199     }                                                                         \
 200     nextchr = UCHARAT(++locinput);                                            \
 201     break;
 202
 203 /* Handle the non-locale cases for a character class and its complement.  It
 204  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 205  * This is because that code fails when the test succeeds, so we want to have
 206  * the test fail so that the code succeeds.  The swash is stored in a
 207  * predictable PL_ place */
 208 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 209                            CLASS, STR)                                        \
 210     case NAME:                                                                \
 211         _CCC_TRY_CODE( !, FUNC,                                               \
 212                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 213                                             (U8*)locinput, TRUE)),            \
 214                           CLASS, STR)                                         \
 215     case NNAME:                                                               \
 216         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 217                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 218                                             (U8*)locinput, TRUE)),            \
 219                           CLASS, STR)                                         \
 220
 221 /* Generate the case statements for both locale and non-locale character
 222  * classes in regmatch for classes that don't have special unicode semantics.
 223  * Locales don't use an immediate swash, but an intermediary special locale
 224  * function that is called on the pointer to the current place in the input
 225  * string.  That function will resolve to needing the same swash.  One might
 226  * think that because we don't know what the locale will match, we shouldn't
 227  * check with the swash loading function that it loaded properly; ie, that we
 228  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 229  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 230  * irrelevant here */
 231 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 232                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 233                 NAMEA, NNAMEA, FUNCA,                                         \
 234                 CLASS, STR)                                                   \
 235     case NAMEL:                                                               \
 236         PL_reg_flags |= RF_tainted;                                           \
 237         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 238     case NNAMEL:                                                              \
 239         PL_reg_flags |= RF_tainted;                                           \
 240         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 241                        CLASS, STR)                                            \
 242     case NAMEA:                                                               \
 243         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 244             sayNO;                                                            \
 245         }                                                                     \
 246         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 247         nextchr = UCHARAT(++locinput);                                        \
 248         break;                                                                \
 249     case NNAMEA:                                                              \
 250         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 251             sayNO;                                                            \
 252         }                                                                     \
 253         if (utf8_target) {                                                    \
 254             locinput += PL_utf8skip[nextchr];                                 \
 255             nextchr = UCHARAT(locinput);                                      \
 256         }                                                                     \
 257         else {                                                                \
 258             nextchr = UCHARAT(++locinput);                                    \
 259         }                                                                     \
 260         break;                                                                \
 261     /* Generate the non-locale cases */                                       \
 262     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 263
 264 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 265  * statements to handle separate Unicode semantics nodes */
 266 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 267                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 268                   NAMEU, NNAMEU, FUNCU,                                        \
 269                   NAMEA, NNAMEA, FUNCA,                                        \
 270                   CLASS, STR)                                                  \
 271     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 272             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 273             NAMEA, NNAMEA, FUNCA,                                              \
 274             CLASS, STR)                                                        \
 275     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 276
 277 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 278
 279 /* for use after a quantifier and before an EXACT-like node -- japhy */
 280 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 281  *
 282  * NOTE that *nothing* that affects backtracking should be in here, specifically
 283  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 284  * node that is in between two EXACT like nodes when ascertaining what the required
 285  * "follow" character is. This should probably be moved to regex compile time
 286  * although it may be done at run time beause of the REF possibility - more
 287  * investigation required. -- demerphq
 288 */
 289 #define JUMPABLE(rn) (      \
 290     OP(rn) == OPEN ||       \
 291     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 292     OP(rn) == EVAL ||   \
 293     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 294     OP(rn) == PLUS || OP(rn) == MINMOD || \
 295     OP(rn) == KEEPS || \
 296     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 297 )
 298 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 299
 300 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 301
 302 #if 0
 303 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 304    we don't need this definition. */
 305 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 306 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 307 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 308
 309 #else
 310 /* ... so we use this as its faster. */
 311 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 312 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 313 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 314 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 315
 316 #endif
 317
 318 /*
 319   Search for mandatory following text node; for lookahead, the text must
 320   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 321 */
 322 #define FIND_NEXT_IMPT(rn) STMT_START { \
 323     while (JUMPABLE(rn)) { \
 324         const OPCODE type = OP(rn); \
 325         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 326             rn = NEXTOPER(NEXTOPER(rn)); \
 327         else if (type == PLUS) \
 328             rn = NEXTOPER(rn); \
 329         else if (type == IFMATCH) \
 330             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 331         else rn += NEXT_OFF(rn); \
 332     } \
 333 } STMT_END
 334
 335
 336 static void restore_pos(pTHX_ void *arg);
 337
 338 #define REGCP_PAREN_ELEMS 4
 339 #define REGCP_OTHER_ELEMS 5
 340 #define REGCP_FRAME_ELEMS 1
 341 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 342  * are needed for the regexp context stack bookkeeping. */
 343
 344 STATIC CHECKPOINT
 345 S_regcppush(pTHX_ I32 parenfloor)
 346 {
 347     dVAR;
 348     const int retval = PL_savestack_ix;
 349     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 350     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 351     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 352     int p;
 353     GET_RE_DEBUG_FLAGS_DECL;
 354
 355     if (paren_elems_to_push < 0)
 356         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 357
 358     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 359         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 360                    " out of range (%lu-%ld)",
 361                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 362
 363     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 364
 365     for (p = PL_regsize; p > parenfloor; p--) {
 366 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 367         SSPUSHINT(PL_regoffs[p].end);
 368         SSPUSHINT(PL_regoffs[p].start);
 369         SSPUSHPTR(PL_reg_start_tmp[p]);
 370         SSPUSHINT(p);
 371         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 372           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 373                       (UV)p, (IV)PL_regoffs[p].start,
 374                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 375                       (IV)PL_regoffs[p].end
 376         ));
 377     }
 378 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 379     SSPUSHPTR(PL_regoffs);
 380     SSPUSHINT(PL_regsize);
 381     SSPUSHINT(*PL_reglastparen);
 382     SSPUSHINT(*PL_reglastcloseparen);
 383     SSPUSHPTR(PL_reginput);
 384     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 385
 386     return retval;
 387 }
 388
 389 /* These are needed since we do not localize EVAL nodes: */
 390 #define REGCP_SET(cp)                                           \
 391     DEBUG_STATE_r(                                              \
 392             PerlIO_printf(Perl_debug_log,                       \
 393                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 394                 (IV)PL_savestack_ix));                          \
 395     cp = PL_savestack_ix
 396
 397 #define REGCP_UNWIND(cp)                                        \
 398     DEBUG_STATE_r(                                              \
 399         if (cp != PL_savestack_ix)                              \
 400             PerlIO_printf(Perl_debug_log,                       \
 401                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 402                 (IV)(cp), (IV)PL_savestack_ix));                \
 403     regcpblow(cp)
 404
 405 STATIC char *
 406 S_regcppop(pTHX_ const regexp *rex)
 407 {
 408     dVAR;
 409     UV i;
 410     char *input;
 411     GET_RE_DEBUG_FLAGS_DECL;
 412
 413     PERL_ARGS_ASSERT_REGCPPOP;
 414
 415     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 416     i = SSPOPUV;
 417     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 418     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 419     input = (char *) SSPOPPTR;
 420     *PL_reglastcloseparen = SSPOPINT;
 421     *PL_reglastparen = SSPOPINT;
 422     PL_regsize = SSPOPINT;
 423     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 424
 425     i -= REGCP_OTHER_ELEMS;
 426     /* Now restore the parentheses context. */
 427     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 428         I32 tmps;
 429         U32 paren = (U32)SSPOPINT;
 430         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 431         PL_regoffs[paren].start = SSPOPINT;
 432         tmps = SSPOPINT;
 433         if (paren <= *PL_reglastparen)
 434             PL_regoffs[paren].end = tmps;
 435         DEBUG_BUFFERS_r(
 436             PerlIO_printf(Perl_debug_log,
 437                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 438                           (UV)paren, (IV)PL_regoffs[paren].start,
 439                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 440                           (IV)PL_regoffs[paren].end,
 441                           (paren > *PL_reglastparen ? "(no)" : ""));
 442         );
 443     }
 444     DEBUG_BUFFERS_r(
 445         if (*PL_reglastparen + 1 <= rex->nparens) {
 446             PerlIO_printf(Perl_debug_log,
 447                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 448                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 449         }
 450     );
 451 #if 1
 452     /* It would seem that the similar code in regtry()
 453      * already takes care of this, and in fact it is in
 454      * a better location to since this code can #if 0-ed out
 455      * but the code in regtry() is needed or otherwise tests
 456      * requiring null fields (pat.t#187 and split.t#{13,14}
 457      * (as of patchlevel 7877)  will fail.  Then again,
 458      * this code seems to be necessary or otherwise
 459      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 460      * --jhi updated by dapm */
 461     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 462         if (i > PL_regsize)
 463             PL_regoffs[i].start = -1;
 464         PL_regoffs[i].end = -1;
 465     }
 466 #endif
 467     return input;
 468 }
 469
 470 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 471
 472 /*
 473  * pregexec and friends
 474  */
 475
 476 #ifndef PERL_IN_XSUB_RE
 477 /*
 478  - pregexec - match a regexp against a string
 479  */
 480 I32
 481 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 482          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 483 /* strend: pointer to null at end of string */
 484 /* strbeg: real beginning of string */
 485 /* minend: end of match must be >=minend after stringarg. */
 486 /* nosave: For optimizations. */
 487 {
 488     PERL_ARGS_ASSERT_PREGEXEC;
 489
 490     return
 491         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 492                       nosave ? 0 : REXEC_COPY_STR);
 493 }
 494 #endif
 495
 496 /*
 497  * Need to implement the following flags for reg_anch:
 498  *
 499  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 500  * USE_INTUIT_ML
 501  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 502  * INTUIT_AUTORITATIVE_ML
 503  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 504  * INTUIT_ONCE_ML
 505  *
 506  * Another flag for this function: SECOND_TIME (so that float substrs
 507  * with giant delta may be not rechecked).
 508  */
 509
 510 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 511
 512 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 513    Otherwise, only SvCUR(sv) is used to get strbeg. */
 514
 515 /* XXXX We assume that strpos is strbeg unless sv. */
 516
 517 /* XXXX Some places assume that there is a fixed substring.
 518         An update may be needed if optimizer marks as "INTUITable"
 519         RExen without fixed substrings.  Similarly, it is assumed that
 520         lengths of all the strings are no more than minlen, thus they
 521         cannot come from lookahead.
 522         (Or minlen should take into account lookahead.)
 523   NOTE: Some of this comment is not correct. minlen does now take account
 524   of lookahead/behind. Further research is required. -- demerphq
 525
 526 */
 527
 528 /* A failure to find a constant substring means that there is no need to make
 529    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 530    finding a substring too deep into the string means that less calls to
 531    regtry() should be needed.
 532
 533    REx compiler's optimizer found 4 possible hints:
 534         a) Anchored substring;
 535         b) Fixed substring;
 536         c) Whether we are anchored (beginning-of-line or \G);
 537         d) First node (of those at offset 0) which may distinguish positions;
 538    We use a)b)d) and multiline-part of c), and try to find a position in the
 539    string which does not contradict any of them.
 540  */
 541
 542 /* Most of decisions we do here should have been done at compile time.
 543    The nodes of the REx which we used for the search should have been
 544    deleted from the finite automaton. */
 545
 546 char *
 547 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 548                      char *strend, const U32 flags, re_scream_pos_data *data)
 549 {
 550     dVAR;
 551     struct regexp *const prog = (struct regexp *)SvANY(rx);
 552     register I32 start_shift = 0;
 553     /* Should be nonnegative! */
 554     register I32 end_shift   = 0;
 555     register char *s;
 556     register SV *check;
 557     char *strbeg;
 558     char *t;
 559     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 560     I32 ml_anch;
 561     register char *other_last = NULL;   /* other substr checked before this */
 562     char *check_at = NULL;              /* check substr found at this pos */
 563     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 564     RXi_GET_DECL(prog,progi);
 565 #ifdef DEBUGGING
 566     const char * const i_strpos = strpos;
 567 #endif
 568     GET_RE_DEBUG_FLAGS_DECL;
 569
 570     PERL_ARGS_ASSERT_RE_INTUIT_START;
 571
 572     RX_MATCH_UTF8_set(rx,utf8_target);
 573
 574     if (RX_UTF8(rx)) {
 575         PL_reg_flags |= RF_utf8;
 576     }
 577     DEBUG_EXECUTE_r(
 578         debug_start_match(rx, utf8_target, strpos, strend,
 579             sv ? "Guessing start of match in sv for"
 580                : "Guessing start of match in string for");
 581               );
 582
 583     /* CHR_DIST() would be more correct here but it makes things slow. */
 584     if (prog->minlen > strend - strpos) {
 585         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 586                               "String too short... [re_intuit_start]\n"));
 587         goto fail;
 588     }
 589
 590     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 591     PL_regeol = strend;
 592     if (utf8_target) {
 593         if (!prog->check_utf8 && prog->check_substr)
 594             to_utf8_substr(prog);
 595         check = prog->check_utf8;
 596     } else {
 597         if (!prog->check_substr && prog->check_utf8)
 598             to_byte_substr(prog);
 599         check = prog->check_substr;
 600     }
 601     if (check == &PL_sv_undef) {
 602         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 603                 "Non-utf8 string cannot match utf8 check string\n"));
 604         goto fail;
 605     }
 606     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 607         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 608                      || ( (prog->extflags & RXf_ANCH_BOL)
 609                           && !multiline ) );    /* Check after \n? */
 610
 611         if (!ml_anch) {
 612           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 613                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 614                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 615                && sv && !SvROK(sv)
 616                && (strpos != strbeg)) {
 617               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 618               goto fail;
 619           }
 620           if (prog->check_offset_min == prog->check_offset_max &&
 621               !(prog->extflags & RXf_CANY_SEEN)) {
 622             /* Substring at constant offset from beg-of-str... */
 623             I32 slen;
 624
 625             s = HOP3c(strpos, prog->check_offset_min, strend);
 626
 627             if (SvTAIL(check)) {
 628                 slen = SvCUR(check);    /* >= 1 */
 629
 630                 if ( strend - s > slen || strend - s < slen - 1
 631                      || (strend - s == slen && strend[-1] != '\n')) {
 632                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 633                     goto fail_finish;
 634                 }
 635                 /* Now should match s[0..slen-2] */
 636                 slen--;
 637                 if (slen && (*SvPVX_const(check) != *s
 638                              || (slen > 1
 639                                  && memNE(SvPVX_const(check), s, slen)))) {
 640                   report_neq:
 641                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 642                     goto fail_finish;
 643                 }
 644             }
 645             else if (*SvPVX_const(check) != *s
 646                      || ((slen = SvCUR(check)) > 1
 647                          && memNE(SvPVX_const(check), s, slen)))
 648                 goto report_neq;
 649             check_at = s;
 650             goto success_at_start;
 651           }
 652         }
 653         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 654         s = strpos;
 655         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 656         end_shift = prog->check_end_shift;
 657
 658         if (!ml_anch) {
 659             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 660                                          - (SvTAIL(check) != 0);
 661             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 662
 663             if (end_shift < eshift)
 664                 end_shift = eshift;
 665         }
 666     }
 667     else {                              /* Can match at random position */
 668         ml_anch = 0;
 669         s = strpos;
 670         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 671         end_shift = prog->check_end_shift;
 672
 673         /* end shift should be non negative here */
 674     }
 675
 676 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 677     if (end_shift < 0)
 678         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 679                    (IV)end_shift, RX_PRECOMP(prog));
 680 #endif
 681
 682   restart:
 683     /* Find a possible match in the region s..strend by looking for
 684        the "check" substring in the region corrected by start/end_shift. */
 685
 686     {
 687         I32 srch_start_shift = start_shift;
 688         I32 srch_end_shift = end_shift;
 689         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 690             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 691             srch_start_shift = strbeg - s;
 692         }
 693     DEBUG_OPTIMISE_MORE_r({
 694         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 695             (IV)prog->check_offset_min,
 696             (IV)srch_start_shift,
 697             (IV)srch_end_shift,
 698             (IV)prog->check_end_shift);
 699     });
 700
 701     if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
 702         I32 p = -1;                     /* Internal iterator of scream. */
 703         I32 * const pp = data ? data->scream_pos : &p;
 704         const MAGIC *mg;
 705         bool found = FALSE;
 706
 707         assert(SvMAGICAL(sv));
 708         mg = mg_find(sv, PERL_MAGIC_study);
 709         assert(mg);
 710
 711         if (mg->mg_private == 1) {
 712             found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
 713         } else if (mg->mg_private == 2) {
 714             found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
 715         } else {
 716             assert (mg->mg_private == 4);
 717             found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
 718         }
 719
 720         if (found
 721             || ( BmRARE(check) == '\n'
 722                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 723                  && SvTAIL(check) ))
 724             s = screaminstr(sv, check,
 725                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 726         else
 727             goto fail_finish;
 728         /* we may be pointing at the wrong string */
 729         if (s && RXp_MATCH_COPIED(prog))
 730             s = strbeg + (s - SvPVX_const(sv));
 731         if (data)
 732             *data->scream_olds = s;
 733     }
 734     else {
 735         U8* start_point;
 736         U8* end_point;
 737         if (prog->extflags & RXf_CANY_SEEN) {
 738             start_point= (U8*)(s + srch_start_shift);
 739             end_point= (U8*)(strend - srch_end_shift);
 740         } else {
 741             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 742             end_point= HOP3(strend, -srch_end_shift, strbeg);
 743         }
 744         DEBUG_OPTIMISE_MORE_r({
 745             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 746                 (int)(end_point - start_point),
 747                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 748                 start_point);
 749         });
 750
 751         s = fbm_instr( start_point, end_point,
 752                       check, multiline ? FBMrf_MULTILINE : 0);
 753     }
 754     }
 755     /* Update the count-of-usability, remove useless subpatterns,
 756         unshift s.  */
 757
 758     DEBUG_EXECUTE_r({
 759         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 760             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 761         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 762                           (s ? "Found" : "Did not find"),
 763             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 764                 ? "anchored" : "floating"),
 765             quoted,
 766             RE_SV_TAIL(check),
 767             (s ? " at offset " : "...\n") );
 768     });
 769
 770     if (!s)
 771         goto fail_finish;
 772     /* Finish the diagnostic message */
 773     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 774
 775     /* XXX dmq: first branch is for positive lookbehind...
 776        Our check string is offset from the beginning of the pattern.
 777        So we need to do any stclass tests offset forward from that
 778        point. I think. :-(
 779      */
 780
 781
 782
 783     check_at=s;
 784
 785
 786     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 787        Start with the other substr.
 788        XXXX no SCREAM optimization yet - and a very coarse implementation
 789        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 790                 *always* match.  Probably should be marked during compile...
 791        Probably it is right to do no SCREAM here...
 792      */
 793
 794     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 795                 : (prog->float_substr && prog->anchored_substr))
 796     {
 797         /* Take into account the "other" substring. */
 798         /* XXXX May be hopelessly wrong for UTF... */
 799         if (!other_last)
 800             other_last = strpos;
 801         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 802           do_other_anchored:
 803             {
 804                 char * const last = HOP3c(s, -start_shift, strbeg);
 805                 char *last1, *last2;
 806                 char * const saved_s = s;
 807                 SV* must;
 808
 809                 t = s - prog->check_offset_max;
 810                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 811                     && (!utf8_target
 812                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 813                             && t > strpos)))
 814                     NOOP;
 815                 else
 816                     t = strpos;
 817                 t = HOP3c(t, prog->anchored_offset, strend);
 818                 if (t < other_last)     /* These positions already checked */
 819                     t = other_last;
 820                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 821                 if (last < last1)
 822                     last1 = last;
 823                 /* XXXX It is not documented what units *_offsets are in.
 824                    We assume bytes, but this is clearly wrong.
 825                    Meaning this code needs to be carefully reviewed for errors.
 826                    dmq.
 827                   */
 828
 829                 /* On end-of-str: see comment below. */
 830                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 831                 if (must == &PL_sv_undef) {
 832                     s = (char*)NULL;
 833                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 834                 }
 835                 else
 836                     s = fbm_instr(
 837                         (unsigned char*)t,
 838                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 839                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 840                         must,
 841                         multiline ? FBMrf_MULTILINE : 0
 842                     );
 843                 DEBUG_EXECUTE_r({
 844                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 845                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 846                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 847                         (s ? "Found" : "Contradicts"),
 848                         quoted, RE_SV_TAIL(must));
 849                 });
 850
 851
 852                 if (!s) {
 853                     if (last1 >= last2) {
 854                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 855                                                 ", giving up...\n"));
 856                         goto fail_finish;
 857                     }
 858                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 859                         ", trying floating at offset %ld...\n",
 860                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 861                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 862                     s = HOP3c(last, 1, strend);
 863                     goto restart;
 864                 }
 865                 else {
 866                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 867                           (long)(s - i_strpos)));
 868                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 869                     other_last = HOP3c(s, 1, strend);
 870                     s = saved_s;
 871                     if (t == strpos)
 872                         goto try_at_start;
 873                     goto try_at_offset;
 874                 }
 875             }
 876         }
 877         else {          /* Take into account the floating substring. */
 878             char *last, *last1;
 879             char * const saved_s = s;
 880             SV* must;
 881
 882             t = HOP3c(s, -start_shift, strbeg);
 883             last1 = last =
 884                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 885             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 886                 last = HOP3c(t, prog->float_max_offset, strend);
 887             s = HOP3c(t, prog->float_min_offset, strend);
 888             if (s < other_last)
 889                 s = other_last;
 890  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 891             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 892             /* fbm_instr() takes into account exact value of end-of-str
 893                if the check is SvTAIL(ed).  Since false positives are OK,
 894                and end-of-str is not later than strend we are OK. */
 895             if (must == &PL_sv_undef) {
 896                 s = (char*)NULL;
 897                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 898             }
 899             else
 900                 s = fbm_instr((unsigned char*)s,
 901                               (unsigned char*)last + SvCUR(must)
 902                                   - (SvTAIL(must)!=0),
 903                               must, multiline ? FBMrf_MULTILINE : 0);
 904             DEBUG_EXECUTE_r({
 905                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 906                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 907                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 908                     (s ? "Found" : "Contradicts"),
 909                     quoted, RE_SV_TAIL(must));
 910             });
 911             if (!s) {
 912                 if (last1 == last) {
 913                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 914                                             ", giving up...\n"));
 915                     goto fail_finish;
 916                 }
 917                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 918                     ", trying anchored starting at offset %ld...\n",
 919                     (long)(saved_s + 1 - i_strpos)));
 920                 other_last = last;
 921                 s = HOP3c(t, 1, strend);
 922                 goto restart;
 923             }
 924             else {
 925                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 926                       (long)(s - i_strpos)));
 927                 other_last = s; /* Fix this later. --Hugo */
 928                 s = saved_s;
 929                 if (t == strpos)
 930                     goto try_at_start;
 931                 goto try_at_offset;
 932             }
 933         }
 934     }
 935
 936
 937     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 938
 939     DEBUG_OPTIMISE_MORE_r(
 940         PerlIO_printf(Perl_debug_log,
 941             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 942             (IV)prog->check_offset_min,
 943             (IV)prog->check_offset_max,
 944             (IV)(s-strpos),
 945             (IV)(t-strpos),
 946             (IV)(t-s),
 947             (IV)(strend-strpos)
 948         )
 949     );
 950
 951     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 952         && (!utf8_target
 953             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 954                  && t > strpos)))
 955     {
 956         /* Fixed substring is found far enough so that the match
 957            cannot start at strpos. */
 958       try_at_offset:
 959         if (ml_anch && t[-1] != '\n') {
 960             /* Eventually fbm_*() should handle this, but often
 961                anchored_offset is not 0, so this check will not be wasted. */
 962             /* XXXX In the code below we prefer to look for "^" even in
 963                presence of anchored substrings.  And we search even
 964                beyond the found float position.  These pessimizations
 965                are historical artefacts only.  */
 966           find_anchor:
 967             while (t < strend - prog->minlen) {
 968                 if (*t == '\n') {
 969                     if (t < check_at - prog->check_offset_min) {
 970                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 971                             /* Since we moved from the found position,
 972                                we definitely contradict the found anchored
 973                                substr.  Due to the above check we do not
 974                                contradict "check" substr.
 975                                Thus we can arrive here only if check substr
 976                                is float.  Redo checking for "other"=="fixed".
 977                              */
 978                             strpos = t + 1;
 979                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 980                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 981                             goto do_other_anchored;
 982                         }
 983                         /* We don't contradict the found floating substring. */
 984                         /* XXXX Why not check for STCLASS? */
 985                         s = t + 1;
 986                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 987                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 988                         goto set_useful;
 989                     }
 990                     /* Position contradicts check-string */
 991                     /* XXXX probably better to look for check-string
 992                        than for "\n", so one should lower the limit for t? */
 993                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 994                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 995                     other_last = strpos = s = t + 1;
 996                     goto restart;
 997                 }
 998                 t++;
 999             }
1000             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1001                         PL_colors[0], PL_colors[1]));
1002             goto fail_finish;
1003         }
1004         else {
1005             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1006                         PL_colors[0], PL_colors[1]));
1007         }
1008         s = t;
1009       set_useful:
1010         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1011     }
1012     else {
1013         /* The found string does not prohibit matching at strpos,
1014            - no optimization of calling REx engine can be performed,
1015            unless it was an MBOL and we are not after MBOL,
1016            or a future STCLASS check will fail this. */
1017       try_at_start:
1018         /* Even in this situation we may use MBOL flag if strpos is offset
1019            wrt the start of the string. */
1020         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1021             && (strpos != strbeg) && strpos[-1] != '\n'
1022             /* May be due to an implicit anchor of m{.*foo}  */
1023             && !(prog->intflags & PREGf_IMPLICIT))
1024         {
1025             t = strpos;
1026             goto find_anchor;
1027         }
1028         DEBUG_EXECUTE_r( if (ml_anch)
1029             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1030                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1031         );
1032       success_at_start:
1033         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1034             && (utf8_target ? (
1035                 prog->check_utf8                /* Could be deleted already */
1036                 && --BmUSEFUL(prog->check_utf8) < 0
1037                 && (prog->check_utf8 == prog->float_utf8)
1038             ) : (
1039                 prog->check_substr              /* Could be deleted already */
1040                 && --BmUSEFUL(prog->check_substr) < 0
1041                 && (prog->check_substr == prog->float_substr)
1042             )))
1043         {
1044             /* If flags & SOMETHING - do not do it many times on the same match */
1045             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1046             /* XXX Does the destruction order has to change with utf8_target? */
1047             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1048             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1049             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1050             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1051             check = NULL;                       /* abort */
1052             s = strpos;
1053             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1054                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1055             if (prog->intflags & PREGf_IMPLICIT)
1056                 prog->extflags &= ~RXf_ANCH_MBOL;
1057             /* XXXX This is a remnant of the old implementation.  It
1058                     looks wasteful, since now INTUIT can use many
1059                     other heuristics. */
1060             prog->extflags &= ~RXf_USE_INTUIT;
1061             /* XXXX What other flags might need to be cleared in this branch? */
1062         }
1063         else
1064             s = strpos;
1065     }
1066
1067     /* Last resort... */
1068     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1069     /* trie stclasses are too expensive to use here, we are better off to
1070        leave it to regmatch itself */
1071     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1072         /* minlen == 0 is possible if regstclass is \b or \B,
1073            and the fixed substr is ''$.
1074            Since minlen is already taken into account, s+1 is before strend;
1075            accidentally, minlen >= 1 guaranties no false positives at s + 1
1076            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1077            regstclass does not come from lookahead...  */
1078         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1079            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1080         const U8* const str = (U8*)STRING(progi->regstclass);
1081         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1082                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1083                     : 1);
1084         char * endpos;
1085         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1086             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1087         else if (prog->float_substr || prog->float_utf8)
1088             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1089         else
1090             endpos= strend;
1091
1092         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1093                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1094
1095         t = s;
1096         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1097         if (!s) {
1098 #ifdef DEBUGGING
1099             const char *what = NULL;
1100 #endif
1101             if (endpos == strend) {
1102                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1103                                 "Could not match STCLASS...\n") );
1104                 goto fail;
1105             }
1106             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1107                                    "This position contradicts STCLASS...\n") );
1108             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1109                 goto fail;
1110             /* Contradict one of substrings */
1111             if (prog->anchored_substr || prog->anchored_utf8) {
1112                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1113                     DEBUG_EXECUTE_r( what = "anchored" );
1114                   hop_and_restart:
1115                     s = HOP3c(t, 1, strend);
1116                     if (s + start_shift + end_shift > strend) {
1117                         /* XXXX Should be taken into account earlier? */
1118                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                                                "Could not match STCLASS...\n") );
1120                         goto fail;
1121                     }
1122                     if (!check)
1123                         goto giveup;
1124                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1125                                 "Looking for %s substr starting at offset %ld...\n",
1126                                  what, (long)(s + start_shift - i_strpos)) );
1127                     goto restart;
1128                 }
1129                 /* Have both, check_string is floating */
1130                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1131                     goto retry_floating_check;
1132                 /* Recheck anchored substring, but not floating... */
1133                 s = check_at;
1134                 if (!check)
1135                     goto giveup;
1136                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1137                           "Looking for anchored substr starting at offset %ld...\n",
1138                           (long)(other_last - i_strpos)) );
1139                 goto do_other_anchored;
1140             }
1141             /* Another way we could have checked stclass at the
1142                current position only: */
1143             if (ml_anch) {
1144                 s = t = t + 1;
1145                 if (!check)
1146                     goto giveup;
1147                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148                           "Looking for /%s^%s/m starting at offset %ld...\n",
1149                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1150                 goto try_at_offset;
1151             }
1152             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1153                 goto fail;
1154             /* Check is floating substring. */
1155           retry_floating_check:
1156             t = check_at - start_shift;
1157             DEBUG_EXECUTE_r( what = "floating" );
1158             goto hop_and_restart;
1159         }
1160         if (t != s) {
1161             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1162                         "By STCLASS: moving %ld --> %ld\n",
1163                                   (long)(t - i_strpos), (long)(s - i_strpos))
1164                    );
1165         }
1166         else {
1167             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1168                                   "Does not contradict STCLASS...\n");
1169                    );
1170         }
1171     }
1172   giveup:
1173     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1174                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1175                           PL_colors[5], (long)(s - i_strpos)) );
1176     return s;
1177
1178   fail_finish:                          /* Substring not found */
1179     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1180         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1181   fail:
1182     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1183                           PL_colors[4], PL_colors[5]));
1184     return NULL;
1185 }
1186
1187 #define DECL_TRIE_TYPE(scan) \
1188     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1189                     trie_type = (scan->flags != EXACT) \
1190                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1191                               : (utf8_target ? trie_utf8 : trie_plain)
1192
1193 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1194 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1195     switch (trie_type) {                                                    \
1196     case trie_utf8_fold:                                                    \
1197         if ( foldlen>0 ) {                                                  \
1198             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1199             foldlen -= len;                                                 \
1200             uscan += len;                                                   \
1201             len=0;                                                          \
1202         } else {                                                            \
1203             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1204             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1205             foldlen -= UNISKIP( uvc );                                      \
1206             uscan = foldbuf + UNISKIP( uvc );                               \
1207         }                                                                   \
1208         break;                                                              \
1209     case trie_latin_utf8_fold:                                              \
1210         if ( foldlen>0 ) {                                                  \
1211             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1212             foldlen -= len;                                                 \
1213             uscan += len;                                                   \
1214             len=0;                                                          \
1215         } else {                                                            \
1216             len = 1;                                                        \
1217             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1218             foldlen -= UNISKIP( uvc );                                      \
1219             uscan = foldbuf + UNISKIP( uvc );                               \
1220         }                                                                   \
1221         break;                                                              \
1222     case trie_utf8:                                                         \
1223         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1224         break;                                                              \
1225     case trie_plain:                                                        \
1226         uvc = (UV)*uc;                                                      \
1227         len = 1;                                                            \
1228     }                                                                       \
1229     if (uvc < 256) {                                                        \
1230         charid = trie->charmap[ uvc ];                                      \
1231     }                                                                       \
1232     else {                                                                  \
1233         charid = 0;                                                         \
1234         if (widecharmap) {                                                  \
1235             SV** const svpp = hv_fetch(widecharmap,                         \
1236                         (char*)&uvc, sizeof(UV), 0);                        \
1237             if (svpp)                                                       \
1238                 charid = (U16)SvIV(*svpp);                                  \
1239         }                                                                   \
1240     }                                                                       \
1241 } STMT_END
1242
1243 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1244 STMT_START {                                              \
1245     while (s <= e) {                                      \
1246         if ( (CoNd)                                       \
1247              && (ln == 1 || folder(s, pat_string, ln))    \
1248              && (!reginfo || regtry(reginfo, &s)) )       \
1249             goto got_it;                                  \
1250         s++;                                              \
1251     }                                                     \
1252 } STMT_END
1253
1254 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1255 STMT_START {                                          \
1256     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1257         CoDe                                          \
1258         s += uskip;                                   \
1259     }                                                 \
1260 } STMT_END
1261
1262 #define REXEC_FBC_SCAN(CoDe)                          \
1263 STMT_START {                                          \
1264     while (s < strend) {                              \
1265         CoDe                                          \
1266         s++;                                          \
1267     }                                                 \
1268 } STMT_END
1269
1270 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1271 REXEC_FBC_UTF8_SCAN(                                  \
1272     if (CoNd) {                                       \
1273         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1274             goto got_it;                              \
1275         else                                          \
1276             tmp = doevery;                            \
1277     }                                                 \
1278     else                                              \
1279         tmp = 1;                                      \
1280 )
1281
1282 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1283 REXEC_FBC_SCAN(                                       \
1284     if (CoNd) {                                       \
1285         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1286             goto got_it;                              \
1287         else                                          \
1288             tmp = doevery;                            \
1289     }                                                 \
1290     else                                              \
1291         tmp = 1;                                      \
1292 )
1293
1294 #define REXEC_FBC_TRYIT               \
1295 if ((!reginfo || regtry(reginfo, &s))) \
1296     goto got_it
1297
1298 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1299     if (utf8_target) {                                             \
1300         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1301     }                                                          \
1302     else {                                                     \
1303         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1304     }
1305
1306 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1307     if (utf8_target) {                                             \
1308         UtFpReLoAd;                                            \
1309         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1310     }                                                          \
1311     else {                                                     \
1312         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1313     }
1314
1315 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1316     PL_reg_flags |= RF_tainted;                                \
1317     if (utf8_target) {                                             \
1318         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1319     }                                                          \
1320     else {                                                     \
1321         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1322     }
1323
1324 #define DUMP_EXEC_POS(li,s,doutf8) \
1325     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1326
1327
1328 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1329         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1330         tmp = TEST_NON_UTF8(tmp);                                              \
1331         REXEC_FBC_UTF8_SCAN(                                                   \
1332             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1333                 tmp = !tmp;                                                    \
1334                 IF_SUCCESS;                                                    \
1335             }                                                                  \
1336             else {                                                             \
1337                 IF_FAIL;                                                       \
1338             }                                                                  \
1339         );                                                                     \
1340
1341 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1342         if (s == PL_bostr) {                                                   \
1343             tmp = '\n';                                                        \
1344         }                                                                      \
1345         else {                                                                 \
1346             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1347             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1348         }                                                                      \
1349         tmp = TeSt1_UtF8;                                                      \
1350         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1351         REXEC_FBC_UTF8_SCAN(                                                   \
1352             if (tmp == ! (TeSt2_UtF8)) { \
1353                 tmp = !tmp;                                                    \
1354                 IF_SUCCESS;                                                    \
1355             }                                                                  \
1356             else {                                                             \
1357                 IF_FAIL;                                                       \
1358             }                                                                  \
1359         );                                                                     \
1360
1361 /* The only difference between the BOUND and NBOUND cases is that
1362  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1363  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1364  * with the other one being empty */
1365 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1366     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1367
1368 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1369     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1370
1371 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1372     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1373
1374 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1375     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1376
1377
1378 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1379  * be passed in completely with the variable name being tested, which isn't
1380  * such a clean interface, but this is easier to read than it was before.  We
1381  * are looking for the boundary (or non-boundary between a word and non-word
1382  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1383  * must be different.  Find the "wordness" of the character just prior to this
1384  * one, and compare it with the wordness of this one.  If they differ, we have
1385  * a boundary.  At the beginning of the string, pretend that the previous
1386  * character was a new-line */
1387 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1388     if (utf8_target) {                                                         \
1389                 UTF8_CODE \
1390     }                                                                          \
1391     else {  /* Not utf8 */                                                     \
1392         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1393         tmp = TEST_NON_UTF8(tmp);                                              \
1394         REXEC_FBC_SCAN(                                                        \
1395             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1396                 tmp = !tmp;                                                    \
1397                 IF_SUCCESS;                                                    \
1398             }                                                                  \
1399             else {                                                             \
1400                 IF_FAIL;                                                       \
1401             }                                                                  \
1402         );                                                                     \
1403     }                                                                          \
1404     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1405         goto got_it;
1406
1407 /* We know what class REx starts with.  Try to find this position... */
1408 /* if reginfo is NULL, its a dryrun */
1409 /* annoyingly all the vars in this routine have different names from their counterparts
1410    in regmatch. /grrr */
1411
1412 STATIC char *
1413 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1414     const char *strend, regmatch_info *reginfo)
1415 {
1416         dVAR;
1417         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1418         char *pat_string;   /* The pattern's exactish string */
1419         char *pat_end;      /* ptr to end char of pat_string */
1420         re_fold_t folder;       /* Function for computing non-utf8 folds */
1421         const U8 *fold_array;   /* array for folding ords < 256 */
1422         STRLEN ln;
1423         STRLEN lnc;
1424         register STRLEN uskip;
1425         U8 c1;
1426         U8 c2;
1427         char *e;
1428         register I32 tmp = 1;   /* Scratch variable? */
1429         register const bool utf8_target = PL_reg_match_utf8;
1430         UV utf8_fold_flags = 0;
1431         RXi_GET_DECL(prog,progi);
1432
1433         PERL_ARGS_ASSERT_FIND_BYCLASS;
1434
1435         /* We know what class it must start with. */
1436         switch (OP(c)) {
1437         case ANYOFV:
1438         case ANYOF:
1439             if (utf8_target || OP(c) == ANYOFV) {
1440                 STRLEN inclasslen = strend - s;
1441                 REXEC_FBC_UTF8_CLASS_SCAN(
1442                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1443             }
1444             else {
1445                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1446             }
1447             break;
1448         case CANY:
1449             REXEC_FBC_SCAN(
1450                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1451                     goto got_it;
1452                 else
1453                     tmp = doevery;
1454             );
1455             break;
1456
1457         case EXACTFA:
1458             if (UTF_PATTERN || utf8_target) {
1459                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1460                 goto do_exactf_utf8;
1461             }
1462             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1463             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1464             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1465
1466         case EXACTF:
1467             if (UTF_PATTERN || utf8_target) {
1468
1469                 /* regcomp.c already folded this if pattern is in UTF-8 */
1470                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1471                 goto do_exactf_utf8;
1472             }
1473             fold_array = PL_fold;
1474             folder = foldEQ;
1475             goto do_exactf_non_utf8;
1476
1477         case EXACTFL:
1478             if (UTF_PATTERN || utf8_target) {
1479                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1480                 goto do_exactf_utf8;
1481             }
1482             fold_array = PL_fold_locale;
1483             folder = foldEQ_locale;
1484             goto do_exactf_non_utf8;
1485
1486         case EXACTFU:
1487             if (UTF_PATTERN || utf8_target) {
1488                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1489                 goto do_exactf_utf8;
1490             }
1491
1492             /* Any 'ss' in the pattern should have been replaced by regcomp,
1493              * so we don't have to worry here about this single special case
1494              * in the Latin1 range */
1495             fold_array = PL_fold_latin1;
1496             folder = foldEQ_latin1;
1497
1498             /* FALL THROUGH */
1499
1500         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1501
1502             /* The idea in the non-utf8 EXACTF* cases is to first find the
1503              * first character of the EXACTF* node and then, if necessary,
1504              * case-insensitively compare the full text of the node.  c1 is the
1505              * first character.  c2 is its fold.  This logic will not work for
1506              * Unicode semantics and the german sharp ss, which hence should
1507              * not be compiled into a node that gets here. */
1508             pat_string = STRING(c);
1509             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1510
1511             /* We know that we have to match at least 'ln' bytes (which is the
1512              * same as characters, since not utf8).  If we have to match 3
1513              * characters, and there are only 2 availabe, we know without
1514              * trying that it will fail; so don't start a match past the
1515              * required minimum number from the far end */
1516             e = HOP3c(strend, -((I32)ln), s);
1517
1518             if (!reginfo && e < s) {
1519                 e = s;                  /* Due to minlen logic of intuit() */
1520             }
1521
1522             c1 = *pat_string;
1523             c2 = fold_array[c1];
1524             if (c1 == c2) { /* If char and fold are the same */
1525                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1526             }
1527             else {
1528                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1529             }
1530             break;
1531
1532         do_exactf_utf8:
1533         {
1534             unsigned expansion;
1535
1536
1537             /* If one of the operands is in utf8, we can't use the simpler
1538              * folding above, due to the fact that many different characters
1539              * can have the same fold, or portion of a fold, or different-
1540              * length fold */
1541             pat_string = STRING(c);
1542             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1543             pat_end = pat_string + ln;
1544             lnc = (UTF_PATTERN) /* length to match in characters */
1545                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1546                     : ln;
1547
1548             /* We have 'lnc' characters to match in the pattern, but because of
1549              * multi-character folding, each character in the target can match
1550              * up to 3 characters (Unicode guarantees it will never exceed
1551              * this) if it is utf8-encoded; and up to 2 if not (based on the
1552              * fact that the Latin 1 folds are already determined, and the
1553              * only multi-char fold in that range is the sharp-s folding to
1554              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1555              * string character.  Adjust lnc accordingly, always matching at
1556              * least 1 */
1557             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1558             lnc = (lnc < expansion) ? 1 : lnc / expansion;
1559
1560             /* As in the non-UTF8 case, if we have to match 3 characters, and
1561              * only 2 are left, it's guaranteed to fail, so don't start a
1562              * match that would require us to go beyond the end of the string
1563              */
1564             e = HOP3c(strend, -((I32)lnc), s);
1565
1566             if (!reginfo && e < s) {
1567                 e = s;                  /* Due to minlen logic of intuit() */
1568             }
1569
1570             /* XXX Note that we could recalculate e every so-often through the
1571              * loop to stop earlier, as the worst case expansion above will
1572              * rarely be met, and as we go along we would usually find that e
1573              * moves further to the left.  Unclear if worth the expense */
1574
1575             while (s <= e) {
1576                 char *my_strend= (char *)strend;
1577                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1578                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1579                     && (!reginfo || regtry(reginfo, &s)) )
1580                 {
1581                     goto got_it;
1582                 }
1583                 s += UTF8SKIP(s);
1584             }
1585             break;
1586         }
1587         case BOUNDL:
1588             PL_reg_flags |= RF_tainted;
1589             FBC_BOUND(isALNUM_LC,
1590                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1591                       isALNUM_LC_utf8((U8*)s));
1592             break;
1593         case NBOUNDL:
1594             PL_reg_flags |= RF_tainted;
1595             FBC_NBOUND(isALNUM_LC,
1596                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1597                        isALNUM_LC_utf8((U8*)s));
1598             break;
1599         case BOUND:
1600             FBC_BOUND(isWORDCHAR,
1601                       isALNUM_uni(tmp),
1602                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1603             break;
1604         case BOUNDA:
1605             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1606                              isWORDCHAR_A(tmp),
1607                              isWORDCHAR_A((U8*)s));
1608             break;
1609         case NBOUND:
1610             FBC_NBOUND(isWORDCHAR,
1611                        isALNUM_uni(tmp),
1612                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1613             break;
1614         case NBOUNDA:
1615             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1616                               isWORDCHAR_A(tmp),
1617                               isWORDCHAR_A((U8*)s));
1618             break;
1619         case BOUNDU:
1620             FBC_BOUND(isWORDCHAR_L1,
1621                       isALNUM_uni(tmp),
1622                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1623             break;
1624         case NBOUNDU:
1625             FBC_NBOUND(isWORDCHAR_L1,
1626                        isALNUM_uni(tmp),
1627                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1628             break;
1629         case ALNUML:
1630             REXEC_FBC_CSCAN_TAINT(
1631                 isALNUM_LC_utf8((U8*)s),
1632                 isALNUM_LC(*s)
1633             );
1634             break;
1635         case ALNUMU:
1636             REXEC_FBC_CSCAN_PRELOAD(
1637                 LOAD_UTF8_CHARCLASS_ALNUM(),
1638                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1639                 isWORDCHAR_L1((U8) *s)
1640             );
1641             break;
1642         case ALNUM:
1643             REXEC_FBC_CSCAN_PRELOAD(
1644                 LOAD_UTF8_CHARCLASS_ALNUM(),
1645                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1646                 isWORDCHAR((U8) *s)
1647             );
1648             break;
1649         case ALNUMA:
1650             /* Don't need to worry about utf8, as it can match only a single
1651              * byte invariant character */
1652             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1653             break;
1654         case NALNUMU:
1655             REXEC_FBC_CSCAN_PRELOAD(
1656                 LOAD_UTF8_CHARCLASS_ALNUM(),
1657                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1658                 ! isWORDCHAR_L1((U8) *s)
1659             );
1660             break;
1661         case NALNUM:
1662             REXEC_FBC_CSCAN_PRELOAD(
1663                 LOAD_UTF8_CHARCLASS_ALNUM(),
1664                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1665                 ! isALNUM(*s)
1666             );
1667             break;
1668         case NALNUMA:
1669             REXEC_FBC_CSCAN(
1670                 !isWORDCHAR_A(*s),
1671                 !isWORDCHAR_A(*s)
1672             );
1673             break;
1674         case NALNUML:
1675             REXEC_FBC_CSCAN_TAINT(
1676                 !isALNUM_LC_utf8((U8*)s),
1677                 !isALNUM_LC(*s)
1678             );
1679             break;
1680         case SPACEU:
1681             REXEC_FBC_CSCAN_PRELOAD(
1682                 LOAD_UTF8_CHARCLASS_SPACE(),
1683                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1684                 isSPACE_L1((U8) *s)
1685             );
1686             break;
1687         case SPACE:
1688             REXEC_FBC_CSCAN_PRELOAD(
1689                 LOAD_UTF8_CHARCLASS_SPACE(),
1690                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1691                 isSPACE((U8) *s)
1692             );
1693             break;
1694         case SPACEA:
1695             /* Don't need to worry about utf8, as it can match only a single
1696              * byte invariant character */
1697             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1698             break;
1699         case SPACEL:
1700             REXEC_FBC_CSCAN_TAINT(
1701                 isSPACE_LC_utf8((U8*)s),
1702                 isSPACE_LC(*s)
1703             );
1704             break;
1705         case NSPACEU:
1706             REXEC_FBC_CSCAN_PRELOAD(
1707                 LOAD_UTF8_CHARCLASS_SPACE(),
1708                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1709                 ! isSPACE_L1((U8) *s)
1710             );
1711             break;
1712         case NSPACE:
1713             REXEC_FBC_CSCAN_PRELOAD(
1714                 LOAD_UTF8_CHARCLASS_SPACE(),
1715                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1716                 ! isSPACE((U8) *s)
1717             );
1718             break;
1719         case NSPACEA:
1720             REXEC_FBC_CSCAN(
1721                 !isSPACE_A(*s),
1722                 !isSPACE_A(*s)
1723             );
1724             break;
1725         case NSPACEL:
1726             REXEC_FBC_CSCAN_TAINT(
1727                 !isSPACE_LC_utf8((U8*)s),
1728                 !isSPACE_LC(*s)
1729             );
1730             break;
1731         case DIGIT:
1732             REXEC_FBC_CSCAN_PRELOAD(
1733                 LOAD_UTF8_CHARCLASS_DIGIT(),
1734                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1735                 isDIGIT(*s)
1736             );
1737             break;
1738         case DIGITA:
1739             /* Don't need to worry about utf8, as it can match only a single
1740              * byte invariant character */
1741             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1742             break;
1743         case DIGITL:
1744             REXEC_FBC_CSCAN_TAINT(
1745                 isDIGIT_LC_utf8((U8*)s),
1746                 isDIGIT_LC(*s)
1747             );
1748             break;
1749         case NDIGIT:
1750             REXEC_FBC_CSCAN_PRELOAD(
1751                 LOAD_UTF8_CHARCLASS_DIGIT(),
1752                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1753                 !isDIGIT(*s)
1754             );
1755             break;
1756         case NDIGITA:
1757             REXEC_FBC_CSCAN(
1758                 !isDIGIT_A(*s),
1759                 !isDIGIT_A(*s)
1760             );
1761             break;
1762         case NDIGITL:
1763             REXEC_FBC_CSCAN_TAINT(
1764                 !isDIGIT_LC_utf8((U8*)s),
1765                 !isDIGIT_LC(*s)
1766             );
1767             break;
1768         case LNBREAK:
1769             REXEC_FBC_CSCAN(
1770                 is_LNBREAK_utf8(s),
1771                 is_LNBREAK_latin1(s)
1772             );
1773             break;
1774         case VERTWS:
1775             REXEC_FBC_CSCAN(
1776                 is_VERTWS_utf8(s),
1777                 is_VERTWS_latin1(s)
1778             );
1779             break;
1780         case NVERTWS:
1781             REXEC_FBC_CSCAN(
1782                 !is_VERTWS_utf8(s),
1783                 !is_VERTWS_latin1(s)
1784             );
1785             break;
1786         case HORIZWS:
1787             REXEC_FBC_CSCAN(
1788                 is_HORIZWS_utf8(s),
1789                 is_HORIZWS_latin1(s)
1790             );
1791             break;
1792         case NHORIZWS:
1793             REXEC_FBC_CSCAN(
1794                 !is_HORIZWS_utf8(s),
1795                 !is_HORIZWS_latin1(s)
1796             );
1797             break;
1798         case AHOCORASICKC:
1799         case AHOCORASICK:
1800             {
1801                 DECL_TRIE_TYPE(c);
1802                 /* what trie are we using right now */
1803                 reg_ac_data *aho
1804                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1805                 reg_trie_data *trie
1806                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1807                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1808
1809                 const char *last_start = strend - trie->minlen;
1810 #ifdef DEBUGGING
1811                 const char *real_start = s;
1812 #endif
1813                 STRLEN maxlen = trie->maxlen;
1814                 SV *sv_points;
1815                 U8 **points; /* map of where we were in the input string
1816                                 when reading a given char. For ASCII this
1817                                 is unnecessary overhead as the relationship
1818                                 is always 1:1, but for Unicode, especially
1819                                 case folded Unicode this is not true. */
1820                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1821                 U8 *bitmap=NULL;
1822
1823
1824                 GET_RE_DEBUG_FLAGS_DECL;
1825
1826                 /* We can't just allocate points here. We need to wrap it in
1827                  * an SV so it gets freed properly if there is a croak while
1828                  * running the match */
1829                 ENTER;
1830                 SAVETMPS;
1831                 sv_points=newSV(maxlen * sizeof(U8 *));
1832                 SvCUR_set(sv_points,
1833                     maxlen * sizeof(U8 *));
1834                 SvPOK_on(sv_points);
1835                 sv_2mortal(sv_points);
1836                 points=(U8**)SvPV_nolen(sv_points );
1837                 if ( trie_type != trie_utf8_fold
1838                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1839                 {
1840                     if (trie->bitmap)
1841                         bitmap=(U8*)trie->bitmap;
1842                     else
1843                         bitmap=(U8*)ANYOF_BITMAP(c);
1844                 }
1845                 /* this is the Aho-Corasick algorithm modified a touch
1846                    to include special handling for long "unknown char"
1847                    sequences. The basic idea being that we use AC as long
1848                    as we are dealing with a possible matching char, when
1849                    we encounter an unknown char (and we have not encountered
1850                    an accepting state) we scan forward until we find a legal
1851                    starting char.
1852                    AC matching is basically that of trie matching, except
1853                    that when we encounter a failing transition, we fall back
1854                    to the current states "fail state", and try the current char
1855                    again, a process we repeat until we reach the root state,
1856                    state 1, or a legal transition. If we fail on the root state
1857                    then we can either terminate if we have reached an accepting
1858                    state previously, or restart the entire process from the beginning
1859                    if we have not.
1860
1861                  */
1862                 while (s <= last_start) {
1863                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1864                     U8 *uc = (U8*)s;
1865                     U16 charid = 0;
1866                     U32 base = 1;
1867                     U32 state = 1;
1868                     UV uvc = 0;
1869                     STRLEN len = 0;
1870                     STRLEN foldlen = 0;
1871                     U8 *uscan = (U8*)NULL;
1872                     U8 *leftmost = NULL;
1873 #ifdef DEBUGGING
1874                     U32 accepted_word= 0;
1875 #endif
1876                     U32 pointpos = 0;
1877
1878                     while ( state && uc <= (U8*)strend ) {
1879                         int failed=0;
1880                         U32 word = aho->states[ state ].wordnum;
1881
1882                         if( state==1 ) {
1883                             if ( bitmap ) {
1884                                 DEBUG_TRIE_EXECUTE_r(
1885                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1886                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1887                                             (char *)uc, utf8_target );
1888                                         PerlIO_printf( Perl_debug_log,
1889                                             " Scanning for legal start char...\n");
1890                                     }
1891                                 );
1892                                 if (utf8_target) {
1893                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1894                                         uc += UTF8SKIP(uc);
1895                                     }
1896                                 } else {
1897                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1898                                         uc++;
1899                                     }
1900                                 }
1901                                 s= (char *)uc;
1902                             }
1903                             if (uc >(U8*)last_start) break;
1904                         }
1905
1906                         if ( word ) {
1907                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1908                             if (!leftmost || lpos < leftmost) {
1909                                 DEBUG_r(accepted_word=word);
1910                                 leftmost= lpos;
1911                             }
1912                             if (base==0) break;
1913
1914                         }
1915                         points[pointpos++ % maxlen]= uc;
1916                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1917                                              uscan, len, uvc, charid, foldlen,
1918                                              foldbuf, uniflags);
1919                         DEBUG_TRIE_EXECUTE_r({
1920                             dump_exec_pos( (char *)uc, c, strend, real_start,
1921                                 s,   utf8_target );
1922                             PerlIO_printf(Perl_debug_log,
1923                                 " Charid:%3u CP:%4"UVxf" ",
1924                                  charid, uvc);
1925                         });
1926
1927                         do {
1928 #ifdef DEBUGGING
1929                             word = aho->states[ state ].wordnum;
1930 #endif
1931                             base = aho->states[ state ].trans.base;
1932
1933                             DEBUG_TRIE_EXECUTE_r({
1934                                 if (failed)
1935                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1936                                         s,   utf8_target );
1937                                 PerlIO_printf( Perl_debug_log,
1938                                     "%sState: %4"UVxf", word=%"UVxf,
1939                                     failed ? " Fail transition to " : "",
1940                                     (UV)state, (UV)word);
1941                             });
1942                             if ( base ) {
1943                                 U32 tmp;
1944                                 I32 offset;
1945                                 if (charid &&
1946                                      ( ((offset = base + charid
1947                                         - 1 - trie->uniquecharcount)) >= 0)
1948                                      && ((U32)offset < trie->lasttrans)
1949                                      && trie->trans[offset].check == state
1950                                      && (tmp=trie->trans[offset].next))
1951                                 {
1952                                     DEBUG_TRIE_EXECUTE_r(
1953                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1954                                     state = tmp;
1955                                     break;
1956                                 }
1957                                 else {
1958                                     DEBUG_TRIE_EXECUTE_r(
1959                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1960                                     failed = 1;
1961                                     state = aho->fail[state];
1962                                 }
1963                             }
1964                             else {
1965                                 /* we must be accepting here */
1966                                 DEBUG_TRIE_EXECUTE_r(
1967                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1968                                 failed = 1;
1969                                 break;
1970                             }
1971                         } while(state);
1972                         uc += len;
1973                         if (failed) {
1974                             if (leftmost)
1975                                 break;
1976                             if (!state) state = 1;
1977                         }
1978                     }
1979                     if ( aho->states[ state ].wordnum ) {
1980                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1981                         if (!leftmost || lpos < leftmost) {
1982                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1983                             leftmost = lpos;
1984                         }
1985                     }
1986                     if (leftmost) {
1987                         s = (char*)leftmost;
1988                         DEBUG_TRIE_EXECUTE_r({
1989                             PerlIO_printf(
1990                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1991                                 (UV)accepted_word, (IV)(s - real_start)
1992                             );
1993                         });
1994                         if (!reginfo || regtry(reginfo, &s)) {
1995                             FREETMPS;
1996                             LEAVE;
1997                             goto got_it;
1998                         }
1999                         s = HOPc(s,1);
2000                         DEBUG_TRIE_EXECUTE_r({
2001                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2002                         });
2003                     } else {
2004                         DEBUG_TRIE_EXECUTE_r(
2005                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2006                         break;
2007                     }
2008                 }
2009                 FREETMPS;
2010                 LEAVE;
2011             }
2012             break;
2013         default:
2014             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2015             break;
2016         }
2017         return 0;
2018       got_it:
2019         return s;
2020 }
2021
2022
2023 /*
2024  - regexec_flags - match a regexp against a string
2025  */
2026 I32
2027 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2028               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2029 /* strend: pointer to null at end of string */
2030 /* strbeg: real beginning of string */
2031 /* minend: end of match must be >=minend after stringarg. */
2032 /* data: May be used for some additional optimizations.
2033          Currently its only used, with a U32 cast, for transmitting
2034          the ganch offset when doing a /g match. This will change */
2035 /* nosave: For optimizations. */
2036 {
2037     dVAR;
2038     struct regexp *const prog = (struct regexp *)SvANY(rx);
2039     /*register*/ char *s;
2040     register regnode *c;
2041     /*register*/ char *startpos = stringarg;
2042     I32 minlen;         /* must match at least this many chars */
2043     I32 dontbother = 0; /* how many characters not to try at end */
2044     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2045     I32 scream_pos = -1;                /* Internal iterator of scream. */
2046     char *scream_olds = NULL;
2047     const bool utf8_target = cBOOL(DO_UTF8(sv));
2048     I32 multiline;
2049     RXi_GET_DECL(prog,progi);
2050     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2051     regexp_paren_pair *swap = NULL;
2052     GET_RE_DEBUG_FLAGS_DECL;
2053
2054     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2055     PERL_UNUSED_ARG(data);
2056
2057     /* Be paranoid... */
2058     if (prog == NULL || startpos == NULL) {
2059         Perl_croak(aTHX_ "NULL regexp parameter");
2060         return 0;
2061     }
2062
2063     multiline = prog->extflags & RXf_PMf_MULTILINE;
2064     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2065
2066     RX_MATCH_UTF8_set(rx, utf8_target);
2067     DEBUG_EXECUTE_r(
2068         debug_start_match(rx, utf8_target, startpos, strend,
2069         "Matching");
2070     );
2071
2072     minlen = prog->minlen;
2073
2074     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2075         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2076                               "String too short [regexec_flags]...\n"));
2077         goto phooey;
2078     }
2079
2080
2081     /* Check validity of program. */
2082     if (UCHARAT(progi->program) != REG_MAGIC) {
2083         Perl_croak(aTHX_ "corrupted regexp program");
2084     }
2085
2086     PL_reg_flags = 0;
2087     PL_reg_eval_set = 0;
2088     PL_reg_maxiter = 0;
2089
2090     if (RX_UTF8(rx))
2091         PL_reg_flags |= RF_utf8;
2092
2093     /* Mark beginning of line for ^ and lookbehind. */
2094     reginfo.bol = startpos; /* XXX not used ??? */
2095     PL_bostr  = strbeg;
2096     reginfo.sv = sv;
2097
2098     /* Mark end of line for $ (and such) */
2099     PL_regeol = strend;
2100
2101     /* see how far we have to get to not match where we matched before */
2102     reginfo.till = startpos+minend;
2103
2104     /* If there is a "must appear" string, look for it. */
2105     s = startpos;
2106
2107     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2108         MAGIC *mg;
2109         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2110             reginfo.ganch = startpos + prog->gofs;
2111             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2112               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2113         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2114                   && SvMAGIC(sv)
2115                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2116                   && mg->mg_len >= 0) {
2117             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2118             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2119                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2120
2121             if (prog->extflags & RXf_ANCH_GPOS) {
2122                 if (s > reginfo.ganch)
2123                     goto phooey;
2124                 s = reginfo.ganch - prog->gofs;
2125                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2126                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2127                 if (s < strbeg)
2128                     goto phooey;
2129             }
2130         }
2131         else if (data) {
2132             reginfo.ganch = strbeg + PTR2UV(data);
2133             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2134                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2135
2136         } else {                                /* pos() not defined */
2137             reginfo.ganch = strbeg;
2138             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2139                  "GPOS: reginfo.ganch = strbeg\n"));
2140         }
2141     }
2142     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2143         /* We have to be careful. If the previous successful match
2144            was from this regex we don't want a subsequent partially
2145            successful match to clobber the old results.
2146            So when we detect this possibility we add a swap buffer
2147            to the re, and switch the buffer each match. If we fail
2148            we switch it back, otherwise we leave it swapped.
2149         */
2150         swap = prog->offs;
2151         /* do we need a save destructor here for eval dies? */
2152         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2153     }
2154     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2155         re_scream_pos_data d;
2156
2157         d.scream_olds = &scream_olds;
2158         d.scream_pos = &scream_pos;
2159         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2160         if (!s) {
2161             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2162             goto phooey;        /* not present */
2163         }
2164     }
2165
2166
2167
2168     /* Simplest case:  anchored match need be tried only once. */
2169     /*  [unless only anchor is BOL and multiline is set] */
2170     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2171         if (s == startpos && regtry(&reginfo, &startpos))
2172             goto got_it;
2173         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2174                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2175         {
2176             char *end;
2177
2178             if (minlen)
2179                 dontbother = minlen - 1;
2180             end = HOP3c(strend, -dontbother, strbeg) - 1;
2181             /* for multiline we only have to try after newlines */
2182             if (prog->check_substr || prog->check_utf8) {
2183                 /* because of the goto we can not easily reuse the macros for bifurcating the
2184                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2185                 if (utf8_target) {
2186                     if (s == startpos)
2187                         goto after_try_utf8;
2188                     while (1) {
2189                         if (regtry(&reginfo, &s)) {
2190                             goto got_it;
2191                         }
2192                       after_try_utf8:
2193                         if (s > end) {
2194                             goto phooey;
2195                         }
2196                         if (prog->extflags & RXf_USE_INTUIT) {
2197                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2198                             if (!s) {
2199                                 goto phooey;
2200                             }
2201                         }
2202                         else {
2203                             s += UTF8SKIP(s);
2204                         }
2205                     }
2206                 } /* end search for check string in unicode */
2207                 else {
2208                     if (s == startpos) {
2209                         goto after_try_latin;
2210                     }
2211                     while (1) {
2212                         if (regtry(&reginfo, &s)) {
2213                             goto got_it;
2214                         }
2215                       after_try_latin:
2216                         if (s > end) {
2217                             goto phooey;
2218                         }
2219                         if (prog->extflags & RXf_USE_INTUIT) {
2220                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2221                             if (!s) {
2222                                 goto phooey;
2223                             }
2224                         }
2225                         else {
2226                             s++;
2227                         }
2228                     }
2229                 } /* end search for check string in latin*/
2230             } /* end search for check string */
2231             else { /* search for newline */
2232                 if (s > startpos) {
2233                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2234                     s--;
2235                 }
2236                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2237                 while (s < end) {
2238                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2239                         if (regtry(&reginfo, &s))
2240                             goto got_it;
2241                     }
2242                 }
2243             } /* end search for newline */
2244         } /* end anchored/multiline check string search */
2245         goto phooey;
2246     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2247     {
2248         /* the warning about reginfo.ganch being used without initialization
2249            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2250            and we only enter this block when the same bit is set. */
2251         char *tmp_s = reginfo.ganch - prog->gofs;
2252
2253         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2254             goto got_it;
2255         goto phooey;
2256     }
2257
2258     /* Messy cases:  unanchored match. */
2259     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2260         /* we have /x+whatever/ */
2261         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2262         char ch;
2263 #ifdef DEBUGGING
2264         int did_match = 0;
2265 #endif
2266         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2267             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2268         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2269
2270         if (utf8_target) {
2271             REXEC_FBC_SCAN(
2272                 if (*s == ch) {
2273                     DEBUG_EXECUTE_r( did_match = 1 );
2274                     if (regtry(&reginfo, &s)) goto got_it;
2275                     s += UTF8SKIP(s);
2276                     while (s < strend && *s == ch)
2277                         s += UTF8SKIP(s);
2278                 }
2279             );
2280         }
2281         else {
2282             REXEC_FBC_SCAN(
2283                 if (*s == ch) {
2284                     DEBUG_EXECUTE_r( did_match = 1 );
2285                     if (regtry(&reginfo, &s)) goto got_it;
2286                     s++;
2287                     while (s < strend && *s == ch)
2288                         s++;
2289                 }
2290             );
2291         }
2292         DEBUG_EXECUTE_r(if (!did_match)
2293                 PerlIO_printf(Perl_debug_log,
2294                                   "Did not find anchored character...\n")
2295                );
2296     }
2297     else if (prog->anchored_substr != NULL
2298               || prog->anchored_utf8 != NULL
2299               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2300                   && prog->float_max_offset < strend - s)) {
2301         SV *must;
2302         I32 back_max;
2303         I32 back_min;
2304         char *last;
2305         char *last1;            /* Last position checked before */
2306 #ifdef DEBUGGING
2307         int did_match = 0;
2308 #endif
2309         if (prog->anchored_substr || prog->anchored_utf8) {
2310             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2311                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2312             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2313             back_max = back_min = prog->anchored_offset;
2314         } else {
2315             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2316                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2317             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2318             back_max = prog->float_max_offset;
2319             back_min = prog->float_min_offset;
2320         }
2321
2322
2323         if (must == &PL_sv_undef)
2324             /* could not downgrade utf8 check substring, so must fail */
2325             goto phooey;
2326
2327         if (back_min<0) {
2328             last = strend;
2329         } else {
2330             last = HOP3c(strend,        /* Cannot start after this */
2331                   -(I32)(CHR_SVLEN(must)
2332                          - (SvTAIL(must) != 0) + back_min), strbeg);
2333         }
2334         if (s > PL_bostr)
2335             last1 = HOPc(s, -1);
2336         else
2337             last1 = s - 1;      /* bogus */
2338
2339         /* XXXX check_substr already used to find "s", can optimize if
2340            check_substr==must. */
2341         scream_pos = -1;
2342         dontbother = end_shift;
2343         strend = HOPc(strend, -dontbother);
2344         while ( (s <= last) &&
2345                 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
2346                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2347                                     end_shift, &scream_pos, 0))
2348                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2349                                   (unsigned char*)strend, must,
2350                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2351             /* we may be pointing at the wrong string */
2352             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2353                 s = strbeg + (s - SvPVX_const(sv));
2354             DEBUG_EXECUTE_r( did_match = 1 );
2355             if (HOPc(s, -back_max) > last1) {
2356                 last1 = HOPc(s, -back_min);
2357                 s = HOPc(s, -back_max);
2358             }
2359             else {
2360                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2361
2362                 last1 = HOPc(s, -back_min);
2363                 s = t;
2364             }
2365             if (utf8_target) {
2366                 while (s <= last1) {
2367                     if (regtry(&reginfo, &s))
2368                         goto got_it;
2369                     s += UTF8SKIP(s);
2370                 }
2371             }
2372             else {
2373                 while (s <= last1) {
2374                     if (regtry(&reginfo, &s))
2375                         goto got_it;
2376                     s++;
2377                 }
2378             }
2379         }
2380         DEBUG_EXECUTE_r(if (!did_match) {
2381             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2382                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2383             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2384                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2385                                ? "anchored" : "floating"),
2386                 quoted, RE_SV_TAIL(must));
2387         });
2388         goto phooey;
2389     }
2390     else if ( (c = progi->regstclass) ) {
2391         if (minlen) {
2392             const OPCODE op = OP(progi->regstclass);
2393             /* don't bother with what can't match */
2394             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2395                 strend = HOPc(strend, -(minlen - 1));
2396         }
2397         DEBUG_EXECUTE_r({
2398             SV * const prop = sv_newmortal();
2399             regprop(prog, prop, c);
2400             {
2401                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2402                     s,strend-s,60);
2403                 PerlIO_printf(Perl_debug_log,
2404                     "Matching stclass %.*s against %s (%d bytes)\n",
2405                     (int)SvCUR(prop), SvPVX_const(prop),
2406                      quoted, (int)(strend - s));
2407             }
2408         });
2409         if (find_byclass(prog, c, s, strend, &reginfo))
2410             goto got_it;
2411         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2412     }
2413     else {
2414         dontbother = 0;
2415         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2416             /* Trim the end. */
2417             char *last;
2418             SV* float_real;
2419
2420             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2421                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2422             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2423
2424             if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
2425                 last = screaminstr(sv, float_real, s - strbeg,
2426                                    end_shift, &scream_pos, 1); /* last one */
2427                 if (!last)
2428                     last = scream_olds; /* Only one occurrence. */
2429                 /* we may be pointing at the wrong string */
2430                 else if (RXp_MATCH_COPIED(prog))
2431                     s = strbeg + (s - SvPVX_const(sv));
2432             }
2433             else {
2434                 STRLEN len;
2435                 const char * const little = SvPV_const(float_real, len);
2436
2437                 if (SvTAIL(float_real)) {
2438                     if (memEQ(strend - len + 1, little, len - 1))
2439                         last = strend - len + 1;
2440                     else if (!multiline)
2441                         last = memEQ(strend - len, little, len)
2442                             ? strend - len : NULL;
2443                     else
2444                         goto find_last;
2445                 } else {
2446                   find_last:
2447                     if (len)
2448                         last = rninstr(s, strend, little, little + len);
2449                     else
2450                         last = strend;  /* matching "$" */
2451                 }
2452             }
2453             if (last == NULL) {
2454                 DEBUG_EXECUTE_r(
2455                     PerlIO_printf(Perl_debug_log,
2456                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2457                         PL_colors[4], PL_colors[5]));
2458                 goto phooey; /* Should not happen! */
2459             }
2460             dontbother = strend - last + prog->float_min_offset;
2461         }
2462         if (minlen && (dontbother < minlen))
2463             dontbother = minlen - 1;
2464         strend -= dontbother;              /* this one's always in bytes! */
2465         /* We don't know much -- general case. */
2466         if (utf8_target) {
2467             for (;;) {
2468                 if (regtry(&reginfo, &s))
2469                     goto got_it;
2470                 if (s >= strend)
2471                     break;
2472                 s += UTF8SKIP(s);
2473             };
2474         }
2475         else {
2476             do {
2477                 if (regtry(&reginfo, &s))
2478                     goto got_it;
2479             } while (s++ < strend);
2480         }
2481     }
2482
2483     /* Failure. */
2484     goto phooey;
2485
2486 got_it:
2487     Safefree(swap);
2488     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2489
2490     if (PL_reg_eval_set)
2491         restore_pos(aTHX_ prog);
2492     if (RXp_PAREN_NAMES(prog))
2493         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2494
2495     /* make sure $`, $&, $', and $digit will work later */
2496     if ( !(flags & REXEC_NOT_FIRST) ) {
2497         RX_MATCH_COPY_FREE(rx);
2498         if (flags & REXEC_COPY_STR) {
2499             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2500 #ifdef PERL_OLD_COPY_ON_WRITE
2501             if ((SvIsCOW(sv)
2502                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2503                 if (DEBUG_C_TEST) {
2504                     PerlIO_printf(Perl_debug_log,
2505                                   "Copy on write: regexp capture, type %d\n",
2506                                   (int) SvTYPE(sv));
2507                 }
2508                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2509                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2510                 assert (SvPOKp(prog->saved_copy));
2511             } else
2512 #endif
2513             {
2514                 RX_MATCH_COPIED_on(rx);
2515                 s = savepvn(strbeg, i);
2516                 prog->subbeg = s;
2517             }
2518             prog->sublen = i;
2519         }
2520         else {
2521             prog->subbeg = strbeg;
2522             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2523         }
2524     }
2525
2526     return 1;
2527
2528 phooey:
2529     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2530                           PL_colors[4], PL_colors[5]));
2531     if (PL_reg_eval_set)
2532         restore_pos(aTHX_ prog);
2533     if (swap) {
2534         /* we failed :-( roll it back */
2535         Safefree(prog->offs);
2536         prog->offs = swap;
2537     }
2538
2539     return 0;
2540 }
2541
2542
2543 /*
2544  - regtry - try match at specific point
2545  */
2546 STATIC I32                      /* 0 failure, 1 success */
2547 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2548 {
2549     dVAR;
2550     CHECKPOINT lastcp;
2551     REGEXP *const rx = reginfo->prog;
2552     regexp *const prog = (struct regexp *)SvANY(rx);
2553     RXi_GET_DECL(prog,progi);
2554     GET_RE_DEBUG_FLAGS_DECL;
2555
2556     PERL_ARGS_ASSERT_REGTRY;
2557
2558     reginfo->cutpoint=NULL;
2559
2560     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2561         MAGIC *mg;
2562
2563         PL_reg_eval_set = RS_init;
2564         DEBUG_EXECUTE_r(DEBUG_s(
2565             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2566                           (IV)(PL_stack_sp - PL_stack_base));
2567             ));
2568         SAVESTACK_CXPOS();
2569         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2570         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2571         SAVETMPS;
2572         /* Apparently this is not needed, judging by wantarray. */
2573         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2574            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2575
2576         if (reginfo->sv) {
2577             /* Make $_ available to executed code. */
2578             if (reginfo->sv != DEFSV) {
2579                 SAVE_DEFSV;
2580                 DEFSV_set(reginfo->sv);
2581             }
2582
2583             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2584                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2585                 /* prepare for quick setting of pos */
2586 #ifdef PERL_OLD_COPY_ON_WRITE
2587                 if (SvIsCOW(reginfo->sv))
2588                     sv_force_normal_flags(reginfo->sv, 0);
2589 #endif
2590                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2591                                  &PL_vtbl_mglob, NULL, 0);
2592                 mg->mg_len = -1;
2593             }
2594             PL_reg_magic    = mg;
2595             PL_reg_oldpos   = mg->mg_len;
2596             SAVEDESTRUCTOR_X(restore_pos, prog);
2597         }
2598         if (!PL_reg_curpm) {
2599             Newxz(PL_reg_curpm, 1, PMOP);
2600 #ifdef USE_ITHREADS
2601             {
2602                 SV* const repointer = &PL_sv_undef;
2603                 /* this regexp is also owned by the new PL_reg_curpm, which
2604                    will try to free it.  */
2605                 av_push(PL_regex_padav, repointer);
2606                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2607                 PL_regex_pad = AvARRAY(PL_regex_padav);
2608             }
2609 #endif
2610         }
2611 #ifdef USE_ITHREADS
2612         /* It seems that non-ithreads works both with and without this code.
2613            So for efficiency reasons it seems best not to have the code
2614            compiled when it is not needed.  */
2615         /* This is safe against NULLs: */
2616         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2617         /* PM_reg_curpm owns a reference to this regexp.  */
2618         (void)ReREFCNT_inc(rx);
2619 #endif
2620         PM_SETRE(PL_reg_curpm, rx);
2621         PL_reg_oldcurpm = PL_curpm;
2622         PL_curpm = PL_reg_curpm;
2623         if (RXp_MATCH_COPIED(prog)) {
2624             /*  Here is a serious problem: we cannot rewrite subbeg,
2625                 since it may be needed if this match fails.  Thus
2626                 $` inside (?{}) could fail... */
2627             PL_reg_oldsaved = prog->subbeg;
2628             PL_reg_oldsavedlen = prog->sublen;
2629 #ifdef PERL_OLD_COPY_ON_WRITE
2630             PL_nrs = prog->saved_copy;
2631 #endif
2632             RXp_MATCH_COPIED_off(prog);
2633         }
2634         else
2635             PL_reg_oldsaved = NULL;
2636         prog->subbeg = PL_bostr;
2637         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2638     }
2639     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2640     prog->offs[0].start = *startpos - PL_bostr;
2641     PL_reginput = *startpos;
2642     PL_reglastparen = &prog->lastparen;
2643     PL_reglastcloseparen = &prog->lastcloseparen;
2644     prog->lastparen = 0;
2645     prog->lastcloseparen = 0;
2646     PL_regsize = 0;
2647     PL_regoffs = prog->offs;
2648     if (PL_reg_start_tmpl <= prog->nparens) {
2649         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2650         if(PL_reg_start_tmp)
2651             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2652         else
2653             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2654     }
2655
2656     /* XXXX What this code is doing here?!!!  There should be no need
2657        to do this again and again, PL_reglastparen should take care of
2658        this!  --ilya*/
2659
2660     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2661      * Actually, the code in regcppop() (which Ilya may be meaning by
2662      * PL_reglastparen), is not needed at all by the test suite
2663      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2664      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2665      * Meanwhile, this code *is* needed for the
2666      * above-mentioned test suite tests to succeed.  The common theme
2667      * on those tests seems to be returning null fields from matches.
2668      * --jhi updated by dapm */
2669 #if 1
2670     if (prog->nparens) {
2671         regexp_paren_pair *pp = PL_regoffs;
2672         register I32 i;
2673         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2674             ++pp;
2675             pp->start = -1;
2676             pp->end = -1;
2677         }
2678     }
2679 #endif
2680     REGCP_SET(lastcp);
2681     if (regmatch(reginfo, progi->program + 1)) {
2682         PL_regoffs[0].end = PL_reginput - PL_bostr;
2683         return 1;
2684     }
2685     if (reginfo->cutpoint)
2686         *startpos= reginfo->cutpoint;
2687     REGCP_UNWIND(lastcp);
2688     return 0;
2689 }
2690
2691
2692 #define sayYES goto yes
2693 #define sayNO goto no
2694 #define sayNO_SILENT goto no_silent
2695
2696 /* we dont use STMT_START/END here because it leads to
2697    "unreachable code" warnings, which are bogus, but distracting. */
2698 #define CACHEsayNO \
2699     if (ST.cache_mask) \
2700        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2701     sayNO
2702
2703 /* this is used to determine how far from the left messages like
2704    'failed...' are printed. It should be set such that messages
2705    are inline with the regop output that created them.
2706 */
2707 #define REPORT_CODE_OFF 32
2708
2709
2710 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2711 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2712
2713 #define SLAB_FIRST(s) (&(s)->states[0])
2714 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2715
2716 /* grab a new slab and return the first slot in it */
2717
2718 STATIC regmatch_state *
2719 S_push_slab(pTHX)
2720 {
2721 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2722     dMY_CXT;
2723 #endif
2724     regmatch_slab *s = PL_regmatch_slab->next;
2725     if (!s) {
2726         Newx(s, 1, regmatch_slab);
2727         s->prev = PL_regmatch_slab;
2728         s->next = NULL;
2729         PL_regmatch_slab->next = s;
2730     }
2731     PL_regmatch_slab = s;
2732     return SLAB_FIRST(s);
2733 }
2734
2735
2736 /* push a new state then goto it */
2737
2738 #define PUSH_STATE_GOTO(state, node) \
2739     scan = node; \
2740     st->resume_state = state; \
2741     goto push_state;
2742
2743 /* push a new state with success backtracking, then goto it */
2744
2745 #define PUSH_YES_STATE_GOTO(state, node) \
2746     scan = node; \
2747     st->resume_state = state; \
2748     goto push_yes_state;
2749
2750
2751
2752 /*
2753
2754 regmatch() - main matching routine
2755
2756 This is basically one big switch statement in a loop. We execute an op,
2757 set 'next' to point the next op, and continue. If we come to a point which
2758 we may need to backtrack to on failure such as (A|B|C), we push a
2759 backtrack state onto the backtrack stack. On failure, we pop the top
2760 state, and re-enter the loop at the state indicated. If there are no more
2761 states to pop, we return failure.
2762
2763 Sometimes we also need to backtrack on success; for example /A+/, where
2764 after successfully matching one A, we need to go back and try to
2765 match another one; similarly for lookahead assertions: if the assertion
2766 completes successfully, we backtrack to the state just before the assertion
2767 and then carry on.  In these cases, the pushed state is marked as
2768 'backtrack on success too'. This marking is in fact done by a chain of
2769 pointers, each pointing to the previous 'yes' state. On success, we pop to
2770 the nearest yes state, discarding any intermediate failure-only states.
2771 Sometimes a yes state is pushed just to force some cleanup code to be
2772 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2773 it to free the inner regex.
2774
2775 Note that failure backtracking rewinds the cursor position, while
2776 success backtracking leaves it alone.
2777
2778 A pattern is complete when the END op is executed, while a subpattern
2779 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2780 ops trigger the "pop to last yes state if any, otherwise return true"
2781 behaviour.
2782
2783 A common convention in this function is to use A and B to refer to the two
2784 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2785 the subpattern to be matched possibly multiple times, while B is the entire
2786 rest of the pattern. Variable and state names reflect this convention.
2787
2788 The states in the main switch are the union of ops and failure/success of
2789 substates associated with with that op.  For example, IFMATCH is the op
2790 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2791 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2792 successfully matched A and IFMATCH_A_fail is a state saying that we have
2793 just failed to match A. Resume states always come in pairs. The backtrack
2794 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2795 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2796 on success or failure.
2797
2798 The struct that holds a backtracking state is actually a big union, with
2799 one variant for each major type of op. The variable st points to the
2800 top-most backtrack struct. To make the code clearer, within each
2801 block of code we #define ST to alias the relevant union.
2802
2803 Here's a concrete example of a (vastly oversimplified) IFMATCH
2804 implementation:
2805
2806     switch (state) {
2807     ....
2808
2809 #define ST st->u.ifmatch
2810
2811     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2812         ST.foo = ...; // some state we wish to save
2813         ...
2814         // push a yes backtrack state with a resume value of
2815         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2816         // first node of A:
2817         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2818         // NOTREACHED
2819
2820     case IFMATCH_A: // we have successfully executed A; now continue with B
2821         next = B;
2822         bar = ST.foo; // do something with the preserved value
2823         break;
2824
2825     case IFMATCH_A_fail: // A failed, so the assertion failed
2826         ...;   // do some housekeeping, then ...
2827         sayNO; // propagate the failure
2828
2829 #undef ST
2830
2831     ...
2832     }
2833
2834 For any old-timers reading this who are familiar with the old recursive
2835 approach, the code above is equivalent to:
2836
2837     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2838     {
2839         int foo = ...
2840         ...
2841         if (regmatch(A)) {
2842             next = B;
2843             bar = foo;
2844             break;
2845         }
2846         ...;   // do some housekeeping, then ...
2847         sayNO; // propagate the failure
2848     }
2849
2850 The topmost backtrack state, pointed to by st, is usually free. If you
2851 want to claim it, populate any ST.foo fields in it with values you wish to
2852 save, then do one of
2853
2854         PUSH_STATE_GOTO(resume_state, node);
2855         PUSH_YES_STATE_GOTO(resume_state, node);
2856
2857 which sets that backtrack state's resume value to 'resume_state', pushes a
2858 new free entry to the top of the backtrack stack, then goes to 'node'.
2859 On backtracking, the free slot is popped, and the saved state becomes the
2860 new free state. An ST.foo field in this new top state can be temporarily
2861 accessed to retrieve values, but once the main loop is re-entered, it
2862 becomes available for reuse.
2863
2864 Note that the depth of the backtrack stack constantly increases during the
2865 left-to-right execution of the pattern, rather than going up and down with
2866 the pattern nesting. For example the stack is at its maximum at Z at the
2867 end of the pattern, rather than at X in the following:
2868
2869     /(((X)+)+)+....(Y)+....Z/
2870
2871 The only exceptions to this are lookahead/behind assertions and the cut,
2872 (?>A), which pop all the backtrack states associated with A before
2873 continuing.
2874
2875 Backtrack state structs are allocated in slabs of about 4K in size.
2876 PL_regmatch_state and st always point to the currently active state,
2877 and PL_regmatch_slab points to the slab currently containing
2878 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2879 allocated, and is never freed until interpreter destruction. When the slab
2880 is full, a new one is allocated and chained to the end. At exit from
2881 regmatch(), slabs allocated since entry are freed.
2882
2883 */
2884
2885
2886 #define DEBUG_STATE_pp(pp)                                  \
2887     DEBUG_STATE_r({                                         \
2888         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2889         PerlIO_printf(Perl_debug_log,                       \
2890             "    %*s"pp" %s%s%s%s%s\n",                     \
2891             depth*2, "",                                    \
2892             PL_reg_name[st->resume_state],                     \
2893             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2894             ((st==yes_state) ? "Y" : ""),                   \
2895             ((st==mark_state) ? "M" : ""),                  \
2896             ((st==yes_state||st==mark_state) ? "]" : "")    \
2897         );                                                  \
2898     });
2899
2900
2901 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2902
2903 #ifdef DEBUGGING
2904
2905 STATIC void
2906 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2907     const char *start, const char *end, const char *blurb)
2908 {
2909     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2910
2911     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2912
2913     if (!PL_colorset)
2914             reginitcolors();
2915     {
2916         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2917             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2918
2919         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2920             start, end - start, 60);
2921
2922         PerlIO_printf(Perl_debug_log,
2923             "%s%s REx%s %s against %s\n",
2924                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2925
2926         if (utf8_target||utf8_pat)
2927             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2928                 utf8_pat ? "pattern" : "",
2929                 utf8_pat && utf8_target ? " and " : "",
2930                 utf8_target ? "string" : ""
2931             );
2932     }
2933 }
2934
2935 STATIC void
2936 S_dump_exec_pos(pTHX_ const char *locinput,
2937                       const regnode *scan,
2938                       const char *loc_regeol,
2939                       const char *loc_bostr,
2940                       const char *loc_reg_starttry,
2941                       const bool utf8_target)
2942 {
2943     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2944     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2945     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2946     /* The part of the string before starttry has one color
2947        (pref0_len chars), between starttry and current
2948        position another one (pref_len - pref0_len chars),
2949        after the current position the third one.
2950        We assume that pref0_len <= pref_len, otherwise we
2951        decrease pref0_len.  */
2952     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2953         ? (5 + taill) - l : locinput - loc_bostr;
2954     int pref0_len;
2955
2956     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2957
2958     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2959         pref_len++;
2960     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2961     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2962         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2963               ? (5 + taill) - pref_len : loc_regeol - locinput);
2964     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2965         l--;
2966     if (pref0_len < 0)
2967         pref0_len = 0;
2968     if (pref0_len > pref_len)
2969         pref0_len = pref_len;
2970     {
2971         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2972
2973         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2974             (locinput - pref_len),pref0_len, 60, 4, 5);
2975
2976         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2977                     (locinput - pref_len + pref0_len),
2978                     pref_len - pref0_len, 60, 2, 3);
2979
2980         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2981                     locinput, loc_regeol - locinput, 10, 0, 1);
2982
2983         const STRLEN tlen=len0+len1+len2;
2984         PerlIO_printf(Perl_debug_log,
2985                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2986                     (IV)(locinput - loc_bostr),
2987                     len0, s0,
2988                     len1, s1,
2989                     (docolor ? "" : "> <"),
2990                     len2, s2,
2991                     (int)(tlen > 19 ? 0 :  19 - tlen),
2992                     "");
2993     }
2994 }
2995
2996 #endif
2997
2998 /* reg_check_named_buff_matched()
2999  * Checks to see if a named buffer has matched. The data array of
3000  * buffer numbers corresponding to the buffer is expected to reside
3001  * in the regexp->data->data array in the slot stored in the ARG() of
3002  * node involved. Note that this routine doesn't actually care about the
3003  * name, that information is not preserved from compilation to execution.
3004  * Returns the index of the leftmost defined buffer with the given name
3005  * or 0 if non of the buffers matched.
3006  */
3007 STATIC I32
3008 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3009 {
3010     I32 n;
3011     RXi_GET_DECL(rex,rexi);
3012     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3013     I32 *nums=(I32*)SvPVX(sv_dat);
3014
3015     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3016
3017     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3018         if ((I32)*PL_reglastparen >= nums[n] &&
3019             PL_regoffs[nums[n]].end != -1)
3020         {
3021             return nums[n];
3022         }
3023     }
3024     return 0;
3025 }
3026
3027
3028 /* free all slabs above current one  - called during LEAVE_SCOPE */
3029
3030 STATIC void
3031 S_clear_backtrack_stack(pTHX_ void *p)
3032 {
3033     regmatch_slab *s = PL_regmatch_slab->next;
3034     PERL_UNUSED_ARG(p);
3035
3036     if (!s)
3037         return;
3038     PL_regmatch_slab->next = NULL;
3039     while (s) {
3040         regmatch_slab * const osl = s;
3041         s = s->next;
3042         Safefree(osl);
3043     }
3044 }
3045
3046
3047 #define SETREX(Re1,Re2) \
3048     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3049     Re1 = (Re2)
3050
3051 STATIC I32                      /* 0 failure, 1 success */
3052 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3053 {
3054 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3055     dMY_CXT;
3056 #endif
3057     dVAR;
3058     register const bool utf8_target = PL_reg_match_utf8;
3059     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3060     REGEXP *rex_sv = reginfo->prog;
3061     regexp *rex = (struct regexp *)SvANY(rex_sv);
3062     RXi_GET_DECL(rex,rexi);
3063     I32 oldsave;
3064     /* the current state. This is a cached copy of PL_regmatch_state */
3065     register regmatch_state *st;
3066     /* cache heavy used fields of st in registers */
3067     register regnode *scan;
3068     register regnode *next;
3069     register U32 n = 0; /* general value; init to avoid compiler warning */
3070     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3071     register char *locinput = PL_reginput;
3072     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3073
3074     bool result = 0;        /* return value of S_regmatch */
3075     int depth = 0;          /* depth of backtrack stack */
3076     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3077     const U32 max_nochange_depth =
3078         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3079         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3080     regmatch_state *yes_state = NULL; /* state to pop to on success of
3081                                                             subpattern */
3082     /* mark_state piggy backs on the yes_state logic so that when we unwind
3083        the stack on success we can update the mark_state as we go */
3084     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3085     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3086     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3087     U32 state_num;
3088     bool no_final = 0;      /* prevent failure from backtracking? */
3089     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3090     char *startpoint = PL_reginput;
3091     SV *popmark = NULL;     /* are we looking for a mark? */
3092     SV *sv_commit = NULL;   /* last mark name seen in failure */
3093     SV *sv_yes_mark = NULL; /* last mark name we have seen
3094                                during a successful match */
3095     U32 lastopen = 0;       /* last open we saw */
3096     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3097     SV* const oreplsv = GvSV(PL_replgv);
3098     /* these three flags are set by various ops to signal information to
3099      * the very next op. They have a useful lifetime of exactly one loop
3100      * iteration, and are not preserved or restored by state pushes/pops
3101      */
3102     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3103     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3104     int logical = 0;        /* the following EVAL is:
3105                                 0: (?{...})
3106                                 1: (?(?{...})X|Y)
3107                                 2: (??{...})
3108                                or the following IFMATCH/UNLESSM is:
3109                                 false: plain (?=foo)
3110                                 true:  used as a condition: (?(?=foo))
3111                             */
3112 #ifdef DEBUGGING
3113     GET_RE_DEBUG_FLAGS_DECL;
3114 #endif
3115
3116     PERL_ARGS_ASSERT_REGMATCH;
3117
3118     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3119             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3120     }));
3121     /* on first ever call to regmatch, allocate first slab */
3122     if (!PL_regmatch_slab) {
3123         Newx(PL_regmatch_slab, 1, regmatch_slab);
3124         PL_regmatch_slab->prev = NULL;
3125         PL_regmatch_slab->next = NULL;
3126         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3127     }
3128
3129     oldsave = PL_savestack_ix;
3130     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3131     SAVEVPTR(PL_regmatch_slab);
3132     SAVEVPTR(PL_regmatch_state);
3133
3134     /* grab next free state slot */
3135     st = ++PL_regmatch_state;
3136     if (st >  SLAB_LAST(PL_regmatch_slab))
3137         st = PL_regmatch_state = S_push_slab(aTHX);
3138
3139     /* Note that nextchr is a byte even in UTF */
3140     nextchr = UCHARAT(locinput);
3141     scan = prog;
3142     while (scan != NULL) {
3143
3144         DEBUG_EXECUTE_r( {
3145             SV * const prop = sv_newmortal();
3146             regnode *rnext=regnext(scan);
3147             DUMP_EXEC_POS( locinput, scan, utf8_target );
3148             regprop(rex, prop, scan);
3149
3150             PerlIO_printf(Perl_debug_log,
3151                     "%3"IVdf":%*s%s(%"IVdf")\n",
3152                     (IV)(scan - rexi->program), depth*2, "",
3153                     SvPVX_const(prop),
3154                     (PL_regkind[OP(scan)] == END || !rnext) ?
3155                         0 : (IV)(rnext - rexi->program));
3156         });
3157
3158         next = scan + NEXT_OFF(scan);
3159         if (next == scan)
3160             next = NULL;
3161         state_num = OP(scan);
3162
3163       reenter_switch:
3164
3165         assert(PL_reglastparen == &rex->lastparen);
3166         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3167         assert(PL_regoffs == rex->offs);
3168
3169         switch (state_num) {
3170         case BOL:
3171             if (locinput == PL_bostr)
3172             {
3173                 /* reginfo->till = reginfo->bol; */
3174                 break;
3175             }
3176             sayNO;
3177         case MBOL:
3178             if (locinput == PL_bostr ||
3179                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3180             {
3181                 break;
3182             }
3183             sayNO;
3184         case SBOL:
3185             if (locinput == PL_bostr)
3186                 break;
3187             sayNO;
3188         case GPOS:
3189             if (locinput == reginfo->ganch)
3190                 break;
3191             sayNO;
3192
3193         case KEEPS:
3194             /* update the startpoint */
3195             st->u.keeper.val = PL_regoffs[0].start;
3196             PL_reginput = locinput;
3197             PL_regoffs[0].start = locinput - PL_bostr;
3198             PUSH_STATE_GOTO(KEEPS_next, next);
3199             /*NOT-REACHED*/
3200         case KEEPS_next_fail:
3201             /* rollback the start point change */
3202             PL_regoffs[0].start = st->u.keeper.val;
3203             sayNO_SILENT;
3204             /*NOT-REACHED*/
3205         case EOL:
3206                 goto seol;
3207         case MEOL:
3208             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3209                 sayNO;
3210             break;
3211         case SEOL:
3212           seol:
3213             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3214                 sayNO;
3215             if (PL_regeol - locinput > 1)
3216                 sayNO;
3217             break;
3218         case EOS:
3219             if (PL_regeol != locinput)
3220                 sayNO;
3221             break;
3222         case SANY:
3223             if (!nextchr && locinput >= PL_regeol)
3224                 sayNO;
3225             if (utf8_target) {
3226                 locinput += PL_utf8skip[nextchr];
3227                 if (locinput > PL_regeol)
3228                     sayNO;
3229                 nextchr = UCHARAT(locinput);
3230             }
3231             else
3232                 nextchr = UCHARAT(++locinput);
3233             break;
3234         case CANY:
3235             if (!nextchr && locinput >= PL_regeol)
3236                 sayNO;
3237             nextchr = UCHARAT(++locinput);
3238             break;
3239         case REG_ANY:
3240             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3241                 sayNO;
3242             if (utf8_target) {
3243                 locinput += PL_utf8skip[nextchr];
3244                 if (locinput > PL_regeol)
3245                     sayNO;
3246                 nextchr = UCHARAT(locinput);
3247             }
3248             else
3249                 nextchr = UCHARAT(++locinput);
3250             break;
3251
3252 #undef  ST
3253 #define ST st->u.trie
3254         case TRIEC:
3255             /* In this case the charclass data is available inline so
3256                we can fail fast without a lot of extra overhead.
3257              */
3258             if (scan->flags == EXACT || !utf8_target) {
3259                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3260                     DEBUG_EXECUTE_r(
3261                         PerlIO_printf(Perl_debug_log,
3262                                   "%*s  %sfailed to match trie start class...%s\n",
3263                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3264                     );
3265                     sayNO_SILENT;
3266                     /* NOTREACHED */
3267                 }
3268             }
3269             /* FALL THROUGH */
3270         case TRIE:
3271             /* the basic plan of execution of the trie is:
3272              * At the beginning, run though all the states, and
3273              * find the longest-matching word. Also remember the position
3274              * of the shortest matching word. For example, this pattern:
3275              *    1  2 3 4    5
3276              *    ab|a|x|abcd|abc
3277              * when matched against the string "abcde", will generate
3278              * accept states for all words except 3, with the longest
3279              * matching word being 4, and the shortest being 1 (with
3280              * the position being after char 1 of the string).
3281              *
3282              * Then for each matching word, in word order (i.e. 1,2,4,5),
3283              * we run the remainder of the pattern; on each try setting
3284              * the current position to the character following the word,
3285              * returning to try the next word on failure.
3286              *
3287              * We avoid having to build a list of words at runtime by
3288              * using a compile-time structure, wordinfo[].prev, which
3289              * gives, for each word, the previous accepting word (if any).
3290              * In the case above it would contain the mappings 1->2, 2->0,
3291              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3292              * the longest word (4 above), a list of all words, by
3293              * following the list of prev pointers; this gives us the
3294              * unordered list 4,5,1,2. Then given the current word we have
3295              * just tried, we can go through the list and find the
3296              * next-biggest word to try (so if we just failed on word 2,
3297              * the next in the list is 4).
3298              *
3299              * Since at runtime we don't record the matching position in
3300              * the string for each word, we have to work that out for
3301              * each word we're about to process. The wordinfo table holds
3302              * the character length of each word; given that we recorded
3303              * at the start: the position of the shortest word and its
3304              * length in chars, we just need to move the pointer the
3305              * difference between the two char lengths. Depending on
3306              * Unicode status and folding, that's cheap or expensive.
3307              *
3308              * This algorithm is optimised for the case where are only a
3309              * small number of accept states, i.e. 0,1, or maybe 2.
3310              * With lots of accepts states, and having to try all of them,
3311              * it becomes quadratic on number of accept states to find all
3312              * the next words.
3313              */
3314
3315             {
3316                 /* what type of TRIE am I? (utf8 makes this contextual) */
3317                 DECL_TRIE_TYPE(scan);
3318
3319                 /* what trie are we using right now */
3320                 reg_trie_data * const trie
3321                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3322                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3323                 U32 state = trie->startstate;
3324
3325                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3326                     !TRIE_BITMAP_TEST(trie,*locinput)
3327                 ) {
3328                     if (trie->states[ state ].wordnum) {
3329                          DEBUG_EXECUTE_r(
3330                             PerlIO_printf(Perl_debug_log,
3331                                           "%*s  %smatched empty string...%s\n",
3332                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3333                         );
3334                         if (!trie->jump)
3335                             break;
3336                     } else {
3337                         DEBUG_EXECUTE_r(
3338                             PerlIO_printf(Perl_debug_log,
3339                                           "%*s  %sfailed to match trie start class...%s\n",
3340                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3341                         );
3342                         sayNO_SILENT;
3343                    }
3344                 }
3345
3346             {
3347                 U8 *uc = ( U8* )locinput;
3348
3349                 STRLEN len = 0;
3350                 STRLEN foldlen = 0;
3351                 U8 *uscan = (U8*)NULL;
3352                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3353                 U32 charcount = 0; /* how many input chars we have matched */
3354                 U32 accepted = 0; /* have we seen any accepting states? */
3355
3356                 ST.B = next;
3357                 ST.jump = trie->jump;
3358                 ST.me = scan;
3359                 ST.firstpos = NULL;
3360                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3361                 ST.nextword = 0;
3362
3363                 /* fully traverse the TRIE; note the position of the
3364                    shortest accept state and the wordnum of the longest
3365                    accept state */
3366
3367                 while ( state && uc <= (U8*)PL_regeol ) {
3368                     U32 base = trie->states[ state ].trans.base;
3369                     UV uvc = 0;
3370                     U16 charid = 0;
3371                     U16 wordnum;
3372                     wordnum = trie->states[ state ].wordnum;
3373
3374                     if (wordnum) { /* it's an accept state */
3375                         if (!accepted) {
3376                             accepted = 1;
3377                             /* record first match position */
3378                             if (ST.longfold) {
3379                                 ST.firstpos = (U8*)locinput;
3380                                 ST.firstchars = 0;
3381                             }
3382                             else {
3383                                 ST.firstpos = uc;
3384                                 ST.firstchars = charcount;
3385                             }
3386                         }
3387                         if (!ST.nextword || wordnum < ST.nextword)
3388                             ST.nextword = wordnum;
3389                         ST.topword = wordnum;
3390                     }
3391
3392                     DEBUG_TRIE_EXECUTE_r({
3393                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3394                                 PerlIO_printf( Perl_debug_log,
3395                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3396                                     2+depth * 2, "", PL_colors[4],
3397                                     (UV)state, (accepted ? 'Y' : 'N'));
3398                     });
3399
3400                     /* read a char and goto next state */
3401                     if ( base ) {
3402                         I32 offset;
3403                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3404                                              uscan, len, uvc, charid, foldlen,
3405                                              foldbuf, uniflags);
3406                         charcount++;
3407                         if (foldlen>0)
3408                             ST.longfold = TRUE;
3409                         if (charid &&
3410                              ( ((offset =
3411                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3412
3413                              && ((U32)offset < trie->lasttrans)
3414                              && trie->trans[offset].check == state)
3415                         {
3416                             state = trie->trans[offset].next;
3417                         }
3418                         else {
3419                             state = 0;
3420                         }
3421                         uc += len;
3422
3423                     }
3424                     else {
3425                         state = 0;
3426                     }
3427                     DEBUG_TRIE_EXECUTE_r(
3428                         PerlIO_printf( Perl_debug_log,
3429                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3430                             charid, uvc, (UV)state, PL_colors[5] );
3431                     );
3432                 }
3433                 if (!accepted)
3434                    sayNO;
3435
3436                 /* calculate total number of accept states */
3437                 {
3438                     U16 w = ST.topword;
3439                     accepted = 0;
3440                     while (w) {
3441                         w = trie->wordinfo[w].prev;
3442                         accepted++;
3443                     }
3444                     ST.accepted = accepted;
3445                 }
3446
3447                 DEBUG_EXECUTE_r(
3448                     PerlIO_printf( Perl_debug_log,
3449                         "%*s  %sgot %"IVdf" possible matches%s\n",
3450                         REPORT_CODE_OFF + depth * 2, "",
3451                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3452                 );
3453                 goto trie_first_try; /* jump into the fail handler */
3454             }}
3455             /* NOTREACHED */
3456
3457         case TRIE_next_fail: /* we failed - try next alternative */
3458             if ( ST.jump) {
3459                 REGCP_UNWIND(ST.cp);
3460                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3461                     PL_regoffs[n].end = -1;
3462                 *PL_reglastparen = n;
3463             }
3464             if (!--ST.accepted) {
3465                 DEBUG_EXECUTE_r({
3466                     PerlIO_printf( Perl_debug_log,
3467                         "%*s  %sTRIE failed...%s\n",
3468                         REPORT_CODE_OFF+depth*2, "",
3469                         PL_colors[4],
3470                         PL_colors[5] );
3471                 });
3472                 sayNO_SILENT;
3473             }
3474             {
3475                 /* Find next-highest word to process.  Note that this code
3476                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3477                 register U16 min = 0;
3478                 register U16 word;
3479                 register U16 const nextword = ST.nextword;
3480                 register reg_trie_wordinfo * const wordinfo
3481                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3482                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3483                     if (word > nextword && (!min || word < min))
3484                         min = word;
3485                 }
3486                 ST.nextword = min;
3487             }
3488
3489           trie_first_try:
3490             if (do_cutgroup) {
3491                 do_cutgroup = 0;
3492                 no_final = 0;
3493             }
3494
3495             if ( ST.jump) {
3496                 ST.lastparen = *PL_reglastparen;
3497                 REGCP_SET(ST.cp);
3498             }
3499
3500             /* find start char of end of current word */
3501             {
3502                 U32 chars; /* how many chars to skip */
3503                 U8 *uc = ST.firstpos;
3504                 reg_trie_data * const trie
3505                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3506
3507                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3508                             >=  ST.firstchars);
3509                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3510                             - ST.firstchars;
3511
3512                 if (ST.longfold) {
3513                     /* the hard option - fold each char in turn and find
3514                      * its folded length (which may be different */
3515                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3516                     STRLEN foldlen;
3517                     STRLEN len;
3518                     UV uvc;
3519                     U8 *uscan;
3520
3521                     while (chars) {
3522                         if (utf8_target) {
3523                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3524                                                     uniflags);
3525                             uc += len;
3526                         }
3527                         else {
3528                             uvc = *uc;
3529                             uc++;
3530                         }
3531                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3532                         uscan = foldbuf;
3533                         while (foldlen) {
3534                             if (!--chars)
3535                                 break;
3536                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3537                                             uniflags);
3538                             uscan += len;
3539                             foldlen -= len;
3540                         }
3541                     }
3542                 }
3543                 else {
3544                     if (utf8_target)
3545                         while (chars--)
3546                             uc += UTF8SKIP(uc);
3547                     else
3548                         uc += chars;
3549                 }
3550                 PL_reginput = (char *)uc;
3551             }
3552
3553             scan = (ST.jump && ST.jump[ST.nextword])
3554                         ? ST.me + ST.jump[ST.nextword]
3555                         : ST.B;
3556
3557             DEBUG_EXECUTE_r({
3558                 PerlIO_printf( Perl_debug_log,
3559                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3560                     REPORT_CODE_OFF+depth*2, "",
3561                     PL_colors[4],
3562                     ST.nextword,
3563                     PL_colors[5]
3564                     );
3565             });
3566
3567             if (ST.accepted > 1 || has_cutgroup) {
3568                 PUSH_STATE_GOTO(TRIE_next, scan);
3569                 /* NOTREACHED */
3570             }
3571             /* only one choice left - just continue */
3572             DEBUG_EXECUTE_r({
3573                 AV *const trie_words
3574                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3575                 SV ** const tmp = av_fetch( trie_words,
3576                     ST.nextword-1, 0 );
3577                 SV *sv= tmp ? sv_newmortal() : NULL;
3578
3579                 PerlIO_printf( Perl_debug_log,
3580                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3581                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3582                     ST.nextword,
3583                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3584                             PL_colors[0], PL_colors[1],
3585                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3586                         )
3587                     : "not compiled under -Dr",
3588                     PL_colors[5] );
3589             });
3590
3591             locinput = PL_reginput;
3592             nextchr = UCHARAT(locinput);
3593             continue; /* execute rest of RE */
3594             /* NOTREACHED */
3595 #undef  ST
3596
3597         case EXACT: {
3598             char *s = STRING(scan);
3599             ln = STR_LEN(scan);
3600             if (utf8_target != UTF_PATTERN) {
3601                 /* The target and the pattern have differing utf8ness. */
3602                 char *l = locinput;
3603                 const char * const e = s + ln;
3604
3605                 if (utf8_target) {
3606                     /* The target is utf8, the pattern is not utf8. */
3607                     while (s < e) {
3608                         STRLEN ulen;
3609                         if (l >= PL_regeol)
3610                              sayNO;
3611                         if (NATIVE_TO_UNI(*(U8*)s) !=
3612                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3613                                             uniflags))
3614                              sayNO;
3615                         l += ulen;
3616                         s ++;
3617                     }
3618                 }
3619                 else {
3620                     /* The target is not utf8, the pattern is utf8. */
3621                     while (s < e) {
3622                         STRLEN ulen;
3623                         if (l >= PL_regeol)
3624                             sayNO;
3625                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3626                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3627                                            uniflags))
3628                             sayNO;
3629                         s += ulen;
3630                         l ++;
3631                     }
3632                 }
3633                 locinput = l;
3634                 nextchr = UCHARAT(locinput);
3635                 break;
3636             }
3637             /* The target and the pattern have the same utf8ness. */
3638             /* Inline the first character, for speed. */
3639             if (UCHARAT(s) != nextchr)
3640                 sayNO;
3641             if (PL_regeol - locinput < ln)
3642                 sayNO;
3643             if (ln > 1 && memNE(s, locinput, ln))
3644                 sayNO;
3645             locinput += ln;
3646             nextchr = UCHARAT(locinput);
3647             break;
3648             }
3649         case EXACTFL: {
3650             re_fold_t folder;
3651             const U8 * fold_array;
3652             const char * s;
3653             U32 fold_utf8_flags;
3654
3655             PL_reg_flags |= RF_tainted;
3656             folder = foldEQ_locale;
3657             fold_array = PL_fold_locale;
3658             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3659             goto do_exactf;
3660
3661         case EXACTFU:
3662             folder = foldEQ_latin1;
3663             fold_array = PL_fold_latin1;
3664             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3665             goto do_exactf;
3666
3667         case EXACTFA:
3668             folder = foldEQ_latin1;
3669             fold_array = PL_fold_latin1;
3670             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3671             goto do_exactf;
3672
3673         case EXACTF:
3674             folder = foldEQ;
3675             fold_array = PL_fold;
3676             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3677
3678           do_exactf:
3679             s = STRING(scan);
3680             ln = STR_LEN(scan);
3681
3682             if (utf8_target || UTF_PATTERN) {
3683               /* Either target or the pattern are utf8. */
3684                 const char * const l = locinput;
3685                 char *e = PL_regeol;
3686
3687                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3688                                l, &e, 0,  utf8_target, fold_utf8_flags))
3689                 {
3690                     sayNO;
3691                 }
3692                 locinput = e;
3693                 nextchr = UCHARAT(locinput);
3694                 break;
3695             }
3696
3697             /* Neither the target nor the pattern are utf8 */
3698             if (UCHARAT(s) != nextchr &&
3699                 UCHARAT(s) != fold_array[nextchr])
3700             {
3701                 sayNO;
3702             }
3703             if (PL_regeol - locinput < ln)
3704                 sayNO;
3705             if (ln > 1 && ! folder(s, locinput, ln))
3706                 sayNO;
3707             locinput += ln;
3708             nextchr = UCHARAT(locinput);
3709             break;
3710         }
3711
3712         /* XXX Could improve efficiency by separating these all out using a
3713          * macro or in-line function.  At that point regcomp.c would no longer
3714          * have to set the FLAGS fields of these */
3715         case BOUNDL:
3716         case NBOUNDL:
3717             PL_reg_flags |= RF_tainted;
3718             /* FALL THROUGH */
3719         case BOUND:
3720         case BOUNDU:
3721         case BOUNDA:
3722         case NBOUND:
3723         case NBOUNDU:
3724         case NBOUNDA:
3725             /* was last char in word? */
3726             if (utf8_target
3727                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3728                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3729             {
3730                 if (locinput == PL_bostr)
3731                     ln = '\n';
3732                 else {
3733                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3734
3735                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3736                 }
3737                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3738                     ln = isALNUM_uni(ln);
3739                     LOAD_UTF8_CHARCLASS_ALNUM();
3740                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3741                 }
3742                 else {
3743                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3744                     n = isALNUM_LC_utf8((U8*)locinput);
3745                 }
3746             }
3747             else {
3748
3749                 /* Here the string isn't utf8, or is utf8 and only ascii
3750                  * characters are to match \w.  In the latter case looking at
3751                  * the byte just prior to the current one may be just the final
3752                  * byte of a multi-byte character.  This is ok.  There are two
3753                  * cases:
3754                  * 1) it is a single byte character, and then the test is doing
3755                  *      just what it's supposed to.
3756                  * 2) it is a multi-byte character, in which case the final
3757                  *      byte is never mistakable for ASCII, and so the test
3758                  *      will say it is not a word character, which is the
3759                  *      correct answer. */
3760                 ln = (locinput != PL_bostr) ?
3761                     UCHARAT(locinput - 1) : '\n';
3762                 switch (FLAGS(scan)) {
3763                     case REGEX_UNICODE_CHARSET:
3764                         ln = isWORDCHAR_L1(ln);
3765                         n = isWORDCHAR_L1(nextchr);
3766                         break;
3767                     case REGEX_LOCALE_CHARSET:
3768                         ln = isALNUM_LC(ln);
3769                         n = isALNUM_LC(nextchr);
3770                         break;
3771                     case REGEX_DEPENDS_CHARSET:
3772                         ln = isALNUM(ln);
3773                         n = isALNUM(nextchr);
3774                         break;
3775                     case REGEX_ASCII_RESTRICTED_CHARSET:
3776                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3777                         ln = isWORDCHAR_A(ln);
3778                         n = isWORDCHAR_A(nextchr);
3779                         break;
3780                     default:
3781                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3782                         break;
3783                 }
3784             }
3785             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3786              * regcomp.sym */
3787             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3788                     sayNO;
3789             break;
3790         case ANYOFV:
3791         case ANYOF:
3792             if (utf8_target || state_num == ANYOFV) {
3793                 STRLEN inclasslen = PL_regeol - locinput;
3794                 if (locinput >= PL_regeol)
3795                     sayNO;
3796
3797                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3798                     sayNO;
3799                 locinput += inclasslen;
3800                 nextchr = UCHARAT(locinput);
3801                 break;
3802             }
3803             else {
3804                 if (nextchr < 0)
3805                     nextchr = UCHARAT(locinput);
3806                 if (!nextchr && locinput >= PL_regeol)
3807                     sayNO;
3808                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3809                     sayNO;
3810                 nextchr = UCHARAT(++locinput);
3811                 break;
3812             }
3813             break;
3814         /* Special char classes - The defines start on line 129 or so */
3815         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3816                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3817                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3818                   ALNUMA, NALNUMA, isWORDCHAR_A,
3819                   alnum, "a");
3820
3821         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3822                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3823                   SPACEU, NSPACEU, isSPACE_L1,
3824                   SPACEA, NSPACEA, isSPACE_A,
3825                   space, " ");
3826
3827         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3828                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3829                 DIGITA, NDIGITA, isDIGIT_A,
3830                 digit, "0");
3831
3832         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3833                        a Unicode extended Grapheme Cluster */
3834             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3835               extended Grapheme Cluster is:
3836
3837                CR LF
3838                | Prepend* Begin Extend*
3839                | .
3840
3841                Begin is (Hangul-syllable | ! Control)
3842                Extend is (Grapheme_Extend | Spacing_Mark)
3843                Control is [ GCB_Control CR LF ]
3844
3845                The discussion below shows how the code for CLUMP is derived
3846                from this regex.  Note that most of these concepts are from
3847                property values of the Grapheme Cluster Boundary (GCB) property.
3848                No code point can have multiple property values for a given
3849                property.  Thus a code point in Prepend can't be in Control, but
3850                it must be in !Control.  This is why Control above includes
3851                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3852                property separately, and so can't be in GCB_Control, even though
3853                they logically are controls.  Control is not the same as gc=cc,
3854                but includes format and other characters as well.
3855
3856                The Unicode definition of Hangul-syllable is:
3857                    L+
3858                    | (L* ( ( V | LV ) V* | LVT ) T*)
3859                    | T+
3860                   )
3861                Each of these is a value for the GCB property, and hence must be
3862                disjoint, so the order they are tested is immaterial, so the
3863                above can safely be changed to
3864                    T+
3865                    | L+
3866                    | (L* ( LVT | ( V | LV ) V*) T*)
3867
3868                The last two terms can be combined like this:
3869                    L* ( L
3870                         | (( LVT | ( V | LV ) V*) T*))
3871
3872                And refactored into this:
3873                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3874
3875                That means that if we have seen any L's at all we can quit
3876                there, but if the next character is an LVT, a V, or an LV we
3877                should keep going.
3878
3879                There is a subtlety with Prepend* which showed up in testing.
3880                Note that the Begin, and only the Begin is required in:
3881                 | Prepend* Begin Extend*
3882                Also, Begin contains '! Control'.  A Prepend must be a
3883                '!  Control', which means it must also be a Begin.  What it
3884                comes down to is that if we match Prepend* and then find no
3885                suitable Begin afterwards, that if we backtrack the last
3886                Prepend, that one will be a suitable Begin.
3887             */
3888
3889             if (locinput >= PL_regeol)
3890                 sayNO;
3891             if  (! utf8_target) {
3892
3893                 /* Match either CR LF  or '.', as all the other possibilities
3894                  * require utf8 */
3895                 locinput++;         /* Match the . or CR */
3896                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3897                                        match the LF */
3898                     && locinput < PL_regeol
3899                     && UCHARAT(locinput) == '\n') locinput++;
3900             }
3901             else {
3902
3903                 /* Utf8: See if is ( CR LF ); already know that locinput <
3904                  * PL_regeol, so locinput+1 is in bounds */
3905                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3906                     locinput += 2;
3907                 }
3908                 else {
3909                     /* In case have to backtrack to beginning, then match '.' */
3910                     char *starting = locinput;
3911
3912                     /* In case have to backtrack the last prepend */
3913                     char *previous_prepend = 0;
3914
3915                     LOAD_UTF8_CHARCLASS_GCB();
3916
3917                     /* Match (prepend)* */
3918                     while (locinput < PL_regeol
3919                            && swash_fetch(PL_utf8_X_prepend,
3920                                           (U8*)locinput, utf8_target))
3921                     {
3922                         previous_prepend = locinput;
3923                         locinput += UTF8SKIP(locinput);
3924                     }
3925
3926                     /* As noted above, if we matched a prepend character, but
3927                      * the next thing won't match, back off the last prepend we
3928                      * matched, as it is guaranteed to match the begin */
3929                     if (previous_prepend
3930                         && (locinput >=  PL_regeol
3931                             || ! swash_fetch(PL_utf8_X_begin,
3932                                              (U8*)locinput, utf8_target)))
3933                     {
3934                         locinput = previous_prepend;
3935                     }
3936
3937                     /* Note that here we know PL_regeol > locinput, as we
3938                      * tested that upon input to this switch case, and if we
3939                      * moved locinput forward, we tested the result just above
3940                      * and it either passed, or we backed off so that it will
3941                      * now pass */
3942                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3943
3944                         /* Here did not match the required 'Begin' in the
3945                          * second term.  So just match the very first
3946                          * character, the '.' of the final term of the regex */
3947                         locinput = starting + UTF8SKIP(starting);
3948                     } else {
3949
3950                         /* Here is the beginning of a character that can have
3951                          * an extender.  It is either a hangul syllable, or a
3952                          * non-control */
3953                         if (swash_fetch(PL_utf8_X_non_hangul,
3954                                         (U8*)locinput, utf8_target))
3955                         {
3956
3957                             /* Here not a Hangul syllable, must be a
3958                              * ('!  * Control') */
3959                             locinput += UTF8SKIP(locinput);
3960                         } else {
3961
3962                             /* Here is a Hangul syllable.  It can be composed
3963                              * of several individual characters.  One
3964                              * possibility is T+ */
3965                             if (swash_fetch(PL_utf8_X_T,
3966                                             (U8*)locinput, utf8_target))
3967                             {
3968                                 while (locinput < PL_regeol
3969                                         && swash_fetch(PL_utf8_X_T,
3970                                                         (U8*)locinput, utf8_target))
3971                                 {
3972                                     locinput += UTF8SKIP(locinput);
3973                                 }
3974                             } else {
3975
3976                                 /* Here, not T+, but is a Hangul.  That means
3977                                  * it is one of the others: L, LV, LVT or V,
3978                                  * and matches:
3979                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3980
3981                                 /* Match L*           */
3982                                 while (locinput < PL_regeol
3983                                         && swash_fetch(PL_utf8_X_L,
3984                                                         (U8*)locinput, utf8_target))
3985                                 {
3986                                     locinput += UTF8SKIP(locinput);
3987                                 }
3988
3989                                 /* Here, have exhausted L*.  If the next
3990                                  * character is not an LV, LVT nor V, it means
3991                                  * we had to have at least one L, so matches L+
3992                                  * in the original equation, we have a complete
3993                                  * hangul syllable.  Are done. */
3994
3995                                 if (locinput < PL_regeol
3996                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3997                                                     (U8*)locinput, utf8_target))
3998                                 {
3999
4000                                     /* Otherwise keep going.  Must be LV, LVT
4001                                      * or V.  See if LVT */
4002                                     if (swash_fetch(PL_utf8_X_LVT,
4003                                                     (U8*)locinput, utf8_target))
4004                                     {
4005                                         locinput += UTF8SKIP(locinput);
4006                                     } else {
4007
4008                                         /* Must be  V or LV.  Take it, then
4009                                          * match V*     */
4010                                         locinput += UTF8SKIP(locinput);
4011                                         while (locinput < PL_regeol
4012                                                 && swash_fetch(PL_utf8_X_V,
4013                                                          (U8*)locinput, utf8_target))
4014                                         {
4015                                             locinput += UTF8SKIP(locinput);
4016                                         }
4017                                     }
4018
4019                                     /* And any of LV, LVT, or V can be followed
4020                                      * by T*            */
4021                                     while (locinput < PL_regeol
4022                                            && swash_fetch(PL_utf8_X_T,
4023                                                            (U8*)locinput,
4024                                                            utf8_target))
4025                                     {
4026                                         locinput += UTF8SKIP(locinput);
4027                                     }
4028                                 }
4029                             }
4030                         }
4031
4032                         /* Match any extender */
4033                         while (locinput < PL_regeol
4034                                 && swash_fetch(PL_utf8_X_extend,
4035                                                 (U8*)locinput, utf8_target))
4036                         {
4037                             locinput += UTF8SKIP(locinput);
4038                         }
4039                     }
4040                 }
4041                 if (locinput > PL_regeol) sayNO;
4042             }
4043             nextchr = UCHARAT(locinput);
4044             break;
4045
4046         case NREFFL:
4047         {   /* The capture buffer cases.  The ones beginning with N for the
4048                named buffers just convert to the equivalent numbered and
4049                pretend they were called as the corresponding numbered buffer
4050                op.  */
4051             /* don't initialize these in the declaration, it makes C++
4052                unhappy */
4053             char *s;
4054             char type;
4055             re_fold_t folder;
4056             const U8 *fold_array;
4057             UV utf8_fold_flags;
4058
4059             PL_reg_flags |= RF_tainted;
4060             folder = foldEQ_locale;
4061             fold_array = PL_fold_locale;
4062             type = REFFL;
4063             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4064             goto do_nref;
4065
4066         case NREFFA:
4067             folder = foldEQ_latin1;
4068             fold_array = PL_fold_latin1;
4069             type = REFFA;
4070             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4071             goto do_nref;
4072
4073         case NREFFU:
4074             folder = foldEQ_latin1;
4075             fold_array = PL_fold_latin1;
4076             type = REFFU;
4077             utf8_fold_flags = 0;
4078             goto do_nref;
4079
4080         case NREFF:
4081             folder = foldEQ;
4082             fold_array = PL_fold;
4083             type = REFF;
4084             utf8_fold_flags = 0;
4085             goto do_nref;
4086
4087         case NREF:
4088             type = REF;
4089             folder = NULL;
4090             fold_array = NULL;
4091             utf8_fold_flags = 0;
4092           do_nref:
4093
4094             /* For the named back references, find the corresponding buffer
4095              * number */
4096             n = reg_check_named_buff_matched(rex,scan);
4097
4098             if ( ! n ) {
4099                 sayNO;
4100             }
4101             goto do_nref_ref_common;
4102
4103         case REFFL:
4104             PL_reg_flags |= RF_tainted;
4105             folder = foldEQ_locale;
4106             fold_array = PL_fold_locale;
4107             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4108             goto do_ref;
4109
4110         case REFFA:
4111             folder = foldEQ_latin1;
4112             fold_array = PL_fold_latin1;
4113             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4114             goto do_ref;
4115
4116         case REFFU:
4117             folder = foldEQ_latin1;
4118             fold_array = PL_fold_latin1;
4119             utf8_fold_flags = 0;
4120             goto do_ref;
4121
4122         case REFF:
4123             folder = foldEQ;
4124             fold_array = PL_fold;
4125             utf8_fold_flags = 0;
4126             goto do_ref;
4127
4128         case REF:
4129             folder = NULL;
4130             fold_array = NULL;
4131             utf8_fold_flags = 0;
4132
4133           do_ref:
4134             type = OP(scan);
4135             n = ARG(scan);  /* which paren pair */
4136
4137           do_nref_ref_common:
4138             ln = PL_regoffs[n].start;
4139             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4140             if (*PL_reglastparen < n || ln == -1)
4141                 sayNO;                  /* Do not match unless seen CLOSEn. */
4142             if (ln == PL_regoffs[n].end)
4143                 break;
4144
4145             s = PL_bostr + ln;
4146             if (type != REF     /* REF can do byte comparison */
4147                 && (utf8_target || type == REFFU))
4148             { /* XXX handle REFFL better */
4149                 char * limit = PL_regeol;
4150
4151                 /* This call case insensitively compares the entire buffer
4152                     * at s, with the current input starting at locinput, but
4153                     * not going off the end given by PL_regeol, and returns in
4154                     * limit upon success, how much of the current input was
4155                     * matched */
4156                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4157                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4158                 {
4159                     sayNO;
4160                 }
4161                 locinput = limit;
4162                 nextchr = UCHARAT(locinput);
4163                 break;
4164             }
4165
4166             /* Not utf8:  Inline the first character, for speed. */
4167             if (UCHARAT(s) != nextchr &&
4168                 (type == REF ||
4169                  UCHARAT(s) != fold_array[nextchr]))
4170                 sayNO;
4171             ln = PL_regoffs[n].end - ln;
4172             if (locinput + ln > PL_regeol)
4173                 sayNO;
4174             if (ln > 1 && (type == REF
4175                            ? memNE(s, locinput, ln)
4176                            : ! folder(s, locinput, ln)))
4177                 sayNO;
4178             locinput += ln;
4179             nextchr = UCHARAT(locinput);
4180             break;
4181         }
4182         case NOTHING:
4183         case TAIL:
4184             break;
4185         case BACK:
4186             break;
4187
4188 #undef  ST
4189 #define ST st->u.eval
4190         {
4191             SV *ret;
4192             REGEXP *re_sv;
4193             regexp *re;
4194             regexp_internal *rei;
4195             regnode *startpoint;
4196
4197         case GOSTART:
4198         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4199             if (cur_eval && cur_eval->locinput==locinput) {
4200                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4201                     Perl_croak(aTHX_ "Infinite recursion in regex");
4202                 if ( ++nochange_depth > max_nochange_depth )
4203                     Perl_croak(aTHX_
4204                         "Pattern subroutine nesting without pos change"
4205                         " exceeded limit in regex");
4206             } else {
4207                 nochange_depth = 0;
4208             }
4209             re_sv = rex_sv;
4210             re = rex;
4211             rei = rexi;
4212             (void)ReREFCNT_inc(rex_sv);
4213             if (OP(scan)==GOSUB) {
4214                 startpoint = scan + ARG2L(scan);
4215                 ST.close_paren = ARG(scan);
4216             } else {
4217                 startpoint = rei->program+1;
4218                 ST.close_paren = 0;
4219             }
4220             goto eval_recurse_doit;
4221             /* NOTREACHED */
4222         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4223             if (cur_eval && cur_eval->locinput==locinput) {
4224                 if ( ++nochange_depth > max_nochange_depth )
4225                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4226             } else {
4227                 nochange_depth = 0;
4228             }
4229             {
4230                 /* execute the code in the {...} */
4231                 dSP;
4232                 SV ** const before = SP;
4233                 OP_4tree * const oop = PL_op;
4234                 COP * const ocurcop = PL_curcop;
4235                 PAD *old_comppad;
4236                 char *saved_regeol = PL_regeol;
4237                 struct re_save_state saved_state;
4238
4239                 /* To not corrupt the existing regex state while executing the
4240                  * eval we would normally put it on the save stack, like with
4241                  * save_re_context. However, re-evals have a weird scoping so we
4242                  * can't just add ENTER/LEAVE here. With that, things like
4243                  *
4244                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4245                  *
4246                  * would break, as they expect the localisation to be unwound
4247                  * only when the re-engine backtracks through the bit that
4248                  * localised it.
4249                  *
4250                  * What we do instead is just saving the state in a local c
4251                  * variable.
4252                  */
4253                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4254
4255                 n = ARG(scan);
4256                 PL_op = (OP_4tree*)rexi->data->data[n];
4257                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4258                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4259                 /* wrap the call in two SAVECOMPPADs. This ensures that
4260                  * when the save stack is eventually unwound, all the
4261                  * accumulated SAVEt_CLEARSV's will be processed with
4262                  * interspersed SAVEt_COMPPAD's to ensure that lexicals
4263                  * are cleared in the right pad */
4264                 SAVECOMPPAD();
4265                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4266                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4267
4268                 if (sv_yes_mark) {
4269                     SV *sv_mrk = get_sv("REGMARK", 1);
4270                     sv_setsv(sv_mrk, sv_yes_mark);
4271                 }
4272
4273                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4274                 SPAGAIN;
4275                 if (SP == before)
4276                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4277                 else {
4278                     ret = POPs;
4279                     PUTBACK;
4280                 }
4281
4282                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4283
4284                 PL_op = oop;
4285                 SAVECOMPPAD();
4286                 PAD_RESTORE_LOCAL(old_comppad);
4287                 PL_curcop = ocurcop;
4288                 PL_regeol = saved_regeol;
4289                 if (!logical) {
4290                     /* /(?{...})/ */
4291                     sv_setsv(save_scalar(PL_replgv), ret);
4292                     break;
4293                 }
4294             }
4295             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4296                 logical = 0;
4297                 {
4298                     /* extract RE object from returned value; compiling if
4299                      * necessary */
4300                     MAGIC *mg = NULL;
4301                     REGEXP *rx = NULL;
4302
4303                     if (SvROK(ret)) {
4304                         SV *const sv = SvRV(ret);
4305
4306                         if (SvTYPE(sv) == SVt_REGEXP) {
4307                             rx = (REGEXP*) sv;
4308                         } else if (SvSMAGICAL(sv)) {
4309                             mg = mg_find(sv, PERL_MAGIC_qr);
4310                             assert(mg);
4311                         }
4312                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4313                         rx = (REGEXP*) ret;
4314                     } else if (SvSMAGICAL(ret)) {
4315                         if (SvGMAGICAL(ret)) {
4316                             /* I don't believe that there is ever qr magic
4317                                here.  */
4318                             assert(!mg_find(ret, PERL_MAGIC_qr));
4319                             sv_unmagic(ret, PERL_MAGIC_qr);
4320                         }
4321                         else {
4322                             mg = mg_find(ret, PERL_MAGIC_qr);
4323                             /* testing suggests mg only ends up non-NULL for
4324                                scalars who were upgraded and compiled in the
4325                                else block below. In turn, this is only
4326                                triggered in the "postponed utf8 string" tests
4327                                in t/op/pat.t  */
4328                         }
4329                     }
4330
4331                     if (mg) {
4332                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4333                         assert(rx);
4334                     }
4335                     if (rx) {
4336                         rx = reg_temp_copy(NULL, rx);
4337                     }
4338                     else {
4339                         U32 pm_flags = 0;
4340                         const I32 osize = PL_regsize;
4341
4342                         if (DO_UTF8(ret)) {
4343                             assert (SvUTF8(ret));
4344                         } else if (SvUTF8(ret)) {
4345                             /* Not doing UTF-8, despite what the SV says. Is
4346                                this only if we're trapped in use 'bytes'?  */
4347                             /* Make a copy of the octet sequence, but without
4348                                the flag on, as the compiler now honours the
4349                                SvUTF8 flag on ret.  */
4350                             STRLEN len;
4351                             const char *const p = SvPV(ret, len);
4352                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4353                         }
4354                         rx = CALLREGCOMP(ret, pm_flags);
4355                         if (!(SvFLAGS(ret)
4356                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4357                                  | SVs_GMG))) {
4358                             /* This isn't a first class regexp. Instead, it's
4359                                caching a regexp onto an existing, Perl visible
4360                                scalar.  */
4361                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4362                         }
4363                         PL_regsize = osize;
4364                     }
4365                     re_sv = rx;
4366                     re = (struct regexp *)SvANY(rx);
4367                 }
4368                 RXp_MATCH_COPIED_off(re);
4369                 re->subbeg = rex->subbeg;
4370                 re->sublen = rex->sublen;
4371                 rei = RXi_GET(re);
4372                 DEBUG_EXECUTE_r(
4373                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4374                         "Matching embedded");
4375                 );
4376                 startpoint = rei->program + 1;
4377                 ST.close_paren = 0; /* only used for GOSUB */
4378                 /* borrowed from regtry */
4379                 if (PL_reg_start_tmpl <= re->nparens) {
4380                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4381                     if(PL_reg_start_tmp)
4382                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4383                     else
4384                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4385                 }
4386
4387         eval_recurse_doit: /* Share code with GOSUB below this line */
4388                 /* run the pattern returned from (??{...}) */
4389                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4390                 REGCP_SET(ST.lastcp);
4391
4392                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4393
4394                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4395                 PL_reglastparen = &re->lastparen;
4396                 PL_reglastcloseparen = &re->lastcloseparen;
4397                 re->lastparen = 0;
4398                 re->lastcloseparen = 0;
4399
4400                 PL_reginput = locinput;
4401                 PL_regsize = 0;
4402
4403                 /* XXXX This is too dramatic a measure... */
4404                 PL_reg_maxiter = 0;
4405
4406                 ST.toggle_reg_flags = PL_reg_flags;
4407                 if (RX_UTF8(re_sv))
4408                     PL_reg_flags |= RF_utf8;
4409                 else
4410                     PL_reg_flags &= ~RF_utf8;
4411                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4412
4413                 ST.prev_rex = rex_sv;
4414                 ST.prev_curlyx = cur_curlyx;
4415                 SETREX(rex_sv,re_sv);
4416                 rex = re;
4417                 rexi = rei;
4418                 cur_curlyx = NULL;
4419                 ST.B = next;
4420                 ST.prev_eval = cur_eval;
4421                 cur_eval = st;
4422                 /* now continue from first node in postoned RE */
4423                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4424                 /* NOTREACHED */
4425             }
4426             /* logical is 1,   /(?(?{...})X|Y)/ */
4427             sw = cBOOL(SvTRUE(ret));
4428             logical = 0;
4429             break;
4430         }
4431
4432         case EVAL_AB: /* cleanup after a successful (??{A})B */
4433             /* note: this is called twice; first after popping B, then A */
4434             PL_reg_flags ^= ST.toggle_reg_flags;
4435             ReREFCNT_dec(rex_sv);
4436             SETREX(rex_sv,ST.prev_rex);
4437             rex = (struct regexp *)SvANY(rex_sv);
4438             rexi = RXi_GET(rex);
4439             regcpblow(ST.cp);
4440             cur_eval = ST.prev_eval;
4441             cur_curlyx = ST.prev_curlyx;
4442
4443             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4444             PL_reglastparen = &rex->lastparen;
4445             PL_reglastcloseparen = &rex->lastcloseparen;
4446             /* also update PL_regoffs */
4447             PL_regoffs = rex->offs;
4448
4449             /* XXXX This is too dramatic a measure... */
4450             PL_reg_maxiter = 0;
4451             if ( nochange_depth )
4452                 nochange_depth--;
4453             sayYES;
4454
4455
4456         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4457             /* note: this is called twice; first after popping B, then A */
4458             PL_reg_flags ^= ST.toggle_reg_flags;
4459             ReREFCNT_dec(rex_sv);
4460             SETREX(rex_sv,ST.prev_rex);
4461             rex = (struct regexp *)SvANY(rex_sv);
4462             rexi = RXi_GET(rex);
4463             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4464             PL_reglastparen = &rex->lastparen;
4465             PL_reglastcloseparen = &rex->lastcloseparen;
4466
4467             PL_reginput = locinput;
4468             REGCP_UNWIND(ST.lastcp);
4469             regcppop(rex);
4470             cur_eval = ST.prev_eval;
4471             cur_curlyx = ST.prev_curlyx;
4472             /* XXXX This is too dramatic a measure... */
4473             PL_reg_maxiter = 0;
4474             if ( nochange_depth )
4475                 nochange_depth--;
4476             sayNO_SILENT;
4477 #undef ST
4478
4479         case OPEN:
4480             n = ARG(scan);  /* which paren pair */
4481             PL_reg_start_tmp[n] = locinput;
4482             if (n > PL_regsize)
4483                 PL_regsize = n;
4484             lastopen = n;
4485             break;
4486         case CLOSE:
4487             n = ARG(scan);  /* which paren pair */
4488             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4489             PL_regoffs[n].end = locinput - PL_bostr;
4490             /*if (n > PL_regsize)
4491                 PL_regsize = n;*/
4492             if (n > *PL_reglastparen)
4493                 *PL_reglastparen = n;
4494             *PL_reglastcloseparen = n;
4495             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4496                 goto fake_end;
4497             }
4498             break;
4499         case ACCEPT:
4500             if (ARG(scan)){
4501                 regnode *cursor;
4502                 for (cursor=scan;
4503                      cursor && OP(cursor)!=END;
4504                      cursor=regnext(cursor))
4505                 {
4506                     if ( OP(cursor)==CLOSE ){
4507                         n = ARG(cursor);
4508                         if ( n <= lastopen ) {
4509                             PL_regoffs[n].start
4510                                 = PL_reg_start_tmp[n] - PL_bostr;
4511                             PL_regoffs[n].end = locinput - PL_bostr;
4512                             /*if (n > PL_regsize)
4513                             PL_regsize = n;*/
4514                             if (n > *PL_reglastparen)
4515                                 *PL_reglastparen = n;
4516                             *PL_reglastcloseparen = n;
4517                             if ( n == ARG(scan) || (cur_eval &&
4518                                 cur_eval->u.eval.close_paren == n))
4519                                 break;
4520                         }
4521                     }
4522                 }
4523             }
4524             goto fake_end;
4525             /*NOTREACHED*/
4526         case GROUPP:
4527             n = ARG(scan);  /* which paren pair */
4528             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4529             break;
4530         case NGROUPP:
4531             /* reg_check_named_buff_matched returns 0 for no match */
4532             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4533             break;
4534         case INSUBP:
4535             n = ARG(scan);
4536             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4537             break;
4538         case DEFINEP:
4539             sw = 0;
4540             break;
4541         case IFTHEN:
4542             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4543             if (sw)
4544                 next = NEXTOPER(NEXTOPER(scan));
4545             else {
4546                 next = scan + ARG(scan);
4547                 if (OP(next) == IFTHEN) /* Fake one. */
4548                     next = NEXTOPER(NEXTOPER(next));
4549             }
4550             break;
4551         case LOGICAL:
4552             logical = scan->flags;
4553             break;
4554
4555 /*******************************************************************
4556
4557 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4558 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4559 STAR/PLUS/CURLY/CURLYN are used instead.)
4560
4561 A*B is compiled as <CURLYX><A><WHILEM><B>
4562
4563 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4564 state, which contains the current count, initialised to -1. It also sets
4565 cur_curlyx to point to this state, with any previous value saved in the
4566 state block.
4567
4568 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4569 since the pattern may possibly match zero times (i.e. it's a while {} loop
4570 rather than a do {} while loop).
4571
4572 Each entry to WHILEM represents a successful match of A. The count in the
4573 CURLYX block is incremented, another WHILEM state is pushed, and execution
4574 passes to A or B depending on greediness and the current count.
4575
4576 For example, if matching against the string a1a2a3b (where the aN are
4577 substrings that match /A/), then the match progresses as follows: (the
4578 pushed states are interspersed with the bits of strings matched so far):
4579
4580     <CURLYX cnt=-1>
4581     <CURLYX cnt=0><WHILEM>
4582     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4583     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4584     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4585     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4586
4587 (Contrast this with something like CURLYM, which maintains only a single
4588 backtrack state:
4589
4590     <CURLYM cnt=0> a1
4591     a1 <CURLYM cnt=1> a2
4592     a1 a2 <CURLYM cnt=2> a3
4593     a1 a2 a3 <CURLYM cnt=3> b
4594 )
4595
4596 Each WHILEM state block marks a point to backtrack to upon partial failure
4597 of A or B, and also contains some minor state data related to that
4598 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4599 overall state, such as the count, and pointers to the A and B ops.
4600
4601 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4602 must always point to the *current* CURLYX block, the rules are:
4603
4604 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4605 and set cur_curlyx to point the new block.
4606
4607 When popping the CURLYX block after a successful or unsuccessful match,
4608 restore the previous cur_curlyx.
4609
4610 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4611 to the outer one saved in the CURLYX block.
4612
4613 When popping the WHILEM block after a successful or unsuccessful B match,
4614 restore the previous cur_curlyx.
4615
4616 Here's an example for the pattern (AI* BI)*BO
4617 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4618
4619 cur_
4620 curlyx backtrack stack
4621 ------ ---------------
4622 NULL
4623 CO     <CO prev=NULL> <WO>
4624 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4625 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4626 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4627
4628 At this point the pattern succeeds, and we work back down the stack to
4629 clean up, restoring as we go:
4630
4631 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4632 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4633 CO     <CO prev=NULL> <WO>
4634 NULL
4635
4636 *******************************************************************/
4637
4638 #define ST st->u.curlyx
4639
4640         case CURLYX:    /* start of /A*B/  (for complex A) */
4641         {
4642             /* No need to save/restore up to this paren */
4643             I32 parenfloor = scan->flags;
4644
4645             assert(next); /* keep Coverity happy */
4646             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4647                 next += ARG(next);
4648
4649             /* XXXX Probably it is better to teach regpush to support
4650                parenfloor > PL_regsize... */
4651             if (parenfloor > (I32)*PL_reglastparen)
4652                 parenfloor = *PL_reglastparen; /* Pessimization... */
4653
4654             ST.prev_curlyx= cur_curlyx;
4655             cur_curlyx = st;
4656             ST.cp = PL_savestack_ix;
4657
4658             /* these fields contain the state of the current curly.
4659              * they are accessed by subsequent WHILEMs */
4660             ST.parenfloor = parenfloor;
4661             ST.me = scan;
4662             ST.B = next;
4663             ST.minmod = minmod;
4664             minmod = 0;
4665             ST.count = -1;      /* this will be updated by WHILEM */
4666             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4667
4668             PL_reginput = locinput;
4669             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4670             /* NOTREACHED */
4671         }
4672
4673         case CURLYX_end: /* just finished matching all of A*B */
4674             cur_curlyx = ST.prev_curlyx;
4675             sayYES;
4676             /* NOTREACHED */
4677
4678         case CURLYX_end_fail: /* just failed to match all of A*B */
4679             regcpblow(ST.cp);
4680             cur_curlyx = ST.prev_curlyx;
4681             sayNO;
4682             /* NOTREACHED */
4683
4684
4685 #undef ST
4686 #define ST st->u.whilem
4687
4688         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4689         {
4690             /* see the discussion above about CURLYX/WHILEM */
4691             I32 n;
4692             int min = ARG1(cur_curlyx->u.curlyx.me);
4693             int max = ARG2(cur_curlyx->u.curlyx.me);
4694             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4695
4696             assert(cur_curlyx); /* keep Coverity happy */
4697             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4698             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4699             ST.cache_offset = 0;
4700             ST.cache_mask = 0;
4701
4702             PL_reginput = locinput;
4703
4704             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4705                   "%*s  whilem: matched %ld out of %d..%d\n",
4706                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4707             );
4708
4709             /* First just match a string of min A's. */
4710
4711             if (n < min) {
4712                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4713                 cur_curlyx->u.curlyx.lastloc = locinput;
4714                 REGCP_SET(ST.lastcp);
4715
4716                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4717                 /* NOTREACHED */
4718             }
4719
4720             /* If degenerate A matches "", assume A done. */
4721
4722             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4723                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4724                    "%*s  whilem: empty match detected, trying continuation...\n",
4725                    REPORT_CODE_OFF+depth*2, "")
4726                 );
4727                 goto do_whilem_B_max;
4728             }
4729
4730             /* super-linear cache processing */
4731
4732             if (scan->flags) {
4733
4734                 if (!PL_reg_maxiter) {
4735                     /* start the countdown: Postpone detection until we
4736                      * know the match is not *that* much linear. */
4737                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4738                     /* possible overflow for long strings and many CURLYX's */
4739                     if (PL_reg_maxiter < 0)
4740                         PL_reg_maxiter = I32_MAX;
4741                     PL_reg_leftiter = PL_reg_maxiter;
4742                 }
4743
4744                 if (PL_reg_leftiter-- == 0) {
4745                     /* initialise cache */
4746                     const I32 size = (PL_reg_maxiter + 7)/8;
4747                     if (PL_reg_poscache) {
4748                         if ((I32)PL_reg_poscache_size < size) {
4749                             Renew(PL_reg_poscache, size, char);
4750                             PL_reg_poscache_size = size;
4751                         }
4752                         Zero(PL_reg_poscache, size, char);
4753                     }
4754                     else {
4755                         PL_reg_poscache_size = size;
4756                         Newxz(PL_reg_poscache, size, char);
4757                     }
4758                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4759       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4760                               PL_colors[4], PL_colors[5])
4761                     );
4762                 }
4763
4764                 if (PL_reg_leftiter < 0) {
4765                     /* have we already failed at this position? */
4766                     I32 offset, mask;
4767                     offset  = (scan->flags & 0xf) - 1
4768                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4769                     mask    = 1 << (offset % 8);
4770                     offset /= 8;
4771                     if (PL_reg_poscache[offset] & mask) {
4772                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4773                             "%*s  whilem: (cache) already tried at this position...\n",
4774                             REPORT_CODE_OFF+depth*2, "")
4775                         );
4776                         sayNO; /* cache records failure */
4777                     }
4778                     ST.cache_offset = offset;
4779                     ST.cache_mask   = mask;
4780                 }
4781             }
4782
4783             /* Prefer B over A for minimal matching. */
4784
4785             if (cur_curlyx->u.curlyx.minmod) {
4786                 ST.save_curlyx = cur_curlyx;
4787                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4788                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4789                 REGCP_SET(ST.lastcp);
4790                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4791                 /* NOTREACHED */
4792             }
4793
4794             /* Prefer A over B for maximal matching. */
4795
4796             if (n < max) { /* More greed allowed? */
4797                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4798                 cur_curlyx->u.curlyx.lastloc = locinput;
4799                 REGCP_SET(ST.lastcp);
4800                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4801                 /* NOTREACHED */
4802             }
4803             goto do_whilem_B_max;
4804         }
4805         /* NOTREACHED */
4806
4807         case WHILEM_B_min: /* just matched B in a minimal match */
4808         case WHILEM_B_max: /* just matched B in a maximal match */
4809             cur_curlyx = ST.save_curlyx;
4810             sayYES;
4811             /* NOTREACHED */
4812
4813         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4814             cur_curlyx = ST.save_curlyx;
4815             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4816             cur_curlyx->u.curlyx.count--;
4817             CACHEsayNO;
4818             /* NOTREACHED */
4819
4820         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4821             /* FALL THROUGH */
4822         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4823             REGCP_UNWIND(ST.lastcp);
4824             regcppop(rex);
4825             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4826             cur_curlyx->u.curlyx.count--;
4827             CACHEsayNO;
4828             /* NOTREACHED */
4829
4830         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4831             REGCP_UNWIND(ST.lastcp);
4832             regcppop(rex);      /* Restore some previous $<digit>s? */
4833             PL_reginput = locinput;
4834             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4835                 "%*s  whilem: failed, trying continuation...\n",
4836                 REPORT_CODE_OFF+depth*2, "")
4837             );
4838           do_whilem_B_max:
4839             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4840                 && ckWARN(WARN_REGEXP)
4841                 && !(PL_reg_flags & RF_warned))
4842             {
4843                 PL_reg_flags |= RF_warned;
4844                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4845                      "Complex regular subexpression recursion",
4846                      REG_INFTY - 1);
4847             }
4848
4849             /* now try B */
4850             ST.save_curlyx = cur_curlyx;
4851             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4852             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4853             /* NOTREACHED */
4854
4855         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4856             cur_curlyx = ST.save_curlyx;
4857             REGCP_UNWIND(ST.lastcp);
4858             regcppop(rex);
4859
4860             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4861                 /* Maximum greed exceeded */
4862                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4863                     && ckWARN(WARN_REGEXP)
4864                     && !(PL_reg_flags & RF_warned))
4865                 {
4866                     PL_reg_flags |= RF_warned;
4867                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4868                         "%s limit (%d) exceeded",
4869                         "Complex regular subexpression recursion",
4870                         REG_INFTY - 1);
4871                 }
4872                 cur_curlyx->u.curlyx.count--;
4873                 CACHEsayNO;
4874             }
4875
4876             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4877                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4878             );
4879             /* Try grabbing another A and see if it helps. */
4880             PL_reginput = locinput;
4881             cur_curlyx->u.curlyx.lastloc = locinput;
4882             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4883             REGCP_SET(ST.lastcp);
4884             PUSH_STATE_GOTO(WHILEM_A_min,
4885                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4886             /* NOTREACHED */
4887
4888 #undef  ST
4889 #define ST st->u.branch
4890
4891         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4892             next = scan + ARG(scan);
4893             if (next == scan)
4894                 next = NULL;
4895             scan = NEXTOPER(scan);
4896             /* FALL THROUGH */
4897
4898         case BRANCH:        /*  /(...|A|...)/ */
4899             scan = NEXTOPER(scan); /* scan now points to inner node */
4900             ST.lastparen = *PL_reglastparen;
4901             ST.next_branch = next;
4902             REGCP_SET(ST.cp);
4903             PL_reginput = locinput;
4904
4905             /* Now go into the branch */
4906             if (has_cutgroup) {
4907                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4908             } else {
4909                 PUSH_STATE_GOTO(BRANCH_next, scan);
4910             }
4911             /* NOTREACHED */
4912         case CUTGROUP:
4913             PL_reginput = locinput;
4914             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4915                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4916             PUSH_STATE_GOTO(CUTGROUP_next,next);
4917             /* NOTREACHED */
4918         case CUTGROUP_next_fail:
4919             do_cutgroup = 1;
4920             no_final = 1;
4921             if (st->u.mark.mark_name)
4922                 sv_commit = st->u.mark.mark_name;
4923             sayNO;
4924             /* NOTREACHED */
4925         case BRANCH_next:
4926             sayYES;
4927             /* NOTREACHED */
4928         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4929             if (do_cutgroup) {
4930                 do_cutgroup = 0;
4931                 no_final = 0;
4932             }
4933             REGCP_UNWIND(ST.cp);
4934             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4935                 PL_regoffs[n].end = -1;
4936             *PL_reglastparen = n;
4937             /*dmq: *PL_reglastcloseparen = n; */
4938             scan = ST.next_branch;
4939             /* no more branches? */
4940             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4941                 DEBUG_EXECUTE_r({
4942                     PerlIO_printf( Perl_debug_log,
4943                         "%*s  %sBRANCH failed...%s\n",
4944                         REPORT_CODE_OFF+depth*2, "",
4945                         PL_colors[4],
4946                         PL_colors[5] );
4947                 });
4948                 sayNO_SILENT;
4949             }
4950             continue; /* execute next BRANCH[J] op */
4951             /* NOTREACHED */
4952
4953         case MINMOD:
4954             minmod = 1;
4955             break;
4956
4957 #undef  ST
4958 #define ST st->u.curlym
4959
4960         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4961
4962             /* This is an optimisation of CURLYX that enables us to push
4963              * only a single backtracking state, no matter how many matches
4964              * there are in {m,n}. It relies on the pattern being constant
4965              * length, with no parens to influence future backrefs
4966              */
4967
4968             ST.me = scan;
4969             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4970
4971             /* if paren positive, emulate an OPEN/CLOSE around A */
4972             if (ST.me->flags) {
4973                 U32 paren = ST.me->flags;
4974                 if (paren > PL_regsize)
4975                     PL_regsize = paren;
4976                 if (paren > *PL_reglastparen)
4977                     *PL_reglastparen = paren;
4978                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4979             }
4980             ST.A = scan;
4981             ST.B = next;
4982             ST.alen = 0;
4983             ST.count = 0;
4984             ST.minmod = minmod;
4985             minmod = 0;
4986             ST.c1 = CHRTEST_UNINIT;
4987             REGCP_SET(ST.cp);
4988
4989             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4990                 goto curlym_do_B;
4991
4992           curlym_do_A: /* execute the A in /A{m,n}B/  */
4993             PL_reginput = locinput;
4994             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4995             /* NOTREACHED */
4996
4997         case CURLYM_A: /* we've just matched an A */
4998             locinput = st->locinput;
4999             nextchr = UCHARAT(locinput);
5000
5001             ST.count++;
5002             /* after first match, determine A's length: u.curlym.alen */
5003             if (ST.count == 1) {
5004                 if (PL_reg_match_utf8) {
5005                     char *s = locinput;
5006                     while (s < PL_reginput) {
5007                         ST.alen++;
5008                         s += UTF8SKIP(s);
5009                     }
5010                 }
5011                 else {
5012                     ST.alen = PL_reginput - locinput;
5013                 }
5014                 if (ST.alen == 0)
5015                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5016             }
5017             DEBUG_EXECUTE_r(
5018                 PerlIO_printf(Perl_debug_log,
5019                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5020                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5021                           (IV) ST.count, (IV)ST.alen)
5022             );
5023
5024             locinput = PL_reginput;
5025
5026             if (cur_eval && cur_eval->u.eval.close_paren &&
5027                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5028                 goto fake_end;
5029
5030             {
5031                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5032                 if ( max == REG_INFTY || ST.count < max )
5033                     goto curlym_do_A; /* try to match another A */
5034             }
5035             goto curlym_do_B; /* try to match B */
5036
5037         case CURLYM_A_fail: /* just failed to match an A */
5038             REGCP_UNWIND(ST.cp);
5039
5040             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5041                 || (cur_eval && cur_eval->u.eval.close_paren &&
5042                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5043                 sayNO;
5044
5045           curlym_do_B: /* execute the B in /A{m,n}B/  */
5046             PL_reginput = locinput;
5047             if (ST.c1 == CHRTEST_UNINIT) {
5048                 /* calculate c1 and c2 for possible match of 1st char
5049                  * following curly */
5050                 ST.c1 = ST.c2 = CHRTEST_VOID;
5051                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5052                     regnode *text_node = ST.B;
5053                     if (! HAS_TEXT(text_node))
5054                         FIND_NEXT_IMPT(text_node);
5055                     /* this used to be
5056
5057                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5058
5059                         But the former is redundant in light of the latter.
5060
5061                         if this changes back then the macro for
5062                         IS_TEXT and friends need to change.
5063                      */
5064                     if (PL_regkind[OP(text_node)] == EXACT)
5065                     {
5066
5067                         ST.c1 = (U8)*STRING(text_node);
5068                         switch (OP(text_node)) {
5069                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5070                             case EXACTFA:
5071                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5072                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5073                             default: ST.c2 = ST.c1;
5074                         }
5075                     }
5076                 }
5077             }
5078
5079             DEBUG_EXECUTE_r(
5080                 PerlIO_printf(Perl_debug_log,
5081                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5082                     (int)(REPORT_CODE_OFF+(depth*2)),
5083                     "", (IV)ST.count)
5084                 );
5085             if (ST.c1 != CHRTEST_VOID
5086                     && UCHARAT(PL_reginput) != ST.c1
5087                     && UCHARAT(PL_reginput) != ST.c2)
5088             {
5089                 /* simulate B failing */
5090                 DEBUG_OPTIMISE_r(
5091                     PerlIO_printf(Perl_debug_log,
5092                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5093                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5094                         (IV)ST.c1,(IV)ST.c2
5095                 ));
5096                 state_num = CURLYM_B_fail;
5097                 goto reenter_switch;
5098             }
5099
5100             if (ST.me->flags) {
5101                 /* mark current A as captured */
5102                 I32 paren = ST.me->flags;
5103                 if (ST.count) {
5104                     PL_regoffs[paren].start
5105                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5106                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5107                     /*dmq: *PL_reglastcloseparen = paren; */
5108                 }
5109                 else
5110                     PL_regoffs[paren].end = -1;
5111                 if (cur_eval && cur_eval->u.eval.close_paren &&
5112                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5113                 {
5114                     if (ST.count)
5115                         goto fake_end;
5116                     else
5117                         sayNO;
5118                 }
5119             }
5120
5121             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5122             /* NOTREACHED */
5123
5124         case CURLYM_B_fail: /* just failed to match a B */
5125             REGCP_UNWIND(ST.cp);
5126             if (ST.minmod) {
5127                 I32 max = ARG2(ST.me);
5128                 if (max != REG_INFTY && ST.count == max)
5129                     sayNO;
5130                 goto curlym_do_A; /* try to match a further A */
5131             }
5132             /* backtrack one A */
5133             if (ST.count == ARG1(ST.me) /* min */)
5134                 sayNO;
5135             ST.count--;
5136             locinput = HOPc(locinput, -ST.alen);
5137             goto curlym_do_B; /* try to match B */
5138
5139 #undef ST
5140 #define ST st->u.curly
5141
5142 #define CURLY_SETPAREN(paren, success) \
5143     if (paren) { \
5144         if (success) { \
5145             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5146             PL_regoffs[paren].end = locinput - PL_bostr; \
5147             *PL_reglastcloseparen = paren; \
5148         } \
5149         else \
5150             PL_regoffs[paren].end = -1; \
5151     }
5152
5153         case STAR:              /*  /A*B/ where A is width 1 */
5154             ST.paren = 0;
5155             ST.min = 0;
5156             ST.max = REG_INFTY;
5157             scan = NEXTOPER(scan);
5158             goto repeat;
5159         case PLUS:              /*  /A+B/ where A is width 1 */
5160             ST.paren = 0;
5161             ST.min = 1;
5162             ST.max = REG_INFTY;
5163             scan = NEXTOPER(scan);
5164             goto repeat;
5165         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5166             ST.paren = scan->flags;     /* Which paren to set */
5167             if (ST.paren > PL_regsize)
5168                 PL_regsize = ST.paren;
5169             if (ST.paren > *PL_reglastparen)
5170                 *PL_reglastparen = ST.paren;
5171             ST.min = ARG1(scan);  /* min to match */
5172             ST.max = ARG2(scan);  /* max to match */
5173             if (cur_eval && cur_eval->u.eval.close_paren &&
5174                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5175                 ST.min=1;
5176                 ST.max=1;
5177             }
5178             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5179             goto repeat;
5180         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5181             ST.paren = 0;
5182             ST.min = ARG1(scan);  /* min to match */
5183             ST.max = ARG2(scan);  /* max to match */
5184             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5185           repeat:
5186             /*
5187             * Lookahead to avoid useless match attempts
5188             * when we know what character comes next.
5189             *
5190             * Used to only do .*x and .*?x, but now it allows
5191             * for )'s, ('s and (?{ ... })'s to be in the way
5192             * of the quantifier and the EXACT-like node.  -- japhy
5193             */
5194
5195             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5196                 sayNO;
5197             if (HAS_TEXT(next) || JUMPABLE(next)) {
5198                 U8 *s;
5199                 regnode *text_node = next;
5200
5201                 if (! HAS_TEXT(text_node))
5202                     FIND_NEXT_IMPT(text_node);
5203
5204                 if (! HAS_TEXT(text_node))
5205                     ST.c1 = ST.c2 = CHRTEST_VOID;
5206                 else {
5207                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5208                         ST.c1 = ST.c2 = CHRTEST_VOID;
5209                         goto assume_ok_easy;
5210                     }
5211                     else
5212                         s = (U8*)STRING(text_node);
5213
5214                     /*  Currently we only get here when
5215
5216                         PL_rekind[OP(text_node)] == EXACT
5217
5218                         if this changes back then the macro for IS_TEXT and
5219                         friends need to change. */
5220                     if (!UTF_PATTERN) {
5221                         ST.c1 = *s;
5222                         switch (OP(text_node)) {
5223                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5224                             case EXACTFA:
5225                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5226                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5227                             default: ST.c2 = ST.c1; break;
5228                         }
5229                     }
5230                     else { /* UTF_PATTERN */
5231                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5232                              STRLEN ulen1, ulen2;
5233                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5234                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5235
5236                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5237                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5238 #ifdef EBCDIC
5239                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5240                                                     ckWARN(WARN_UTF8) ?
5241                                                     0 : UTF8_ALLOW_ANY);
5242                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5243                                                     ckWARN(WARN_UTF8) ?
5244                                                     0 : UTF8_ALLOW_ANY);
5245 #else
5246                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5247                                                     uniflags);
5248                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5249                                                     uniflags);
5250 #endif
5251                         }
5252                         else {
5253                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5254                                                      uniflags);
5255                         }
5256                     }
5257                 }
5258             }
5259             else
5260                 ST.c1 = ST.c2 = CHRTEST_VOID;
5261         assume_ok_easy:
5262
5263             ST.A = scan;
5264             ST.B = next;
5265             PL_reginput = locinput;
5266             if (minmod) {
5267                 minmod = 0;
5268                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5269                     sayNO;
5270                 ST.count = ST.min;
5271                 locinput = PL_reginput;
5272                 REGCP_SET(ST.cp);
5273                 if (ST.c1 == CHRTEST_VOID)
5274                     goto curly_try_B_min;
5275
5276                 ST.oldloc = locinput;
5277
5278                 /* set ST.maxpos to the furthest point along the
5279                  * string that could possibly match */
5280                 if  (ST.max == REG_INFTY) {
5281                     ST.maxpos = PL_regeol - 1;
5282                     if (utf8_target)
5283                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5284                             ST.maxpos--;
5285                 }
5286                 else if (utf8_target) {
5287                     int m = ST.max - ST.min;
5288                     for (ST.maxpos = locinput;
5289                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5290                         ST.maxpos += UTF8SKIP(ST.maxpos);
5291                 }
5292                 else {
5293                     ST.maxpos = locinput + ST.max - ST.min;
5294                     if (ST.maxpos >= PL_regeol)
5295                         ST.maxpos = PL_regeol - 1;
5296                 }
5297                 goto curly_try_B_min_known;
5298
5299             }
5300             else {
5301                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5302                 locinput = PL_reginput;
5303                 if (ST.count < ST.min)
5304                     sayNO;
5305                 if ((ST.count > ST.min)
5306                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5307                 {
5308                     /* A{m,n} must come at the end of the string, there's
5309                      * no point in backing off ... */
5310                     ST.min = ST.count;
5311                     /* ...except that $ and \Z can match before *and* after
5312                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5313                        We may back off by one in this case. */
5314                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5315                         ST.min--;
5316                 }
5317                 REGCP_SET(ST.cp);
5318                 goto curly_try_B_max;
5319             }
5320             /* NOTREACHED */
5321
5322
5323         case CURLY_B_min_known_fail:
5324             /* failed to find B in a non-greedy match where c1,c2 valid */
5325             if (ST.paren && ST.count)
5326                 PL_regoffs[ST.paren].end = -1;
5327
5328             PL_reginput = locinput;     /* Could be reset... */
5329             REGCP_UNWIND(ST.cp);
5330             /* Couldn't or didn't -- move forward. */
5331             ST.oldloc = locinput;
5332             if (utf8_target)
5333                 locinput += UTF8SKIP(locinput);
5334             else
5335                 locinput++;
5336             ST.count++;
5337           curly_try_B_min_known:
5338              /* find the next place where 'B' could work, then call B */
5339             {
5340                 int n;
5341                 if (utf8_target) {
5342                     n = (ST.oldloc == locinput) ? 0 : 1;
5343                     if (ST.c1 == ST.c2) {
5344                         STRLEN len;
5345                         /* set n to utf8_distance(oldloc, locinput) */
5346                         while (locinput <= ST.maxpos &&
5347                                utf8n_to_uvchr((U8*)locinput,
5348                                               UTF8_MAXBYTES, &len,
5349                                               uniflags) != (UV)ST.c1) {
5350                             locinput += len;
5351                             n++;
5352                         }
5353                     }
5354                     else {
5355                         /* set n to utf8_distance(oldloc, locinput) */
5356                         while (locinput <= ST.maxpos) {
5357                             STRLEN len;
5358                             const UV c = utf8n_to_uvchr((U8*)locinput,
5359                                                   UTF8_MAXBYTES, &len,
5360                                                   uniflags);
5361                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5362                                 break;
5363                             locinput += len;
5364                             n++;
5365                         }
5366                     }
5367                 }
5368                 else {
5369                     if (ST.c1 == ST.c2) {
5370                         while (locinput <= ST.maxpos &&
5371                                UCHARAT(locinput) != ST.c1)
5372                             locinput++;
5373                     }
5374                     else {
5375                         while (locinput <= ST.maxpos
5376                                && UCHARAT(locinput) != ST.c1
5377                                && UCHARAT(locinput) != ST.c2)
5378                             locinput++;
5379                     }
5380                     n = locinput - ST.oldloc;
5381                 }
5382                 if (locinput > ST.maxpos)
5383                     sayNO;
5384                 /* PL_reginput == oldloc now */
5385                 if (n) {
5386                     ST.count += n;
5387                     if (regrepeat(rex, ST.A, n, depth) < n)
5388                         sayNO;
5389                 }
5390                 PL_reginput = locinput;
5391                 CURLY_SETPAREN(ST.paren, ST.count);
5392                 if (cur_eval && cur_eval->u.eval.close_paren &&
5393                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5394                     goto fake_end;
5395                 }
5396                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5397             }
5398             /* NOTREACHED */
5399
5400
5401         case CURLY_B_min_fail:
5402             /* failed to find B in a non-greedy match where c1,c2 invalid */
5403             if (ST.paren && ST.count)
5404                 PL_regoffs[ST.paren].end = -1;
5405
5406             REGCP_UNWIND(ST.cp);
5407             /* failed -- move forward one */
5408             PL_reginput = locinput;
5409             if (regrepeat(rex, ST.A, 1, depth)) {
5410                 ST.count++;
5411                 locinput = PL_reginput;
5412                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5413                         ST.count > 0)) /* count overflow ? */
5414                 {
5415                   curly_try_B_min:
5416                     CURLY_SETPAREN(ST.paren, ST.count);
5417                     if (cur_eval && cur_eval->u.eval.close_paren &&
5418                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5419                         goto fake_end;
5420                     }
5421                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5422                 }
5423             }
5424             sayNO;
5425             /* NOTREACHED */
5426
5427
5428         curly_try_B_max:
5429             /* a successful greedy match: now try to match B */
5430             if (cur_eval && cur_eval->u.eval.close_paren &&
5431                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5432                 goto fake_end;
5433             }
5434             {
5435                 UV c = 0;
5436                 if (ST.c1 != CHRTEST_VOID)
5437                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5438                                            UTF8_MAXBYTES, 0, uniflags)
5439                                 : (UV) UCHARAT(PL_reginput);
5440                 /* If it could work, try it. */
5441                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5442                     CURLY_SETPAREN(ST.paren, ST.count);
5443                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5444                     /* NOTREACHED */
5445                 }
5446             }
5447             /* FALL THROUGH */
5448         case CURLY_B_max_fail:
5449             /* failed to find B in a greedy match */
5450             if (ST.paren && ST.count)
5451                 PL_regoffs[ST.paren].end = -1;
5452
5453             REGCP_UNWIND(ST.cp);
5454             /*  back up. */
5455             if (--ST.count < ST.min)
5456                 sayNO;
5457             PL_reginput = locinput = HOPc(locinput, -1);
5458             goto curly_try_B_max;
5459
5460 #undef ST
5461
5462         case END:
5463             fake_end:
5464             if (cur_eval) {
5465                 /* we've just finished A in /(??{A})B/; now continue with B */
5466                 I32 tmpix;
5467                 st->u.eval.toggle_reg_flags
5468                             = cur_eval->u.eval.toggle_reg_flags;
5469                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5470
5471                 st->u.eval.prev_rex = rex_sv;           /* inner */
5472                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5473                 rex = (struct regexp *)SvANY(rex_sv);
5474                 rexi = RXi_GET(rex);
5475                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5476                 (void)ReREFCNT_inc(rex_sv);
5477                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5478
5479                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5480                 PL_reglastparen = &rex->lastparen;
5481                 PL_reglastcloseparen = &rex->lastcloseparen;
5482
5483                 REGCP_SET(st->u.eval.lastcp);
5484                 PL_reginput = locinput;
5485
5486                 /* Restore parens of the outer rex without popping the
5487                  * savestack */
5488                 tmpix = PL_savestack_ix;
5489                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5490                 regcppop(rex);
5491                 PL_savestack_ix = tmpix;
5492
5493                 st->u.eval.prev_eval = cur_eval;
5494                 cur_eval = cur_eval->u.eval.prev_eval;
5495                 DEBUG_EXECUTE_r(
5496                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5497                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5498                 if ( nochange_depth )
5499                     nochange_depth--;
5500
5501                 PUSH_YES_STATE_GOTO(EVAL_AB,
5502                         st->u.eval.prev_eval->u.eval.B); /* match B */
5503             }
5504
5505             if (locinput < reginfo->till) {
5506                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5507                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5508                                       PL_colors[4],
5509                                       (long)(locinput - PL_reg_starttry),
5510                                       (long)(reginfo->till - PL_reg_starttry),
5511                                       PL_colors[5]));
5512
5513                 sayNO_SILENT;           /* Cannot match: too short. */
5514             }
5515             PL_reginput = locinput;     /* put where regtry can find it */
5516             sayYES;                     /* Success! */
5517
5518         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5519             DEBUG_EXECUTE_r(
5520             PerlIO_printf(Perl_debug_log,
5521                 "%*s  %ssubpattern success...%s\n",
5522                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5523             PL_reginput = locinput;     /* put where regtry can find it */
5524             sayYES;                     /* Success! */
5525
5526 #undef  ST
5527 #define ST st->u.ifmatch
5528
5529         case SUSPEND:   /* (?>A) */
5530             ST.wanted = 1;
5531             PL_reginput = locinput;
5532             goto do_ifmatch;
5533
5534         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5535             ST.wanted = 0;
5536             goto ifmatch_trivial_fail_test;
5537
5538         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5539             ST.wanted = 1;
5540           ifmatch_trivial_fail_test:
5541             if (scan->flags) {
5542                 char * const s = HOPBACKc(locinput, scan->flags);
5543                 if (!s) {
5544                     /* trivial fail */
5545                     if (logical) {
5546                         logical = 0;
5547                         sw = 1 - cBOOL(ST.wanted);
5548                     }
5549                     else if (ST.wanted)
5550                         sayNO;
5551                     next = scan + ARG(scan);
5552                     if (next == scan)
5553                         next = NULL;
5554                     break;
5555                 }
5556                 PL_reginput = s;
5557             }
5558             else
5559                 PL_reginput = locinput;
5560
5561           do_ifmatch:
5562             ST.me = scan;
5563             ST.logical = logical;
5564             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5565
5566             /* execute body of (?...A) */
5567             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5568             /* NOTREACHED */
5569
5570         case IFMATCH_A_fail: /* body of (?...A) failed */
5571             ST.wanted = !ST.wanted;
5572             /* FALL THROUGH */
5573
5574         case IFMATCH_A: /* body of (?...A) succeeded */
5575             if (ST.logical) {
5576                 sw = cBOOL(ST.wanted);
5577             }
5578             else if (!ST.wanted)
5579                 sayNO;
5580
5581             if (OP(ST.me) == SUSPEND)
5582                 locinput = PL_reginput;
5583             else {
5584                 locinput = PL_reginput = st->locinput;
5585                 nextchr = UCHARAT(locinput);
5586             }
5587             scan = ST.me + ARG(ST.me);
5588             if (scan == ST.me)
5589                 scan = NULL;
5590             continue; /* execute B */
5591
5592 #undef ST
5593
5594         case LONGJMP:
5595             next = scan + ARG(scan);
5596             if (next == scan)
5597                 next = NULL;
5598             break;
5599         case COMMIT:
5600             reginfo->cutpoint = PL_regeol;
5601             /* FALLTHROUGH */
5602         case PRUNE:
5603             PL_reginput = locinput;
5604             if (!scan->flags)
5605                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5606             PUSH_STATE_GOTO(COMMIT_next,next);
5607             /* NOTREACHED */
5608         case COMMIT_next_fail:
5609             no_final = 1;
5610             /* FALLTHROUGH */
5611         case OPFAIL:
5612             sayNO;
5613             /* NOTREACHED */
5614
5615 #define ST st->u.mark
5616         case MARKPOINT:
5617             ST.prev_mark = mark_state;
5618             ST.mark_name = sv_commit = sv_yes_mark
5619                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5620             mark_state = st;
5621             ST.mark_loc = PL_reginput = locinput;
5622             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5623             /* NOTREACHED */
5624         case MARKPOINT_next:
5625             mark_state = ST.prev_mark;
5626             sayYES;
5627             /* NOTREACHED */
5628         case MARKPOINT_next_fail:
5629             if (popmark && sv_eq(ST.mark_name,popmark))
5630             {
5631                 if (ST.mark_loc > startpoint)
5632                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5633                 popmark = NULL; /* we found our mark */
5634                 sv_commit = ST.mark_name;
5635
5636                 DEBUG_EXECUTE_r({
5637                         PerlIO_printf(Perl_debug_log,
5638                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5639                             REPORT_CODE_OFF+depth*2, "",
5640                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5641                 });
5642             }
5643             mark_state = ST.prev_mark;
5644             sv_yes_mark = mark_state ?
5645                 mark_state->u.mark.mark_name : NULL;
5646             sayNO;
5647             /* NOTREACHED */
5648         case SKIP:
5649             PL_reginput = locinput;
5650             if (scan->flags) {
5651                 /* (*SKIP) : if we fail we cut here*/
5652                 ST.mark_name = NULL;
5653                 ST.mark_loc = locinput;
5654                 PUSH_STATE_GOTO(SKIP_next,next);
5655             } else {
5656                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5657                    otherwise do nothing.  Meaning we need to scan
5658                  */
5659                 regmatch_state *cur = mark_state;
5660                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5661
5662                 while (cur) {
5663                     if ( sv_eq( cur->u.mark.mark_name,
5664                                 find ) )
5665                     {
5666                         ST.mark_name = find;
5667                         PUSH_STATE_GOTO( SKIP_next, next );
5668                     }
5669                     cur = cur->u.mark.prev_mark;
5670                 }
5671             }
5672             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5673             break;
5674         case SKIP_next_fail:
5675             if (ST.mark_name) {
5676                 /* (*CUT:NAME) - Set up to search for the name as we
5677                    collapse the stack*/
5678                 popmark = ST.mark_name;
5679             } else {
5680                 /* (*CUT) - No name, we cut here.*/
5681                 if (ST.mark_loc > startpoint)
5682                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5683                 /* but we set sv_commit to latest mark_name if there
5684                    is one so they can test to see how things lead to this
5685                    cut */
5686                 if (mark_state)
5687                     sv_commit=mark_state->u.mark.mark_name;
5688             }
5689             no_final = 1;
5690             sayNO;
5691             /* NOTREACHED */
5692 #undef ST
5693         case FOLDCHAR:
5694             n = ARG(scan);
5695             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5696                 locinput += ln;
5697             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5698                 sayNO;
5699             } else  {
5700                 U8 folded[UTF8_MAXBYTES_CASE+1];
5701                 STRLEN foldlen;
5702                 const char * const l = locinput;
5703                 char *e = PL_regeol;
5704                 to_uni_fold(n, folded, &foldlen);
5705
5706                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5707                                l, &e, 0,  utf8_target)) {
5708                         sayNO;
5709                 }
5710                 locinput = e;
5711             }
5712             nextchr = UCHARAT(locinput);
5713             break;
5714         case LNBREAK:
5715             if ((n=is_LNBREAK(locinput,utf8_target))) {
5716                 locinput += n;
5717                 nextchr = UCHARAT(locinput);
5718             } else
5719                 sayNO;
5720             break;
5721
5722 #define CASE_CLASS(nAmE)                              \
5723         case nAmE:                                    \
5724             if (locinput >= PL_regeol)                \
5725                 sayNO;                                \
5726             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5727                 locinput += n;                        \
5728                 nextchr = UCHARAT(locinput);          \
5729             } else                                    \
5730                 sayNO;                                \
5731             break;                                    \
5732         case N##nAmE:                                 \
5733             if (locinput >= PL_regeol)                \
5734                 sayNO;                                \
5735             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5736                 sayNO;                                \
5737             } else {                                  \
5738                 locinput += UTF8SKIP(locinput);       \
5739                 nextchr = UCHARAT(locinput);          \
5740             }                                         \
5741             break
5742
5743         CASE_CLASS(VERTWS);
5744         CASE_CLASS(HORIZWS);
5745 #undef CASE_CLASS
5746
5747         default:
5748             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5749                           PTR2UV(scan), OP(scan));
5750             Perl_croak(aTHX_ "regexp memory corruption");
5751
5752         } /* end switch */
5753
5754         /* switch break jumps here */
5755         scan = next; /* prepare to execute the next op and ... */
5756         continue;    /* ... jump back to the top, reusing st */
5757         /* NOTREACHED */
5758
5759       push_yes_state:
5760         /* push a state that backtracks on success */
5761         st->u.yes.prev_yes_state = yes_state;
5762         yes_state = st;
5763         /* FALL THROUGH */
5764       push_state:
5765         /* push a new regex state, then continue at scan  */
5766         {
5767             regmatch_state *newst;
5768
5769             DEBUG_STACK_r({
5770                 regmatch_state *cur = st;
5771                 regmatch_state *curyes = yes_state;
5772                 int curd = depth;
5773                 regmatch_slab *slab = PL_regmatch_slab;
5774                 for (;curd > -1;cur--,curd--) {
5775                     if (cur < SLAB_FIRST(slab)) {
5776                         slab = slab->prev;
5777                         cur = SLAB_LAST(slab);
5778                     }
5779                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5780                         REPORT_CODE_OFF + 2 + depth * 2,"",
5781                         curd, PL_reg_name[cur->resume_state],
5782                         (curyes == cur) ? "yes" : ""
5783                     );
5784                     if (curyes == cur)
5785                         curyes = cur->u.yes.prev_yes_state;
5786                 }
5787             } else
5788                 DEBUG_STATE_pp("push")
5789             );
5790             depth++;
5791             st->locinput = locinput;
5792             newst = st+1;
5793             if (newst >  SLAB_LAST(PL_regmatch_slab))
5794                 newst = S_push_slab(aTHX);
5795             PL_regmatch_state = newst;
5796
5797             locinput = PL_reginput;
5798             nextchr = UCHARAT(locinput);
5799             st = newst;
5800             continue;
5801             /* NOTREACHED */
5802         }
5803     }
5804
5805     /*
5806     * We get here only if there's trouble -- normally "case END" is
5807     * the terminating point.
5808     */
5809     Perl_croak(aTHX_ "corrupted regexp pointers");
5810     /*NOTREACHED*/
5811     sayNO;
5812
5813 yes:
5814     if (yes_state) {
5815         /* we have successfully completed a subexpression, but we must now
5816          * pop to the state marked by yes_state and continue from there */
5817         assert(st != yes_state);
5818 #ifdef DEBUGGING
5819         while (st != yes_state) {
5820             st--;
5821             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5822                 PL_regmatch_slab = PL_regmatch_slab->prev;
5823                 st = SLAB_LAST(PL_regmatch_slab);
5824             }
5825             DEBUG_STATE_r({
5826                 if (no_final) {
5827                     DEBUG_STATE_pp("pop (no final)");
5828                 } else {
5829                     DEBUG_STATE_pp("pop (yes)");
5830                 }
5831             });
5832             depth--;
5833         }
5834 #else
5835         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5836             || yes_state > SLAB_LAST(PL_regmatch_slab))
5837         {
5838             /* not in this slab, pop slab */
5839             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5840             PL_regmatch_slab = PL_regmatch_slab->prev;
5841             st = SLAB_LAST(PL_regmatch_slab);
5842         }
5843         depth -= (st - yes_state);
5844 #endif
5845         st = yes_state;
5846         yes_state = st->u.yes.prev_yes_state;
5847         PL_regmatch_state = st;
5848
5849         if (no_final) {
5850             locinput= st->locinput;
5851             nextchr = UCHARAT(locinput);
5852         }
5853         state_num = st->resume_state + no_final;
5854         goto reenter_switch;
5855     }
5856
5857     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5858                           PL_colors[4], PL_colors[5]));
5859
5860     if (PL_reg_eval_set) {
5861         /* each successfully executed (?{...}) block does the equivalent of
5862          *   local $^R = do {...}
5863          * When popping the save stack, all these locals would be undone;
5864          * bypass this by setting the outermost saved $^R to the latest
5865          * value */
5866         if (oreplsv != GvSV(PL_replgv))
5867             sv_setsv(oreplsv, GvSV(PL_replgv));
5868     }
5869     result = 1;
5870     goto final_exit;
5871
5872 no:
5873     DEBUG_EXECUTE_r(
5874         PerlIO_printf(Perl_debug_log,
5875             "%*s  %sfailed...%s\n",
5876             REPORT_CODE_OFF+depth*2, "",
5877             PL_colors[4], PL_colors[5])
5878         );
5879
5880 no_silent:
5881     if (no_final) {
5882         if (yes_state) {
5883             goto yes;
5884         } else {
5885             goto final_exit;
5886         }
5887     }
5888     if (depth) {
5889         /* there's a previous state to backtrack to */
5890         st--;
5891         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5892             PL_regmatch_slab = PL_regmatch_slab->prev;
5893             st = SLAB_LAST(PL_regmatch_slab);
5894         }
5895         PL_regmatch_state = st;
5896         locinput= st->locinput;
5897         nextchr = UCHARAT(locinput);
5898
5899         DEBUG_STATE_pp("pop");
5900         depth--;
5901         if (yes_state == st)
5902             yes_state = st->u.yes.prev_yes_state;
5903
5904         state_num = st->resume_state + 1; /* failure = success + 1 */
5905         goto reenter_switch;
5906     }
5907     result = 0;
5908
5909   final_exit:
5910     if (rex->intflags & PREGf_VERBARG_SEEN) {
5911         SV *sv_err = get_sv("REGERROR", 1);
5912         SV *sv_mrk = get_sv("REGMARK", 1);
5913         if (result) {
5914             sv_commit = &PL_sv_no;
5915             if (!sv_yes_mark)
5916                 sv_yes_mark = &PL_sv_yes;
5917         } else {
5918             if (!sv_commit)
5919                 sv_commit = &PL_sv_yes;
5920             sv_yes_mark = &PL_sv_no;
5921         }
5922         sv_setsv(sv_err, sv_commit);
5923         sv_setsv(sv_mrk, sv_yes_mark);
5924     }
5925
5926     /* clean up; in particular, free all slabs above current one */
5927     LEAVE_SCOPE(oldsave);
5928
5929     return result;
5930 }
5931
5932 /*
5933  - regrepeat - repeatedly match something simple, report how many
5934  */
5935 /*
5936  * [This routine now assumes that it will only match on things of length 1.
5937  * That was true before, but now we assume scan - reginput is the count,
5938  * rather than incrementing count on every character.  [Er, except utf8.]]
5939  */
5940 STATIC I32
5941 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5942 {
5943     dVAR;
5944     register char *scan;
5945     register I32 c;
5946     register char *loceol = PL_regeol;
5947     register I32 hardcount = 0;
5948     register bool utf8_target = PL_reg_match_utf8;
5949     UV utf8_flags;
5950 #ifndef DEBUGGING
5951     PERL_UNUSED_ARG(depth);
5952 #endif
5953
5954     PERL_ARGS_ASSERT_REGREPEAT;
5955
5956     scan = PL_reginput;
5957     if (max == REG_INFTY)
5958         max = I32_MAX;
5959     else if (max < loceol - scan)
5960         loceol = scan + max;
5961     switch (OP(p)) {
5962     case REG_ANY:
5963         if (utf8_target) {
5964             loceol = PL_regeol;
5965             while (scan < loceol && hardcount < max && *scan != '\n') {
5966                 scan += UTF8SKIP(scan);
5967                 hardcount++;
5968             }
5969         } else {
5970             while (scan < loceol && *scan != '\n')
5971                 scan++;
5972         }
5973         break;
5974     case SANY:
5975         if (utf8_target) {
5976             loceol = PL_regeol;
5977             while (scan < loceol && hardcount < max) {
5978                 scan += UTF8SKIP(scan);
5979                 hardcount++;
5980             }
5981         }
5982         else
5983             scan = loceol;
5984         break;
5985     case CANY:
5986         scan = loceol;
5987         break;
5988     case EXACT:
5989         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5990          * means they match only characters in the string that can be expressed
5991          * as a single byte.  For non-utf8 strings, that means a simple match.
5992          * For utf8 strings, the character matched must be an invariant, or
5993          * downgradable to a single byte.  The pattern's utf8ness is
5994          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5995          * it is, it's an invariant */
5996
5997         c = (U8)*STRING(p);
5998         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5999
6000         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6001             while (scan < loceol && UCHARAT(scan) == c) {
6002                 scan++;
6003             }
6004         }
6005         else {
6006
6007             /* Here, the string is utf8, and the pattern char is different
6008              * in utf8 than not, so can't compare them directly.  Outside the
6009              * loop, find find the two utf8 bytes that represent c, and then
6010              * look for those in sequence in the utf8 string */
6011             U8 high = UTF8_TWO_BYTE_HI(c);
6012             U8 low = UTF8_TWO_BYTE_LO(c);
6013             loceol = PL_regeol;
6014
6015             while (hardcount < max
6016                     && scan + 1 < loceol
6017                     && UCHARAT(scan) == high
6018                     && UCHARAT(scan + 1) == low)
6019             {
6020                 scan += 2;
6021                 hardcount++;
6022             }
6023         }
6024         break;
6025     case EXACTFA:
6026         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6027         goto do_exactf;
6028
6029     case EXACTFL:
6030         PL_reg_flags |= RF_tainted;
6031         utf8_flags = FOLDEQ_UTF8_LOCALE;
6032         goto do_exactf;
6033
6034     case EXACTF:
6035     case EXACTFU:
6036         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6037
6038         /* The comments for the EXACT case above apply as well to these fold
6039          * ones */
6040
6041     do_exactf:
6042         c = (U8)*STRING(p);
6043         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6044
6045         if (utf8_target) { /* Use full Unicode fold matching */
6046             char *tmpeol = loceol;
6047             while (hardcount < max
6048                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6049                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6050             {
6051                 scan = tmpeol;
6052                 tmpeol = loceol;
6053                 hardcount++;
6054             }
6055
6056             /* XXX Note that the above handles properly the German sharp s in
6057              * the pattern matching ss in the string.  But it doesn't handle
6058              * properly cases where the string contains say 'LIGATURE ff' and
6059              * the pattern is 'f+'.  This would require, say, a new function or
6060              * revised interface to foldEQ_utf8(), in which the maximum number
6061              * of characters to match could be passed and it would return how
6062              * many actually did.  This is just one of many cases where
6063              * multi-char folds don't work properly, and so the fix is being
6064              * deferred */
6065         }
6066         else {
6067             U8 folded;
6068
6069             /* Here, the string isn't utf8 and c is a single byte; and either
6070              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6071              * doesn't affect c.  Can just do simple comparisons for exact or
6072              * fold matching. */
6073             switch (OP(p)) {
6074                 case EXACTF: folded = PL_fold[c]; break;
6075                 case EXACTFA:
6076                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6077                 case EXACTFL: folded = PL_fold_locale[c]; break;
6078                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6079             }
6080             while (scan < loceol &&
6081                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6082             {
6083                 scan++;
6084             }
6085         }
6086         break;
6087     case ANYOFV:
6088     case ANYOF:
6089         if (utf8_target || OP(p) == ANYOFV) {
6090             STRLEN inclasslen;
6091             loceol = PL_regeol;
6092             inclasslen = loceol - scan;
6093             while (hardcount < max
6094                    && ((inclasslen = loceol - scan) > 0)
6095                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6096             {
6097                 scan += inclasslen;
6098                 hardcount++;
6099             }
6100         } else {
6101             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6102                 scan++;
6103         }
6104         break;
6105     case ALNUMU:
6106         if (utf8_target) {
6107     utf8_wordchar:
6108             loceol = PL_regeol;
6109             LOAD_UTF8_CHARCLASS_ALNUM();
6110             while (hardcount < max && scan < loceol &&
6111                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6112             {
6113                 scan += UTF8SKIP(scan);
6114                 hardcount++;
6115             }
6116         } else {
6117             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6118                 scan++;
6119             }
6120         }
6121         break;
6122     case ALNUM:
6123         if (utf8_target)
6124             goto utf8_wordchar;
6125         while (scan < loceol && isALNUM((U8) *scan)) {
6126             scan++;
6127         }
6128         break;
6129     case ALNUMA:
6130         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6131             scan++;
6132         }
6133         break;
6134     case ALNUML:
6135         PL_reg_flags |= RF_tainted;
6136         if (utf8_target) {
6137             loceol = PL_regeol;
6138             while (hardcount < max && scan < loceol &&
6139                    isALNUM_LC_utf8((U8*)scan)) {
6140                 scan += UTF8SKIP(scan);
6141                 hardcount++;
6142             }
6143         } else {
6144             while (scan < loceol && isALNUM_LC(*scan))
6145                 scan++;
6146         }
6147         break;
6148     case NALNUMU:
6149         if (utf8_target) {
6150
6151     utf8_Nwordchar:
6152
6153             loceol = PL_regeol;
6154             LOAD_UTF8_CHARCLASS_ALNUM();
6155             while (hardcount < max && scan < loceol &&
6156                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6157             {
6158                 scan += UTF8SKIP(scan);
6159                 hardcount++;
6160             }
6161         } else {
6162             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6163                 scan++;
6164             }
6165         }
6166         break;
6167     case NALNUM:
6168         if (utf8_target)
6169             goto utf8_Nwordchar;
6170         while (scan < loceol && ! isALNUM((U8) *scan)) {
6171             scan++;
6172         }
6173         break;
6174     case NALNUMA:
6175         if (utf8_target) {
6176             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6177                 scan += UTF8SKIP(scan);
6178             }
6179         }
6180         else {
6181             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6182                 scan++;
6183             }
6184         }
6185         break;
6186     case NALNUML:
6187         PL_reg_flags |= RF_tainted;
6188         if (utf8_target) {
6189             loceol = PL_regeol;
6190             while (hardcount < max && scan < loceol &&
6191                    !isALNUM_LC_utf8((U8*)scan)) {
6192                 scan += UTF8SKIP(scan);
6193                 hardcount++;
6194             }
6195         } else {
6196             while (scan < loceol && !isALNUM_LC(*scan))
6197                 scan++;
6198         }
6199         break;
6200     case SPACEU:
6201         if (utf8_target) {
6202
6203     utf8_space:
6204
6205             loceol = PL_regeol;
6206             LOAD_UTF8_CHARCLASS_SPACE();
6207             while (hardcount < max && scan < loceol &&
6208                    (*scan == ' ' ||
6209                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6210             {
6211                 scan += UTF8SKIP(scan);
6212                 hardcount++;
6213             }
6214             break;
6215         }
6216         else {
6217             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6218                 scan++;
6219             }
6220             break;
6221         }
6222     case SPACE:
6223         if (utf8_target)
6224             goto utf8_space;
6225
6226         while (scan < loceol && isSPACE((U8) *scan)) {
6227             scan++;
6228         }
6229         break;
6230     case SPACEA:
6231         while (scan < loceol && isSPACE_A((U8) *scan)) {
6232             scan++;
6233         }
6234         break;
6235     case SPACEL:
6236         PL_reg_flags |= RF_tainted;
6237         if (utf8_target) {
6238             loceol = PL_regeol;
6239             while (hardcount < max && scan < loceol &&
6240                    isSPACE_LC_utf8((U8*)scan)) {
6241                 scan += UTF8SKIP(scan);
6242                 hardcount++;
6243             }
6244         } else {
6245             while (scan < loceol && isSPACE_LC(*scan))
6246                 scan++;
6247         }
6248         break;
6249     case NSPACEU:
6250         if (utf8_target) {
6251
6252     utf8_Nspace:
6253
6254             loceol = PL_regeol;
6255             LOAD_UTF8_CHARCLASS_SPACE();
6256             while (hardcount < max && scan < loceol &&
6257                    ! (*scan == ' ' ||
6258                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6259             {
6260                 scan += UTF8SKIP(scan);
6261                 hardcount++;
6262             }
6263             break;
6264         }
6265         else {
6266             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6267                 scan++;
6268             }
6269         }
6270         break;
6271     case NSPACE:
6272         if (utf8_target)
6273             goto utf8_Nspace;
6274
6275         while (scan < loceol && ! isSPACE((U8) *scan)) {
6276             scan++;
6277         }
6278         break;
6279     case NSPACEA:
6280         if (utf8_target) {
6281             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6282                 scan += UTF8SKIP(scan);
6283             }
6284         }
6285         else {
6286             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6287                 scan++;
6288             }
6289         }
6290         break;
6291     case NSPACEL:
6292         PL_reg_flags |= RF_tainted;
6293         if (utf8_target) {
6294             loceol = PL_regeol;
6295             while (hardcount < max && scan < loceol &&
6296                    !isSPACE_LC_utf8((U8*)scan)) {
6297                 scan += UTF8SKIP(scan);
6298                 hardcount++;
6299             }
6300         } else {
6301             while (scan < loceol && !isSPACE_LC(*scan))
6302                 scan++;
6303         }
6304         break;
6305     case DIGIT:
6306         if (utf8_target) {
6307             loceol = PL_regeol;
6308             LOAD_UTF8_CHARCLASS_DIGIT();
6309             while (hardcount < max && scan < loceol &&
6310                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6311                 scan += UTF8SKIP(scan);
6312                 hardcount++;
6313             }
6314         } else {
6315             while (scan < loceol && isDIGIT(*scan))
6316                 scan++;
6317         }
6318         break;
6319     case DIGITA:
6320         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6321             scan++;
6322         }
6323         break;
6324     case DIGITL:
6325         PL_reg_flags |= RF_tainted;
6326         if (utf8_target) {
6327             loceol = PL_regeol;
6328             while (hardcount < max && scan < loceol &&
6329                    isDIGIT_LC_utf8((U8*)scan)) {
6330                 scan += UTF8SKIP(scan);
6331                 hardcount++;
6332             }
6333         } else {
6334             while (scan < loceol && isDIGIT_LC(*scan))
6335                 scan++;
6336         }
6337         break;
6338     case NDIGIT:
6339         if (utf8_target) {
6340             loceol = PL_regeol;
6341             LOAD_UTF8_CHARCLASS_DIGIT();
6342             while (hardcount < max && scan < loceol &&
6343                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6344                 scan += UTF8SKIP(scan);
6345                 hardcount++;
6346             }
6347         } else {
6348             while (scan < loceol && !isDIGIT(*scan))
6349                 scan++;
6350         }
6351         break;
6352     case NDIGITA:
6353         if (utf8_target) {
6354             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6355                 scan += UTF8SKIP(scan);
6356             }
6357         }
6358         else {
6359             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6360                 scan++;
6361             }
6362         }
6363         break;
6364     case NDIGITL:
6365         PL_reg_flags |= RF_tainted;
6366         if (utf8_target) {
6367             loceol = PL_regeol;
6368             while (hardcount < max && scan < loceol &&
6369                    !isDIGIT_LC_utf8((U8*)scan)) {
6370                 scan += UTF8SKIP(scan);
6371                 hardcount++;
6372             }
6373         } else {
6374             while (scan < loceol && !isDIGIT_LC(*scan))
6375                 scan++;
6376         }
6377         break;
6378     case LNBREAK:
6379         if (utf8_target) {
6380             loceol = PL_regeol;
6381             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6382                 scan += c;
6383                 hardcount++;
6384             }
6385         } else {
6386             /*
6387               LNBREAK can match two latin chars, which is ok,
6388               because we have a null terminated string, but we
6389               have to use hardcount in this situation
6390             */
6391             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6392                 scan+=c;
6393                 hardcount++;
6394             }
6395         }
6396         break;
6397     case HORIZWS:
6398         if (utf8_target) {
6399             loceol = PL_regeol;
6400             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6401                 scan += c;
6402                 hardcount++;
6403             }
6404         } else {
6405             while (scan < loceol && is_HORIZWS_latin1(scan))
6406                 scan++;
6407         }
6408         break;
6409     case NHORIZWS:
6410         if (utf8_target) {
6411             loceol = PL_regeol;
6412             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6413                 scan += UTF8SKIP(scan);
6414                 hardcount++;
6415             }
6416         } else {
6417             while (scan < loceol && !is_HORIZWS_latin1(scan))
6418                 scan++;
6419
6420         }
6421         break;
6422     case VERTWS:
6423         if (utf8_target) {
6424             loceol = PL_regeol;
6425             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6426                 scan += c;
6427                 hardcount++;
6428             }
6429         } else {
6430             while (scan < loceol && is_VERTWS_latin1(scan))
6431                 scan++;
6432
6433         }
6434         break;
6435     case NVERTWS:
6436         if (utf8_target) {
6437             loceol = PL_regeol;
6438             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6439                 scan += UTF8SKIP(scan);
6440                 hardcount++;
6441             }
6442         } else {
6443             while (scan < loceol && !is_VERTWS_latin1(scan))
6444                 scan++;
6445
6446         }
6447         break;
6448
6449     default:            /* Called on something of 0 width. */
6450         break;          /* So match right here or not at all. */
6451     }
6452
6453     if (hardcount)
6454         c = hardcount;
6455     else
6456         c = scan - PL_reginput;
6457     PL_reginput = scan;
6458
6459     DEBUG_r({
6460         GET_RE_DEBUG_FLAGS_DECL;
6461         DEBUG_EXECUTE_r({
6462             SV * const prop = sv_newmortal();
6463             regprop(prog, prop, p);
6464             PerlIO_printf(Perl_debug_log,
6465                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6466                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6467         });
6468     });
6469
6470     return(c);
6471 }
6472
6473
6474 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6475 /*
6476 - regclass_swash - prepare the utf8 swash
6477 */
6478
6479 SV *
6480 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6481 {
6482     dVAR;
6483     SV *sw  = NULL;
6484     SV *si  = NULL;
6485     SV *alt = NULL;
6486     RXi_GET_DECL(prog,progi);
6487     const struct reg_data * const data = prog ? progi->data : NULL;
6488
6489     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6490
6491     assert(ANYOF_NONBITMAP(node));
6492
6493     if (data && data->count) {
6494         const U32 n = ARG(node);
6495
6496         if (data->what[n] == 's') {
6497             SV * const rv = MUTABLE_SV(data->data[n]);
6498             AV * const av = MUTABLE_AV(SvRV(rv));
6499             SV **const ary = AvARRAY(av);
6500             SV **a, **b;
6501
6502             /* See the end of regcomp.c:S_regclass() for
6503              * documentation of these array elements. */
6504
6505             si = *ary;
6506             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6507             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6508
6509             if (a)
6510                 sw = *a;
6511             else if (si && doinit) {
6512                 sw = swash_init("utf8", "", si, 1, 0);
6513                 (void)av_store(av, 1, sw);
6514             }
6515             if (b)
6516                 alt = *b;
6517         }
6518     }
6519
6520     if (listsvp)
6521         *listsvp = si;
6522     if (altsvp)
6523         *altsvp  = alt;
6524
6525     return sw;
6526 }
6527 #endif
6528
6529 /*
6530  - reginclass - determine if a character falls into a character class
6531
6532   n is the ANYOF regnode
6533   p is the target string
6534   lenp is pointer to the maximum number of bytes of how far to go in p
6535     (This is assumed wthout checking to always be at least the current
6536     character's size)
6537   utf8_target tells whether p is in UTF-8.
6538
6539   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6540   from a successful match, the value it points to will be updated to how many
6541   bytes in p were matched.  If there was no match, the value is undefined,
6542   possibly changed from the input.
6543
6544   Note that this can be a synthetic start class, a combination of various
6545   nodes, so things you think might be mutually exclusive, such as locale,
6546   aren't.  It can match both locale and non-locale
6547
6548  */
6549
6550 STATIC bool
6551 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6552 {
6553     dVAR;
6554     const char flags = ANYOF_FLAGS(n);
6555     bool match = FALSE;
6556     UV c = *p;
6557     STRLEN c_len = 0;
6558     STRLEN maxlen;
6559
6560     PERL_ARGS_ASSERT_REGINCLASS;
6561
6562     /* If c is not already the code point, get it */
6563     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6564         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6565                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6566                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6567                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6568                  * UTF8_ALLOW_FFFF */
6569         if (c_len == (STRLEN)-1)
6570             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6571     }
6572     else {
6573         c_len = 1;
6574     }
6575
6576     /* Use passed in max length, or one character if none passed in or less
6577      * than one character.  And assume will match just one character.  This is
6578      * overwritten later if matched more. */
6579     if (lenp) {
6580         maxlen = (*lenp > c_len) ? *lenp : c_len;
6581         *lenp = c_len;
6582
6583     }
6584     else {
6585         maxlen = c_len;
6586     }
6587
6588     /* If this character is potentially in the bitmap, check it */
6589     if (c < 256) {
6590         if (ANYOF_BITMAP_TEST(n, c))
6591             match = TRUE;
6592         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6593                 && ! utf8_target
6594                 && ! isASCII(c))
6595         {
6596             match = TRUE;
6597         }
6598
6599         else if (flags & ANYOF_LOCALE) {
6600             PL_reg_flags |= RF_tainted;
6601
6602             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6603                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6604             {
6605                 match = TRUE;
6606             }
6607             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6608                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6609                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6610                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6611                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6612                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6613                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6614                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6615                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6616                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6617                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6618                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6619                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6620                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6621                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6622                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6623                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6624                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6625                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6626                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6627                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6628                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6629                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6630                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6631                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6632                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6633                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6634                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6635                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6636                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6637                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6638                      ) /* How's that for a conditional? */
6639             ) {
6640                 match = TRUE;
6641             }
6642         }
6643     }
6644
6645     /* If the bitmap didn't (or couldn't) match, and something outside the
6646      * bitmap could match, try that.  Locale nodes specifiy completely the
6647      * behavior of code points in the bit map (otherwise, a utf8 target would
6648      * cause them to be treated as Unicode and not locale), except in
6649      * the very unlikely event when this node is a synthetic start class, which
6650      * could be a combination of locale and non-locale nodes.  So allow locale
6651      * to match for the synthetic start class, which will give a false
6652      * positive that will be resolved when the match is done again as not part
6653      * of the synthetic start class */
6654     if (!match) {
6655         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6656             match = TRUE;       /* Everything above 255 matches */
6657         }
6658         else if (ANYOF_NONBITMAP(n)
6659                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6660                      || (utf8_target
6661                          && (c >=256
6662                              || (! (flags & ANYOF_LOCALE))
6663                              || (flags & ANYOF_IS_SYNTHETIC)))))
6664         {
6665             AV *av;
6666             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6667
6668             if (sw) {
6669                 U8 * utf8_p;
6670                 if (utf8_target) {
6671                     utf8_p = (U8 *) p;
6672                 } else {
6673
6674                     /* Not utf8.  Convert as much of the string as available up
6675                      * to the limit of how far the (single) character in the
6676                      * pattern can possibly match (no need to go further).  If
6677                      * the node is a straight ANYOF or not folding, it can't
6678                      * match more than one.  Otherwise, It can match up to how
6679                      * far a single char can fold to.  Since not utf8, each
6680                      * character is a single byte, so the max it can be in
6681                      * bytes is the same as the max it can be in characters */
6682                     STRLEN len = (OP(n) == ANYOF
6683                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6684                                   ? 1
6685                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6686                                     ? maxlen
6687                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6688                     utf8_p = bytes_to_utf8(p, &len);
6689                 }
6690
6691                 if (swash_fetch(sw, utf8_p, TRUE))
6692                     match = TRUE;
6693                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6694
6695                     /* Here, we need to test if the fold of the target string
6696                      * matches.  The non-multi char folds have all been moved to
6697                      * the compilation phase, and the multi-char folds have
6698                      * been stored by regcomp into 'av'; we linearly check to
6699                      * see if any match the target string (folded).   We know
6700                      * that the originals were each one character, but we don't
6701                      * currently know how many characters/bytes each folded to,
6702                      * except we do know that there are small limits imposed by
6703                      * Unicode.  XXX A performance enhancement would be to have
6704                      * regcomp.c store the max number of chars/bytes that are
6705                      * in an av entry, as, say the 0th element.  Even better
6706                      * would be to have a hash of the few characters that can
6707                      * start a multi-char fold to the max number of chars of
6708                      * those folds.
6709                      *
6710                      * If there is a match, we will need to advance (if lenp is
6711                      * specified) the match pointer in the target string.  But
6712                      * what we are comparing here isn't that string directly,
6713                      * but its fold, whose length may differ from the original.
6714                      * As we go along in constructing the fold, therefore, we
6715                      * create a map so that we know how many bytes in the
6716                      * source to advance given that we have matched a certain
6717                      * number of bytes in the fold.  This map is stored in
6718                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6719                      * the fold of the first character that we are folding.
6720                      * Then map_fold_len_back[n] is set to the number of bytes
6721                      * in that first character.  Similarly let m be the
6722                      * corresponding number for the second character to be
6723                      * folded.  Then map_fold_len_back[n+m] is set to the
6724                      * number of bytes occupied by the first two source
6725                      * characters. ... */
6726                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6727                     U8 folded[UTF8_MAXBYTES_CASE+1];
6728                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6729                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6730                                                   chars */
6731
6732                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6733
6734                         /* Here, only need to fold the first char of the target
6735                          * string.  It the source wasn't utf8, is 1 byte long */
6736                         to_utf8_fold(utf8_p, folded, &foldlen);
6737                         total_foldlen = foldlen;
6738                         map_fold_len_back[foldlen] = (utf8_target)
6739                                                      ? UTF8SKIP(utf8_p)
6740                                                      : 1;
6741                     }
6742                     else {
6743
6744                         /* Here, need to fold more than the first char.  Do so
6745                          * up to the limits */
6746                         U8* source_ptr = utf8_p;    /* The source for the fold
6747                                                        is the regex target
6748                                                        string */
6749                         U8* folded_ptr = folded;
6750                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6751                                                        available byte in the
6752                                                        target string */
6753                         U8 i;
6754                         for (i = 0;
6755                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6756                              i++)
6757                         {
6758
6759                             /* Fold the next character */
6760                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6761                             STRLEN this_char_foldlen;
6762                             to_utf8_fold(source_ptr,
6763                                          this_char_folded,
6764                                          &this_char_foldlen);
6765
6766                             /* Bail if it would exceed the byte limit for
6767                              * folding a single char. */
6768                             if (this_char_foldlen + folded_ptr - folded >
6769                                                             UTF8_MAXBYTES_CASE)
6770                             {
6771                                 break;
6772                             }
6773
6774                             /* Add the fold of this character */
6775                             Copy(this_char_folded,
6776                                  folded_ptr,
6777                                  this_char_foldlen,
6778                                  U8);
6779                             source_ptr += UTF8SKIP(source_ptr);
6780                             folded_ptr += this_char_foldlen;
6781                             total_foldlen = folded_ptr - folded;
6782
6783                             /* Create map from the number of bytes in the fold
6784                              * back to the number of bytes in the source.  If
6785                              * the source isn't utf8, the byte count is just
6786                              * the number of characters so far */
6787                             map_fold_len_back[total_foldlen]
6788                                                       = (utf8_target)
6789                                                         ? source_ptr - utf8_p
6790                                                         : i + 1;
6791                         }
6792                         *folded_ptr = '\0';
6793                     }
6794
6795
6796                     /* Do the linear search to see if the fold is in the list
6797                      * of multi-char folds. */
6798                     if (av) {
6799                         I32 i;
6800                         for (i = 0; i <= av_len(av); i++) {
6801                             SV* const sv = *av_fetch(av, i, FALSE);
6802                             STRLEN len;
6803                             const char * const s = SvPV_const(sv, len);
6804
6805                             if (len <= total_foldlen
6806                                 && memEQ(s, (char*)folded, len)
6807
6808                                    /* If 0, means matched a partial char. See
6809                                     * [perl #90536] */
6810                                 && map_fold_len_back[len])
6811                             {
6812
6813                                 /* Advance the target string ptr to account for
6814                                  * this fold, but have to translate from the
6815                                  * folded length to the corresponding source
6816                                  * length. */
6817                                 if (lenp) {
6818                                     *lenp = map_fold_len_back[len];
6819                                 }
6820                                 match = TRUE;
6821                                 break;
6822                             }
6823                         }
6824                     }
6825                 }
6826
6827                 /* If we allocated a string above, free it */
6828                 if (! utf8_target) Safefree(utf8_p);
6829             }
6830         }
6831     }
6832
6833     return (flags & ANYOF_INVERT) ? !match : match;
6834 }
6835
6836 STATIC U8 *
6837 S_reghop3(U8 *s, I32 off, const U8* lim)
6838 {
6839     /* return the position 'off' UTF-8 characters away from 's', forward if
6840      * 'off' >= 0, backwards if negative.  But don't go outside of position
6841      * 'lim', which better be < s  if off < 0 */
6842
6843     dVAR;
6844
6845     PERL_ARGS_ASSERT_REGHOP3;
6846
6847     if (off >= 0) {
6848         while (off-- && s < lim) {
6849             /* XXX could check well-formedness here */
6850             s += UTF8SKIP(s);
6851         }
6852     }
6853     else {
6854         while (off++ && s > lim) {
6855             s--;
6856             if (UTF8_IS_CONTINUED(*s)) {
6857                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6858                     s--;
6859             }
6860             /* XXX could check well-formedness here */
6861         }
6862     }
6863     return s;
6864 }
6865
6866 #ifdef XXX_dmq
6867 /* there are a bunch of places where we use two reghop3's that should
6868    be replaced with this routine. but since thats not done yet
6869    we ifdef it out - dmq
6870 */
6871 STATIC U8 *
6872 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6873 {
6874     dVAR;
6875
6876     PERL_ARGS_ASSERT_REGHOP4;
6877
6878     if (off >= 0) {
6879         while (off-- && s < rlim) {
6880             /* XXX could check well-formedness here */
6881             s += UTF8SKIP(s);
6882         }
6883     }
6884     else {
6885         while (off++ && s > llim) {
6886             s--;
6887             if (UTF8_IS_CONTINUED(*s)) {
6888                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6889                     s--;
6890             }
6891             /* XXX could check well-formedness here */
6892         }
6893     }
6894     return s;
6895 }
6896 #endif
6897
6898 STATIC U8 *
6899 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6900 {
6901     dVAR;
6902
6903     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6904
6905     if (off >= 0) {
6906         while (off-- && s < lim) {
6907             /* XXX could check well-formedness here */
6908             s += UTF8SKIP(s);
6909         }
6910         if (off >= 0)
6911             return NULL;
6912     }
6913     else {
6914         while (off++ && s > lim) {
6915             s--;
6916             if (UTF8_IS_CONTINUED(*s)) {
6917                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6918                     s--;
6919             }
6920             /* XXX could check well-formedness here */
6921         }
6922         if (off <= 0)
6923             return NULL;
6924     }
6925     return s;
6926 }
6927
6928 static void
6929 restore_pos(pTHX_ void *arg)
6930 {
6931     dVAR;
6932     regexp * const rex = (regexp *)arg;
6933     if (PL_reg_eval_set) {
6934         if (PL_reg_oldsaved) {
6935             rex->subbeg = PL_reg_oldsaved;
6936             rex->sublen = PL_reg_oldsavedlen;
6937 #ifdef PERL_OLD_COPY_ON_WRITE
6938             rex->saved_copy = PL_nrs;
6939 #endif
6940             RXp_MATCH_COPIED_on(rex);
6941         }
6942         PL_reg_magic->mg_len = PL_reg_oldpos;
6943         PL_reg_eval_set = 0;
6944         PL_curpm = PL_reg_oldcurpm;
6945     }
6946 }
6947
6948 STATIC void
6949 S_to_utf8_substr(pTHX_ register regexp *prog)
6950 {
6951     int i = 1;
6952
6953     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6954
6955     do {
6956         if (prog->substrs->data[i].substr
6957             && !prog->substrs->data[i].utf8_substr) {
6958             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6959             prog->substrs->data[i].utf8_substr = sv;
6960             sv_utf8_upgrade(sv);
6961             if (SvVALID(prog->substrs->data[i].substr)) {
6962                 if (SvTAIL(prog->substrs->data[i].substr)) {
6963                     /* Trim the trailing \n that fbm_compile added last
6964                        time.  */
6965                     SvCUR_set(sv, SvCUR(sv) - 1);
6966                     /* Whilst this makes the SV technically "invalid" (as its
6967                        buffer is no longer followed by "\0") when fbm_compile()
6968                        adds the "\n" back, a "\0" is restored.  */
6969                     fbm_compile(sv, FBMcf_TAIL);
6970                 } else
6971                     fbm_compile(sv, 0);
6972             }
6973             if (prog->substrs->data[i].substr == prog->check_substr)
6974                 prog->check_utf8 = sv;
6975         }
6976     } while (i--);
6977 }
6978
6979 STATIC void
6980 S_to_byte_substr(pTHX_ register regexp *prog)
6981 {
6982     dVAR;
6983     int i = 1;
6984
6985     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6986
6987     do {
6988         if (prog->substrs->data[i].utf8_substr
6989             && !prog->substrs->data[i].substr) {
6990             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6991             if (sv_utf8_downgrade(sv, TRUE)) {
6992                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6993                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
6994                         /* Trim the trailing \n that fbm_compile added last
6995                            time.  */
6996                         SvCUR_set(sv, SvCUR(sv) - 1);
6997                         fbm_compile(sv, FBMcf_TAIL);
6998                     } else
6999                         fbm_compile(sv, 0);
7000                 }
7001             } else {
7002                 SvREFCNT_dec(sv);
7003                 sv = &PL_sv_undef;
7004             }
7005             prog->substrs->data[i].substr = sv;
7006             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7007                 prog->check_substr = sv;
7008         }
7009     } while (i--);
7010 }
7011
7012 /*
7013  * Local variables:
7014  * c-indentation-style: bsd
7015  * c-basic-offset: 4
7016  * indent-tabs-mode: t
7017  * End:
7018  *
7019  * ex: set ts=8 sts=4 sw=4 noet:
7020  */