src/5015002/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { \
 128         bool ok; \
 129         ENTER; save_re_context(); \
 130         ok=CAT2(is_utf8_,class)((const U8*)str); \
 131         assert(ok); LEAVE; } } STMT_END
 132
 133 /* Doesn't do an assert to verify that is correct */
 134 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 135     if (!CAT2(PL_utf8_,class)) { \
 136         bool throw_away __attribute__unused__; \
 137         ENTER; save_re_context(); \
 138         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 139         LEAVE; } } STMT_END
 140
 141 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 142 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 143 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 144
 145 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 146         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 147         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 148         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 149             * assert should likely and hopefully fail on an EBCDIC machine */ \
 150         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 151                                                                             \
 152         /* No asserts are done for these, in case called on an early        \
 153             * Unicode version in which they map to nothing */               \
 154         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 155         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 156         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 157         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 161
 162 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 163
 164 /* The actual code for CCC_TRY, which uses several variables from the routine
 165  * it's callable from.  It is designed to be the bulk of a case statement.
 166  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 167  *      nextchr matches the class.
 168  * UTF8_TEST is the whole test string to use for utf8 targets
 169  * LOAD is what to use to test, and if not present to load in the swash for the
 170  *      class
 171  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 172  *      UTF8_TEST test.
 173  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 174  * utf8 and a variant, load the swash if necessary and test using the utf8
 175  * test.  Advance to the next character if test is ok, otherwise fail; If not
 176  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 177  * fails, or advance to the next character */
 178
 179 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 180     if (locinput >= PL_regeol) {                                              \
 181         sayNO;                                                                \
 182     }                                                                         \
 183     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 184         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 185         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 186             sayNO;                                                            \
 187         }                                                                     \
 188         locinput += PL_utf8skip[nextchr];                                     \
 189         nextchr = UCHARAT(locinput);                                          \
 190         break;                                                                \
 191     }                                                                         \
 192     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 193         sayNO;                                                                \
 194     }                                                                         \
 195     nextchr = UCHARAT(++locinput);                                            \
 196     break;
 197
 198 /* Handle the non-locale cases for a character class and its complement.  It
 199  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 200  * This is because that code fails when the test succeeds, so we want to have
 201  * the test fail so that the code succeeds.  The swash is stored in a
 202  * predictable PL_ place */
 203 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 204                            CLASS, STR)                                        \
 205     case NAME:                                                                \
 206         _CCC_TRY_CODE( !, FUNC,                                               \
 207                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 208                                             (U8*)locinput, TRUE)),            \
 209                           CLASS, STR)                                         \
 210     case NNAME:                                                               \
 211         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 212                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 213                                             (U8*)locinput, TRUE)),            \
 214                           CLASS, STR)                                         \
 215
 216 /* Generate the case statements for both locale and non-locale character
 217  * classes in regmatch for classes that don't have special unicode semantics.
 218  * Locales don't use an immediate swash, but an intermediary special locale
 219  * function that is called on the pointer to the current place in the input
 220  * string.  That function will resolve to needing the same swash.  One might
 221  * think that because we don't know what the locale will match, we shouldn't
 222  * check with the swash loading function that it loaded properly; ie, that we
 223  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 224  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 225  * irrelevant here */
 226 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 227                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 228                 NAMEA, NNAMEA, FUNCA,                                         \
 229                 CLASS, STR)                                                   \
 230     case NAMEL:                                                               \
 231         PL_reg_flags |= RF_tainted;                                           \
 232         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 233     case NNAMEL:                                                              \
 234         PL_reg_flags |= RF_tainted;                                           \
 235         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 236                        CLASS, STR)                                            \
 237     case NAMEA:                                                               \
 238         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 239             sayNO;                                                            \
 240         }                                                                     \
 241         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 242         nextchr = UCHARAT(++locinput);                                        \
 243         break;                                                                \
 244     case NNAMEA:                                                              \
 245         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 246             sayNO;                                                            \
 247         }                                                                     \
 248         if (utf8_target) {                                                    \
 249             locinput += PL_utf8skip[nextchr];                                 \
 250             nextchr = UCHARAT(locinput);                                      \
 251         }                                                                     \
 252         else {                                                                \
 253             nextchr = UCHARAT(++locinput);                                    \
 254         }                                                                     \
 255         break;                                                                \
 256     /* Generate the non-locale cases */                                       \
 257     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 258
 259 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 260  * statements to handle separate Unicode semantics nodes */
 261 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 262                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 263                   NAMEU, NNAMEU, FUNCU,                                        \
 264                   NAMEA, NNAMEA, FUNCA,                                        \
 265                   CLASS, STR)                                                  \
 266     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 267             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 268             NAMEA, NNAMEA, FUNCA,                                              \
 269             CLASS, STR)                                                        \
 270     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 271
 272 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 273
 274 /* for use after a quantifier and before an EXACT-like node -- japhy */
 275 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 276  *
 277  * NOTE that *nothing* that affects backtracking should be in here, specifically
 278  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 279  * node that is in between two EXACT like nodes when ascertaining what the required
 280  * "follow" character is. This should probably be moved to regex compile time
 281  * although it may be done at run time beause of the REF possibility - more
 282  * investigation required. -- demerphq
 283 */
 284 #define JUMPABLE(rn) (      \
 285     OP(rn) == OPEN ||       \
 286     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 287     OP(rn) == EVAL ||   \
 288     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 289     OP(rn) == PLUS || OP(rn) == MINMOD || \
 290     OP(rn) == KEEPS || \
 291     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 292 )
 293 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 294
 295 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 296
 297 #if 0
 298 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 299    we don't need this definition. */
 300 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 301 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 302 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 303
 304 #else
 305 /* ... so we use this as its faster. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 307 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 308 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 309 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 310
 311 #endif
 312
 313 /*
 314   Search for mandatory following text node; for lookahead, the text must
 315   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 316 */
 317 #define FIND_NEXT_IMPT(rn) STMT_START { \
 318     while (JUMPABLE(rn)) { \
 319         const OPCODE type = OP(rn); \
 320         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 321             rn = NEXTOPER(NEXTOPER(rn)); \
 322         else if (type == PLUS) \
 323             rn = NEXTOPER(rn); \
 324         else if (type == IFMATCH) \
 325             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 326         else rn += NEXT_OFF(rn); \
 327     } \
 328 } STMT_END
 329
 330
 331 static void restore_pos(pTHX_ void *arg);
 332
 333 #define REGCP_PAREN_ELEMS 4
 334 #define REGCP_OTHER_ELEMS 5
 335 #define REGCP_FRAME_ELEMS 1
 336 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 337  * are needed for the regexp context stack bookkeeping. */
 338
 339 STATIC CHECKPOINT
 340 S_regcppush(pTHX_ I32 parenfloor)
 341 {
 342     dVAR;
 343     const int retval = PL_savestack_ix;
 344     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 345     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 346     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 347     int p;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     if (paren_elems_to_push < 0)
 351         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 352
 353     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 354         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 355                    " out of range (%lu-%ld)",
 356                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 357
 358     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 359
 360     for (p = PL_regsize; p > parenfloor; p--) {
 361 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 362         SSPUSHINT(PL_regoffs[p].end);
 363         SSPUSHINT(PL_regoffs[p].start);
 364         SSPUSHPTR(PL_reg_start_tmp[p]);
 365         SSPUSHINT(p);
 366         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 367           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 368                       (UV)p, (IV)PL_regoffs[p].start,
 369                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 370                       (IV)PL_regoffs[p].end
 371         ));
 372     }
 373 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 374     SSPUSHPTR(PL_regoffs);
 375     SSPUSHINT(PL_regsize);
 376     SSPUSHINT(*PL_reglastparen);
 377     SSPUSHINT(*PL_reglastcloseparen);
 378     SSPUSHPTR(PL_reginput);
 379     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 380
 381     return retval;
 382 }
 383
 384 /* These are needed since we do not localize EVAL nodes: */
 385 #define REGCP_SET(cp)                                           \
 386     DEBUG_STATE_r(                                              \
 387             PerlIO_printf(Perl_debug_log,                       \
 388                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 389                 (IV)PL_savestack_ix));                          \
 390     cp = PL_savestack_ix
 391
 392 #define REGCP_UNWIND(cp)                                        \
 393     DEBUG_STATE_r(                                              \
 394         if (cp != PL_savestack_ix)                              \
 395             PerlIO_printf(Perl_debug_log,                       \
 396                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 397                 (IV)(cp), (IV)PL_savestack_ix));                \
 398     regcpblow(cp)
 399
 400 STATIC char *
 401 S_regcppop(pTHX_ const regexp *rex)
 402 {
 403     dVAR;
 404     UV i;
 405     char *input;
 406     GET_RE_DEBUG_FLAGS_DECL;
 407
 408     PERL_ARGS_ASSERT_REGCPPOP;
 409
 410     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 411     i = SSPOPUV;
 412     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 413     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 414     input = (char *) SSPOPPTR;
 415     *PL_reglastcloseparen = SSPOPINT;
 416     *PL_reglastparen = SSPOPINT;
 417     PL_regsize = SSPOPINT;
 418     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 419
 420     i -= REGCP_OTHER_ELEMS;
 421     /* Now restore the parentheses context. */
 422     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 423         I32 tmps;
 424         U32 paren = (U32)SSPOPINT;
 425         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 426         PL_regoffs[paren].start = SSPOPINT;
 427         tmps = SSPOPINT;
 428         if (paren <= *PL_reglastparen)
 429             PL_regoffs[paren].end = tmps;
 430         DEBUG_BUFFERS_r(
 431             PerlIO_printf(Perl_debug_log,
 432                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 433                           (UV)paren, (IV)PL_regoffs[paren].start,
 434                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 435                           (IV)PL_regoffs[paren].end,
 436                           (paren > *PL_reglastparen ? "(no)" : ""));
 437         );
 438     }
 439     DEBUG_BUFFERS_r(
 440         if (*PL_reglastparen + 1 <= rex->nparens) {
 441             PerlIO_printf(Perl_debug_log,
 442                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 443                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 444         }
 445     );
 446 #if 1
 447     /* It would seem that the similar code in regtry()
 448      * already takes care of this, and in fact it is in
 449      * a better location to since this code can #if 0-ed out
 450      * but the code in regtry() is needed or otherwise tests
 451      * requiring null fields (pat.t#187 and split.t#{13,14}
 452      * (as of patchlevel 7877)  will fail.  Then again,
 453      * this code seems to be necessary or otherwise
 454      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 455      * --jhi updated by dapm */
 456     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 457         if (i > PL_regsize)
 458             PL_regoffs[i].start = -1;
 459         PL_regoffs[i].end = -1;
 460     }
 461 #endif
 462     return input;
 463 }
 464
 465 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 466
 467 /*
 468  * pregexec and friends
 469  */
 470
 471 #ifndef PERL_IN_XSUB_RE
 472 /*
 473  - pregexec - match a regexp against a string
 474  */
 475 I32
 476 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 477          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 478 /* strend: pointer to null at end of string */
 479 /* strbeg: real beginning of string */
 480 /* minend: end of match must be >=minend after stringarg. */
 481 /* nosave: For optimizations. */
 482 {
 483     PERL_ARGS_ASSERT_PREGEXEC;
 484
 485     return
 486         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 487                       nosave ? 0 : REXEC_COPY_STR);
 488 }
 489 #endif
 490
 491 /*
 492  * Need to implement the following flags for reg_anch:
 493  *
 494  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 495  * USE_INTUIT_ML
 496  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 497  * INTUIT_AUTORITATIVE_ML
 498  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 499  * INTUIT_ONCE_ML
 500  *
 501  * Another flag for this function: SECOND_TIME (so that float substrs
 502  * with giant delta may be not rechecked).
 503  */
 504
 505 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 506
 507 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 508    Otherwise, only SvCUR(sv) is used to get strbeg. */
 509
 510 /* XXXX We assume that strpos is strbeg unless sv. */
 511
 512 /* XXXX Some places assume that there is a fixed substring.
 513         An update may be needed if optimizer marks as "INTUITable"
 514         RExen without fixed substrings.  Similarly, it is assumed that
 515         lengths of all the strings are no more than minlen, thus they
 516         cannot come from lookahead.
 517         (Or minlen should take into account lookahead.)
 518   NOTE: Some of this comment is not correct. minlen does now take account
 519   of lookahead/behind. Further research is required. -- demerphq
 520
 521 */
 522
 523 /* A failure to find a constant substring means that there is no need to make
 524    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 525    finding a substring too deep into the string means that less calls to
 526    regtry() should be needed.
 527
 528    REx compiler's optimizer found 4 possible hints:
 529         a) Anchored substring;
 530         b) Fixed substring;
 531         c) Whether we are anchored (beginning-of-line or \G);
 532         d) First node (of those at offset 0) which may distinguish positions;
 533    We use a)b)d) and multiline-part of c), and try to find a position in the
 534    string which does not contradict any of them.
 535  */
 536
 537 /* Most of decisions we do here should have been done at compile time.
 538    The nodes of the REx which we used for the search should have been
 539    deleted from the finite automaton. */
 540
 541 char *
 542 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 543                      char *strend, const U32 flags, re_scream_pos_data *data)
 544 {
 545     dVAR;
 546     struct regexp *const prog = (struct regexp *)SvANY(rx);
 547     register I32 start_shift = 0;
 548     /* Should be nonnegative! */
 549     register I32 end_shift   = 0;
 550     register char *s;
 551     register SV *check;
 552     char *strbeg;
 553     char *t;
 554     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 555     I32 ml_anch;
 556     register char *other_last = NULL;   /* other substr checked before this */
 557     char *check_at = NULL;              /* check substr found at this pos */
 558     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 559     RXi_GET_DECL(prog,progi);
 560 #ifdef DEBUGGING
 561     const char * const i_strpos = strpos;
 562 #endif
 563     GET_RE_DEBUG_FLAGS_DECL;
 564
 565     PERL_ARGS_ASSERT_RE_INTUIT_START;
 566
 567     RX_MATCH_UTF8_set(rx,utf8_target);
 568
 569     if (RX_UTF8(rx)) {
 570         PL_reg_flags |= RF_utf8;
 571     }
 572     DEBUG_EXECUTE_r(
 573         debug_start_match(rx, utf8_target, strpos, strend,
 574             sv ? "Guessing start of match in sv for"
 575                : "Guessing start of match in string for");
 576               );
 577
 578     /* CHR_DIST() would be more correct here but it makes things slow. */
 579     if (prog->minlen > strend - strpos) {
 580         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 581                               "String too short... [re_intuit_start]\n"));
 582         goto fail;
 583     }
 584
 585     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 586     PL_regeol = strend;
 587     if (utf8_target) {
 588         if (!prog->check_utf8 && prog->check_substr)
 589             to_utf8_substr(prog);
 590         check = prog->check_utf8;
 591     } else {
 592         if (!prog->check_substr && prog->check_utf8)
 593             to_byte_substr(prog);
 594         check = prog->check_substr;
 595     }
 596     if (check == &PL_sv_undef) {
 597         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 598                 "Non-utf8 string cannot match utf8 check string\n"));
 599         goto fail;
 600     }
 601     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 602         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 603                      || ( (prog->extflags & RXf_ANCH_BOL)
 604                           && !multiline ) );    /* Check after \n? */
 605
 606         if (!ml_anch) {
 607           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 608                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 609                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 610                && sv && !SvROK(sv)
 611                && (strpos != strbeg)) {
 612               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 613               goto fail;
 614           }
 615           if (prog->check_offset_min == prog->check_offset_max &&
 616               !(prog->extflags & RXf_CANY_SEEN)) {
 617             /* Substring at constant offset from beg-of-str... */
 618             I32 slen;
 619
 620             s = HOP3c(strpos, prog->check_offset_min, strend);
 621
 622             if (SvTAIL(check)) {
 623                 slen = SvCUR(check);    /* >= 1 */
 624
 625                 if ( strend - s > slen || strend - s < slen - 1
 626                      || (strend - s == slen && strend[-1] != '\n')) {
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 628                     goto fail_finish;
 629                 }
 630                 /* Now should match s[0..slen-2] */
 631                 slen--;
 632                 if (slen && (*SvPVX_const(check) != *s
 633                              || (slen > 1
 634                                  && memNE(SvPVX_const(check), s, slen)))) {
 635                   report_neq:
 636                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 637                     goto fail_finish;
 638                 }
 639             }
 640             else if (*SvPVX_const(check) != *s
 641                      || ((slen = SvCUR(check)) > 1
 642                          && memNE(SvPVX_const(check), s, slen)))
 643                 goto report_neq;
 644             check_at = s;
 645             goto success_at_start;
 646           }
 647         }
 648         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 649         s = strpos;
 650         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 651         end_shift = prog->check_end_shift;
 652
 653         if (!ml_anch) {
 654             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 655                                          - (SvTAIL(check) != 0);
 656             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 657
 658             if (end_shift < eshift)
 659                 end_shift = eshift;
 660         }
 661     }
 662     else {                              /* Can match at random position */
 663         ml_anch = 0;
 664         s = strpos;
 665         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 666         end_shift = prog->check_end_shift;
 667
 668         /* end shift should be non negative here */
 669     }
 670
 671 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 672     if (end_shift < 0)
 673         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 674                    (IV)end_shift, RX_PRECOMP(prog));
 675 #endif
 676
 677   restart:
 678     /* Find a possible match in the region s..strend by looking for
 679        the "check" substring in the region corrected by start/end_shift. */
 680
 681     {
 682         I32 srch_start_shift = start_shift;
 683         I32 srch_end_shift = end_shift;
 684         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 685             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 686             srch_start_shift = strbeg - s;
 687         }
 688     DEBUG_OPTIMISE_MORE_r({
 689         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 690             (IV)prog->check_offset_min,
 691             (IV)srch_start_shift,
 692             (IV)srch_end_shift,
 693             (IV)prog->check_end_shift);
 694     });
 695
 696     if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
 697         I32 p = -1;                     /* Internal iterator of scream. */
 698         I32 * const pp = data ? data->scream_pos : &p;
 699         const MAGIC *mg;
 700         bool found = FALSE;
 701
 702         assert(SvMAGICAL(sv));
 703         mg = mg_find(sv, PERL_MAGIC_study);
 704         assert(mg);
 705
 706         if (mg->mg_private == 1) {
 707             found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
 708         } else if (mg->mg_private == 2) {
 709             found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
 710         } else {
 711             assert (mg->mg_private == 4);
 712             found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
 713         }
 714
 715         if (found
 716             || ( BmRARE(check) == '\n'
 717                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 718                  && SvTAIL(check) ))
 719             s = screaminstr(sv, check,
 720                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 721         else
 722             goto fail_finish;
 723         /* we may be pointing at the wrong string */
 724         if (s && RXp_MATCH_COPIED(prog))
 725             s = strbeg + (s - SvPVX_const(sv));
 726         if (data)
 727             *data->scream_olds = s;
 728     }
 729     else {
 730         U8* start_point;
 731         U8* end_point;
 732         if (prog->extflags & RXf_CANY_SEEN) {
 733             start_point= (U8*)(s + srch_start_shift);
 734             end_point= (U8*)(strend - srch_end_shift);
 735         } else {
 736             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 737             end_point= HOP3(strend, -srch_end_shift, strbeg);
 738         }
 739         DEBUG_OPTIMISE_MORE_r({
 740             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 741                 (int)(end_point - start_point),
 742                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 743                 start_point);
 744         });
 745
 746         s = fbm_instr( start_point, end_point,
 747                       check, multiline ? FBMrf_MULTILINE : 0);
 748     }
 749     }
 750     /* Update the count-of-usability, remove useless subpatterns,
 751         unshift s.  */
 752
 753     DEBUG_EXECUTE_r({
 754         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 755             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 756         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 757                           (s ? "Found" : "Did not find"),
 758             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 759                 ? "anchored" : "floating"),
 760             quoted,
 761             RE_SV_TAIL(check),
 762             (s ? " at offset " : "...\n") );
 763     });
 764
 765     if (!s)
 766         goto fail_finish;
 767     /* Finish the diagnostic message */
 768     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 769
 770     /* XXX dmq: first branch is for positive lookbehind...
 771        Our check string is offset from the beginning of the pattern.
 772        So we need to do any stclass tests offset forward from that
 773        point. I think. :-(
 774      */
 775
 776
 777
 778     check_at=s;
 779
 780
 781     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 782        Start with the other substr.
 783        XXXX no SCREAM optimization yet - and a very coarse implementation
 784        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 785                 *always* match.  Probably should be marked during compile...
 786        Probably it is right to do no SCREAM here...
 787      */
 788
 789     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 790                 : (prog->float_substr && prog->anchored_substr))
 791     {
 792         /* Take into account the "other" substring. */
 793         /* XXXX May be hopelessly wrong for UTF... */
 794         if (!other_last)
 795             other_last = strpos;
 796         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 797           do_other_anchored:
 798             {
 799                 char * const last = HOP3c(s, -start_shift, strbeg);
 800                 char *last1, *last2;
 801                 char * const saved_s = s;
 802                 SV* must;
 803
 804                 t = s - prog->check_offset_max;
 805                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 806                     && (!utf8_target
 807                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 808                             && t > strpos)))
 809                     NOOP;
 810                 else
 811                     t = strpos;
 812                 t = HOP3c(t, prog->anchored_offset, strend);
 813                 if (t < other_last)     /* These positions already checked */
 814                     t = other_last;
 815                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 816                 if (last < last1)
 817                     last1 = last;
 818                 /* XXXX It is not documented what units *_offsets are in.
 819                    We assume bytes, but this is clearly wrong.
 820                    Meaning this code needs to be carefully reviewed for errors.
 821                    dmq.
 822                   */
 823
 824                 /* On end-of-str: see comment below. */
 825                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 826                 if (must == &PL_sv_undef) {
 827                     s = (char*)NULL;
 828                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 829                 }
 830                 else
 831                     s = fbm_instr(
 832                         (unsigned char*)t,
 833                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 834                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 835                         must,
 836                         multiline ? FBMrf_MULTILINE : 0
 837                     );
 838                 DEBUG_EXECUTE_r({
 839                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 840                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 841                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 842                         (s ? "Found" : "Contradicts"),
 843                         quoted, RE_SV_TAIL(must));
 844                 });
 845
 846
 847                 if (!s) {
 848                     if (last1 >= last2) {
 849                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 850                                                 ", giving up...\n"));
 851                         goto fail_finish;
 852                     }
 853                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 854                         ", trying floating at offset %ld...\n",
 855                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 856                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 857                     s = HOP3c(last, 1, strend);
 858                     goto restart;
 859                 }
 860                 else {
 861                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 862                           (long)(s - i_strpos)));
 863                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 864                     other_last = HOP3c(s, 1, strend);
 865                     s = saved_s;
 866                     if (t == strpos)
 867                         goto try_at_start;
 868                     goto try_at_offset;
 869                 }
 870             }
 871         }
 872         else {          /* Take into account the floating substring. */
 873             char *last, *last1;
 874             char * const saved_s = s;
 875             SV* must;
 876
 877             t = HOP3c(s, -start_shift, strbeg);
 878             last1 = last =
 879                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 880             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 881                 last = HOP3c(t, prog->float_max_offset, strend);
 882             s = HOP3c(t, prog->float_min_offset, strend);
 883             if (s < other_last)
 884                 s = other_last;
 885  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 886             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 887             /* fbm_instr() takes into account exact value of end-of-str
 888                if the check is SvTAIL(ed).  Since false positives are OK,
 889                and end-of-str is not later than strend we are OK. */
 890             if (must == &PL_sv_undef) {
 891                 s = (char*)NULL;
 892                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 893             }
 894             else
 895                 s = fbm_instr((unsigned char*)s,
 896                               (unsigned char*)last + SvCUR(must)
 897                                   - (SvTAIL(must)!=0),
 898                               must, multiline ? FBMrf_MULTILINE : 0);
 899             DEBUG_EXECUTE_r({
 900                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 901                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 902                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 903                     (s ? "Found" : "Contradicts"),
 904                     quoted, RE_SV_TAIL(must));
 905             });
 906             if (!s) {
 907                 if (last1 == last) {
 908                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 909                                             ", giving up...\n"));
 910                     goto fail_finish;
 911                 }
 912                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 913                     ", trying anchored starting at offset %ld...\n",
 914                     (long)(saved_s + 1 - i_strpos)));
 915                 other_last = last;
 916                 s = HOP3c(t, 1, strend);
 917                 goto restart;
 918             }
 919             else {
 920                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 921                       (long)(s - i_strpos)));
 922                 other_last = s; /* Fix this later. --Hugo */
 923                 s = saved_s;
 924                 if (t == strpos)
 925                     goto try_at_start;
 926                 goto try_at_offset;
 927             }
 928         }
 929     }
 930
 931
 932     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 933
 934     DEBUG_OPTIMISE_MORE_r(
 935         PerlIO_printf(Perl_debug_log,
 936             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 937             (IV)prog->check_offset_min,
 938             (IV)prog->check_offset_max,
 939             (IV)(s-strpos),
 940             (IV)(t-strpos),
 941             (IV)(t-s),
 942             (IV)(strend-strpos)
 943         )
 944     );
 945
 946     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 947         && (!utf8_target
 948             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 949                  && t > strpos)))
 950     {
 951         /* Fixed substring is found far enough so that the match
 952            cannot start at strpos. */
 953       try_at_offset:
 954         if (ml_anch && t[-1] != '\n') {
 955             /* Eventually fbm_*() should handle this, but often
 956                anchored_offset is not 0, so this check will not be wasted. */
 957             /* XXXX In the code below we prefer to look for "^" even in
 958                presence of anchored substrings.  And we search even
 959                beyond the found float position.  These pessimizations
 960                are historical artefacts only.  */
 961           find_anchor:
 962             while (t < strend - prog->minlen) {
 963                 if (*t == '\n') {
 964                     if (t < check_at - prog->check_offset_min) {
 965                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 966                             /* Since we moved from the found position,
 967                                we definitely contradict the found anchored
 968                                substr.  Due to the above check we do not
 969                                contradict "check" substr.
 970                                Thus we can arrive here only if check substr
 971                                is float.  Redo checking for "other"=="fixed".
 972                              */
 973                             strpos = t + 1;
 974                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 975                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 976                             goto do_other_anchored;
 977                         }
 978                         /* We don't contradict the found floating substring. */
 979                         /* XXXX Why not check for STCLASS? */
 980                         s = t + 1;
 981                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 982                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 983                         goto set_useful;
 984                     }
 985                     /* Position contradicts check-string */
 986                     /* XXXX probably better to look for check-string
 987                        than for "\n", so one should lower the limit for t? */
 988                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 989                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 990                     other_last = strpos = s = t + 1;
 991                     goto restart;
 992                 }
 993                 t++;
 994             }
 995             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 996                         PL_colors[0], PL_colors[1]));
 997             goto fail_finish;
 998         }
 999         else {
1000             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1001                         PL_colors[0], PL_colors[1]));
1002         }
1003         s = t;
1004       set_useful:
1005         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1006     }
1007     else {
1008         /* The found string does not prohibit matching at strpos,
1009            - no optimization of calling REx engine can be performed,
1010            unless it was an MBOL and we are not after MBOL,
1011            or a future STCLASS check will fail this. */
1012       try_at_start:
1013         /* Even in this situation we may use MBOL flag if strpos is offset
1014            wrt the start of the string. */
1015         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1016             && (strpos != strbeg) && strpos[-1] != '\n'
1017             /* May be due to an implicit anchor of m{.*foo}  */
1018             && !(prog->intflags & PREGf_IMPLICIT))
1019         {
1020             t = strpos;
1021             goto find_anchor;
1022         }
1023         DEBUG_EXECUTE_r( if (ml_anch)
1024             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1025                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1026         );
1027       success_at_start:
1028         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1029             && (utf8_target ? (
1030                 prog->check_utf8                /* Could be deleted already */
1031                 && --BmUSEFUL(prog->check_utf8) < 0
1032                 && (prog->check_utf8 == prog->float_utf8)
1033             ) : (
1034                 prog->check_substr              /* Could be deleted already */
1035                 && --BmUSEFUL(prog->check_substr) < 0
1036                 && (prog->check_substr == prog->float_substr)
1037             )))
1038         {
1039             /* If flags & SOMETHING - do not do it many times on the same match */
1040             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1041             /* XXX Does the destruction order has to change with utf8_target? */
1042             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1043             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1044             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1045             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1046             check = NULL;                       /* abort */
1047             s = strpos;
1048             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1049                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1050             if (prog->intflags & PREGf_IMPLICIT)
1051                 prog->extflags &= ~RXf_ANCH_MBOL;
1052             /* XXXX This is a remnant of the old implementation.  It
1053                     looks wasteful, since now INTUIT can use many
1054                     other heuristics. */
1055             prog->extflags &= ~RXf_USE_INTUIT;
1056             /* XXXX What other flags might need to be cleared in this branch? */
1057         }
1058         else
1059             s = strpos;
1060     }
1061
1062     /* Last resort... */
1063     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1064     /* trie stclasses are too expensive to use here, we are better off to
1065        leave it to regmatch itself */
1066     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1067         /* minlen == 0 is possible if regstclass is \b or \B,
1068            and the fixed substr is ''$.
1069            Since minlen is already taken into account, s+1 is before strend;
1070            accidentally, minlen >= 1 guaranties no false positives at s + 1
1071            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1072            regstclass does not come from lookahead...  */
1073         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1074            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1075         const U8* const str = (U8*)STRING(progi->regstclass);
1076         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1077                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1078                     : 1);
1079         char * endpos;
1080         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1081             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1082         else if (prog->float_substr || prog->float_utf8)
1083             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1084         else
1085             endpos= strend;
1086
1087         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1088                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1089
1090         t = s;
1091         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1092         if (!s) {
1093 #ifdef DEBUGGING
1094             const char *what = NULL;
1095 #endif
1096             if (endpos == strend) {
1097                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098                                 "Could not match STCLASS...\n") );
1099                 goto fail;
1100             }
1101             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1102                                    "This position contradicts STCLASS...\n") );
1103             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1104                 goto fail;
1105             /* Contradict one of substrings */
1106             if (prog->anchored_substr || prog->anchored_utf8) {
1107                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1108                     DEBUG_EXECUTE_r( what = "anchored" );
1109                   hop_and_restart:
1110                     s = HOP3c(t, 1, strend);
1111                     if (s + start_shift + end_shift > strend) {
1112                         /* XXXX Should be taken into account earlier? */
1113                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1114                                                "Could not match STCLASS...\n") );
1115                         goto fail;
1116                     }
1117                     if (!check)
1118                         goto giveup;
1119                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120                                 "Looking for %s substr starting at offset %ld...\n",
1121                                  what, (long)(s + start_shift - i_strpos)) );
1122                     goto restart;
1123                 }
1124                 /* Have both, check_string is floating */
1125                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1126                     goto retry_floating_check;
1127                 /* Recheck anchored substring, but not floating... */
1128                 s = check_at;
1129                 if (!check)
1130                     goto giveup;
1131                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1132                           "Looking for anchored substr starting at offset %ld...\n",
1133                           (long)(other_last - i_strpos)) );
1134                 goto do_other_anchored;
1135             }
1136             /* Another way we could have checked stclass at the
1137                current position only: */
1138             if (ml_anch) {
1139                 s = t = t + 1;
1140                 if (!check)
1141                     goto giveup;
1142                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143                           "Looking for /%s^%s/m starting at offset %ld...\n",
1144                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1145                 goto try_at_offset;
1146             }
1147             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1148                 goto fail;
1149             /* Check is floating substring. */
1150           retry_floating_check:
1151             t = check_at - start_shift;
1152             DEBUG_EXECUTE_r( what = "floating" );
1153             goto hop_and_restart;
1154         }
1155         if (t != s) {
1156             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1157                         "By STCLASS: moving %ld --> %ld\n",
1158                                   (long)(t - i_strpos), (long)(s - i_strpos))
1159                    );
1160         }
1161         else {
1162             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1163                                   "Does not contradict STCLASS...\n");
1164                    );
1165         }
1166     }
1167   giveup:
1168     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1169                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1170                           PL_colors[5], (long)(s - i_strpos)) );
1171     return s;
1172
1173   fail_finish:                          /* Substring not found */
1174     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1175         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1176   fail:
1177     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1178                           PL_colors[4], PL_colors[5]));
1179     return NULL;
1180 }
1181
1182 #define DECL_TRIE_TYPE(scan) \
1183     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1184                     trie_type = (scan->flags != EXACT) \
1185                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1186                               : (utf8_target ? trie_utf8 : trie_plain)
1187
1188 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1189 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1190     switch (trie_type) {                                                    \
1191     case trie_utf8_fold:                                                    \
1192         if ( foldlen>0 ) {                                                  \
1193             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1194             foldlen -= len;                                                 \
1195             uscan += len;                                                   \
1196             len=0;                                                          \
1197         } else {                                                            \
1198             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1199             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1200             foldlen -= UNISKIP( uvc );                                      \
1201             uscan = foldbuf + UNISKIP( uvc );                               \
1202         }                                                                   \
1203         break;                                                              \
1204     case trie_latin_utf8_fold:                                              \
1205         if ( foldlen>0 ) {                                                  \
1206             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1207             foldlen -= len;                                                 \
1208             uscan += len;                                                   \
1209             len=0;                                                          \
1210         } else {                                                            \
1211             len = 1;                                                        \
1212             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1213             foldlen -= UNISKIP( uvc );                                      \
1214             uscan = foldbuf + UNISKIP( uvc );                               \
1215         }                                                                   \
1216         break;                                                              \
1217     case trie_utf8:                                                         \
1218         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1219         break;                                                              \
1220     case trie_plain:                                                        \
1221         uvc = (UV)*uc;                                                      \
1222         len = 1;                                                            \
1223     }                                                                       \
1224     if (uvc < 256) {                                                        \
1225         charid = trie->charmap[ uvc ];                                      \
1226     }                                                                       \
1227     else {                                                                  \
1228         charid = 0;                                                         \
1229         if (widecharmap) {                                                  \
1230             SV** const svpp = hv_fetch(widecharmap,                         \
1231                         (char*)&uvc, sizeof(UV), 0);                        \
1232             if (svpp)                                                       \
1233                 charid = (U16)SvIV(*svpp);                                  \
1234         }                                                                   \
1235     }                                                                       \
1236 } STMT_END
1237
1238 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1239 STMT_START {                                              \
1240     while (s <= e) {                                      \
1241         if ( (CoNd)                                       \
1242              && (ln == 1 || folder(s, pat_string, ln))    \
1243              && (!reginfo || regtry(reginfo, &s)) )       \
1244             goto got_it;                                  \
1245         s++;                                              \
1246     }                                                     \
1247 } STMT_END
1248
1249 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1250 STMT_START {                                          \
1251     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1252         CoDe                                          \
1253         s += uskip;                                   \
1254     }                                                 \
1255 } STMT_END
1256
1257 #define REXEC_FBC_SCAN(CoDe)                          \
1258 STMT_START {                                          \
1259     while (s < strend) {                              \
1260         CoDe                                          \
1261         s++;                                          \
1262     }                                                 \
1263 } STMT_END
1264
1265 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1266 REXEC_FBC_UTF8_SCAN(                                  \
1267     if (CoNd) {                                       \
1268         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1269             goto got_it;                              \
1270         else                                          \
1271             tmp = doevery;                            \
1272     }                                                 \
1273     else                                              \
1274         tmp = 1;                                      \
1275 )
1276
1277 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1278 REXEC_FBC_SCAN(                                       \
1279     if (CoNd) {                                       \
1280         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1281             goto got_it;                              \
1282         else                                          \
1283             tmp = doevery;                            \
1284     }                                                 \
1285     else                                              \
1286         tmp = 1;                                      \
1287 )
1288
1289 #define REXEC_FBC_TRYIT               \
1290 if ((!reginfo || regtry(reginfo, &s))) \
1291     goto got_it
1292
1293 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1294     if (utf8_target) {                                             \
1295         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1296     }                                                          \
1297     else {                                                     \
1298         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1299     }
1300
1301 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1302     if (utf8_target) {                                             \
1303         UtFpReLoAd;                                            \
1304         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1305     }                                                          \
1306     else {                                                     \
1307         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1308     }
1309
1310 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1311     PL_reg_flags |= RF_tainted;                                \
1312     if (utf8_target) {                                             \
1313         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1314     }                                                          \
1315     else {                                                     \
1316         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1317     }
1318
1319 #define DUMP_EXEC_POS(li,s,doutf8) \
1320     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1321
1322
1323 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1324         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1325         tmp = TEST_NON_UTF8(tmp);                                              \
1326         REXEC_FBC_UTF8_SCAN(                                                   \
1327             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1328                 tmp = !tmp;                                                    \
1329                 IF_SUCCESS;                                                    \
1330             }                                                                  \
1331             else {                                                             \
1332                 IF_FAIL;                                                       \
1333             }                                                                  \
1334         );                                                                     \
1335
1336 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1337         if (s == PL_bostr) {                                                   \
1338             tmp = '\n';                                                        \
1339         }                                                                      \
1340         else {                                                                 \
1341             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1342             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1343         }                                                                      \
1344         tmp = TeSt1_UtF8;                                                      \
1345         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1346         REXEC_FBC_UTF8_SCAN(                                                   \
1347             if (tmp == ! (TeSt2_UtF8)) { \
1348                 tmp = !tmp;                                                    \
1349                 IF_SUCCESS;                                                    \
1350             }                                                                  \
1351             else {                                                             \
1352                 IF_FAIL;                                                       \
1353             }                                                                  \
1354         );                                                                     \
1355
1356 /* The only difference between the BOUND and NBOUND cases is that
1357  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1358  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1359  * with the other one being empty */
1360 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1361     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1362
1363 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1364     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1365
1366 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1367     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1368
1369 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1370     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1371
1372
1373 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1374  * be passed in completely with the variable name being tested, which isn't
1375  * such a clean interface, but this is easier to read than it was before.  We
1376  * are looking for the boundary (or non-boundary between a word and non-word
1377  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1378  * must be different.  Find the "wordness" of the character just prior to this
1379  * one, and compare it with the wordness of this one.  If they differ, we have
1380  * a boundary.  At the beginning of the string, pretend that the previous
1381  * character was a new-line */
1382 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1383     if (utf8_target) {                                                         \
1384                 UTF8_CODE \
1385     }                                                                          \
1386     else {  /* Not utf8 */                                                     \
1387         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1388         tmp = TEST_NON_UTF8(tmp);                                              \
1389         REXEC_FBC_SCAN(                                                        \
1390             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1391                 tmp = !tmp;                                                    \
1392                 IF_SUCCESS;                                                    \
1393             }                                                                  \
1394             else {                                                             \
1395                 IF_FAIL;                                                       \
1396             }                                                                  \
1397         );                                                                     \
1398     }                                                                          \
1399     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1400         goto got_it;
1401
1402 /* We know what class REx starts with.  Try to find this position... */
1403 /* if reginfo is NULL, its a dryrun */
1404 /* annoyingly all the vars in this routine have different names from their counterparts
1405    in regmatch. /grrr */
1406
1407 STATIC char *
1408 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1409     const char *strend, regmatch_info *reginfo)
1410 {
1411         dVAR;
1412         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1413         char *pat_string;   /* The pattern's exactish string */
1414         char *pat_end;      /* ptr to end char of pat_string */
1415         re_fold_t folder;       /* Function for computing non-utf8 folds */
1416         const U8 *fold_array;   /* array for folding ords < 256 */
1417         STRLEN ln;
1418         STRLEN lnc;
1419         register STRLEN uskip;
1420         U8 c1;
1421         U8 c2;
1422         char *e;
1423         register I32 tmp = 1;   /* Scratch variable? */
1424         register const bool utf8_target = PL_reg_match_utf8;
1425         UV utf8_fold_flags = 0;
1426         RXi_GET_DECL(prog,progi);
1427
1428         PERL_ARGS_ASSERT_FIND_BYCLASS;
1429
1430         /* We know what class it must start with. */
1431         switch (OP(c)) {
1432         case ANYOFV:
1433         case ANYOF:
1434             if (utf8_target || OP(c) == ANYOFV) {
1435                 STRLEN inclasslen = strend - s;
1436                 REXEC_FBC_UTF8_CLASS_SCAN(
1437                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1438             }
1439             else {
1440                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1441             }
1442             break;
1443         case CANY:
1444             REXEC_FBC_SCAN(
1445                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1446                     goto got_it;
1447                 else
1448                     tmp = doevery;
1449             );
1450             break;
1451
1452         case EXACTFA:
1453             if (UTF_PATTERN || utf8_target) {
1454                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1455                 goto do_exactf_utf8;
1456             }
1457             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1458             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1459             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1460
1461         case EXACTFU:
1462             if (UTF_PATTERN || utf8_target) {
1463                 utf8_fold_flags = 0;
1464                 goto do_exactf_utf8;
1465             }
1466
1467             /* Any 'ss' in the pattern should have been replaced by regcomp,
1468              * so we don't have to worry here about this single special case
1469              * in the Latin1 range */
1470             fold_array = PL_fold_latin1;
1471             folder = foldEQ_latin1;
1472             goto do_exactf_non_utf8;
1473
1474         case EXACTF:
1475             if (UTF_PATTERN || utf8_target) {
1476                 utf8_fold_flags = 0;
1477                 goto do_exactf_utf8;
1478             }
1479             fold_array = PL_fold;
1480             folder = foldEQ;
1481             goto do_exactf_non_utf8;
1482
1483         case EXACTFL:
1484             if (UTF_PATTERN || utf8_target) {
1485                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1486                 goto do_exactf_utf8;
1487             }
1488             fold_array = PL_fold_locale;
1489             folder = foldEQ_locale;
1490
1491             /* FALL THROUGH */
1492
1493         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1494
1495             /* The idea in the non-utf8 EXACTF* cases is to first find the
1496              * first character of the EXACTF* node and then, if necessary,
1497              * case-insensitively compare the full text of the node.  c1 is the
1498              * first character.  c2 is its fold.  This logic will not work for
1499              * Unicode semantics and the german sharp ss, which hence should
1500              * not be compiled into a node that gets here. */
1501             pat_string = STRING(c);
1502             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1503
1504             e = HOP3c(strend, -((I32)ln), s);
1505
1506             if (!reginfo && e < s) {
1507                 e = s;                  /* Due to minlen logic of intuit() */
1508             }
1509
1510             c1 = *pat_string;
1511             c2 = fold_array[c1];
1512             if (c1 == c2) { /* If char and fold are the same */
1513                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1514             }
1515             else {
1516                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1517             }
1518             break;
1519
1520         do_exactf_utf8:
1521
1522             /* If one of the operands is in utf8, we can't use the simpler
1523              * folding above, due to the fact that many different characters
1524              * can have the same fold, or portion of a fold, or different-
1525              * length fold */
1526             pat_string = STRING(c);
1527             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1528             pat_end = pat_string + ln;
1529             lnc = (UTF_PATTERN) /* length to match in characters */
1530                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1531                     : ln;
1532
1533             e = HOP3c(strend, -((I32)lnc), s);
1534
1535             if (!reginfo && e < s) {
1536                 e = s;                  /* Due to minlen logic of intuit() */
1537             }
1538
1539             while (s <= e) {
1540                 char *my_strend= (char *)strend;
1541                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1542                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1543                     && (!reginfo || regtry(reginfo, &s)) )
1544                 {
1545                     goto got_it;
1546                 }
1547                 s += UTF8SKIP(s);
1548             }
1549             break;
1550         case BOUNDL:
1551             PL_reg_flags |= RF_tainted;
1552             FBC_BOUND(isALNUM_LC,
1553                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1554                       isALNUM_LC_utf8((U8*)s));
1555             break;
1556         case NBOUNDL:
1557             PL_reg_flags |= RF_tainted;
1558             FBC_NBOUND(isALNUM_LC,
1559                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1560                        isALNUM_LC_utf8((U8*)s));
1561             break;
1562         case BOUND:
1563             FBC_BOUND(isWORDCHAR,
1564                       isALNUM_uni(tmp),
1565                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1566             break;
1567         case BOUNDA:
1568             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1569                              isWORDCHAR_A(tmp),
1570                              isWORDCHAR_A((U8*)s));
1571             break;
1572         case NBOUND:
1573             FBC_NBOUND(isWORDCHAR,
1574                        isALNUM_uni(tmp),
1575                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1576             break;
1577         case NBOUNDA:
1578             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1579                               isWORDCHAR_A(tmp),
1580                               isWORDCHAR_A((U8*)s));
1581             break;
1582         case BOUNDU:
1583             FBC_BOUND(isWORDCHAR_L1,
1584                       isALNUM_uni(tmp),
1585                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1586             break;
1587         case NBOUNDU:
1588             FBC_NBOUND(isWORDCHAR_L1,
1589                        isALNUM_uni(tmp),
1590                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1591             break;
1592         case ALNUML:
1593             REXEC_FBC_CSCAN_TAINT(
1594                 isALNUM_LC_utf8((U8*)s),
1595                 isALNUM_LC(*s)
1596             );
1597             break;
1598         case ALNUMU:
1599             REXEC_FBC_CSCAN_PRELOAD(
1600                 LOAD_UTF8_CHARCLASS_ALNUM(),
1601                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1602                 isWORDCHAR_L1((U8) *s)
1603             );
1604             break;
1605         case ALNUM:
1606             REXEC_FBC_CSCAN_PRELOAD(
1607                 LOAD_UTF8_CHARCLASS_ALNUM(),
1608                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1609                 isWORDCHAR((U8) *s)
1610             );
1611             break;
1612         case ALNUMA:
1613             /* Don't need to worry about utf8, as it can match only a single
1614              * byte invariant character */
1615             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1616             break;
1617         case NALNUMU:
1618             REXEC_FBC_CSCAN_PRELOAD(
1619                 LOAD_UTF8_CHARCLASS_ALNUM(),
1620                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1621                 ! isWORDCHAR_L1((U8) *s)
1622             );
1623             break;
1624         case NALNUM:
1625             REXEC_FBC_CSCAN_PRELOAD(
1626                 LOAD_UTF8_CHARCLASS_ALNUM(),
1627                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1628                 ! isALNUM(*s)
1629             );
1630             break;
1631         case NALNUMA:
1632             REXEC_FBC_CSCAN(
1633                 !isWORDCHAR_A(*s),
1634                 !isWORDCHAR_A(*s)
1635             );
1636             break;
1637         case NALNUML:
1638             REXEC_FBC_CSCAN_TAINT(
1639                 !isALNUM_LC_utf8((U8*)s),
1640                 !isALNUM_LC(*s)
1641             );
1642             break;
1643         case SPACEU:
1644             REXEC_FBC_CSCAN_PRELOAD(
1645                 LOAD_UTF8_CHARCLASS_SPACE(),
1646                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1647                 isSPACE_L1((U8) *s)
1648             );
1649             break;
1650         case SPACE:
1651             REXEC_FBC_CSCAN_PRELOAD(
1652                 LOAD_UTF8_CHARCLASS_SPACE(),
1653                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1654                 isSPACE((U8) *s)
1655             );
1656             break;
1657         case SPACEA:
1658             /* Don't need to worry about utf8, as it can match only a single
1659              * byte invariant character */
1660             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1661             break;
1662         case SPACEL:
1663             REXEC_FBC_CSCAN_TAINT(
1664                 isSPACE_LC_utf8((U8*)s),
1665                 isSPACE_LC(*s)
1666             );
1667             break;
1668         case NSPACEU:
1669             REXEC_FBC_CSCAN_PRELOAD(
1670                 LOAD_UTF8_CHARCLASS_SPACE(),
1671                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1672                 ! isSPACE_L1((U8) *s)
1673             );
1674             break;
1675         case NSPACE:
1676             REXEC_FBC_CSCAN_PRELOAD(
1677                 LOAD_UTF8_CHARCLASS_SPACE(),
1678                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1679                 ! isSPACE((U8) *s)
1680             );
1681             break;
1682         case NSPACEA:
1683             REXEC_FBC_CSCAN(
1684                 !isSPACE_A(*s),
1685                 !isSPACE_A(*s)
1686             );
1687             break;
1688         case NSPACEL:
1689             REXEC_FBC_CSCAN_TAINT(
1690                 !isSPACE_LC_utf8((U8*)s),
1691                 !isSPACE_LC(*s)
1692             );
1693             break;
1694         case DIGIT:
1695             REXEC_FBC_CSCAN_PRELOAD(
1696                 LOAD_UTF8_CHARCLASS_DIGIT(),
1697                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1698                 isDIGIT(*s)
1699             );
1700             break;
1701         case DIGITA:
1702             /* Don't need to worry about utf8, as it can match only a single
1703              * byte invariant character */
1704             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1705             break;
1706         case DIGITL:
1707             REXEC_FBC_CSCAN_TAINT(
1708                 isDIGIT_LC_utf8((U8*)s),
1709                 isDIGIT_LC(*s)
1710             );
1711             break;
1712         case NDIGIT:
1713             REXEC_FBC_CSCAN_PRELOAD(
1714                 LOAD_UTF8_CHARCLASS_DIGIT(),
1715                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1716                 !isDIGIT(*s)
1717             );
1718             break;
1719         case NDIGITA:
1720             REXEC_FBC_CSCAN(
1721                 !isDIGIT_A(*s),
1722                 !isDIGIT_A(*s)
1723             );
1724             break;
1725         case NDIGITL:
1726             REXEC_FBC_CSCAN_TAINT(
1727                 !isDIGIT_LC_utf8((U8*)s),
1728                 !isDIGIT_LC(*s)
1729             );
1730             break;
1731         case LNBREAK:
1732             REXEC_FBC_CSCAN(
1733                 is_LNBREAK_utf8(s),
1734                 is_LNBREAK_latin1(s)
1735             );
1736             break;
1737         case VERTWS:
1738             REXEC_FBC_CSCAN(
1739                 is_VERTWS_utf8(s),
1740                 is_VERTWS_latin1(s)
1741             );
1742             break;
1743         case NVERTWS:
1744             REXEC_FBC_CSCAN(
1745                 !is_VERTWS_utf8(s),
1746                 !is_VERTWS_latin1(s)
1747             );
1748             break;
1749         case HORIZWS:
1750             REXEC_FBC_CSCAN(
1751                 is_HORIZWS_utf8(s),
1752                 is_HORIZWS_latin1(s)
1753             );
1754             break;
1755         case NHORIZWS:
1756             REXEC_FBC_CSCAN(
1757                 !is_HORIZWS_utf8(s),
1758                 !is_HORIZWS_latin1(s)
1759             );
1760             break;
1761         case AHOCORASICKC:
1762         case AHOCORASICK:
1763             {
1764                 DECL_TRIE_TYPE(c);
1765                 /* what trie are we using right now */
1766                 reg_ac_data *aho
1767                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1768                 reg_trie_data *trie
1769                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1770                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1771
1772                 const char *last_start = strend - trie->minlen;
1773 #ifdef DEBUGGING
1774                 const char *real_start = s;
1775 #endif
1776                 STRLEN maxlen = trie->maxlen;
1777                 SV *sv_points;
1778                 U8 **points; /* map of where we were in the input string
1779                                 when reading a given char. For ASCII this
1780                                 is unnecessary overhead as the relationship
1781                                 is always 1:1, but for Unicode, especially
1782                                 case folded Unicode this is not true. */
1783                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1784                 U8 *bitmap=NULL;
1785
1786
1787                 GET_RE_DEBUG_FLAGS_DECL;
1788
1789                 /* We can't just allocate points here. We need to wrap it in
1790                  * an SV so it gets freed properly if there is a croak while
1791                  * running the match */
1792                 ENTER;
1793                 SAVETMPS;
1794                 sv_points=newSV(maxlen * sizeof(U8 *));
1795                 SvCUR_set(sv_points,
1796                     maxlen * sizeof(U8 *));
1797                 SvPOK_on(sv_points);
1798                 sv_2mortal(sv_points);
1799                 points=(U8**)SvPV_nolen(sv_points );
1800                 if ( trie_type != trie_utf8_fold
1801                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1802                 {
1803                     if (trie->bitmap)
1804                         bitmap=(U8*)trie->bitmap;
1805                     else
1806                         bitmap=(U8*)ANYOF_BITMAP(c);
1807                 }
1808                 /* this is the Aho-Corasick algorithm modified a touch
1809                    to include special handling for long "unknown char"
1810                    sequences. The basic idea being that we use AC as long
1811                    as we are dealing with a possible matching char, when
1812                    we encounter an unknown char (and we have not encountered
1813                    an accepting state) we scan forward until we find a legal
1814                    starting char.
1815                    AC matching is basically that of trie matching, except
1816                    that when we encounter a failing transition, we fall back
1817                    to the current states "fail state", and try the current char
1818                    again, a process we repeat until we reach the root state,
1819                    state 1, or a legal transition. If we fail on the root state
1820                    then we can either terminate if we have reached an accepting
1821                    state previously, or restart the entire process from the beginning
1822                    if we have not.
1823
1824                  */
1825                 while (s <= last_start) {
1826                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1827                     U8 *uc = (U8*)s;
1828                     U16 charid = 0;
1829                     U32 base = 1;
1830                     U32 state = 1;
1831                     UV uvc = 0;
1832                     STRLEN len = 0;
1833                     STRLEN foldlen = 0;
1834                     U8 *uscan = (U8*)NULL;
1835                     U8 *leftmost = NULL;
1836 #ifdef DEBUGGING
1837                     U32 accepted_word= 0;
1838 #endif
1839                     U32 pointpos = 0;
1840
1841                     while ( state && uc <= (U8*)strend ) {
1842                         int failed=0;
1843                         U32 word = aho->states[ state ].wordnum;
1844
1845                         if( state==1 ) {
1846                             if ( bitmap ) {
1847                                 DEBUG_TRIE_EXECUTE_r(
1848                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1849                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1850                                             (char *)uc, utf8_target );
1851                                         PerlIO_printf( Perl_debug_log,
1852                                             " Scanning for legal start char...\n");
1853                                     }
1854                                 );
1855                                 if (utf8_target) {
1856                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1857                                         uc += UTF8SKIP(uc);
1858                                     }
1859                                 } else {
1860                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1861                                         uc++;
1862                                     }
1863                                 }
1864                                 s= (char *)uc;
1865                             }
1866                             if (uc >(U8*)last_start) break;
1867                         }
1868
1869                         if ( word ) {
1870                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1871                             if (!leftmost || lpos < leftmost) {
1872                                 DEBUG_r(accepted_word=word);
1873                                 leftmost= lpos;
1874                             }
1875                             if (base==0) break;
1876
1877                         }
1878                         points[pointpos++ % maxlen]= uc;
1879                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1880                                              uscan, len, uvc, charid, foldlen,
1881                                              foldbuf, uniflags);
1882                         DEBUG_TRIE_EXECUTE_r({
1883                             dump_exec_pos( (char *)uc, c, strend, real_start,
1884                                 s,   utf8_target );
1885                             PerlIO_printf(Perl_debug_log,
1886                                 " Charid:%3u CP:%4"UVxf" ",
1887                                  charid, uvc);
1888                         });
1889
1890                         do {
1891 #ifdef DEBUGGING
1892                             word = aho->states[ state ].wordnum;
1893 #endif
1894                             base = aho->states[ state ].trans.base;
1895
1896                             DEBUG_TRIE_EXECUTE_r({
1897                                 if (failed)
1898                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1899                                         s,   utf8_target );
1900                                 PerlIO_printf( Perl_debug_log,
1901                                     "%sState: %4"UVxf", word=%"UVxf,
1902                                     failed ? " Fail transition to " : "",
1903                                     (UV)state, (UV)word);
1904                             });
1905                             if ( base ) {
1906                                 U32 tmp;
1907                                 I32 offset;
1908                                 if (charid &&
1909                                      ( ((offset = base + charid
1910                                         - 1 - trie->uniquecharcount)) >= 0)
1911                                      && ((U32)offset < trie->lasttrans)
1912                                      && trie->trans[offset].check == state
1913                                      && (tmp=trie->trans[offset].next))
1914                                 {
1915                                     DEBUG_TRIE_EXECUTE_r(
1916                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1917                                     state = tmp;
1918                                     break;
1919                                 }
1920                                 else {
1921                                     DEBUG_TRIE_EXECUTE_r(
1922                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1923                                     failed = 1;
1924                                     state = aho->fail[state];
1925                                 }
1926                             }
1927                             else {
1928                                 /* we must be accepting here */
1929                                 DEBUG_TRIE_EXECUTE_r(
1930                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1931                                 failed = 1;
1932                                 break;
1933                             }
1934                         } while(state);
1935                         uc += len;
1936                         if (failed) {
1937                             if (leftmost)
1938                                 break;
1939                             if (!state) state = 1;
1940                         }
1941                     }
1942                     if ( aho->states[ state ].wordnum ) {
1943                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1944                         if (!leftmost || lpos < leftmost) {
1945                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1946                             leftmost = lpos;
1947                         }
1948                     }
1949                     if (leftmost) {
1950                         s = (char*)leftmost;
1951                         DEBUG_TRIE_EXECUTE_r({
1952                             PerlIO_printf(
1953                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1954                                 (UV)accepted_word, (IV)(s - real_start)
1955                             );
1956                         });
1957                         if (!reginfo || regtry(reginfo, &s)) {
1958                             FREETMPS;
1959                             LEAVE;
1960                             goto got_it;
1961                         }
1962                         s = HOPc(s,1);
1963                         DEBUG_TRIE_EXECUTE_r({
1964                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1965                         });
1966                     } else {
1967                         DEBUG_TRIE_EXECUTE_r(
1968                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1969                         break;
1970                     }
1971                 }
1972                 FREETMPS;
1973                 LEAVE;
1974             }
1975             break;
1976         default:
1977             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1978             break;
1979         }
1980         return 0;
1981       got_it:
1982         return s;
1983 }
1984
1985
1986 /*
1987  - regexec_flags - match a regexp against a string
1988  */
1989 I32
1990 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1991               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1992 /* strend: pointer to null at end of string */
1993 /* strbeg: real beginning of string */
1994 /* minend: end of match must be >=minend after stringarg. */
1995 /* data: May be used for some additional optimizations.
1996          Currently its only used, with a U32 cast, for transmitting
1997          the ganch offset when doing a /g match. This will change */
1998 /* nosave: For optimizations. */
1999 {
2000     dVAR;
2001     struct regexp *const prog = (struct regexp *)SvANY(rx);
2002     /*register*/ char *s;
2003     register regnode *c;
2004     /*register*/ char *startpos = stringarg;
2005     I32 minlen;         /* must match at least this many chars */
2006     I32 dontbother = 0; /* how many characters not to try at end */
2007     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2008     I32 scream_pos = -1;                /* Internal iterator of scream. */
2009     char *scream_olds = NULL;
2010     const bool utf8_target = cBOOL(DO_UTF8(sv));
2011     I32 multiline;
2012     RXi_GET_DECL(prog,progi);
2013     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2014     regexp_paren_pair *swap = NULL;
2015     GET_RE_DEBUG_FLAGS_DECL;
2016
2017     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2018     PERL_UNUSED_ARG(data);
2019
2020     /* Be paranoid... */
2021     if (prog == NULL || startpos == NULL) {
2022         Perl_croak(aTHX_ "NULL regexp parameter");
2023         return 0;
2024     }
2025
2026     multiline = prog->extflags & RXf_PMf_MULTILINE;
2027     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2028
2029     RX_MATCH_UTF8_set(rx, utf8_target);
2030     DEBUG_EXECUTE_r(
2031         debug_start_match(rx, utf8_target, startpos, strend,
2032         "Matching");
2033     );
2034
2035     minlen = prog->minlen;
2036
2037     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2038         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2039                               "String too short [regexec_flags]...\n"));
2040         goto phooey;
2041     }
2042
2043
2044     /* Check validity of program. */
2045     if (UCHARAT(progi->program) != REG_MAGIC) {
2046         Perl_croak(aTHX_ "corrupted regexp program");
2047     }
2048
2049     PL_reg_flags = 0;
2050     PL_reg_eval_set = 0;
2051     PL_reg_maxiter = 0;
2052
2053     if (RX_UTF8(rx))
2054         PL_reg_flags |= RF_utf8;
2055
2056     /* Mark beginning of line for ^ and lookbehind. */
2057     reginfo.bol = startpos; /* XXX not used ??? */
2058     PL_bostr  = strbeg;
2059     reginfo.sv = sv;
2060
2061     /* Mark end of line for $ (and such) */
2062     PL_regeol = strend;
2063
2064     /* see how far we have to get to not match where we matched before */
2065     reginfo.till = startpos+minend;
2066
2067     /* If there is a "must appear" string, look for it. */
2068     s = startpos;
2069
2070     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2071         MAGIC *mg;
2072         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2073             reginfo.ganch = startpos + prog->gofs;
2074             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2075               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2076         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2077                   && SvMAGIC(sv)
2078                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2079                   && mg->mg_len >= 0) {
2080             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2081             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2082                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2083
2084             if (prog->extflags & RXf_ANCH_GPOS) {
2085                 if (s > reginfo.ganch)
2086                     goto phooey;
2087                 s = reginfo.ganch - prog->gofs;
2088                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2089                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2090                 if (s < strbeg)
2091                     goto phooey;
2092             }
2093         }
2094         else if (data) {
2095             reginfo.ganch = strbeg + PTR2UV(data);
2096             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2097                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2098
2099         } else {                                /* pos() not defined */
2100             reginfo.ganch = strbeg;
2101             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2102                  "GPOS: reginfo.ganch = strbeg\n"));
2103         }
2104     }
2105     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2106         /* We have to be careful. If the previous successful match
2107            was from this regex we don't want a subsequent partially
2108            successful match to clobber the old results.
2109            So when we detect this possibility we add a swap buffer
2110            to the re, and switch the buffer each match. If we fail
2111            we switch it back, otherwise we leave it swapped.
2112         */
2113         swap = prog->offs;
2114         /* do we need a save destructor here for eval dies? */
2115         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2116     }
2117     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2118         re_scream_pos_data d;
2119
2120         d.scream_olds = &scream_olds;
2121         d.scream_pos = &scream_pos;
2122         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2123         if (!s) {
2124             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2125             goto phooey;        /* not present */
2126         }
2127     }
2128
2129
2130
2131     /* Simplest case:  anchored match need be tried only once. */
2132     /*  [unless only anchor is BOL and multiline is set] */
2133     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2134         if (s == startpos && regtry(&reginfo, &startpos))
2135             goto got_it;
2136         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2137                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2138         {
2139             char *end;
2140
2141             if (minlen)
2142                 dontbother = minlen - 1;
2143             end = HOP3c(strend, -dontbother, strbeg) - 1;
2144             /* for multiline we only have to try after newlines */
2145             if (prog->check_substr || prog->check_utf8) {
2146                 /* because of the goto we can not easily reuse the macros for bifurcating the
2147                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2148                 if (utf8_target) {
2149                     if (s == startpos)
2150                         goto after_try_utf8;
2151                     while (1) {
2152                         if (regtry(&reginfo, &s)) {
2153                             goto got_it;
2154                         }
2155                       after_try_utf8:
2156                         if (s > end) {
2157                             goto phooey;
2158                         }
2159                         if (prog->extflags & RXf_USE_INTUIT) {
2160                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2161                             if (!s) {
2162                                 goto phooey;
2163                             }
2164                         }
2165                         else {
2166                             s += UTF8SKIP(s);
2167                         }
2168                     }
2169                 } /* end search for check string in unicode */
2170                 else {
2171                     if (s == startpos) {
2172                         goto after_try_latin;
2173                     }
2174                     while (1) {
2175                         if (regtry(&reginfo, &s)) {
2176                             goto got_it;
2177                         }
2178                       after_try_latin:
2179                         if (s > end) {
2180                             goto phooey;
2181                         }
2182                         if (prog->extflags & RXf_USE_INTUIT) {
2183                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2184                             if (!s) {
2185                                 goto phooey;
2186                             }
2187                         }
2188                         else {
2189                             s++;
2190                         }
2191                     }
2192                 } /* end search for check string in latin*/
2193             } /* end search for check string */
2194             else { /* search for newline */
2195                 if (s > startpos) {
2196                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2197                     s--;
2198                 }
2199                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2200                 while (s < end) {
2201                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2202                         if (regtry(&reginfo, &s))
2203                             goto got_it;
2204                     }
2205                 }
2206             } /* end search for newline */
2207         } /* end anchored/multiline check string search */
2208         goto phooey;
2209     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2210     {
2211         /* the warning about reginfo.ganch being used without initialization
2212            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2213            and we only enter this block when the same bit is set. */
2214         char *tmp_s = reginfo.ganch - prog->gofs;
2215
2216         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2217             goto got_it;
2218         goto phooey;
2219     }
2220
2221     /* Messy cases:  unanchored match. */
2222     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2223         /* we have /x+whatever/ */
2224         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2225         char ch;
2226 #ifdef DEBUGGING
2227         int did_match = 0;
2228 #endif
2229         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2230             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2231         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2232
2233         if (utf8_target) {
2234             REXEC_FBC_SCAN(
2235                 if (*s == ch) {
2236                     DEBUG_EXECUTE_r( did_match = 1 );
2237                     if (regtry(&reginfo, &s)) goto got_it;
2238                     s += UTF8SKIP(s);
2239                     while (s < strend && *s == ch)
2240                         s += UTF8SKIP(s);
2241                 }
2242             );
2243         }
2244         else {
2245             REXEC_FBC_SCAN(
2246                 if (*s == ch) {
2247                     DEBUG_EXECUTE_r( did_match = 1 );
2248                     if (regtry(&reginfo, &s)) goto got_it;
2249                     s++;
2250                     while (s < strend && *s == ch)
2251                         s++;
2252                 }
2253             );
2254         }
2255         DEBUG_EXECUTE_r(if (!did_match)
2256                 PerlIO_printf(Perl_debug_log,
2257                                   "Did not find anchored character...\n")
2258                );
2259     }
2260     else if (prog->anchored_substr != NULL
2261               || prog->anchored_utf8 != NULL
2262               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2263                   && prog->float_max_offset < strend - s)) {
2264         SV *must;
2265         I32 back_max;
2266         I32 back_min;
2267         char *last;
2268         char *last1;            /* Last position checked before */
2269 #ifdef DEBUGGING
2270         int did_match = 0;
2271 #endif
2272         if (prog->anchored_substr || prog->anchored_utf8) {
2273             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2274                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2275             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2276             back_max = back_min = prog->anchored_offset;
2277         } else {
2278             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2279                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2280             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2281             back_max = prog->float_max_offset;
2282             back_min = prog->float_min_offset;
2283         }
2284
2285
2286         if (must == &PL_sv_undef)
2287             /* could not downgrade utf8 check substring, so must fail */
2288             goto phooey;
2289
2290         if (back_min<0) {
2291             last = strend;
2292         } else {
2293             last = HOP3c(strend,        /* Cannot start after this */
2294                   -(I32)(CHR_SVLEN(must)
2295                          - (SvTAIL(must) != 0) + back_min), strbeg);
2296         }
2297         if (s > PL_bostr)
2298             last1 = HOPc(s, -1);
2299         else
2300             last1 = s - 1;      /* bogus */
2301
2302         /* XXXX check_substr already used to find "s", can optimize if
2303            check_substr==must. */
2304         scream_pos = -1;
2305         dontbother = end_shift;
2306         strend = HOPc(strend, -dontbother);
2307         while ( (s <= last) &&
2308                 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
2309                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2310                                     end_shift, &scream_pos, 0))
2311                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2312                                   (unsigned char*)strend, must,
2313                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2314             /* we may be pointing at the wrong string */
2315             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2316                 s = strbeg + (s - SvPVX_const(sv));
2317             DEBUG_EXECUTE_r( did_match = 1 );
2318             if (HOPc(s, -back_max) > last1) {
2319                 last1 = HOPc(s, -back_min);
2320                 s = HOPc(s, -back_max);
2321             }
2322             else {
2323                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2324
2325                 last1 = HOPc(s, -back_min);
2326                 s = t;
2327             }
2328             if (utf8_target) {
2329                 while (s <= last1) {
2330                     if (regtry(&reginfo, &s))
2331                         goto got_it;
2332                     s += UTF8SKIP(s);
2333                 }
2334             }
2335             else {
2336                 while (s <= last1) {
2337                     if (regtry(&reginfo, &s))
2338                         goto got_it;
2339                     s++;
2340                 }
2341             }
2342         }
2343         DEBUG_EXECUTE_r(if (!did_match) {
2344             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2345                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2346             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2347                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2348                                ? "anchored" : "floating"),
2349                 quoted, RE_SV_TAIL(must));
2350         });
2351         goto phooey;
2352     }
2353     else if ( (c = progi->regstclass) ) {
2354         if (minlen) {
2355             const OPCODE op = OP(progi->regstclass);
2356             /* don't bother with what can't match */
2357             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2358                 strend = HOPc(strend, -(minlen - 1));
2359         }
2360         DEBUG_EXECUTE_r({
2361             SV * const prop = sv_newmortal();
2362             regprop(prog, prop, c);
2363             {
2364                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2365                     s,strend-s,60);
2366                 PerlIO_printf(Perl_debug_log,
2367                     "Matching stclass %.*s against %s (%d bytes)\n",
2368                     (int)SvCUR(prop), SvPVX_const(prop),
2369                      quoted, (int)(strend - s));
2370             }
2371         });
2372         if (find_byclass(prog, c, s, strend, &reginfo))
2373             goto got_it;
2374         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2375     }
2376     else {
2377         dontbother = 0;
2378         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2379             /* Trim the end. */
2380             char *last;
2381             SV* float_real;
2382
2383             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2384                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2385             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2386
2387             if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
2388                 last = screaminstr(sv, float_real, s - strbeg,
2389                                    end_shift, &scream_pos, 1); /* last one */
2390                 if (!last)
2391                     last = scream_olds; /* Only one occurrence. */
2392                 /* we may be pointing at the wrong string */
2393                 else if (RXp_MATCH_COPIED(prog))
2394                     s = strbeg + (s - SvPVX_const(sv));
2395             }
2396             else {
2397                 STRLEN len;
2398                 const char * const little = SvPV_const(float_real, len);
2399
2400                 if (SvTAIL(float_real)) {
2401                     if (memEQ(strend - len + 1, little, len - 1))
2402                         last = strend - len + 1;
2403                     else if (!multiline)
2404                         last = memEQ(strend - len, little, len)
2405                             ? strend - len : NULL;
2406                     else
2407                         goto find_last;
2408                 } else {
2409                   find_last:
2410                     if (len)
2411                         last = rninstr(s, strend, little, little + len);
2412                     else
2413                         last = strend;  /* matching "$" */
2414                 }
2415             }
2416             if (last == NULL) {
2417                 DEBUG_EXECUTE_r(
2418                     PerlIO_printf(Perl_debug_log,
2419                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2420                         PL_colors[4], PL_colors[5]));
2421                 goto phooey; /* Should not happen! */
2422             }
2423             dontbother = strend - last + prog->float_min_offset;
2424         }
2425         if (minlen && (dontbother < minlen))
2426             dontbother = minlen - 1;
2427         strend -= dontbother;              /* this one's always in bytes! */
2428         /* We don't know much -- general case. */
2429         if (utf8_target) {
2430             for (;;) {
2431                 if (regtry(&reginfo, &s))
2432                     goto got_it;
2433                 if (s >= strend)
2434                     break;
2435                 s += UTF8SKIP(s);
2436             };
2437         }
2438         else {
2439             do {
2440                 if (regtry(&reginfo, &s))
2441                     goto got_it;
2442             } while (s++ < strend);
2443         }
2444     }
2445
2446     /* Failure. */
2447     goto phooey;
2448
2449 got_it:
2450     Safefree(swap);
2451     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2452
2453     if (PL_reg_eval_set)
2454         restore_pos(aTHX_ prog);
2455     if (RXp_PAREN_NAMES(prog))
2456         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2457
2458     /* make sure $`, $&, $', and $digit will work later */
2459     if ( !(flags & REXEC_NOT_FIRST) ) {
2460         RX_MATCH_COPY_FREE(rx);
2461         if (flags & REXEC_COPY_STR) {
2462             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2463 #ifdef PERL_OLD_COPY_ON_WRITE
2464             if ((SvIsCOW(sv)
2465                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2466                 if (DEBUG_C_TEST) {
2467                     PerlIO_printf(Perl_debug_log,
2468                                   "Copy on write: regexp capture, type %d\n",
2469                                   (int) SvTYPE(sv));
2470                 }
2471                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2472                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2473                 assert (SvPOKp(prog->saved_copy));
2474             } else
2475 #endif
2476             {
2477                 RX_MATCH_COPIED_on(rx);
2478                 s = savepvn(strbeg, i);
2479                 prog->subbeg = s;
2480             }
2481             prog->sublen = i;
2482         }
2483         else {
2484             prog->subbeg = strbeg;
2485             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2486         }
2487     }
2488
2489     return 1;
2490
2491 phooey:
2492     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2493                           PL_colors[4], PL_colors[5]));
2494     if (PL_reg_eval_set)
2495         restore_pos(aTHX_ prog);
2496     if (swap) {
2497         /* we failed :-( roll it back */
2498         Safefree(prog->offs);
2499         prog->offs = swap;
2500     }
2501
2502     return 0;
2503 }
2504
2505
2506 /*
2507  - regtry - try match at specific point
2508  */
2509 STATIC I32                      /* 0 failure, 1 success */
2510 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2511 {
2512     dVAR;
2513     CHECKPOINT lastcp;
2514     REGEXP *const rx = reginfo->prog;
2515     regexp *const prog = (struct regexp *)SvANY(rx);
2516     RXi_GET_DECL(prog,progi);
2517     GET_RE_DEBUG_FLAGS_DECL;
2518
2519     PERL_ARGS_ASSERT_REGTRY;
2520
2521     reginfo->cutpoint=NULL;
2522
2523     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2524         MAGIC *mg;
2525
2526         PL_reg_eval_set = RS_init;
2527         DEBUG_EXECUTE_r(DEBUG_s(
2528             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2529                           (IV)(PL_stack_sp - PL_stack_base));
2530             ));
2531         SAVESTACK_CXPOS();
2532         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2533         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2534         SAVETMPS;
2535         /* Apparently this is not needed, judging by wantarray. */
2536         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2537            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2538
2539         if (reginfo->sv) {
2540             /* Make $_ available to executed code. */
2541             if (reginfo->sv != DEFSV) {
2542                 SAVE_DEFSV;
2543                 DEFSV_set(reginfo->sv);
2544             }
2545
2546             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2547                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2548                 /* prepare for quick setting of pos */
2549 #ifdef PERL_OLD_COPY_ON_WRITE
2550                 if (SvIsCOW(reginfo->sv))
2551                     sv_force_normal_flags(reginfo->sv, 0);
2552 #endif
2553                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2554                                  &PL_vtbl_mglob, NULL, 0);
2555                 mg->mg_len = -1;
2556             }
2557             PL_reg_magic    = mg;
2558             PL_reg_oldpos   = mg->mg_len;
2559             SAVEDESTRUCTOR_X(restore_pos, prog);
2560         }
2561         if (!PL_reg_curpm) {
2562             Newxz(PL_reg_curpm, 1, PMOP);
2563 #ifdef USE_ITHREADS
2564             {
2565                 SV* const repointer = &PL_sv_undef;
2566                 /* this regexp is also owned by the new PL_reg_curpm, which
2567                    will try to free it.  */
2568                 av_push(PL_regex_padav, repointer);
2569                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2570                 PL_regex_pad = AvARRAY(PL_regex_padav);
2571             }
2572 #endif
2573         }
2574 #ifdef USE_ITHREADS
2575         /* It seems that non-ithreads works both with and without this code.
2576            So for efficiency reasons it seems best not to have the code
2577            compiled when it is not needed.  */
2578         /* This is safe against NULLs: */
2579         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2580         /* PM_reg_curpm owns a reference to this regexp.  */
2581         (void)ReREFCNT_inc(rx);
2582 #endif
2583         PM_SETRE(PL_reg_curpm, rx);
2584         PL_reg_oldcurpm = PL_curpm;
2585         PL_curpm = PL_reg_curpm;
2586         if (RXp_MATCH_COPIED(prog)) {
2587             /*  Here is a serious problem: we cannot rewrite subbeg,
2588                 since it may be needed if this match fails.  Thus
2589                 $` inside (?{}) could fail... */
2590             PL_reg_oldsaved = prog->subbeg;
2591             PL_reg_oldsavedlen = prog->sublen;
2592 #ifdef PERL_OLD_COPY_ON_WRITE
2593             PL_nrs = prog->saved_copy;
2594 #endif
2595             RXp_MATCH_COPIED_off(prog);
2596         }
2597         else
2598             PL_reg_oldsaved = NULL;
2599         prog->subbeg = PL_bostr;
2600         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2601     }
2602     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2603     prog->offs[0].start = *startpos - PL_bostr;
2604     PL_reginput = *startpos;
2605     PL_reglastparen = &prog->lastparen;
2606     PL_reglastcloseparen = &prog->lastcloseparen;
2607     prog->lastparen = 0;
2608     prog->lastcloseparen = 0;
2609     PL_regsize = 0;
2610     PL_regoffs = prog->offs;
2611     if (PL_reg_start_tmpl <= prog->nparens) {
2612         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2613         if(PL_reg_start_tmp)
2614             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2615         else
2616             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2617     }
2618
2619     /* XXXX What this code is doing here?!!!  There should be no need
2620        to do this again and again, PL_reglastparen should take care of
2621        this!  --ilya*/
2622
2623     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2624      * Actually, the code in regcppop() (which Ilya may be meaning by
2625      * PL_reglastparen), is not needed at all by the test suite
2626      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2627      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2628      * Meanwhile, this code *is* needed for the
2629      * above-mentioned test suite tests to succeed.  The common theme
2630      * on those tests seems to be returning null fields from matches.
2631      * --jhi updated by dapm */
2632 #if 1
2633     if (prog->nparens) {
2634         regexp_paren_pair *pp = PL_regoffs;
2635         register I32 i;
2636         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2637             ++pp;
2638             pp->start = -1;
2639             pp->end = -1;
2640         }
2641     }
2642 #endif
2643     REGCP_SET(lastcp);
2644     if (regmatch(reginfo, progi->program + 1)) {
2645         PL_regoffs[0].end = PL_reginput - PL_bostr;
2646         return 1;
2647     }
2648     if (reginfo->cutpoint)
2649         *startpos= reginfo->cutpoint;
2650     REGCP_UNWIND(lastcp);
2651     return 0;
2652 }
2653
2654
2655 #define sayYES goto yes
2656 #define sayNO goto no
2657 #define sayNO_SILENT goto no_silent
2658
2659 /* we dont use STMT_START/END here because it leads to
2660    "unreachable code" warnings, which are bogus, but distracting. */
2661 #define CACHEsayNO \
2662     if (ST.cache_mask) \
2663        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2664     sayNO
2665
2666 /* this is used to determine how far from the left messages like
2667    'failed...' are printed. It should be set such that messages
2668    are inline with the regop output that created them.
2669 */
2670 #define REPORT_CODE_OFF 32
2671
2672
2673 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2674 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2675
2676 #define SLAB_FIRST(s) (&(s)->states[0])
2677 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2678
2679 /* grab a new slab and return the first slot in it */
2680
2681 STATIC regmatch_state *
2682 S_push_slab(pTHX)
2683 {
2684 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2685     dMY_CXT;
2686 #endif
2687     regmatch_slab *s = PL_regmatch_slab->next;
2688     if (!s) {
2689         Newx(s, 1, regmatch_slab);
2690         s->prev = PL_regmatch_slab;
2691         s->next = NULL;
2692         PL_regmatch_slab->next = s;
2693     }
2694     PL_regmatch_slab = s;
2695     return SLAB_FIRST(s);
2696 }
2697
2698
2699 /* push a new state then goto it */
2700
2701 #define PUSH_STATE_GOTO(state, node) \
2702     scan = node; \
2703     st->resume_state = state; \
2704     goto push_state;
2705
2706 /* push a new state with success backtracking, then goto it */
2707
2708 #define PUSH_YES_STATE_GOTO(state, node) \
2709     scan = node; \
2710     st->resume_state = state; \
2711     goto push_yes_state;
2712
2713
2714
2715 /*
2716
2717 regmatch() - main matching routine
2718
2719 This is basically one big switch statement in a loop. We execute an op,
2720 set 'next' to point the next op, and continue. If we come to a point which
2721 we may need to backtrack to on failure such as (A|B|C), we push a
2722 backtrack state onto the backtrack stack. On failure, we pop the top
2723 state, and re-enter the loop at the state indicated. If there are no more
2724 states to pop, we return failure.
2725
2726 Sometimes we also need to backtrack on success; for example /A+/, where
2727 after successfully matching one A, we need to go back and try to
2728 match another one; similarly for lookahead assertions: if the assertion
2729 completes successfully, we backtrack to the state just before the assertion
2730 and then carry on.  In these cases, the pushed state is marked as
2731 'backtrack on success too'. This marking is in fact done by a chain of
2732 pointers, each pointing to the previous 'yes' state. On success, we pop to
2733 the nearest yes state, discarding any intermediate failure-only states.
2734 Sometimes a yes state is pushed just to force some cleanup code to be
2735 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2736 it to free the inner regex.
2737
2738 Note that failure backtracking rewinds the cursor position, while
2739 success backtracking leaves it alone.
2740
2741 A pattern is complete when the END op is executed, while a subpattern
2742 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2743 ops trigger the "pop to last yes state if any, otherwise return true"
2744 behaviour.
2745
2746 A common convention in this function is to use A and B to refer to the two
2747 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2748 the subpattern to be matched possibly multiple times, while B is the entire
2749 rest of the pattern. Variable and state names reflect this convention.
2750
2751 The states in the main switch are the union of ops and failure/success of
2752 substates associated with with that op.  For example, IFMATCH is the op
2753 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2754 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2755 successfully matched A and IFMATCH_A_fail is a state saying that we have
2756 just failed to match A. Resume states always come in pairs. The backtrack
2757 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2758 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2759 on success or failure.
2760
2761 The struct that holds a backtracking state is actually a big union, with
2762 one variant for each major type of op. The variable st points to the
2763 top-most backtrack struct. To make the code clearer, within each
2764 block of code we #define ST to alias the relevant union.
2765
2766 Here's a concrete example of a (vastly oversimplified) IFMATCH
2767 implementation:
2768
2769     switch (state) {
2770     ....
2771
2772 #define ST st->u.ifmatch
2773
2774     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2775         ST.foo = ...; // some state we wish to save
2776         ...
2777         // push a yes backtrack state with a resume value of
2778         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2779         // first node of A:
2780         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2781         // NOTREACHED
2782
2783     case IFMATCH_A: // we have successfully executed A; now continue with B
2784         next = B;
2785         bar = ST.foo; // do something with the preserved value
2786         break;
2787
2788     case IFMATCH_A_fail: // A failed, so the assertion failed
2789         ...;   // do some housekeeping, then ...
2790         sayNO; // propagate the failure
2791
2792 #undef ST
2793
2794     ...
2795     }
2796
2797 For any old-timers reading this who are familiar with the old recursive
2798 approach, the code above is equivalent to:
2799
2800     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2801     {
2802         int foo = ...
2803         ...
2804         if (regmatch(A)) {
2805             next = B;
2806             bar = foo;
2807             break;
2808         }
2809         ...;   // do some housekeeping, then ...
2810         sayNO; // propagate the failure
2811     }
2812
2813 The topmost backtrack state, pointed to by st, is usually free. If you
2814 want to claim it, populate any ST.foo fields in it with values you wish to
2815 save, then do one of
2816
2817         PUSH_STATE_GOTO(resume_state, node);
2818         PUSH_YES_STATE_GOTO(resume_state, node);
2819
2820 which sets that backtrack state's resume value to 'resume_state', pushes a
2821 new free entry to the top of the backtrack stack, then goes to 'node'.
2822 On backtracking, the free slot is popped, and the saved state becomes the
2823 new free state. An ST.foo field in this new top state can be temporarily
2824 accessed to retrieve values, but once the main loop is re-entered, it
2825 becomes available for reuse.
2826
2827 Note that the depth of the backtrack stack constantly increases during the
2828 left-to-right execution of the pattern, rather than going up and down with
2829 the pattern nesting. For example the stack is at its maximum at Z at the
2830 end of the pattern, rather than at X in the following:
2831
2832     /(((X)+)+)+....(Y)+....Z/
2833
2834 The only exceptions to this are lookahead/behind assertions and the cut,
2835 (?>A), which pop all the backtrack states associated with A before
2836 continuing.
2837
2838 Backtrack state structs are allocated in slabs of about 4K in size.
2839 PL_regmatch_state and st always point to the currently active state,
2840 and PL_regmatch_slab points to the slab currently containing
2841 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2842 allocated, and is never freed until interpreter destruction. When the slab
2843 is full, a new one is allocated and chained to the end. At exit from
2844 regmatch(), slabs allocated since entry are freed.
2845
2846 */
2847
2848
2849 #define DEBUG_STATE_pp(pp)                                  \
2850     DEBUG_STATE_r({                                         \
2851         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2852         PerlIO_printf(Perl_debug_log,                       \
2853             "    %*s"pp" %s%s%s%s%s\n",                     \
2854             depth*2, "",                                    \
2855             PL_reg_name[st->resume_state],                     \
2856             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2857             ((st==yes_state) ? "Y" : ""),                   \
2858             ((st==mark_state) ? "M" : ""),                  \
2859             ((st==yes_state||st==mark_state) ? "]" : "")    \
2860         );                                                  \
2861     });
2862
2863
2864 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2865
2866 #ifdef DEBUGGING
2867
2868 STATIC void
2869 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2870     const char *start, const char *end, const char *blurb)
2871 {
2872     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2873
2874     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2875
2876     if (!PL_colorset)
2877             reginitcolors();
2878     {
2879         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2880             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2881
2882         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2883             start, end - start, 60);
2884
2885         PerlIO_printf(Perl_debug_log,
2886             "%s%s REx%s %s against %s\n",
2887                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2888
2889         if (utf8_target||utf8_pat)
2890             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2891                 utf8_pat ? "pattern" : "",
2892                 utf8_pat && utf8_target ? " and " : "",
2893                 utf8_target ? "string" : ""
2894             );
2895     }
2896 }
2897
2898 STATIC void
2899 S_dump_exec_pos(pTHX_ const char *locinput,
2900                       const regnode *scan,
2901                       const char *loc_regeol,
2902                       const char *loc_bostr,
2903                       const char *loc_reg_starttry,
2904                       const bool utf8_target)
2905 {
2906     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2907     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2908     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2909     /* The part of the string before starttry has one color
2910        (pref0_len chars), between starttry and current
2911        position another one (pref_len - pref0_len chars),
2912        after the current position the third one.
2913        We assume that pref0_len <= pref_len, otherwise we
2914        decrease pref0_len.  */
2915     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2916         ? (5 + taill) - l : locinput - loc_bostr;
2917     int pref0_len;
2918
2919     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2920
2921     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2922         pref_len++;
2923     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2924     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2925         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2926               ? (5 + taill) - pref_len : loc_regeol - locinput);
2927     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2928         l--;
2929     if (pref0_len < 0)
2930         pref0_len = 0;
2931     if (pref0_len > pref_len)
2932         pref0_len = pref_len;
2933     {
2934         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2935
2936         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2937             (locinput - pref_len),pref0_len, 60, 4, 5);
2938
2939         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2940                     (locinput - pref_len + pref0_len),
2941                     pref_len - pref0_len, 60, 2, 3);
2942
2943         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2944                     locinput, loc_regeol - locinput, 10, 0, 1);
2945
2946         const STRLEN tlen=len0+len1+len2;
2947         PerlIO_printf(Perl_debug_log,
2948                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2949                     (IV)(locinput - loc_bostr),
2950                     len0, s0,
2951                     len1, s1,
2952                     (docolor ? "" : "> <"),
2953                     len2, s2,
2954                     (int)(tlen > 19 ? 0 :  19 - tlen),
2955                     "");
2956     }
2957 }
2958
2959 #endif
2960
2961 /* reg_check_named_buff_matched()
2962  * Checks to see if a named buffer has matched. The data array of
2963  * buffer numbers corresponding to the buffer is expected to reside
2964  * in the regexp->data->data array in the slot stored in the ARG() of
2965  * node involved. Note that this routine doesn't actually care about the
2966  * name, that information is not preserved from compilation to execution.
2967  * Returns the index of the leftmost defined buffer with the given name
2968  * or 0 if non of the buffers matched.
2969  */
2970 STATIC I32
2971 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2972 {
2973     I32 n;
2974     RXi_GET_DECL(rex,rexi);
2975     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2976     I32 *nums=(I32*)SvPVX(sv_dat);
2977
2978     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2979
2980     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2981         if ((I32)*PL_reglastparen >= nums[n] &&
2982             PL_regoffs[nums[n]].end != -1)
2983         {
2984             return nums[n];
2985         }
2986     }
2987     return 0;
2988 }
2989
2990
2991 /* free all slabs above current one  - called during LEAVE_SCOPE */
2992
2993 STATIC void
2994 S_clear_backtrack_stack(pTHX_ void *p)
2995 {
2996     regmatch_slab *s = PL_regmatch_slab->next;
2997     PERL_UNUSED_ARG(p);
2998
2999     if (!s)
3000         return;
3001     PL_regmatch_slab->next = NULL;
3002     while (s) {
3003         regmatch_slab * const osl = s;
3004         s = s->next;
3005         Safefree(osl);
3006     }
3007 }
3008
3009
3010 #define SETREX(Re1,Re2) \
3011     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3012     Re1 = (Re2)
3013
3014 STATIC I32                      /* 0 failure, 1 success */
3015 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3016 {
3017 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3018     dMY_CXT;
3019 #endif
3020     dVAR;
3021     register const bool utf8_target = PL_reg_match_utf8;
3022     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3023     REGEXP *rex_sv = reginfo->prog;
3024     regexp *rex = (struct regexp *)SvANY(rex_sv);
3025     RXi_GET_DECL(rex,rexi);
3026     I32 oldsave;
3027     /* the current state. This is a cached copy of PL_regmatch_state */
3028     register regmatch_state *st;
3029     /* cache heavy used fields of st in registers */
3030     register regnode *scan;
3031     register regnode *next;
3032     register U32 n = 0; /* general value; init to avoid compiler warning */
3033     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3034     register char *locinput = PL_reginput;
3035     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3036
3037     bool result = 0;        /* return value of S_regmatch */
3038     int depth = 0;          /* depth of backtrack stack */
3039     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3040     const U32 max_nochange_depth =
3041         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3042         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3043     regmatch_state *yes_state = NULL; /* state to pop to on success of
3044                                                             subpattern */
3045     /* mark_state piggy backs on the yes_state logic so that when we unwind
3046        the stack on success we can update the mark_state as we go */
3047     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3048     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3049     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3050     U32 state_num;
3051     bool no_final = 0;      /* prevent failure from backtracking? */
3052     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3053     char *startpoint = PL_reginput;
3054     SV *popmark = NULL;     /* are we looking for a mark? */
3055     SV *sv_commit = NULL;   /* last mark name seen in failure */
3056     SV *sv_yes_mark = NULL; /* last mark name we have seen
3057                                during a successful match */
3058     U32 lastopen = 0;       /* last open we saw */
3059     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3060     SV* const oreplsv = GvSV(PL_replgv);
3061     /* these three flags are set by various ops to signal information to
3062      * the very next op. They have a useful lifetime of exactly one loop
3063      * iteration, and are not preserved or restored by state pushes/pops
3064      */
3065     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3066     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3067     int logical = 0;        /* the following EVAL is:
3068                                 0: (?{...})
3069                                 1: (?(?{...})X|Y)
3070                                 2: (??{...})
3071                                or the following IFMATCH/UNLESSM is:
3072                                 false: plain (?=foo)
3073                                 true:  used as a condition: (?(?=foo))
3074                             */
3075 #ifdef DEBUGGING
3076     GET_RE_DEBUG_FLAGS_DECL;
3077 #endif
3078
3079     PERL_ARGS_ASSERT_REGMATCH;
3080
3081     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3082             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3083     }));
3084     /* on first ever call to regmatch, allocate first slab */
3085     if (!PL_regmatch_slab) {
3086         Newx(PL_regmatch_slab, 1, regmatch_slab);
3087         PL_regmatch_slab->prev = NULL;
3088         PL_regmatch_slab->next = NULL;
3089         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3090     }
3091
3092     oldsave = PL_savestack_ix;
3093     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3094     SAVEVPTR(PL_regmatch_slab);
3095     SAVEVPTR(PL_regmatch_state);
3096
3097     /* grab next free state slot */
3098     st = ++PL_regmatch_state;
3099     if (st >  SLAB_LAST(PL_regmatch_slab))
3100         st = PL_regmatch_state = S_push_slab(aTHX);
3101
3102     /* Note that nextchr is a byte even in UTF */
3103     nextchr = UCHARAT(locinput);
3104     scan = prog;
3105     while (scan != NULL) {
3106
3107         DEBUG_EXECUTE_r( {
3108             SV * const prop = sv_newmortal();
3109             regnode *rnext=regnext(scan);
3110             DUMP_EXEC_POS( locinput, scan, utf8_target );
3111             regprop(rex, prop, scan);
3112
3113             PerlIO_printf(Perl_debug_log,
3114                     "%3"IVdf":%*s%s(%"IVdf")\n",
3115                     (IV)(scan - rexi->program), depth*2, "",
3116                     SvPVX_const(prop),
3117                     (PL_regkind[OP(scan)] == END || !rnext) ?
3118                         0 : (IV)(rnext - rexi->program));
3119         });
3120
3121         next = scan + NEXT_OFF(scan);
3122         if (next == scan)
3123             next = NULL;
3124         state_num = OP(scan);
3125
3126         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3127       reenter_switch:
3128
3129         assert(PL_reglastparen == &rex->lastparen);
3130         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3131         assert(PL_regoffs == rex->offs);
3132
3133         switch (state_num) {
3134         case BOL:
3135             if (locinput == PL_bostr)
3136             {
3137                 /* reginfo->till = reginfo->bol; */
3138                 break;
3139             }
3140             sayNO;
3141         case MBOL:
3142             if (locinput == PL_bostr ||
3143                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3144             {
3145                 break;
3146             }
3147             sayNO;
3148         case SBOL:
3149             if (locinput == PL_bostr)
3150                 break;
3151             sayNO;
3152         case GPOS:
3153             if (locinput == reginfo->ganch)
3154                 break;
3155             sayNO;
3156
3157         case KEEPS:
3158             /* update the startpoint */
3159             st->u.keeper.val = PL_regoffs[0].start;
3160             PL_reginput = locinput;
3161             PL_regoffs[0].start = locinput - PL_bostr;
3162             PUSH_STATE_GOTO(KEEPS_next, next);
3163             /*NOT-REACHED*/
3164         case KEEPS_next_fail:
3165             /* rollback the start point change */
3166             PL_regoffs[0].start = st->u.keeper.val;
3167             sayNO_SILENT;
3168             /*NOT-REACHED*/
3169         case EOL:
3170                 goto seol;
3171         case MEOL:
3172             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3173                 sayNO;
3174             break;
3175         case SEOL:
3176           seol:
3177             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3178                 sayNO;
3179             if (PL_regeol - locinput > 1)
3180                 sayNO;
3181             break;
3182         case EOS:
3183             if (PL_regeol != locinput)
3184                 sayNO;
3185             break;
3186         case SANY:
3187             if (!nextchr && locinput >= PL_regeol)
3188                 sayNO;
3189             if (utf8_target) {
3190                 locinput += PL_utf8skip[nextchr];
3191                 if (locinput > PL_regeol)
3192                     sayNO;
3193                 nextchr = UCHARAT(locinput);
3194             }
3195             else
3196                 nextchr = UCHARAT(++locinput);
3197             break;
3198         case CANY:
3199             if (!nextchr && locinput >= PL_regeol)
3200                 sayNO;
3201             nextchr = UCHARAT(++locinput);
3202             break;
3203         case REG_ANY:
3204             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3205                 sayNO;
3206             if (utf8_target) {
3207                 locinput += PL_utf8skip[nextchr];
3208                 if (locinput > PL_regeol)
3209                     sayNO;
3210                 nextchr = UCHARAT(locinput);
3211             }
3212             else
3213                 nextchr = UCHARAT(++locinput);
3214             break;
3215
3216 #undef  ST
3217 #define ST st->u.trie
3218         case TRIEC:
3219             /* In this case the charclass data is available inline so
3220                we can fail fast without a lot of extra overhead.
3221              */
3222             if (scan->flags == EXACT || !utf8_target) {
3223                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3224                     DEBUG_EXECUTE_r(
3225                         PerlIO_printf(Perl_debug_log,
3226                                   "%*s  %sfailed to match trie start class...%s\n",
3227                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3228                     );
3229                     sayNO_SILENT;
3230                     /* NOTREACHED */
3231                 }
3232             }
3233             /* FALL THROUGH */
3234         case TRIE:
3235             /* the basic plan of execution of the trie is:
3236              * At the beginning, run though all the states, and
3237              * find the longest-matching word. Also remember the position
3238              * of the shortest matching word. For example, this pattern:
3239              *    1  2 3 4    5
3240              *    ab|a|x|abcd|abc
3241              * when matched against the string "abcde", will generate
3242              * accept states for all words except 3, with the longest
3243              * matching word being 4, and the shortest being 1 (with
3244              * the position being after char 1 of the string).
3245              *
3246              * Then for each matching word, in word order (i.e. 1,2,4,5),
3247              * we run the remainder of the pattern; on each try setting
3248              * the current position to the character following the word,
3249              * returning to try the next word on failure.
3250              *
3251              * We avoid having to build a list of words at runtime by
3252              * using a compile-time structure, wordinfo[].prev, which
3253              * gives, for each word, the previous accepting word (if any).
3254              * In the case above it would contain the mappings 1->2, 2->0,
3255              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3256              * the longest word (4 above), a list of all words, by
3257              * following the list of prev pointers; this gives us the
3258              * unordered list 4,5,1,2. Then given the current word we have
3259              * just tried, we can go through the list and find the
3260              * next-biggest word to try (so if we just failed on word 2,
3261              * the next in the list is 4).
3262              *
3263              * Since at runtime we don't record the matching position in
3264              * the string for each word, we have to work that out for
3265              * each word we're about to process. The wordinfo table holds
3266              * the character length of each word; given that we recorded
3267              * at the start: the position of the shortest word and its
3268              * length in chars, we just need to move the pointer the
3269              * difference between the two char lengths. Depending on
3270              * Unicode status and folding, that's cheap or expensive.
3271              *
3272              * This algorithm is optimised for the case where are only a
3273              * small number of accept states, i.e. 0,1, or maybe 2.
3274              * With lots of accepts states, and having to try all of them,
3275              * it becomes quadratic on number of accept states to find all
3276              * the next words.
3277              */
3278
3279             {
3280                 /* what type of TRIE am I? (utf8 makes this contextual) */
3281                 DECL_TRIE_TYPE(scan);
3282
3283                 /* what trie are we using right now */
3284                 reg_trie_data * const trie
3285                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3286                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3287                 U32 state = trie->startstate;
3288
3289                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3290                     !TRIE_BITMAP_TEST(trie,*locinput)
3291                 ) {
3292                     if (trie->states[ state ].wordnum) {
3293                          DEBUG_EXECUTE_r(
3294                             PerlIO_printf(Perl_debug_log,
3295                                           "%*s  %smatched empty string...%s\n",
3296                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3297                         );
3298                         if (!trie->jump)
3299                             break;
3300                     } else {
3301                         DEBUG_EXECUTE_r(
3302                             PerlIO_printf(Perl_debug_log,
3303                                           "%*s  %sfailed to match trie start class...%s\n",
3304                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3305                         );
3306                         sayNO_SILENT;
3307                    }
3308                 }
3309
3310             {
3311                 U8 *uc = ( U8* )locinput;
3312
3313                 STRLEN len = 0;
3314                 STRLEN foldlen = 0;
3315                 U8 *uscan = (U8*)NULL;
3316                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3317                 U32 charcount = 0; /* how many input chars we have matched */
3318                 U32 accepted = 0; /* have we seen any accepting states? */
3319
3320                 ST.B = next;
3321                 ST.jump = trie->jump;
3322                 ST.me = scan;
3323                 ST.firstpos = NULL;
3324                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3325                 ST.nextword = 0;
3326
3327                 /* fully traverse the TRIE; note the position of the
3328                    shortest accept state and the wordnum of the longest
3329                    accept state */
3330
3331                 while ( state && uc <= (U8*)PL_regeol ) {
3332                     U32 base = trie->states[ state ].trans.base;
3333                     UV uvc = 0;
3334                     U16 charid = 0;
3335                     U16 wordnum;
3336                     wordnum = trie->states[ state ].wordnum;
3337
3338                     if (wordnum) { /* it's an accept state */
3339                         if (!accepted) {
3340                             accepted = 1;
3341                             /* record first match position */
3342                             if (ST.longfold) {
3343                                 ST.firstpos = (U8*)locinput;
3344                                 ST.firstchars = 0;
3345                             }
3346                             else {
3347                                 ST.firstpos = uc;
3348                                 ST.firstchars = charcount;
3349                             }
3350                         }
3351                         if (!ST.nextword || wordnum < ST.nextword)
3352                             ST.nextword = wordnum;
3353                         ST.topword = wordnum;
3354                     }
3355
3356                     DEBUG_TRIE_EXECUTE_r({
3357                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3358                                 PerlIO_printf( Perl_debug_log,
3359                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3360                                     2+depth * 2, "", PL_colors[4],
3361                                     (UV)state, (accepted ? 'Y' : 'N'));
3362                     });
3363
3364                     /* read a char and goto next state */
3365                     if ( base ) {
3366                         I32 offset;
3367                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3368                                              uscan, len, uvc, charid, foldlen,
3369                                              foldbuf, uniflags);
3370                         charcount++;
3371                         if (foldlen>0)
3372                             ST.longfold = TRUE;
3373                         if (charid &&
3374                              ( ((offset =
3375                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3376
3377                              && ((U32)offset < trie->lasttrans)
3378                              && trie->trans[offset].check == state)
3379                         {
3380                             state = trie->trans[offset].next;
3381                         }
3382                         else {
3383                             state = 0;
3384                         }
3385                         uc += len;
3386
3387                     }
3388                     else {
3389                         state = 0;
3390                     }
3391                     DEBUG_TRIE_EXECUTE_r(
3392                         PerlIO_printf( Perl_debug_log,
3393                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3394                             charid, uvc, (UV)state, PL_colors[5] );
3395                     );
3396                 }
3397                 if (!accepted)
3398                    sayNO;
3399
3400                 /* calculate total number of accept states */
3401                 {
3402                     U16 w = ST.topword;
3403                     accepted = 0;
3404                     while (w) {
3405                         w = trie->wordinfo[w].prev;
3406                         accepted++;
3407                     }
3408                     ST.accepted = accepted;
3409                 }
3410
3411                 DEBUG_EXECUTE_r(
3412                     PerlIO_printf( Perl_debug_log,
3413                         "%*s  %sgot %"IVdf" possible matches%s\n",
3414                         REPORT_CODE_OFF + depth * 2, "",
3415                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3416                 );
3417                 goto trie_first_try; /* jump into the fail handler */
3418             }}
3419             /* NOTREACHED */
3420
3421         case TRIE_next_fail: /* we failed - try next alternative */
3422             if ( ST.jump) {
3423                 REGCP_UNWIND(ST.cp);
3424                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3425                     PL_regoffs[n].end = -1;
3426                 *PL_reglastparen = n;
3427             }
3428             if (!--ST.accepted) {
3429                 DEBUG_EXECUTE_r({
3430                     PerlIO_printf( Perl_debug_log,
3431                         "%*s  %sTRIE failed...%s\n",
3432                         REPORT_CODE_OFF+depth*2, "",
3433                         PL_colors[4],
3434                         PL_colors[5] );
3435                 });
3436                 sayNO_SILENT;
3437             }
3438             {
3439                 /* Find next-highest word to process.  Note that this code
3440                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3441                 register U16 min = 0;
3442                 register U16 word;
3443                 register U16 const nextword = ST.nextword;
3444                 register reg_trie_wordinfo * const wordinfo
3445                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3446                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3447                     if (word > nextword && (!min || word < min))
3448                         min = word;
3449                 }
3450                 ST.nextword = min;
3451             }
3452
3453           trie_first_try:
3454             if (do_cutgroup) {
3455                 do_cutgroup = 0;
3456                 no_final = 0;
3457             }
3458
3459             if ( ST.jump) {
3460                 ST.lastparen = *PL_reglastparen;
3461                 REGCP_SET(ST.cp);
3462             }
3463
3464             /* find start char of end of current word */
3465             {
3466                 U32 chars; /* how many chars to skip */
3467                 U8 *uc = ST.firstpos;
3468                 reg_trie_data * const trie
3469                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3470
3471                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3472                             >=  ST.firstchars);
3473                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3474                             - ST.firstchars;
3475
3476                 if (ST.longfold) {
3477                     /* the hard option - fold each char in turn and find
3478                      * its folded length (which may be different */
3479                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3480                     STRLEN foldlen;
3481                     STRLEN len;
3482                     UV uvc;
3483                     U8 *uscan;
3484
3485                     while (chars) {
3486                         if (utf8_target) {
3487                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3488                                                     uniflags);
3489                             uc += len;
3490                         }
3491                         else {
3492                             uvc = *uc;
3493                             uc++;
3494                         }
3495                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3496                         uscan = foldbuf;
3497                         while (foldlen) {
3498                             if (!--chars)
3499                                 break;
3500                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3501                                             uniflags);
3502                             uscan += len;
3503                             foldlen -= len;
3504                         }
3505                     }
3506                 }
3507                 else {
3508                     if (utf8_target)
3509                         while (chars--)
3510                             uc += UTF8SKIP(uc);
3511                     else
3512                         uc += chars;
3513                 }
3514                 PL_reginput = (char *)uc;
3515             }
3516
3517             scan = (ST.jump && ST.jump[ST.nextword])
3518                         ? ST.me + ST.jump[ST.nextword]
3519                         : ST.B;
3520
3521             DEBUG_EXECUTE_r({
3522                 PerlIO_printf( Perl_debug_log,
3523                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3524                     REPORT_CODE_OFF+depth*2, "",
3525                     PL_colors[4],
3526                     ST.nextword,
3527                     PL_colors[5]
3528                     );
3529             });
3530
3531             if (ST.accepted > 1 || has_cutgroup) {
3532                 PUSH_STATE_GOTO(TRIE_next, scan);
3533                 /* NOTREACHED */
3534             }
3535             /* only one choice left - just continue */
3536             DEBUG_EXECUTE_r({
3537                 AV *const trie_words
3538                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3539                 SV ** const tmp = av_fetch( trie_words,
3540                     ST.nextword-1, 0 );
3541                 SV *sv= tmp ? sv_newmortal() : NULL;
3542
3543                 PerlIO_printf( Perl_debug_log,
3544                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3545                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3546                     ST.nextword,
3547                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3548                             PL_colors[0], PL_colors[1],
3549                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3550                         )
3551                     : "not compiled under -Dr",
3552                     PL_colors[5] );
3553             });
3554
3555             locinput = PL_reginput;
3556             nextchr = UCHARAT(locinput);
3557             continue; /* execute rest of RE */
3558             /* NOTREACHED */
3559 #undef  ST
3560
3561         case EXACT: {
3562             char *s = STRING(scan);
3563             ln = STR_LEN(scan);
3564             if (utf8_target != UTF_PATTERN) {
3565                 /* The target and the pattern have differing utf8ness. */
3566                 char *l = locinput;
3567                 const char * const e = s + ln;
3568
3569                 if (utf8_target) {
3570                     /* The target is utf8, the pattern is not utf8. */
3571                     while (s < e) {
3572                         STRLEN ulen;
3573                         if (l >= PL_regeol)
3574                              sayNO;
3575                         if (NATIVE_TO_UNI(*(U8*)s) !=
3576                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3577                                             uniflags))
3578                              sayNO;
3579                         l += ulen;
3580                         s ++;
3581                     }
3582                 }
3583                 else {
3584                     /* The target is not utf8, the pattern is utf8. */
3585                     while (s < e) {
3586                         STRLEN ulen;
3587                         if (l >= PL_regeol)
3588                             sayNO;
3589                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3590                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3591                                            uniflags))
3592                             sayNO;
3593                         s += ulen;
3594                         l ++;
3595                     }
3596                 }
3597                 locinput = l;
3598                 nextchr = UCHARAT(locinput);
3599                 break;
3600             }
3601             /* The target and the pattern have the same utf8ness. */
3602             /* Inline the first character, for speed. */
3603             if (UCHARAT(s) != nextchr)
3604                 sayNO;
3605             if (PL_regeol - locinput < ln)
3606                 sayNO;
3607             if (ln > 1 && memNE(s, locinput, ln))
3608                 sayNO;
3609             locinput += ln;
3610             nextchr = UCHARAT(locinput);
3611             break;
3612             }
3613         case EXACTFL: {
3614             re_fold_t folder;
3615             const U8 * fold_array;
3616             const char * s;
3617             U32 fold_utf8_flags;
3618
3619             PL_reg_flags |= RF_tainted;
3620             folder = foldEQ_locale;
3621             fold_array = PL_fold_locale;
3622             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3623             goto do_exactf;
3624
3625         case EXACTFU:
3626             folder = foldEQ_latin1;
3627             fold_array = PL_fold_latin1;
3628             fold_utf8_flags = 0;
3629             goto do_exactf;
3630
3631         case EXACTFA:
3632             folder = foldEQ_latin1;
3633             fold_array = PL_fold_latin1;
3634             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3635             goto do_exactf;
3636
3637         case EXACTF:
3638             folder = foldEQ;
3639             fold_array = PL_fold;
3640             fold_utf8_flags = 0;
3641
3642           do_exactf:
3643             s = STRING(scan);
3644             ln = STR_LEN(scan);
3645
3646             if (utf8_target || UTF_PATTERN) {
3647               /* Either target or the pattern are utf8. */
3648                 const char * const l = locinput;
3649                 char *e = PL_regeol;
3650
3651                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3652                                l, &e, 0,  utf8_target, fold_utf8_flags))
3653                 {
3654                     sayNO;
3655                 }
3656                 locinput = e;
3657                 nextchr = UCHARAT(locinput);
3658                 break;
3659             }
3660
3661             /* Neither the target nor the pattern are utf8 */
3662             if (UCHARAT(s) != nextchr &&
3663                 UCHARAT(s) != fold_array[nextchr])
3664             {
3665                 sayNO;
3666             }
3667             if (PL_regeol - locinput < ln)
3668                 sayNO;
3669             if (ln > 1 && ! folder(s, locinput, ln))
3670                 sayNO;
3671             locinput += ln;
3672             nextchr = UCHARAT(locinput);
3673             break;
3674         }
3675
3676         /* XXX Could improve efficiency by separating these all out using a
3677          * macro or in-line function.  At that point regcomp.c would no longer
3678          * have to set the FLAGS fields of these */
3679         case BOUNDL:
3680         case NBOUNDL:
3681             PL_reg_flags |= RF_tainted;
3682             /* FALL THROUGH */
3683         case BOUND:
3684         case BOUNDU:
3685         case BOUNDA:
3686         case NBOUND:
3687         case NBOUNDU:
3688         case NBOUNDA:
3689             /* was last char in word? */
3690             if (utf8_target
3691                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3692                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3693             {
3694                 if (locinput == PL_bostr)
3695                     ln = '\n';
3696                 else {
3697                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3698
3699                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3700                 }
3701                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3702                     ln = isALNUM_uni(ln);
3703                     LOAD_UTF8_CHARCLASS_ALNUM();
3704                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3705                 }
3706                 else {
3707                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3708                     n = isALNUM_LC_utf8((U8*)locinput);
3709                 }
3710             }
3711             else {
3712
3713                 /* Here the string isn't utf8, or is utf8 and only ascii
3714                  * characters are to match \w.  In the latter case looking at
3715                  * the byte just prior to the current one may be just the final
3716                  * byte of a multi-byte character.  This is ok.  There are two
3717                  * cases:
3718                  * 1) it is a single byte character, and then the test is doing
3719                  *      just what it's supposed to.
3720                  * 2) it is a multi-byte character, in which case the final
3721                  *      byte is never mistakable for ASCII, and so the test
3722                  *      will say it is not a word character, which is the
3723                  *      correct answer. */
3724                 ln = (locinput != PL_bostr) ?
3725                     UCHARAT(locinput - 1) : '\n';
3726                 switch (FLAGS(scan)) {
3727                     case REGEX_UNICODE_CHARSET:
3728                         ln = isWORDCHAR_L1(ln);
3729                         n = isWORDCHAR_L1(nextchr);
3730                         break;
3731                     case REGEX_LOCALE_CHARSET:
3732                         ln = isALNUM_LC(ln);
3733                         n = isALNUM_LC(nextchr);
3734                         break;
3735                     case REGEX_DEPENDS_CHARSET:
3736                         ln = isALNUM(ln);
3737                         n = isALNUM(nextchr);
3738                         break;
3739                     case REGEX_ASCII_RESTRICTED_CHARSET:
3740                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3741                         ln = isWORDCHAR_A(ln);
3742                         n = isWORDCHAR_A(nextchr);
3743                         break;
3744                     default:
3745                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3746                         break;
3747                 }
3748             }
3749             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3750              * regcomp.sym */
3751             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3752                     sayNO;
3753             break;
3754         case ANYOFV:
3755         case ANYOF:
3756             if (utf8_target || state_num == ANYOFV) {
3757                 STRLEN inclasslen = PL_regeol - locinput;
3758                 if (locinput >= PL_regeol)
3759                     sayNO;
3760
3761                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3762                     sayNO;
3763                 locinput += inclasslen;
3764                 nextchr = UCHARAT(locinput);
3765                 break;
3766             }
3767             else {
3768                 if (nextchr < 0)
3769                     nextchr = UCHARAT(locinput);
3770                 if (!nextchr && locinput >= PL_regeol)
3771                     sayNO;
3772                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3773                     sayNO;
3774                 nextchr = UCHARAT(++locinput);
3775                 break;
3776             }
3777             break;
3778         /* Special char classes - The defines start on line 129 or so */
3779         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3780                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3781                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3782                   ALNUMA, NALNUMA, isWORDCHAR_A,
3783                   alnum, "a");
3784
3785         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3786                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3787                   SPACEU, NSPACEU, isSPACE_L1,
3788                   SPACEA, NSPACEA, isSPACE_A,
3789                   space, " ");
3790
3791         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3792                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3793                 DIGITA, NDIGITA, isDIGIT_A,
3794                 digit, "0");
3795
3796         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3797                        a Unicode extended Grapheme Cluster */
3798             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3799               extended Grapheme Cluster is:
3800
3801                CR LF
3802                | Prepend* Begin Extend*
3803                | .
3804
3805                Begin is (Hangul-syllable | ! Control)
3806                Extend is (Grapheme_Extend | Spacing_Mark)
3807                Control is [ GCB_Control CR LF ]
3808
3809                The discussion below shows how the code for CLUMP is derived
3810                from this regex.  Note that most of these concepts are from
3811                property values of the Grapheme Cluster Boundary (GCB) property.
3812                No code point can have multiple property values for a given
3813                property.  Thus a code point in Prepend can't be in Control, but
3814                it must be in !Control.  This is why Control above includes
3815                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3816                property separately, and so can't be in GCB_Control, even though
3817                they logically are controls.  Control is not the same as gc=cc,
3818                but includes format and other characters as well.
3819
3820                The Unicode definition of Hangul-syllable is:
3821                    L+
3822                    | (L* ( ( V | LV ) V* | LVT ) T*)
3823                    | T+
3824                   )
3825                Each of these is a value for the GCB property, and hence must be
3826                disjoint, so the order they are tested is immaterial, so the
3827                above can safely be changed to
3828                    T+
3829                    | L+
3830                    | (L* ( LVT | ( V | LV ) V*) T*)
3831
3832                The last two terms can be combined like this:
3833                    L* ( L
3834                         | (( LVT | ( V | LV ) V*) T*))
3835
3836                And refactored into this:
3837                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3838
3839                That means that if we have seen any L's at all we can quit
3840                there, but if the next character is an LVT, a V, or an LV we
3841                should keep going.
3842
3843                There is a subtlety with Prepend* which showed up in testing.
3844                Note that the Begin, and only the Begin is required in:
3845                 | Prepend* Begin Extend*
3846                Also, Begin contains '! Control'.  A Prepend must be a
3847                '!  Control', which means it must also be a Begin.  What it
3848                comes down to is that if we match Prepend* and then find no
3849                suitable Begin afterwards, that if we backtrack the last
3850                Prepend, that one will be a suitable Begin.
3851             */
3852
3853             if (locinput >= PL_regeol)
3854                 sayNO;
3855             if  (! utf8_target) {
3856
3857                 /* Match either CR LF  or '.', as all the other possibilities
3858                  * require utf8 */
3859                 locinput++;         /* Match the . or CR */
3860                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3861                                        match the LF */
3862                     && locinput < PL_regeol
3863                     && UCHARAT(locinput) == '\n') locinput++;
3864             }
3865             else {
3866
3867                 /* Utf8: See if is ( CR LF ); already know that locinput <
3868                  * PL_regeol, so locinput+1 is in bounds */
3869                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3870                     locinput += 2;
3871                 }
3872                 else {
3873                     /* In case have to backtrack to beginning, then match '.' */
3874                     char *starting = locinput;
3875
3876                     /* In case have to backtrack the last prepend */
3877                     char *previous_prepend = 0;
3878
3879                     LOAD_UTF8_CHARCLASS_GCB();
3880
3881                     /* Match (prepend)* */
3882                     while (locinput < PL_regeol
3883                            && swash_fetch(PL_utf8_X_prepend,
3884                                           (U8*)locinput, utf8_target))
3885                     {
3886                         previous_prepend = locinput;
3887                         locinput += UTF8SKIP(locinput);
3888                     }
3889
3890                     /* As noted above, if we matched a prepend character, but
3891                      * the next thing won't match, back off the last prepend we
3892                      * matched, as it is guaranteed to match the begin */
3893                     if (previous_prepend
3894                         && (locinput >=  PL_regeol
3895                             || ! swash_fetch(PL_utf8_X_begin,
3896                                              (U8*)locinput, utf8_target)))
3897                     {
3898                         locinput = previous_prepend;
3899                     }
3900
3901                     /* Note that here we know PL_regeol > locinput, as we
3902                      * tested that upon input to this switch case, and if we
3903                      * moved locinput forward, we tested the result just above
3904                      * and it either passed, or we backed off so that it will
3905                      * now pass */
3906                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3907
3908                         /* Here did not match the required 'Begin' in the
3909                          * second term.  So just match the very first
3910                          * character, the '.' of the final term of the regex */
3911                         locinput = starting + UTF8SKIP(starting);
3912                     } else {
3913
3914                         /* Here is the beginning of a character that can have
3915                          * an extender.  It is either a hangul syllable, or a
3916                          * non-control */
3917                         if (swash_fetch(PL_utf8_X_non_hangul,
3918                                         (U8*)locinput, utf8_target))
3919                         {
3920
3921                             /* Here not a Hangul syllable, must be a
3922                              * ('!  * Control') */
3923                             locinput += UTF8SKIP(locinput);
3924                         } else {
3925
3926                             /* Here is a Hangul syllable.  It can be composed
3927                              * of several individual characters.  One
3928                              * possibility is T+ */
3929                             if (swash_fetch(PL_utf8_X_T,
3930                                             (U8*)locinput, utf8_target))
3931                             {
3932                                 while (locinput < PL_regeol
3933                                         && swash_fetch(PL_utf8_X_T,
3934                                                         (U8*)locinput, utf8_target))
3935                                 {
3936                                     locinput += UTF8SKIP(locinput);
3937                                 }
3938                             } else {
3939
3940                                 /* Here, not T+, but is a Hangul.  That means
3941                                  * it is one of the others: L, LV, LVT or V,
3942                                  * and matches:
3943                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3944
3945                                 /* Match L*           */
3946                                 while (locinput < PL_regeol
3947                                         && swash_fetch(PL_utf8_X_L,
3948                                                         (U8*)locinput, utf8_target))
3949                                 {
3950                                     locinput += UTF8SKIP(locinput);
3951                                 }
3952
3953                                 /* Here, have exhausted L*.  If the next
3954                                  * character is not an LV, LVT nor V, it means
3955                                  * we had to have at least one L, so matches L+
3956                                  * in the original equation, we have a complete
3957                                  * hangul syllable.  Are done. */
3958
3959                                 if (locinput < PL_regeol
3960                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3961                                                     (U8*)locinput, utf8_target))
3962                                 {
3963
3964                                     /* Otherwise keep going.  Must be LV, LVT
3965                                      * or V.  See if LVT */
3966                                     if (swash_fetch(PL_utf8_X_LVT,
3967                                                     (U8*)locinput, utf8_target))
3968                                     {
3969                                         locinput += UTF8SKIP(locinput);
3970                                     } else {
3971
3972                                         /* Must be  V or LV.  Take it, then
3973                                          * match V*     */
3974                                         locinput += UTF8SKIP(locinput);
3975                                         while (locinput < PL_regeol
3976                                                 && swash_fetch(PL_utf8_X_V,
3977                                                          (U8*)locinput, utf8_target))
3978                                         {
3979                                             locinput += UTF8SKIP(locinput);
3980                                         }
3981                                     }
3982
3983                                     /* And any of LV, LVT, or V can be followed
3984                                      * by T*            */
3985                                     while (locinput < PL_regeol
3986                                            && swash_fetch(PL_utf8_X_T,
3987                                                            (U8*)locinput,
3988                                                            utf8_target))
3989                                     {
3990                                         locinput += UTF8SKIP(locinput);
3991                                     }
3992                                 }
3993                             }
3994                         }
3995
3996                         /* Match any extender */
3997                         while (locinput < PL_regeol
3998                                 && swash_fetch(PL_utf8_X_extend,
3999                                                 (U8*)locinput, utf8_target))
4000                         {
4001                             locinput += UTF8SKIP(locinput);
4002                         }
4003                     }
4004                 }
4005                 if (locinput > PL_regeol) sayNO;
4006             }
4007             nextchr = UCHARAT(locinput);
4008             break;
4009
4010         case NREFFL:
4011         {   /* The capture buffer cases.  The ones beginning with N for the
4012                named buffers just convert to the equivalent numbered and
4013                pretend they were called as the corresponding numbered buffer
4014                op.  */
4015             /* don't initialize these in the declaration, it makes C++
4016                unhappy */
4017             char *s;
4018             char type;
4019             re_fold_t folder;
4020             const U8 *fold_array;
4021             UV utf8_fold_flags;
4022
4023             PL_reg_flags |= RF_tainted;
4024             folder = foldEQ_locale;
4025             fold_array = PL_fold_locale;
4026             type = REFFL;
4027             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4028             goto do_nref;
4029
4030         case NREFFA:
4031             folder = foldEQ_latin1;
4032             fold_array = PL_fold_latin1;
4033             type = REFFA;
4034             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4035             goto do_nref;
4036
4037         case NREFFU:
4038             folder = foldEQ_latin1;
4039             fold_array = PL_fold_latin1;
4040             type = REFFU;
4041             utf8_fold_flags = 0;
4042             goto do_nref;
4043
4044         case NREFF:
4045             folder = foldEQ;
4046             fold_array = PL_fold;
4047             type = REFF;
4048             utf8_fold_flags = 0;
4049             goto do_nref;
4050
4051         case NREF:
4052             type = REF;
4053             folder = NULL;
4054             fold_array = NULL;
4055             utf8_fold_flags = 0;
4056           do_nref:
4057
4058             /* For the named back references, find the corresponding buffer
4059              * number */
4060             n = reg_check_named_buff_matched(rex,scan);
4061
4062             if ( ! n ) {
4063                 sayNO;
4064             }
4065             goto do_nref_ref_common;
4066
4067         case REFFL:
4068             PL_reg_flags |= RF_tainted;
4069             folder = foldEQ_locale;
4070             fold_array = PL_fold_locale;
4071             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4072             goto do_ref;
4073
4074         case REFFA:
4075             folder = foldEQ_latin1;
4076             fold_array = PL_fold_latin1;
4077             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4078             goto do_ref;
4079
4080         case REFFU:
4081             folder = foldEQ_latin1;
4082             fold_array = PL_fold_latin1;
4083             utf8_fold_flags = 0;
4084             goto do_ref;
4085
4086         case REFF:
4087             folder = foldEQ;
4088             fold_array = PL_fold;
4089             utf8_fold_flags = 0;
4090             goto do_ref;
4091
4092         case REF:
4093             folder = NULL;
4094             fold_array = NULL;
4095             utf8_fold_flags = 0;
4096
4097           do_ref:
4098             type = OP(scan);
4099             n = ARG(scan);  /* which paren pair */
4100
4101           do_nref_ref_common:
4102             ln = PL_regoffs[n].start;
4103             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4104             if (*PL_reglastparen < n || ln == -1)
4105                 sayNO;                  /* Do not match unless seen CLOSEn. */
4106             if (ln == PL_regoffs[n].end)
4107                 break;
4108
4109             s = PL_bostr + ln;
4110             if (type != REF     /* REF can do byte comparison */
4111                 && (utf8_target || type == REFFU))
4112             { /* XXX handle REFFL better */
4113                 char * limit = PL_regeol;
4114
4115                 /* This call case insensitively compares the entire buffer
4116                     * at s, with the current input starting at locinput, but
4117                     * not going off the end given by PL_regeol, and returns in
4118                     * limit upon success, how much of the current input was
4119                     * matched */
4120                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4121                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4122                 {
4123                     sayNO;
4124                 }
4125                 locinput = limit;
4126                 nextchr = UCHARAT(locinput);
4127                 break;
4128             }
4129
4130             /* Not utf8:  Inline the first character, for speed. */
4131             if (UCHARAT(s) != nextchr &&
4132                 (type == REF ||
4133                  UCHARAT(s) != fold_array[nextchr]))
4134                 sayNO;
4135             ln = PL_regoffs[n].end - ln;
4136             if (locinput + ln > PL_regeol)
4137                 sayNO;
4138             if (ln > 1 && (type == REF
4139                            ? memNE(s, locinput, ln)
4140                            : ! folder(s, locinput, ln)))
4141                 sayNO;
4142             locinput += ln;
4143             nextchr = UCHARAT(locinput);
4144             break;
4145         }
4146         case NOTHING:
4147         case TAIL:
4148             break;
4149         case BACK:
4150             break;
4151
4152 #undef  ST
4153 #define ST st->u.eval
4154         {
4155             SV *ret;
4156             REGEXP *re_sv;
4157             regexp *re;
4158             regexp_internal *rei;
4159             regnode *startpoint;
4160
4161         case GOSTART:
4162         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4163             if (cur_eval && cur_eval->locinput==locinput) {
4164                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4165                     Perl_croak(aTHX_ "Infinite recursion in regex");
4166                 if ( ++nochange_depth > max_nochange_depth )
4167                     Perl_croak(aTHX_
4168                         "Pattern subroutine nesting without pos change"
4169                         " exceeded limit in regex");
4170             } else {
4171                 nochange_depth = 0;
4172             }
4173             re_sv = rex_sv;
4174             re = rex;
4175             rei = rexi;
4176             (void)ReREFCNT_inc(rex_sv);
4177             if (OP(scan)==GOSUB) {
4178                 startpoint = scan + ARG2L(scan);
4179                 ST.close_paren = ARG(scan);
4180             } else {
4181                 startpoint = rei->program+1;
4182                 ST.close_paren = 0;
4183             }
4184             goto eval_recurse_doit;
4185             /* NOTREACHED */
4186         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4187             if (cur_eval && cur_eval->locinput==locinput) {
4188                 if ( ++nochange_depth > max_nochange_depth )
4189                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4190             } else {
4191                 nochange_depth = 0;
4192             }
4193             {
4194                 /* execute the code in the {...} */
4195                 dSP;
4196                 SV ** const before = SP;
4197                 OP_4tree * const oop = PL_op;
4198                 COP * const ocurcop = PL_curcop;
4199                 PAD *old_comppad;
4200                 char *saved_regeol = PL_regeol;
4201                 struct re_save_state saved_state;
4202
4203                 /* To not corrupt the existing regex state while executing the
4204                  * eval we would normally put it on the save stack, like with
4205                  * save_re_context. However, re-evals have a weird scoping so we
4206                  * can't just add ENTER/LEAVE here. With that, things like
4207                  *
4208                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4209                  *
4210                  * would break, as they expect the localisation to be unwound
4211                  * only when the re-engine backtracks through the bit that
4212                  * localised it.
4213                  *
4214                  * What we do instead is just saving the state in a local c
4215                  * variable.
4216                  */
4217                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4218
4219                 n = ARG(scan);
4220                 PL_op = (OP_4tree*)rexi->data->data[n];
4221                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4222                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4223                 /* wrap the call in two SAVECOMPPADs. This ensures that
4224                  * when the save stack is eventually unwound, all the
4225                  * accumulated SAVEt_CLEARSV's will be processed with
4226                  * interspersed SAVEt_COMPPAD's to ensure that lexicals
4227                  * are cleared in the right pad */
4228                 SAVECOMPPAD();
4229                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4230                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4231
4232                 if (sv_yes_mark) {
4233                     SV *sv_mrk = get_sv("REGMARK", 1);
4234                     sv_setsv(sv_mrk, sv_yes_mark);
4235                 }
4236
4237                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4238                 SPAGAIN;
4239                 if (SP == before)
4240                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4241                 else {
4242                     ret = POPs;
4243                     PUTBACK;
4244                 }
4245
4246                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4247
4248                 PL_op = oop;
4249                 SAVECOMPPAD();
4250                 PAD_RESTORE_LOCAL(old_comppad);
4251                 PL_curcop = ocurcop;
4252                 PL_regeol = saved_regeol;
4253                 if (!logical) {
4254                     /* /(?{...})/ */
4255                     sv_setsv(save_scalar(PL_replgv), ret);
4256                     break;
4257                 }
4258             }
4259             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4260                 logical = 0;
4261                 {
4262                     /* extract RE object from returned value; compiling if
4263                      * necessary */
4264                     MAGIC *mg = NULL;
4265                     REGEXP *rx = NULL;
4266
4267                     if (SvROK(ret)) {
4268                         SV *const sv = SvRV(ret);
4269
4270                         if (SvTYPE(sv) == SVt_REGEXP) {
4271                             rx = (REGEXP*) sv;
4272                         } else if (SvSMAGICAL(sv)) {
4273                             mg = mg_find(sv, PERL_MAGIC_qr);
4274                             assert(mg);
4275                         }
4276                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4277                         rx = (REGEXP*) ret;
4278                     } else if (SvSMAGICAL(ret)) {
4279                         if (SvGMAGICAL(ret)) {
4280                             /* I don't believe that there is ever qr magic
4281                                here.  */
4282                             assert(!mg_find(ret, PERL_MAGIC_qr));
4283                             sv_unmagic(ret, PERL_MAGIC_qr);
4284                         }
4285                         else {
4286                             mg = mg_find(ret, PERL_MAGIC_qr);
4287                             /* testing suggests mg only ends up non-NULL for
4288                                scalars who were upgraded and compiled in the
4289                                else block below. In turn, this is only
4290                                triggered in the "postponed utf8 string" tests
4291                                in t/op/pat.t  */
4292                         }
4293                     }
4294
4295                     if (mg) {
4296                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4297                         assert(rx);
4298                     }
4299                     if (rx) {
4300                         rx = reg_temp_copy(NULL, rx);
4301                     }
4302                     else {
4303                         U32 pm_flags = 0;
4304                         const I32 osize = PL_regsize;
4305
4306                         if (DO_UTF8(ret)) {
4307                             assert (SvUTF8(ret));
4308                         } else if (SvUTF8(ret)) {
4309                             /* Not doing UTF-8, despite what the SV says. Is
4310                                this only if we're trapped in use 'bytes'?  */
4311                             /* Make a copy of the octet sequence, but without
4312                                the flag on, as the compiler now honours the
4313                                SvUTF8 flag on ret.  */
4314                             STRLEN len;
4315                             const char *const p = SvPV(ret, len);
4316                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4317                         }
4318                         rx = CALLREGCOMP(ret, pm_flags);
4319                         if (!(SvFLAGS(ret)
4320                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4321                                  | SVs_GMG))) {
4322                             /* This isn't a first class regexp. Instead, it's
4323                                caching a regexp onto an existing, Perl visible
4324                                scalar.  */
4325                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4326                         }
4327                         PL_regsize = osize;
4328                     }
4329                     re_sv = rx;
4330                     re = (struct regexp *)SvANY(rx);
4331                 }
4332                 RXp_MATCH_COPIED_off(re);
4333                 re->subbeg = rex->subbeg;
4334                 re->sublen = rex->sublen;
4335                 rei = RXi_GET(re);
4336                 DEBUG_EXECUTE_r(
4337                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4338                         "Matching embedded");
4339                 );
4340                 startpoint = rei->program + 1;
4341                 ST.close_paren = 0; /* only used for GOSUB */
4342                 /* borrowed from regtry */
4343                 if (PL_reg_start_tmpl <= re->nparens) {
4344                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4345                     if(PL_reg_start_tmp)
4346                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4347                     else
4348                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4349                 }
4350
4351         eval_recurse_doit: /* Share code with GOSUB below this line */
4352                 /* run the pattern returned from (??{...}) */
4353                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4354                 REGCP_SET(ST.lastcp);
4355
4356                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4357
4358                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4359                 PL_reglastparen = &re->lastparen;
4360                 PL_reglastcloseparen = &re->lastcloseparen;
4361                 re->lastparen = 0;
4362                 re->lastcloseparen = 0;
4363
4364                 PL_reginput = locinput;
4365                 PL_regsize = 0;
4366
4367                 /* XXXX This is too dramatic a measure... */
4368                 PL_reg_maxiter = 0;
4369
4370                 ST.toggle_reg_flags = PL_reg_flags;
4371                 if (RX_UTF8(re_sv))
4372                     PL_reg_flags |= RF_utf8;
4373                 else
4374                     PL_reg_flags &= ~RF_utf8;
4375                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4376
4377                 ST.prev_rex = rex_sv;
4378                 ST.prev_curlyx = cur_curlyx;
4379                 SETREX(rex_sv,re_sv);
4380                 rex = re;
4381                 rexi = rei;
4382                 cur_curlyx = NULL;
4383                 ST.B = next;
4384                 ST.prev_eval = cur_eval;
4385                 cur_eval = st;
4386                 /* now continue from first node in postoned RE */
4387                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4388                 /* NOTREACHED */
4389             }
4390             /* logical is 1,   /(?(?{...})X|Y)/ */
4391             sw = cBOOL(SvTRUE(ret));
4392             logical = 0;
4393             break;
4394         }
4395
4396         case EVAL_AB: /* cleanup after a successful (??{A})B */
4397             /* note: this is called twice; first after popping B, then A */
4398             PL_reg_flags ^= ST.toggle_reg_flags;
4399             ReREFCNT_dec(rex_sv);
4400             SETREX(rex_sv,ST.prev_rex);
4401             rex = (struct regexp *)SvANY(rex_sv);
4402             rexi = RXi_GET(rex);
4403             regcpblow(ST.cp);
4404             cur_eval = ST.prev_eval;
4405             cur_curlyx = ST.prev_curlyx;
4406
4407             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4408             PL_reglastparen = &rex->lastparen;
4409             PL_reglastcloseparen = &rex->lastcloseparen;
4410             /* also update PL_regoffs */
4411             PL_regoffs = rex->offs;
4412
4413             /* XXXX This is too dramatic a measure... */
4414             PL_reg_maxiter = 0;
4415             if ( nochange_depth )
4416                 nochange_depth--;
4417             sayYES;
4418
4419
4420         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4421             /* note: this is called twice; first after popping B, then A */
4422             PL_reg_flags ^= ST.toggle_reg_flags;
4423             ReREFCNT_dec(rex_sv);
4424             SETREX(rex_sv,ST.prev_rex);
4425             rex = (struct regexp *)SvANY(rex_sv);
4426             rexi = RXi_GET(rex);
4427             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4428             PL_reglastparen = &rex->lastparen;
4429             PL_reglastcloseparen = &rex->lastcloseparen;
4430
4431             PL_reginput = locinput;
4432             REGCP_UNWIND(ST.lastcp);
4433             regcppop(rex);
4434             cur_eval = ST.prev_eval;
4435             cur_curlyx = ST.prev_curlyx;
4436             /* XXXX This is too dramatic a measure... */
4437             PL_reg_maxiter = 0;
4438             if ( nochange_depth )
4439                 nochange_depth--;
4440             sayNO_SILENT;
4441 #undef ST
4442
4443         case OPEN:
4444             n = ARG(scan);  /* which paren pair */
4445             PL_reg_start_tmp[n] = locinput;
4446             if (n > PL_regsize)
4447                 PL_regsize = n;
4448             lastopen = n;
4449             break;
4450         case CLOSE:
4451             n = ARG(scan);  /* which paren pair */
4452             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4453             PL_regoffs[n].end = locinput - PL_bostr;
4454             /*if (n > PL_regsize)
4455                 PL_regsize = n;*/
4456             if (n > *PL_reglastparen)
4457                 *PL_reglastparen = n;
4458             *PL_reglastcloseparen = n;
4459             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4460                 goto fake_end;
4461             }
4462             break;
4463         case ACCEPT:
4464             if (ARG(scan)){
4465                 regnode *cursor;
4466                 for (cursor=scan;
4467                      cursor && OP(cursor)!=END;
4468                      cursor=regnext(cursor))
4469                 {
4470                     if ( OP(cursor)==CLOSE ){
4471                         n = ARG(cursor);
4472                         if ( n <= lastopen ) {
4473                             PL_regoffs[n].start
4474                                 = PL_reg_start_tmp[n] - PL_bostr;
4475                             PL_regoffs[n].end = locinput - PL_bostr;
4476                             /*if (n > PL_regsize)
4477                             PL_regsize = n;*/
4478                             if (n > *PL_reglastparen)
4479                                 *PL_reglastparen = n;
4480                             *PL_reglastcloseparen = n;
4481                             if ( n == ARG(scan) || (cur_eval &&
4482                                 cur_eval->u.eval.close_paren == n))
4483                                 break;
4484                         }
4485                     }
4486                 }
4487             }
4488             goto fake_end;
4489             /*NOTREACHED*/
4490         case GROUPP:
4491             n = ARG(scan);  /* which paren pair */
4492             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4493             break;
4494         case NGROUPP:
4495             /* reg_check_named_buff_matched returns 0 for no match */
4496             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4497             break;
4498         case INSUBP:
4499             n = ARG(scan);
4500             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4501             break;
4502         case DEFINEP:
4503             sw = 0;
4504             break;
4505         case IFTHEN:
4506             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4507             if (sw)
4508                 next = NEXTOPER(NEXTOPER(scan));
4509             else {
4510                 next = scan + ARG(scan);
4511                 if (OP(next) == IFTHEN) /* Fake one. */
4512                     next = NEXTOPER(NEXTOPER(next));
4513             }
4514             break;
4515         case LOGICAL:
4516             logical = scan->flags;
4517             break;
4518
4519 /*******************************************************************
4520
4521 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4522 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4523 STAR/PLUS/CURLY/CURLYN are used instead.)
4524
4525 A*B is compiled as <CURLYX><A><WHILEM><B>
4526
4527 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4528 state, which contains the current count, initialised to -1. It also sets
4529 cur_curlyx to point to this state, with any previous value saved in the
4530 state block.
4531
4532 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4533 since the pattern may possibly match zero times (i.e. it's a while {} loop
4534 rather than a do {} while loop).
4535
4536 Each entry to WHILEM represents a successful match of A. The count in the
4537 CURLYX block is incremented, another WHILEM state is pushed, and execution
4538 passes to A or B depending on greediness and the current count.
4539
4540 For example, if matching against the string a1a2a3b (where the aN are
4541 substrings that match /A/), then the match progresses as follows: (the
4542 pushed states are interspersed with the bits of strings matched so far):
4543
4544     <CURLYX cnt=-1>
4545     <CURLYX cnt=0><WHILEM>
4546     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4547     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4548     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4549     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4550
4551 (Contrast this with something like CURLYM, which maintains only a single
4552 backtrack state:
4553
4554     <CURLYM cnt=0> a1
4555     a1 <CURLYM cnt=1> a2
4556     a1 a2 <CURLYM cnt=2> a3
4557     a1 a2 a3 <CURLYM cnt=3> b
4558 )
4559
4560 Each WHILEM state block marks a point to backtrack to upon partial failure
4561 of A or B, and also contains some minor state data related to that
4562 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4563 overall state, such as the count, and pointers to the A and B ops.
4564
4565 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4566 must always point to the *current* CURLYX block, the rules are:
4567
4568 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4569 and set cur_curlyx to point the new block.
4570
4571 When popping the CURLYX block after a successful or unsuccessful match,
4572 restore the previous cur_curlyx.
4573
4574 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4575 to the outer one saved in the CURLYX block.
4576
4577 When popping the WHILEM block after a successful or unsuccessful B match,
4578 restore the previous cur_curlyx.
4579
4580 Here's an example for the pattern (AI* BI)*BO
4581 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4582
4583 cur_
4584 curlyx backtrack stack
4585 ------ ---------------
4586 NULL
4587 CO     <CO prev=NULL> <WO>
4588 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4589 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4590 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4591
4592 At this point the pattern succeeds, and we work back down the stack to
4593 clean up, restoring as we go:
4594
4595 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4596 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4597 CO     <CO prev=NULL> <WO>
4598 NULL
4599
4600 *******************************************************************/
4601
4602 #define ST st->u.curlyx
4603
4604         case CURLYX:    /* start of /A*B/  (for complex A) */
4605         {
4606             /* No need to save/restore up to this paren */
4607             I32 parenfloor = scan->flags;
4608
4609             assert(next); /* keep Coverity happy */
4610             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4611                 next += ARG(next);
4612
4613             /* XXXX Probably it is better to teach regpush to support
4614                parenfloor > PL_regsize... */
4615             if (parenfloor > (I32)*PL_reglastparen)
4616                 parenfloor = *PL_reglastparen; /* Pessimization... */
4617
4618             ST.prev_curlyx= cur_curlyx;
4619             cur_curlyx = st;
4620             ST.cp = PL_savestack_ix;
4621
4622             /* these fields contain the state of the current curly.
4623              * they are accessed by subsequent WHILEMs */
4624             ST.parenfloor = parenfloor;
4625             ST.me = scan;
4626             ST.B = next;
4627             ST.minmod = minmod;
4628             minmod = 0;
4629             ST.count = -1;      /* this will be updated by WHILEM */
4630             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4631
4632             PL_reginput = locinput;
4633             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4634             /* NOTREACHED */
4635         }
4636
4637         case CURLYX_end: /* just finished matching all of A*B */
4638             cur_curlyx = ST.prev_curlyx;
4639             sayYES;
4640             /* NOTREACHED */
4641
4642         case CURLYX_end_fail: /* just failed to match all of A*B */
4643             regcpblow(ST.cp);
4644             cur_curlyx = ST.prev_curlyx;
4645             sayNO;
4646             /* NOTREACHED */
4647
4648
4649 #undef ST
4650 #define ST st->u.whilem
4651
4652         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4653         {
4654             /* see the discussion above about CURLYX/WHILEM */
4655             I32 n;
4656             int min = ARG1(cur_curlyx->u.curlyx.me);
4657             int max = ARG2(cur_curlyx->u.curlyx.me);
4658             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4659
4660             assert(cur_curlyx); /* keep Coverity happy */
4661             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4662             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4663             ST.cache_offset = 0;
4664             ST.cache_mask = 0;
4665
4666             PL_reginput = locinput;
4667
4668             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4669                   "%*s  whilem: matched %ld out of %d..%d\n",
4670                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4671             );
4672
4673             /* First just match a string of min A's. */
4674
4675             if (n < min) {
4676                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4677                 cur_curlyx->u.curlyx.lastloc = locinput;
4678                 REGCP_SET(ST.lastcp);
4679
4680                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4681                 /* NOTREACHED */
4682             }
4683
4684             /* If degenerate A matches "", assume A done. */
4685
4686             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4687                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4688                    "%*s  whilem: empty match detected, trying continuation...\n",
4689                    REPORT_CODE_OFF+depth*2, "")
4690                 );
4691                 goto do_whilem_B_max;
4692             }
4693
4694             /* super-linear cache processing */
4695
4696             if (scan->flags) {
4697
4698                 if (!PL_reg_maxiter) {
4699                     /* start the countdown: Postpone detection until we
4700                      * know the match is not *that* much linear. */
4701                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4702                     /* possible overflow for long strings and many CURLYX's */
4703                     if (PL_reg_maxiter < 0)
4704                         PL_reg_maxiter = I32_MAX;
4705                     PL_reg_leftiter = PL_reg_maxiter;
4706                 }
4707
4708                 if (PL_reg_leftiter-- == 0) {
4709                     /* initialise cache */
4710                     const I32 size = (PL_reg_maxiter + 7)/8;
4711                     if (PL_reg_poscache) {
4712                         if ((I32)PL_reg_poscache_size < size) {
4713                             Renew(PL_reg_poscache, size, char);
4714                             PL_reg_poscache_size = size;
4715                         }
4716                         Zero(PL_reg_poscache, size, char);
4717                     }
4718                     else {
4719                         PL_reg_poscache_size = size;
4720                         Newxz(PL_reg_poscache, size, char);
4721                     }
4722                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4723       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4724                               PL_colors[4], PL_colors[5])
4725                     );
4726                 }
4727
4728                 if (PL_reg_leftiter < 0) {
4729                     /* have we already failed at this position? */
4730                     I32 offset, mask;
4731                     offset  = (scan->flags & 0xf) - 1
4732                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4733                     mask    = 1 << (offset % 8);
4734                     offset /= 8;
4735                     if (PL_reg_poscache[offset] & mask) {
4736                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4737                             "%*s  whilem: (cache) already tried at this position...\n",
4738                             REPORT_CODE_OFF+depth*2, "")
4739                         );
4740                         sayNO; /* cache records failure */
4741                     }
4742                     ST.cache_offset = offset;
4743                     ST.cache_mask   = mask;
4744                 }
4745             }
4746
4747             /* Prefer B over A for minimal matching. */
4748
4749             if (cur_curlyx->u.curlyx.minmod) {
4750                 ST.save_curlyx = cur_curlyx;
4751                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4752                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4753                 REGCP_SET(ST.lastcp);
4754                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4755                 /* NOTREACHED */
4756             }
4757
4758             /* Prefer A over B for maximal matching. */
4759
4760             if (n < max) { /* More greed allowed? */
4761                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4762                 cur_curlyx->u.curlyx.lastloc = locinput;
4763                 REGCP_SET(ST.lastcp);
4764                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4765                 /* NOTREACHED */
4766             }
4767             goto do_whilem_B_max;
4768         }
4769         /* NOTREACHED */
4770
4771         case WHILEM_B_min: /* just matched B in a minimal match */
4772         case WHILEM_B_max: /* just matched B in a maximal match */
4773             cur_curlyx = ST.save_curlyx;
4774             sayYES;
4775             /* NOTREACHED */
4776
4777         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4778             cur_curlyx = ST.save_curlyx;
4779             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4780             cur_curlyx->u.curlyx.count--;
4781             CACHEsayNO;
4782             /* NOTREACHED */
4783
4784         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4785             /* FALL THROUGH */
4786         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4787             REGCP_UNWIND(ST.lastcp);
4788             regcppop(rex);
4789             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4790             cur_curlyx->u.curlyx.count--;
4791             CACHEsayNO;
4792             /* NOTREACHED */
4793
4794         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4795             REGCP_UNWIND(ST.lastcp);
4796             regcppop(rex);      /* Restore some previous $<digit>s? */
4797             PL_reginput = locinput;
4798             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4799                 "%*s  whilem: failed, trying continuation...\n",
4800                 REPORT_CODE_OFF+depth*2, "")
4801             );
4802           do_whilem_B_max:
4803             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4804                 && ckWARN(WARN_REGEXP)
4805                 && !(PL_reg_flags & RF_warned))
4806             {
4807                 PL_reg_flags |= RF_warned;
4808                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4809                      "Complex regular subexpression recursion",
4810                      REG_INFTY - 1);
4811             }
4812
4813             /* now try B */
4814             ST.save_curlyx = cur_curlyx;
4815             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4816             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4817             /* NOTREACHED */
4818
4819         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4820             cur_curlyx = ST.save_curlyx;
4821             REGCP_UNWIND(ST.lastcp);
4822             regcppop(rex);
4823
4824             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4825                 /* Maximum greed exceeded */
4826                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4827                     && ckWARN(WARN_REGEXP)
4828                     && !(PL_reg_flags & RF_warned))
4829                 {
4830                     PL_reg_flags |= RF_warned;
4831                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4832                         "%s limit (%d) exceeded",
4833                         "Complex regular subexpression recursion",
4834                         REG_INFTY - 1);
4835                 }
4836                 cur_curlyx->u.curlyx.count--;
4837                 CACHEsayNO;
4838             }
4839
4840             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4841                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4842             );
4843             /* Try grabbing another A and see if it helps. */
4844             PL_reginput = locinput;
4845             cur_curlyx->u.curlyx.lastloc = locinput;
4846             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4847             REGCP_SET(ST.lastcp);
4848             PUSH_STATE_GOTO(WHILEM_A_min,
4849                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4850             /* NOTREACHED */
4851
4852 #undef  ST
4853 #define ST st->u.branch
4854
4855         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4856             next = scan + ARG(scan);
4857             if (next == scan)
4858                 next = NULL;
4859             scan = NEXTOPER(scan);
4860             /* FALL THROUGH */
4861
4862         case BRANCH:        /*  /(...|A|...)/ */
4863             scan = NEXTOPER(scan); /* scan now points to inner node */
4864             ST.lastparen = *PL_reglastparen;
4865             ST.next_branch = next;
4866             REGCP_SET(ST.cp);
4867             PL_reginput = locinput;
4868
4869             /* Now go into the branch */
4870             if (has_cutgroup) {
4871                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4872             } else {
4873                 PUSH_STATE_GOTO(BRANCH_next, scan);
4874             }
4875             /* NOTREACHED */
4876         case CUTGROUP:
4877             PL_reginput = locinput;
4878             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4879                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4880             PUSH_STATE_GOTO(CUTGROUP_next,next);
4881             /* NOTREACHED */
4882         case CUTGROUP_next_fail:
4883             do_cutgroup = 1;
4884             no_final = 1;
4885             if (st->u.mark.mark_name)
4886                 sv_commit = st->u.mark.mark_name;
4887             sayNO;
4888             /* NOTREACHED */
4889         case BRANCH_next:
4890             sayYES;
4891             /* NOTREACHED */
4892         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4893             if (do_cutgroup) {
4894                 do_cutgroup = 0;
4895                 no_final = 0;
4896             }
4897             REGCP_UNWIND(ST.cp);
4898             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4899                 PL_regoffs[n].end = -1;
4900             *PL_reglastparen = n;
4901             /*dmq: *PL_reglastcloseparen = n; */
4902             scan = ST.next_branch;
4903             /* no more branches? */
4904             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4905                 DEBUG_EXECUTE_r({
4906                     PerlIO_printf( Perl_debug_log,
4907                         "%*s  %sBRANCH failed...%s\n",
4908                         REPORT_CODE_OFF+depth*2, "",
4909                         PL_colors[4],
4910                         PL_colors[5] );
4911                 });
4912                 sayNO_SILENT;
4913             }
4914             continue; /* execute next BRANCH[J] op */
4915             /* NOTREACHED */
4916
4917         case MINMOD:
4918             minmod = 1;
4919             break;
4920
4921 #undef  ST
4922 #define ST st->u.curlym
4923
4924         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4925
4926             /* This is an optimisation of CURLYX that enables us to push
4927              * only a single backtracking state, no matter how many matches
4928              * there are in {m,n}. It relies on the pattern being constant
4929              * length, with no parens to influence future backrefs
4930              */
4931
4932             ST.me = scan;
4933             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4934
4935             /* if paren positive, emulate an OPEN/CLOSE around A */
4936             if (ST.me->flags) {
4937                 U32 paren = ST.me->flags;
4938                 if (paren > PL_regsize)
4939                     PL_regsize = paren;
4940                 if (paren > *PL_reglastparen)
4941                     *PL_reglastparen = paren;
4942                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4943             }
4944             ST.A = scan;
4945             ST.B = next;
4946             ST.alen = 0;
4947             ST.count = 0;
4948             ST.minmod = minmod;
4949             minmod = 0;
4950             ST.c1 = CHRTEST_UNINIT;
4951             REGCP_SET(ST.cp);
4952
4953             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4954                 goto curlym_do_B;
4955
4956           curlym_do_A: /* execute the A in /A{m,n}B/  */
4957             PL_reginput = locinput;
4958             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4959             /* NOTREACHED */
4960
4961         case CURLYM_A: /* we've just matched an A */
4962             locinput = st->locinput;
4963             nextchr = UCHARAT(locinput);
4964
4965             ST.count++;
4966             /* after first match, determine A's length: u.curlym.alen */
4967             if (ST.count == 1) {
4968                 if (PL_reg_match_utf8) {
4969                     char *s = locinput;
4970                     while (s < PL_reginput) {
4971                         ST.alen++;
4972                         s += UTF8SKIP(s);
4973                     }
4974                 }
4975                 else {
4976                     ST.alen = PL_reginput - locinput;
4977                 }
4978                 if (ST.alen == 0)
4979                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4980             }
4981             DEBUG_EXECUTE_r(
4982                 PerlIO_printf(Perl_debug_log,
4983                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4984                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4985                           (IV) ST.count, (IV)ST.alen)
4986             );
4987
4988             locinput = PL_reginput;
4989
4990             if (cur_eval && cur_eval->u.eval.close_paren &&
4991                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4992                 goto fake_end;
4993
4994             {
4995                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4996                 if ( max == REG_INFTY || ST.count < max )
4997                     goto curlym_do_A; /* try to match another A */
4998             }
4999             goto curlym_do_B; /* try to match B */
5000
5001         case CURLYM_A_fail: /* just failed to match an A */
5002             REGCP_UNWIND(ST.cp);
5003
5004             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5005                 || (cur_eval && cur_eval->u.eval.close_paren &&
5006                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5007                 sayNO;
5008
5009           curlym_do_B: /* execute the B in /A{m,n}B/  */
5010             PL_reginput = locinput;
5011             if (ST.c1 == CHRTEST_UNINIT) {
5012                 /* calculate c1 and c2 for possible match of 1st char
5013                  * following curly */
5014                 ST.c1 = ST.c2 = CHRTEST_VOID;
5015                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5016                     regnode *text_node = ST.B;
5017                     if (! HAS_TEXT(text_node))
5018                         FIND_NEXT_IMPT(text_node);
5019                     /* this used to be
5020
5021                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5022
5023                         But the former is redundant in light of the latter.
5024
5025                         if this changes back then the macro for
5026                         IS_TEXT and friends need to change.
5027                      */
5028                     if (PL_regkind[OP(text_node)] == EXACT)
5029                     {
5030
5031                         ST.c1 = (U8)*STRING(text_node);
5032                         switch (OP(text_node)) {
5033                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5034                             case EXACTFA:
5035                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5036                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5037                             default: ST.c2 = ST.c1;
5038                         }
5039                     }
5040                 }
5041             }
5042
5043             DEBUG_EXECUTE_r(
5044                 PerlIO_printf(Perl_debug_log,
5045                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5046                     (int)(REPORT_CODE_OFF+(depth*2)),
5047                     "", (IV)ST.count)
5048                 );
5049             if (ST.c1 != CHRTEST_VOID
5050                     && UCHARAT(PL_reginput) != ST.c1
5051                     && UCHARAT(PL_reginput) != ST.c2)
5052             {
5053                 /* simulate B failing */
5054                 DEBUG_OPTIMISE_r(
5055                     PerlIO_printf(Perl_debug_log,
5056                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5057                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5058                         (IV)ST.c1,(IV)ST.c2
5059                 ));
5060                 state_num = CURLYM_B_fail;
5061                 goto reenter_switch;
5062             }
5063
5064             if (ST.me->flags) {
5065                 /* mark current A as captured */
5066                 I32 paren = ST.me->flags;
5067                 if (ST.count) {
5068                     PL_regoffs[paren].start
5069                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5070                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5071                     /*dmq: *PL_reglastcloseparen = paren; */
5072                 }
5073                 else
5074                     PL_regoffs[paren].end = -1;
5075                 if (cur_eval && cur_eval->u.eval.close_paren &&
5076                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5077                 {
5078                     if (ST.count)
5079                         goto fake_end;
5080                     else
5081                         sayNO;
5082                 }
5083             }
5084
5085             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5086             /* NOTREACHED */
5087
5088         case CURLYM_B_fail: /* just failed to match a B */
5089             REGCP_UNWIND(ST.cp);
5090             if (ST.minmod) {
5091                 I32 max = ARG2(ST.me);
5092                 if (max != REG_INFTY && ST.count == max)
5093                     sayNO;
5094                 goto curlym_do_A; /* try to match a further A */
5095             }
5096             /* backtrack one A */
5097             if (ST.count == ARG1(ST.me) /* min */)
5098                 sayNO;
5099             ST.count--;
5100             locinput = HOPc(locinput, -ST.alen);
5101             goto curlym_do_B; /* try to match B */
5102
5103 #undef ST
5104 #define ST st->u.curly
5105
5106 #define CURLY_SETPAREN(paren, success) \
5107     if (paren) { \
5108         if (success) { \
5109             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5110             PL_regoffs[paren].end = locinput - PL_bostr; \
5111             *PL_reglastcloseparen = paren; \
5112         } \
5113         else \
5114             PL_regoffs[paren].end = -1; \
5115     }
5116
5117         case STAR:              /*  /A*B/ where A is width 1 */
5118             ST.paren = 0;
5119             ST.min = 0;
5120             ST.max = REG_INFTY;
5121             scan = NEXTOPER(scan);
5122             goto repeat;
5123         case PLUS:              /*  /A+B/ where A is width 1 */
5124             ST.paren = 0;
5125             ST.min = 1;
5126             ST.max = REG_INFTY;
5127             scan = NEXTOPER(scan);
5128             goto repeat;
5129         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5130             ST.paren = scan->flags;     /* Which paren to set */
5131             if (ST.paren > PL_regsize)
5132                 PL_regsize = ST.paren;
5133             if (ST.paren > *PL_reglastparen)
5134                 *PL_reglastparen = ST.paren;
5135             ST.min = ARG1(scan);  /* min to match */
5136             ST.max = ARG2(scan);  /* max to match */
5137             if (cur_eval && cur_eval->u.eval.close_paren &&
5138                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5139                 ST.min=1;
5140                 ST.max=1;
5141             }
5142             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5143             goto repeat;
5144         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5145             ST.paren = 0;
5146             ST.min = ARG1(scan);  /* min to match */
5147             ST.max = ARG2(scan);  /* max to match */
5148             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5149           repeat:
5150             /*
5151             * Lookahead to avoid useless match attempts
5152             * when we know what character comes next.
5153             *
5154             * Used to only do .*x and .*?x, but now it allows
5155             * for )'s, ('s and (?{ ... })'s to be in the way
5156             * of the quantifier and the EXACT-like node.  -- japhy
5157             */
5158
5159             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5160                 sayNO;
5161             if (HAS_TEXT(next) || JUMPABLE(next)) {
5162                 U8 *s;
5163                 regnode *text_node = next;
5164
5165                 if (! HAS_TEXT(text_node))
5166                     FIND_NEXT_IMPT(text_node);
5167
5168                 if (! HAS_TEXT(text_node))
5169                     ST.c1 = ST.c2 = CHRTEST_VOID;
5170                 else {
5171                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5172                         ST.c1 = ST.c2 = CHRTEST_VOID;
5173                         goto assume_ok_easy;
5174                     }
5175                     else
5176                         s = (U8*)STRING(text_node);
5177
5178                     /*  Currently we only get here when
5179
5180                         PL_rekind[OP(text_node)] == EXACT
5181
5182                         if this changes back then the macro for IS_TEXT and
5183                         friends need to change. */
5184                     if (!UTF_PATTERN) {
5185                         ST.c1 = *s;
5186                         switch (OP(text_node)) {
5187                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5188                             case EXACTFA:
5189                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5190                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5191                             default: ST.c2 = ST.c1; break;
5192                         }
5193                     }
5194                     else { /* UTF_PATTERN */
5195                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5196                              STRLEN ulen1, ulen2;
5197                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5198                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5199
5200                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5201                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5202 #ifdef EBCDIC
5203                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5204                                                     ckWARN(WARN_UTF8) ?
5205                                                     0 : UTF8_ALLOW_ANY);
5206                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5207                                                     ckWARN(WARN_UTF8) ?
5208                                                     0 : UTF8_ALLOW_ANY);
5209 #else
5210                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5211                                                     uniflags);
5212                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5213                                                     uniflags);
5214 #endif
5215                         }
5216                         else {
5217                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5218                                                      uniflags);
5219                         }
5220                     }
5221                 }
5222             }
5223             else
5224                 ST.c1 = ST.c2 = CHRTEST_VOID;
5225         assume_ok_easy:
5226
5227             ST.A = scan;
5228             ST.B = next;
5229             PL_reginput = locinput;
5230             if (minmod) {
5231                 minmod = 0;
5232                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5233                     sayNO;
5234                 ST.count = ST.min;
5235                 locinput = PL_reginput;
5236                 REGCP_SET(ST.cp);
5237                 if (ST.c1 == CHRTEST_VOID)
5238                     goto curly_try_B_min;
5239
5240                 ST.oldloc = locinput;
5241
5242                 /* set ST.maxpos to the furthest point along the
5243                  * string that could possibly match */
5244                 if  (ST.max == REG_INFTY) {
5245                     ST.maxpos = PL_regeol - 1;
5246                     if (utf8_target)
5247                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5248                             ST.maxpos--;
5249                 }
5250                 else if (utf8_target) {
5251                     int m = ST.max - ST.min;
5252                     for (ST.maxpos = locinput;
5253                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5254                         ST.maxpos += UTF8SKIP(ST.maxpos);
5255                 }
5256                 else {
5257                     ST.maxpos = locinput + ST.max - ST.min;
5258                     if (ST.maxpos >= PL_regeol)
5259                         ST.maxpos = PL_regeol - 1;
5260                 }
5261                 goto curly_try_B_min_known;
5262
5263             }
5264             else {
5265                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5266                 locinput = PL_reginput;
5267                 if (ST.count < ST.min)
5268                     sayNO;
5269                 if ((ST.count > ST.min)
5270                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5271                 {
5272                     /* A{m,n} must come at the end of the string, there's
5273                      * no point in backing off ... */
5274                     ST.min = ST.count;
5275                     /* ...except that $ and \Z can match before *and* after
5276                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5277                        We may back off by one in this case. */
5278                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5279                         ST.min--;
5280                 }
5281                 REGCP_SET(ST.cp);
5282                 goto curly_try_B_max;
5283             }
5284             /* NOTREACHED */
5285
5286
5287         case CURLY_B_min_known_fail:
5288             /* failed to find B in a non-greedy match where c1,c2 valid */
5289             if (ST.paren && ST.count)
5290                 PL_regoffs[ST.paren].end = -1;
5291
5292             PL_reginput = locinput;     /* Could be reset... */
5293             REGCP_UNWIND(ST.cp);
5294             /* Couldn't or didn't -- move forward. */
5295             ST.oldloc = locinput;
5296             if (utf8_target)
5297                 locinput += UTF8SKIP(locinput);
5298             else
5299                 locinput++;
5300             ST.count++;
5301           curly_try_B_min_known:
5302              /* find the next place where 'B' could work, then call B */
5303             {
5304                 int n;
5305                 if (utf8_target) {
5306                     n = (ST.oldloc == locinput) ? 0 : 1;
5307                     if (ST.c1 == ST.c2) {
5308                         STRLEN len;
5309                         /* set n to utf8_distance(oldloc, locinput) */
5310                         while (locinput <= ST.maxpos &&
5311                                utf8n_to_uvchr((U8*)locinput,
5312                                               UTF8_MAXBYTES, &len,
5313                                               uniflags) != (UV)ST.c1) {
5314                             locinput += len;
5315                             n++;
5316                         }
5317                     }
5318                     else {
5319                         /* set n to utf8_distance(oldloc, locinput) */
5320                         while (locinput <= ST.maxpos) {
5321                             STRLEN len;
5322                             const UV c = utf8n_to_uvchr((U8*)locinput,
5323                                                   UTF8_MAXBYTES, &len,
5324                                                   uniflags);
5325                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5326                                 break;
5327                             locinput += len;
5328                             n++;
5329                         }
5330                     }
5331                 }
5332                 else {
5333                     if (ST.c1 == ST.c2) {
5334                         while (locinput <= ST.maxpos &&
5335                                UCHARAT(locinput) != ST.c1)
5336                             locinput++;
5337                     }
5338                     else {
5339                         while (locinput <= ST.maxpos
5340                                && UCHARAT(locinput) != ST.c1
5341                                && UCHARAT(locinput) != ST.c2)
5342                             locinput++;
5343                     }
5344                     n = locinput - ST.oldloc;
5345                 }
5346                 if (locinput > ST.maxpos)
5347                     sayNO;
5348                 /* PL_reginput == oldloc now */
5349                 if (n) {
5350                     ST.count += n;
5351                     if (regrepeat(rex, ST.A, n, depth) < n)
5352                         sayNO;
5353                 }
5354                 PL_reginput = locinput;
5355                 CURLY_SETPAREN(ST.paren, ST.count);
5356                 if (cur_eval && cur_eval->u.eval.close_paren &&
5357                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5358                     goto fake_end;
5359                 }
5360                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5361             }
5362             /* NOTREACHED */
5363
5364
5365         case CURLY_B_min_fail:
5366             /* failed to find B in a non-greedy match where c1,c2 invalid */
5367             if (ST.paren && ST.count)
5368                 PL_regoffs[ST.paren].end = -1;
5369
5370             REGCP_UNWIND(ST.cp);
5371             /* failed -- move forward one */
5372             PL_reginput = locinput;
5373             if (regrepeat(rex, ST.A, 1, depth)) {
5374                 ST.count++;
5375                 locinput = PL_reginput;
5376                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5377                         ST.count > 0)) /* count overflow ? */
5378                 {
5379                   curly_try_B_min:
5380                     CURLY_SETPAREN(ST.paren, ST.count);
5381                     if (cur_eval && cur_eval->u.eval.close_paren &&
5382                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5383                         goto fake_end;
5384                     }
5385                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5386                 }
5387             }
5388             sayNO;
5389             /* NOTREACHED */
5390
5391
5392         curly_try_B_max:
5393             /* a successful greedy match: now try to match B */
5394             if (cur_eval && cur_eval->u.eval.close_paren &&
5395                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5396                 goto fake_end;
5397             }
5398             {
5399                 UV c = 0;
5400                 if (ST.c1 != CHRTEST_VOID)
5401                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5402                                            UTF8_MAXBYTES, 0, uniflags)
5403                                 : (UV) UCHARAT(PL_reginput);
5404                 /* If it could work, try it. */
5405                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5406                     CURLY_SETPAREN(ST.paren, ST.count);
5407                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5408                     /* NOTREACHED */
5409                 }
5410             }
5411             /* FALL THROUGH */
5412         case CURLY_B_max_fail:
5413             /* failed to find B in a greedy match */
5414             if (ST.paren && ST.count)
5415                 PL_regoffs[ST.paren].end = -1;
5416
5417             REGCP_UNWIND(ST.cp);
5418             /*  back up. */
5419             if (--ST.count < ST.min)
5420                 sayNO;
5421             PL_reginput = locinput = HOPc(locinput, -1);
5422             goto curly_try_B_max;
5423
5424 #undef ST
5425
5426         case END:
5427             fake_end:
5428             if (cur_eval) {
5429                 /* we've just finished A in /(??{A})B/; now continue with B */
5430                 I32 tmpix;
5431                 st->u.eval.toggle_reg_flags
5432                             = cur_eval->u.eval.toggle_reg_flags;
5433                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5434
5435                 st->u.eval.prev_rex = rex_sv;           /* inner */
5436                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5437                 rex = (struct regexp *)SvANY(rex_sv);
5438                 rexi = RXi_GET(rex);
5439                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5440                 (void)ReREFCNT_inc(rex_sv);
5441                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5442
5443                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5444                 PL_reglastparen = &rex->lastparen;
5445                 PL_reglastcloseparen = &rex->lastcloseparen;
5446
5447                 REGCP_SET(st->u.eval.lastcp);
5448                 PL_reginput = locinput;
5449
5450                 /* Restore parens of the outer rex without popping the
5451                  * savestack */
5452                 tmpix = PL_savestack_ix;
5453                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5454                 regcppop(rex);
5455                 PL_savestack_ix = tmpix;
5456
5457                 st->u.eval.prev_eval = cur_eval;
5458                 cur_eval = cur_eval->u.eval.prev_eval;
5459                 DEBUG_EXECUTE_r(
5460                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5461                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5462                 if ( nochange_depth )
5463                     nochange_depth--;
5464
5465                 PUSH_YES_STATE_GOTO(EVAL_AB,
5466                         st->u.eval.prev_eval->u.eval.B); /* match B */
5467             }
5468
5469             if (locinput < reginfo->till) {
5470                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5471                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5472                                       PL_colors[4],
5473                                       (long)(locinput - PL_reg_starttry),
5474                                       (long)(reginfo->till - PL_reg_starttry),
5475                                       PL_colors[5]));
5476
5477                 sayNO_SILENT;           /* Cannot match: too short. */
5478             }
5479             PL_reginput = locinput;     /* put where regtry can find it */
5480             sayYES;                     /* Success! */
5481
5482         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5483             DEBUG_EXECUTE_r(
5484             PerlIO_printf(Perl_debug_log,
5485                 "%*s  %ssubpattern success...%s\n",
5486                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5487             PL_reginput = locinput;     /* put where regtry can find it */
5488             sayYES;                     /* Success! */
5489
5490 #undef  ST
5491 #define ST st->u.ifmatch
5492
5493         case SUSPEND:   /* (?>A) */
5494             ST.wanted = 1;
5495             PL_reginput = locinput;
5496             goto do_ifmatch;
5497
5498         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5499             ST.wanted = 0;
5500             goto ifmatch_trivial_fail_test;
5501
5502         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5503             ST.wanted = 1;
5504           ifmatch_trivial_fail_test:
5505             if (scan->flags) {
5506                 char * const s = HOPBACKc(locinput, scan->flags);
5507                 if (!s) {
5508                     /* trivial fail */
5509                     if (logical) {
5510                         logical = 0;
5511                         sw = 1 - cBOOL(ST.wanted);
5512                     }
5513                     else if (ST.wanted)
5514                         sayNO;
5515                     next = scan + ARG(scan);
5516                     if (next == scan)
5517                         next = NULL;
5518                     break;
5519                 }
5520                 PL_reginput = s;
5521             }
5522             else
5523                 PL_reginput = locinput;
5524
5525           do_ifmatch:
5526             ST.me = scan;
5527             ST.logical = logical;
5528             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5529
5530             /* execute body of (?...A) */
5531             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5532             /* NOTREACHED */
5533
5534         case IFMATCH_A_fail: /* body of (?...A) failed */
5535             ST.wanted = !ST.wanted;
5536             /* FALL THROUGH */
5537
5538         case IFMATCH_A: /* body of (?...A) succeeded */
5539             if (ST.logical) {
5540                 sw = cBOOL(ST.wanted);
5541             }
5542             else if (!ST.wanted)
5543                 sayNO;
5544
5545             if (OP(ST.me) == SUSPEND)
5546                 locinput = PL_reginput;
5547             else {
5548                 locinput = PL_reginput = st->locinput;
5549                 nextchr = UCHARAT(locinput);
5550             }
5551             scan = ST.me + ARG(ST.me);
5552             if (scan == ST.me)
5553                 scan = NULL;
5554             continue; /* execute B */
5555
5556 #undef ST
5557
5558         case LONGJMP:
5559             next = scan + ARG(scan);
5560             if (next == scan)
5561                 next = NULL;
5562             break;
5563         case COMMIT:
5564             reginfo->cutpoint = PL_regeol;
5565             /* FALLTHROUGH */
5566         case PRUNE:
5567             PL_reginput = locinput;
5568             if (!scan->flags)
5569                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5570             PUSH_STATE_GOTO(COMMIT_next,next);
5571             /* NOTREACHED */
5572         case COMMIT_next_fail:
5573             no_final = 1;
5574             /* FALLTHROUGH */
5575         case OPFAIL:
5576             sayNO;
5577             /* NOTREACHED */
5578
5579 #define ST st->u.mark
5580         case MARKPOINT:
5581             ST.prev_mark = mark_state;
5582             ST.mark_name = sv_commit = sv_yes_mark
5583                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5584             mark_state = st;
5585             ST.mark_loc = PL_reginput = locinput;
5586             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5587             /* NOTREACHED */
5588         case MARKPOINT_next:
5589             mark_state = ST.prev_mark;
5590             sayYES;
5591             /* NOTREACHED */
5592         case MARKPOINT_next_fail:
5593             if (popmark && sv_eq(ST.mark_name,popmark))
5594             {
5595                 if (ST.mark_loc > startpoint)
5596                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5597                 popmark = NULL; /* we found our mark */
5598                 sv_commit = ST.mark_name;
5599
5600                 DEBUG_EXECUTE_r({
5601                         PerlIO_printf(Perl_debug_log,
5602                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5603                             REPORT_CODE_OFF+depth*2, "",
5604                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5605                 });
5606             }
5607             mark_state = ST.prev_mark;
5608             sv_yes_mark = mark_state ?
5609                 mark_state->u.mark.mark_name : NULL;
5610             sayNO;
5611             /* NOTREACHED */
5612         case SKIP:
5613             PL_reginput = locinput;
5614             if (scan->flags) {
5615                 /* (*SKIP) : if we fail we cut here*/
5616                 ST.mark_name = NULL;
5617                 ST.mark_loc = locinput;
5618                 PUSH_STATE_GOTO(SKIP_next,next);
5619             } else {
5620                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5621                    otherwise do nothing.  Meaning we need to scan
5622                  */
5623                 regmatch_state *cur = mark_state;
5624                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5625
5626                 while (cur) {
5627                     if ( sv_eq( cur->u.mark.mark_name,
5628                                 find ) )
5629                     {
5630                         ST.mark_name = find;
5631                         PUSH_STATE_GOTO( SKIP_next, next );
5632                     }
5633                     cur = cur->u.mark.prev_mark;
5634                 }
5635             }
5636             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5637             break;
5638         case SKIP_next_fail:
5639             if (ST.mark_name) {
5640                 /* (*CUT:NAME) - Set up to search for the name as we
5641                    collapse the stack*/
5642                 popmark = ST.mark_name;
5643             } else {
5644                 /* (*CUT) - No name, we cut here.*/
5645                 if (ST.mark_loc > startpoint)
5646                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5647                 /* but we set sv_commit to latest mark_name if there
5648                    is one so they can test to see how things lead to this
5649                    cut */
5650                 if (mark_state)
5651                     sv_commit=mark_state->u.mark.mark_name;
5652             }
5653             no_final = 1;
5654             sayNO;
5655             /* NOTREACHED */
5656 #undef ST
5657         case FOLDCHAR:
5658             n = ARG(scan);
5659             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5660                 locinput += ln;
5661             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5662                 sayNO;
5663             } else  {
5664                 U8 folded[UTF8_MAXBYTES_CASE+1];
5665                 STRLEN foldlen;
5666                 const char * const l = locinput;
5667                 char *e = PL_regeol;
5668                 to_uni_fold(n, folded, &foldlen);
5669
5670                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5671                                l, &e, 0,  utf8_target)) {
5672                         sayNO;
5673                 }
5674                 locinput = e;
5675             }
5676             nextchr = UCHARAT(locinput);
5677             break;
5678         case LNBREAK:
5679             if ((n=is_LNBREAK(locinput,utf8_target))) {
5680                 locinput += n;
5681                 nextchr = UCHARAT(locinput);
5682             } else
5683                 sayNO;
5684             break;
5685
5686 #define CASE_CLASS(nAmE)                              \
5687         case nAmE:                                    \
5688             if (locinput >= PL_regeol)                \
5689                 sayNO;                                \
5690             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5691                 locinput += n;                        \
5692                 nextchr = UCHARAT(locinput);          \
5693             } else                                    \
5694                 sayNO;                                \
5695             break;                                    \
5696         case N##nAmE:                                 \
5697             if (locinput >= PL_regeol)                \
5698                 sayNO;                                \
5699             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5700                 sayNO;                                \
5701             } else {                                  \
5702                 locinput += UTF8SKIP(locinput);       \
5703                 nextchr = UCHARAT(locinput);          \
5704             }                                         \
5705             break
5706
5707         CASE_CLASS(VERTWS);
5708         CASE_CLASS(HORIZWS);
5709 #undef CASE_CLASS
5710
5711         default:
5712             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5713                           PTR2UV(scan), OP(scan));
5714             Perl_croak(aTHX_ "regexp memory corruption");
5715
5716         } /* end switch */
5717
5718         /* switch break jumps here */
5719         scan = next; /* prepare to execute the next op and ... */
5720         continue;    /* ... jump back to the top, reusing st */
5721         /* NOTREACHED */
5722
5723       push_yes_state:
5724         /* push a state that backtracks on success */
5725         st->u.yes.prev_yes_state = yes_state;
5726         yes_state = st;
5727         /* FALL THROUGH */
5728       push_state:
5729         /* push a new regex state, then continue at scan  */
5730         {
5731             regmatch_state *newst;
5732
5733             DEBUG_STACK_r({
5734                 regmatch_state *cur = st;
5735                 regmatch_state *curyes = yes_state;
5736                 int curd = depth;
5737                 regmatch_slab *slab = PL_regmatch_slab;
5738                 for (;curd > -1;cur--,curd--) {
5739                     if (cur < SLAB_FIRST(slab)) {
5740                         slab = slab->prev;
5741                         cur = SLAB_LAST(slab);
5742                     }
5743                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5744                         REPORT_CODE_OFF + 2 + depth * 2,"",
5745                         curd, PL_reg_name[cur->resume_state],
5746                         (curyes == cur) ? "yes" : ""
5747                     );
5748                     if (curyes == cur)
5749                         curyes = cur->u.yes.prev_yes_state;
5750                 }
5751             } else
5752                 DEBUG_STATE_pp("push")
5753             );
5754             depth++;
5755             st->locinput = locinput;
5756             newst = st+1;
5757             if (newst >  SLAB_LAST(PL_regmatch_slab))
5758                 newst = S_push_slab(aTHX);
5759             PL_regmatch_state = newst;
5760
5761             locinput = PL_reginput;
5762             nextchr = UCHARAT(locinput);
5763             st = newst;
5764             continue;
5765             /* NOTREACHED */
5766         }
5767     }
5768
5769     /*
5770     * We get here only if there's trouble -- normally "case END" is
5771     * the terminating point.
5772     */
5773     Perl_croak(aTHX_ "corrupted regexp pointers");
5774     /*NOTREACHED*/
5775     sayNO;
5776
5777 yes:
5778     if (yes_state) {
5779         /* we have successfully completed a subexpression, but we must now
5780          * pop to the state marked by yes_state and continue from there */
5781         assert(st != yes_state);
5782 #ifdef DEBUGGING
5783         while (st != yes_state) {
5784             st--;
5785             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5786                 PL_regmatch_slab = PL_regmatch_slab->prev;
5787                 st = SLAB_LAST(PL_regmatch_slab);
5788             }
5789             DEBUG_STATE_r({
5790                 if (no_final) {
5791                     DEBUG_STATE_pp("pop (no final)");
5792                 } else {
5793                     DEBUG_STATE_pp("pop (yes)");
5794                 }
5795             });
5796             depth--;
5797         }
5798 #else
5799         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5800             || yes_state > SLAB_LAST(PL_regmatch_slab))
5801         {
5802             /* not in this slab, pop slab */
5803             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5804             PL_regmatch_slab = PL_regmatch_slab->prev;
5805             st = SLAB_LAST(PL_regmatch_slab);
5806         }
5807         depth -= (st - yes_state);
5808 #endif
5809         st = yes_state;
5810         yes_state = st->u.yes.prev_yes_state;
5811         PL_regmatch_state = st;
5812
5813         if (no_final) {
5814             locinput= st->locinput;
5815             nextchr = UCHARAT(locinput);
5816         }
5817         state_num = st->resume_state + no_final;
5818         goto reenter_switch;
5819     }
5820
5821     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5822                           PL_colors[4], PL_colors[5]));
5823
5824     if (PL_reg_eval_set) {
5825         /* each successfully executed (?{...}) block does the equivalent of
5826          *   local $^R = do {...}
5827          * When popping the save stack, all these locals would be undone;
5828          * bypass this by setting the outermost saved $^R to the latest
5829          * value */
5830         if (oreplsv != GvSV(PL_replgv))
5831             sv_setsv(oreplsv, GvSV(PL_replgv));
5832     }
5833     result = 1;
5834     goto final_exit;
5835
5836 no:
5837     DEBUG_EXECUTE_r(
5838         PerlIO_printf(Perl_debug_log,
5839             "%*s  %sfailed...%s\n",
5840             REPORT_CODE_OFF+depth*2, "",
5841             PL_colors[4], PL_colors[5])
5842         );
5843
5844 no_silent:
5845     if (no_final) {
5846         if (yes_state) {
5847             goto yes;
5848         } else {
5849             goto final_exit;
5850         }
5851     }
5852     if (depth) {
5853         /* there's a previous state to backtrack to */
5854         st--;
5855         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5856             PL_regmatch_slab = PL_regmatch_slab->prev;
5857             st = SLAB_LAST(PL_regmatch_slab);
5858         }
5859         PL_regmatch_state = st;
5860         locinput= st->locinput;
5861         nextchr = UCHARAT(locinput);
5862
5863         DEBUG_STATE_pp("pop");
5864         depth--;
5865         if (yes_state == st)
5866             yes_state = st->u.yes.prev_yes_state;
5867
5868         state_num = st->resume_state + 1; /* failure = success + 1 */
5869         goto reenter_switch;
5870     }
5871     result = 0;
5872
5873   final_exit:
5874     if (rex->intflags & PREGf_VERBARG_SEEN) {
5875         SV *sv_err = get_sv("REGERROR", 1);
5876         SV *sv_mrk = get_sv("REGMARK", 1);
5877         if (result) {
5878             sv_commit = &PL_sv_no;
5879             if (!sv_yes_mark)
5880                 sv_yes_mark = &PL_sv_yes;
5881         } else {
5882             if (!sv_commit)
5883                 sv_commit = &PL_sv_yes;
5884             sv_yes_mark = &PL_sv_no;
5885         }
5886         sv_setsv(sv_err, sv_commit);
5887         sv_setsv(sv_mrk, sv_yes_mark);
5888     }
5889
5890     /* clean up; in particular, free all slabs above current one */
5891     LEAVE_SCOPE(oldsave);
5892
5893     return result;
5894 }
5895
5896 /*
5897  - regrepeat - repeatedly match something simple, report how many
5898  */
5899 /*
5900  * [This routine now assumes that it will only match on things of length 1.
5901  * That was true before, but now we assume scan - reginput is the count,
5902  * rather than incrementing count on every character.  [Er, except utf8.]]
5903  */
5904 STATIC I32
5905 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5906 {
5907     dVAR;
5908     register char *scan;
5909     register I32 c;
5910     register char *loceol = PL_regeol;
5911     register I32 hardcount = 0;
5912     register bool utf8_target = PL_reg_match_utf8;
5913     UV utf8_flags;
5914 #ifndef DEBUGGING
5915     PERL_UNUSED_ARG(depth);
5916 #endif
5917
5918     PERL_ARGS_ASSERT_REGREPEAT;
5919
5920     scan = PL_reginput;
5921     if (max == REG_INFTY)
5922         max = I32_MAX;
5923     else if (max < loceol - scan)
5924         loceol = scan + max;
5925     switch (OP(p)) {
5926     case REG_ANY:
5927         if (utf8_target) {
5928             loceol = PL_regeol;
5929             while (scan < loceol && hardcount < max && *scan != '\n') {
5930                 scan += UTF8SKIP(scan);
5931                 hardcount++;
5932             }
5933         } else {
5934             while (scan < loceol && *scan != '\n')
5935                 scan++;
5936         }
5937         break;
5938     case SANY:
5939         if (utf8_target) {
5940             loceol = PL_regeol;
5941             while (scan < loceol && hardcount < max) {
5942                 scan += UTF8SKIP(scan);
5943                 hardcount++;
5944             }
5945         }
5946         else
5947             scan = loceol;
5948         break;
5949     case CANY:
5950         scan = loceol;
5951         break;
5952     case EXACT:
5953         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5954          * means they match only characters in the string that can be expressed
5955          * as a single byte.  For non-utf8 strings, that means a simple match.
5956          * For utf8 strings, the character matched must be an invariant, or
5957          * downgradable to a single byte.  The pattern's utf8ness is
5958          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5959          * it is, it's an invariant */
5960
5961         c = (U8)*STRING(p);
5962         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5963
5964         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5965             while (scan < loceol && UCHARAT(scan) == c) {
5966                 scan++;
5967             }
5968         }
5969         else {
5970
5971             /* Here, the string is utf8, and the pattern char is different
5972              * in utf8 than not, so can't compare them directly.  Outside the
5973              * loop, find find the two utf8 bytes that represent c, and then
5974              * look for those in sequence in the utf8 string */
5975             U8 high = UTF8_TWO_BYTE_HI(c);
5976             U8 low = UTF8_TWO_BYTE_LO(c);
5977             loceol = PL_regeol;
5978
5979             while (hardcount < max
5980                     && scan + 1 < loceol
5981                     && UCHARAT(scan) == high
5982                     && UCHARAT(scan + 1) == low)
5983             {
5984                 scan += 2;
5985                 hardcount++;
5986             }
5987         }
5988         break;
5989     case EXACTFA:
5990         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5991         goto do_exactf;
5992
5993     case EXACTFL:
5994         PL_reg_flags |= RF_tainted;
5995         utf8_flags = FOLDEQ_UTF8_LOCALE;
5996         goto do_exactf;
5997
5998     case EXACTF:
5999     case EXACTFU:
6000         utf8_flags = 0;
6001
6002         /* The comments for the EXACT case above apply as well to these fold
6003          * ones */
6004
6005     do_exactf:
6006         c = (U8)*STRING(p);
6007         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6008
6009         if (utf8_target) { /* Use full Unicode fold matching */
6010             char *tmpeol = loceol;
6011             while (hardcount < max
6012                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6013                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6014             {
6015                 scan = tmpeol;
6016                 tmpeol = loceol;
6017                 hardcount++;
6018             }
6019
6020             /* XXX Note that the above handles properly the German sharp s in
6021              * the pattern matching ss in the string.  But it doesn't handle
6022              * properly cases where the string contains say 'LIGATURE ff' and
6023              * the pattern is 'f+'.  This would require, say, a new function or
6024              * revised interface to foldEQ_utf8(), in which the maximum number
6025              * of characters to match could be passed and it would return how
6026              * many actually did.  This is just one of many cases where
6027              * multi-char folds don't work properly, and so the fix is being
6028              * deferred */
6029         }
6030         else {
6031             U8 folded;
6032
6033             /* Here, the string isn't utf8 and c is a single byte; and either
6034              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6035              * doesn't affect c.  Can just do simple comparisons for exact or
6036              * fold matching. */
6037             switch (OP(p)) {
6038                 case EXACTF: folded = PL_fold[c]; break;
6039                 case EXACTFA:
6040                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6041                 case EXACTFL: folded = PL_fold_locale[c]; break;
6042                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6043             }
6044             while (scan < loceol &&
6045                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6046             {
6047                 scan++;
6048             }
6049         }
6050         break;
6051     case ANYOFV:
6052     case ANYOF:
6053         if (utf8_target || OP(p) == ANYOFV) {
6054             STRLEN inclasslen;
6055             loceol = PL_regeol;
6056             inclasslen = loceol - scan;
6057             while (hardcount < max
6058                    && ((inclasslen = loceol - scan) > 0)
6059                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6060             {
6061                 scan += inclasslen;
6062                 hardcount++;
6063             }
6064         } else {
6065             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6066                 scan++;
6067         }
6068         break;
6069     case ALNUMU:
6070         if (utf8_target) {
6071     utf8_wordchar:
6072             loceol = PL_regeol;
6073             LOAD_UTF8_CHARCLASS_ALNUM();
6074             while (hardcount < max && scan < loceol &&
6075                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6076             {
6077                 scan += UTF8SKIP(scan);
6078                 hardcount++;
6079             }
6080         } else {
6081             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6082                 scan++;
6083             }
6084         }
6085         break;
6086     case ALNUM:
6087         if (utf8_target)
6088             goto utf8_wordchar;
6089         while (scan < loceol && isALNUM((U8) *scan)) {
6090             scan++;
6091         }
6092         break;
6093     case ALNUMA:
6094         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6095             scan++;
6096         }
6097         break;
6098     case ALNUML:
6099         PL_reg_flags |= RF_tainted;
6100         if (utf8_target) {
6101             loceol = PL_regeol;
6102             while (hardcount < max && scan < loceol &&
6103                    isALNUM_LC_utf8((U8*)scan)) {
6104                 scan += UTF8SKIP(scan);
6105                 hardcount++;
6106             }
6107         } else {
6108             while (scan < loceol && isALNUM_LC(*scan))
6109                 scan++;
6110         }
6111         break;
6112     case NALNUMU:
6113         if (utf8_target) {
6114
6115     utf8_Nwordchar:
6116
6117             loceol = PL_regeol;
6118             LOAD_UTF8_CHARCLASS_ALNUM();
6119             while (hardcount < max && scan < loceol &&
6120                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6121             {
6122                 scan += UTF8SKIP(scan);
6123                 hardcount++;
6124             }
6125         } else {
6126             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6127                 scan++;
6128             }
6129         }
6130         break;
6131     case NALNUM:
6132         if (utf8_target)
6133             goto utf8_Nwordchar;
6134         while (scan < loceol && ! isALNUM((U8) *scan)) {
6135             scan++;
6136         }
6137         break;
6138     case NALNUMA:
6139         if (utf8_target) {
6140             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6141                 scan += UTF8SKIP(scan);
6142             }
6143         }
6144         else {
6145             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6146                 scan++;
6147             }
6148         }
6149         break;
6150     case NALNUML:
6151         PL_reg_flags |= RF_tainted;
6152         if (utf8_target) {
6153             loceol = PL_regeol;
6154             while (hardcount < max && scan < loceol &&
6155                    !isALNUM_LC_utf8((U8*)scan)) {
6156                 scan += UTF8SKIP(scan);
6157                 hardcount++;
6158             }
6159         } else {
6160             while (scan < loceol && !isALNUM_LC(*scan))
6161                 scan++;
6162         }
6163         break;
6164     case SPACEU:
6165         if (utf8_target) {
6166
6167     utf8_space:
6168
6169             loceol = PL_regeol;
6170             LOAD_UTF8_CHARCLASS_SPACE();
6171             while (hardcount < max && scan < loceol &&
6172                    (*scan == ' ' ||
6173                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6174             {
6175                 scan += UTF8SKIP(scan);
6176                 hardcount++;
6177             }
6178             break;
6179         }
6180         else {
6181             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6182                 scan++;
6183             }
6184             break;
6185         }
6186     case SPACE:
6187         if (utf8_target)
6188             goto utf8_space;
6189
6190         while (scan < loceol && isSPACE((U8) *scan)) {
6191             scan++;
6192         }
6193         break;
6194     case SPACEA:
6195         while (scan < loceol && isSPACE_A((U8) *scan)) {
6196             scan++;
6197         }
6198         break;
6199     case SPACEL:
6200         PL_reg_flags |= RF_tainted;
6201         if (utf8_target) {
6202             loceol = PL_regeol;
6203             while (hardcount < max && scan < loceol &&
6204                    isSPACE_LC_utf8((U8*)scan)) {
6205                 scan += UTF8SKIP(scan);
6206                 hardcount++;
6207             }
6208         } else {
6209             while (scan < loceol && isSPACE_LC(*scan))
6210                 scan++;
6211         }
6212         break;
6213     case NSPACEU:
6214         if (utf8_target) {
6215
6216     utf8_Nspace:
6217
6218             loceol = PL_regeol;
6219             LOAD_UTF8_CHARCLASS_SPACE();
6220             while (hardcount < max && scan < loceol &&
6221                    ! (*scan == ' ' ||
6222                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6223             {
6224                 scan += UTF8SKIP(scan);
6225                 hardcount++;
6226             }
6227             break;
6228         }
6229         else {
6230             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6231                 scan++;
6232             }
6233         }
6234         break;
6235     case NSPACE:
6236         if (utf8_target)
6237             goto utf8_Nspace;
6238
6239         while (scan < loceol && ! isSPACE((U8) *scan)) {
6240             scan++;
6241         }
6242         break;
6243     case NSPACEA:
6244         if (utf8_target) {
6245             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6246                 scan += UTF8SKIP(scan);
6247             }
6248         }
6249         else {
6250             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6251                 scan++;
6252             }
6253         }
6254         break;
6255     case NSPACEL:
6256         PL_reg_flags |= RF_tainted;
6257         if (utf8_target) {
6258             loceol = PL_regeol;
6259             while (hardcount < max && scan < loceol &&
6260                    !isSPACE_LC_utf8((U8*)scan)) {
6261                 scan += UTF8SKIP(scan);
6262                 hardcount++;
6263             }
6264         } else {
6265             while (scan < loceol && !isSPACE_LC(*scan))
6266                 scan++;
6267         }
6268         break;
6269     case DIGIT:
6270         if (utf8_target) {
6271             loceol = PL_regeol;
6272             LOAD_UTF8_CHARCLASS_DIGIT();
6273             while (hardcount < max && scan < loceol &&
6274                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6275                 scan += UTF8SKIP(scan);
6276                 hardcount++;
6277             }
6278         } else {
6279             while (scan < loceol && isDIGIT(*scan))
6280                 scan++;
6281         }
6282         break;
6283     case DIGITA:
6284         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6285             scan++;
6286         }
6287         break;
6288     case DIGITL:
6289         PL_reg_flags |= RF_tainted;
6290         if (utf8_target) {
6291             loceol = PL_regeol;
6292             while (hardcount < max && scan < loceol &&
6293                    isDIGIT_LC_utf8((U8*)scan)) {
6294                 scan += UTF8SKIP(scan);
6295                 hardcount++;
6296             }
6297         } else {
6298             while (scan < loceol && isDIGIT_LC(*scan))
6299                 scan++;
6300         }
6301         break;
6302     case NDIGIT:
6303         if (utf8_target) {
6304             loceol = PL_regeol;
6305             LOAD_UTF8_CHARCLASS_DIGIT();
6306             while (hardcount < max && scan < loceol &&
6307                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6308                 scan += UTF8SKIP(scan);
6309                 hardcount++;
6310             }
6311         } else {
6312             while (scan < loceol && !isDIGIT(*scan))
6313                 scan++;
6314         }
6315         break;
6316     case NDIGITA:
6317         if (utf8_target) {
6318             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6319                 scan += UTF8SKIP(scan);
6320             }
6321         }
6322         else {
6323             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6324                 scan++;
6325             }
6326         }
6327         break;
6328     case NDIGITL:
6329         PL_reg_flags |= RF_tainted;
6330         if (utf8_target) {
6331             loceol = PL_regeol;
6332             while (hardcount < max && scan < loceol &&
6333                    !isDIGIT_LC_utf8((U8*)scan)) {
6334                 scan += UTF8SKIP(scan);
6335                 hardcount++;
6336             }
6337         } else {
6338             while (scan < loceol && !isDIGIT_LC(*scan))
6339                 scan++;
6340         }
6341         break;
6342     case LNBREAK:
6343         if (utf8_target) {
6344             loceol = PL_regeol;
6345             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6346                 scan += c;
6347                 hardcount++;
6348             }
6349         } else {
6350             /*
6351               LNBREAK can match two latin chars, which is ok,
6352               because we have a null terminated string, but we
6353               have to use hardcount in this situation
6354             */
6355             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6356                 scan+=c;
6357                 hardcount++;
6358             }
6359         }
6360         break;
6361     case HORIZWS:
6362         if (utf8_target) {
6363             loceol = PL_regeol;
6364             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6365                 scan += c;
6366                 hardcount++;
6367             }
6368         } else {
6369             while (scan < loceol && is_HORIZWS_latin1(scan))
6370                 scan++;
6371         }
6372         break;
6373     case NHORIZWS:
6374         if (utf8_target) {
6375             loceol = PL_regeol;
6376             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6377                 scan += UTF8SKIP(scan);
6378                 hardcount++;
6379             }
6380         } else {
6381             while (scan < loceol && !is_HORIZWS_latin1(scan))
6382                 scan++;
6383
6384         }
6385         break;
6386     case VERTWS:
6387         if (utf8_target) {
6388             loceol = PL_regeol;
6389             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6390                 scan += c;
6391                 hardcount++;
6392             }
6393         } else {
6394             while (scan < loceol && is_VERTWS_latin1(scan))
6395                 scan++;
6396
6397         }
6398         break;
6399     case NVERTWS:
6400         if (utf8_target) {
6401             loceol = PL_regeol;
6402             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6403                 scan += UTF8SKIP(scan);
6404                 hardcount++;
6405             }
6406         } else {
6407             while (scan < loceol && !is_VERTWS_latin1(scan))
6408                 scan++;
6409
6410         }
6411         break;
6412
6413     default:            /* Called on something of 0 width. */
6414         break;          /* So match right here or not at all. */
6415     }
6416
6417     if (hardcount)
6418         c = hardcount;
6419     else
6420         c = scan - PL_reginput;
6421     PL_reginput = scan;
6422
6423     DEBUG_r({
6424         GET_RE_DEBUG_FLAGS_DECL;
6425         DEBUG_EXECUTE_r({
6426             SV * const prop = sv_newmortal();
6427             regprop(prog, prop, p);
6428             PerlIO_printf(Perl_debug_log,
6429                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6430                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6431         });
6432     });
6433
6434     return(c);
6435 }
6436
6437
6438 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6439 /*
6440 - regclass_swash - prepare the utf8 swash
6441 */
6442
6443 SV *
6444 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6445 {
6446     dVAR;
6447     SV *sw  = NULL;
6448     SV *si  = NULL;
6449     SV *alt = NULL;
6450     RXi_GET_DECL(prog,progi);
6451     const struct reg_data * const data = prog ? progi->data : NULL;
6452
6453     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6454
6455     assert(ANYOF_NONBITMAP(node));
6456
6457     if (data && data->count) {
6458         const U32 n = ARG(node);
6459
6460         if (data->what[n] == 's') {
6461             SV * const rv = MUTABLE_SV(data->data[n]);
6462             AV * const av = MUTABLE_AV(SvRV(rv));
6463             SV **const ary = AvARRAY(av);
6464             SV **a, **b;
6465
6466             /* See the end of regcomp.c:S_regclass() for
6467              * documentation of these array elements. */
6468
6469             si = *ary;
6470             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6471             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6472
6473             if (a)
6474                 sw = *a;
6475             else if (si && doinit) {
6476                 sw = swash_init("utf8", "", si, 1, 0);
6477                 (void)av_store(av, 1, sw);
6478             }
6479             if (b)
6480                 alt = *b;
6481         }
6482     }
6483
6484     if (listsvp)
6485         *listsvp = si;
6486     if (altsvp)
6487         *altsvp  = alt;
6488
6489     return sw;
6490 }
6491 #endif
6492
6493 /*
6494  - reginclass - determine if a character falls into a character class
6495
6496   n is the ANYOF regnode
6497   p is the target string
6498   lenp is pointer to the maximum number of bytes of how far to go in p
6499     (This is assumed wthout checking to always be at least the current
6500     character's size)
6501   utf8_target tells whether p is in UTF-8.
6502
6503   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6504   from a successful match, the value it points to will be updated to how many
6505   bytes in p were matched.  If there was no match, the value is undefined,
6506   possibly changed from the input.
6507
6508   Note that this can be a synthetic start class, a combination of various
6509   nodes, so things you think might be mutually exclusive, such as locale,
6510   aren't.  It can match both locale and non-locale
6511
6512  */
6513
6514 STATIC bool
6515 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6516 {
6517     dVAR;
6518     const char flags = ANYOF_FLAGS(n);
6519     bool match = FALSE;
6520     UV c = *p;
6521     STRLEN c_len = 0;
6522     STRLEN maxlen;
6523
6524     PERL_ARGS_ASSERT_REGINCLASS;
6525
6526     /* If c is not already the code point, get it */
6527     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6528         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6529                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6530                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6531                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6532                  * UTF8_ALLOW_FFFF */
6533         if (c_len == (STRLEN)-1)
6534             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6535     }
6536     else {
6537         c_len = 1;
6538     }
6539
6540     /* Use passed in max length, or one character if none passed in or less
6541      * than one character.  And assume will match just one character.  This is
6542      * overwritten later if matched more. */
6543     if (lenp) {
6544         maxlen = (*lenp > c_len) ? *lenp : c_len;
6545         *lenp = c_len;
6546
6547     }
6548     else {
6549         maxlen = c_len;
6550     }
6551
6552     /* If this character is potentially in the bitmap, check it */
6553     if (c < 256) {
6554         if (ANYOF_BITMAP_TEST(n, c))
6555             match = TRUE;
6556         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6557                 && ! utf8_target
6558                 && ! isASCII(c))
6559         {
6560             match = TRUE;
6561         }
6562
6563         else if (flags & ANYOF_LOCALE) {
6564             PL_reg_flags |= RF_tainted;
6565
6566             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6567                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6568             {
6569                 match = TRUE;
6570             }
6571             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6572                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6573                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6574                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6575                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6576                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6577                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6578                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6579                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6580                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6581                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6582                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6583                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6584                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6585                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6586                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6587                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6588                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6589                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6590                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6591                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6592                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6593                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6594                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6595                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6596                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6597                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6598                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6599                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6600                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6601                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6602                      ) /* How's that for a conditional? */
6603             ) {
6604                 match = TRUE;
6605             }
6606         }
6607     }
6608
6609     /* If the bitmap didn't (or couldn't) match, and something outside the
6610      * bitmap could match, try that.  Locale nodes specifiy completely the
6611      * behavior of code points in the bit map (otherwise, a utf8 target would
6612      * cause them to be treated as Unicode and not locale), except in
6613      * the very unlikely event when this node is a synthetic start class, which
6614      * could be a combination of locale and non-locale nodes.  So allow locale
6615      * to match for the synthetic start class, which will give a false
6616      * positive that will be resolved when the match is done again as not part
6617      * of the synthetic start class */
6618     if (!match) {
6619         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6620             match = TRUE;       /* Everything above 255 matches */
6621         }
6622         else if (ANYOF_NONBITMAP(n)
6623                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6624                      || (utf8_target
6625                          && (c >=256
6626                              || (! (flags & ANYOF_LOCALE))
6627                              || (flags & ANYOF_IS_SYNTHETIC)))))
6628         {
6629             AV *av;
6630             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6631
6632             if (sw) {
6633                 U8 * utf8_p;
6634                 if (utf8_target) {
6635                     utf8_p = (U8 *) p;
6636                 } else {
6637
6638                     /* Not utf8.  Convert as much of the string as available up
6639                      * to the limit of how far the (single) character in the
6640                      * pattern can possibly match (no need to go further).  If
6641                      * the node is a straight ANYOF or not folding, it can't
6642                      * match more than one.  Otherwise, It can match up to how
6643                      * far a single char can fold to.  Since not utf8, each
6644                      * character is a single byte, so the max it can be in
6645                      * bytes is the same as the max it can be in characters */
6646                     STRLEN len = (OP(n) == ANYOF
6647                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6648                                   ? 1
6649                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6650                                     ? maxlen
6651                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6652                     utf8_p = bytes_to_utf8(p, &len);
6653                 }
6654
6655                 if (swash_fetch(sw, utf8_p, TRUE))
6656                     match = TRUE;
6657                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6658
6659                     /* Here, we need to test if the fold of the target string
6660                      * matches.  The non-multi char folds have all been moved to
6661                      * the compilation phase, and the multi-char folds have
6662                      * been stored by regcomp into 'av'; we linearly check to
6663                      * see if any match the target string (folded).   We know
6664                      * that the originals were each one character, but we don't
6665                      * currently know how many characters/bytes each folded to,
6666                      * except we do know that there are small limits imposed by
6667                      * Unicode.  XXX A performance enhancement would be to have
6668                      * regcomp.c store the max number of chars/bytes that are
6669                      * in an av entry, as, say the 0th element.  Even better
6670                      * would be to have a hash of the few characters that can
6671                      * start a multi-char fold to the max number of chars of
6672                      * those folds.
6673                      *
6674                      * If there is a match, we will need to advance (if lenp is
6675                      * specified) the match pointer in the target string.  But
6676                      * what we are comparing here isn't that string directly,
6677                      * but its fold, whose length may differ from the original.
6678                      * As we go along in constructing the fold, therefore, we
6679                      * create a map so that we know how many bytes in the
6680                      * source to advance given that we have matched a certain
6681                      * number of bytes in the fold.  This map is stored in
6682                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6683                      * the fold of the first character that we are folding.
6684                      * Then map_fold_len_back[n] is set to the number of bytes
6685                      * in that first character.  Similarly let m be the
6686                      * corresponding number for the second character to be
6687                      * folded.  Then map_fold_len_back[n+m] is set to the
6688                      * number of bytes occupied by the first two source
6689                      * characters. ... */
6690                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6691                     U8 folded[UTF8_MAXBYTES_CASE+1];
6692                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6693                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6694                                                   chars */
6695
6696                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6697
6698                         /* Here, only need to fold the first char of the target
6699                          * string.  It the source wasn't utf8, is 1 byte long */
6700                         to_utf8_fold(utf8_p, folded, &foldlen);
6701                         total_foldlen = foldlen;
6702                         map_fold_len_back[foldlen] = (utf8_target)
6703                                                      ? UTF8SKIP(utf8_p)
6704                                                      : 1;
6705                     }
6706                     else {
6707
6708                         /* Here, need to fold more than the first char.  Do so
6709                          * up to the limits */
6710                         U8* source_ptr = utf8_p;    /* The source for the fold
6711                                                        is the regex target
6712                                                        string */
6713                         U8* folded_ptr = folded;
6714                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6715                                                        available byte in the
6716                                                        target string */
6717                         U8 i;
6718                         for (i = 0;
6719                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6720                              i++)
6721                         {
6722
6723                             /* Fold the next character */
6724                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6725                             STRLEN this_char_foldlen;
6726                             to_utf8_fold(source_ptr,
6727                                          this_char_folded,
6728                                          &this_char_foldlen);
6729
6730                             /* Bail if it would exceed the byte limit for
6731                              * folding a single char. */
6732                             if (this_char_foldlen + folded_ptr - folded >
6733                                                             UTF8_MAXBYTES_CASE)
6734                             {
6735                                 break;
6736                             }
6737
6738                             /* Add the fold of this character */
6739                             Copy(this_char_folded,
6740                                  folded_ptr,
6741                                  this_char_foldlen,
6742                                  U8);
6743                             source_ptr += UTF8SKIP(source_ptr);
6744                             folded_ptr += this_char_foldlen;
6745                             total_foldlen = folded_ptr - folded;
6746
6747                             /* Create map from the number of bytes in the fold
6748                              * back to the number of bytes in the source.  If
6749                              * the source isn't utf8, the byte count is just
6750                              * the number of characters so far */
6751                             map_fold_len_back[total_foldlen]
6752                                                       = (utf8_target)
6753                                                         ? source_ptr - utf8_p
6754                                                         : i + 1;
6755                         }
6756                         *folded_ptr = '\0';
6757                     }
6758
6759
6760                     /* Do the linear search to see if the fold is in the list
6761                      * of multi-char folds. */
6762                     if (av) {
6763                         I32 i;
6764                         for (i = 0; i <= av_len(av); i++) {
6765                             SV* const sv = *av_fetch(av, i, FALSE);
6766                             STRLEN len;
6767                             const char * const s = SvPV_const(sv, len);
6768
6769                             if (len <= total_foldlen
6770                                 && memEQ(s, (char*)folded, len)
6771
6772                                    /* If 0, means matched a partial char. See
6773                                     * [perl #90536] */
6774                                 && map_fold_len_back[len])
6775                             {
6776
6777                                 /* Advance the target string ptr to account for
6778                                  * this fold, but have to translate from the
6779                                  * folded length to the corresponding source
6780                                  * length. */
6781                                 if (lenp) {
6782                                     *lenp = map_fold_len_back[len];
6783                                 }
6784                                 match = TRUE;
6785                                 break;
6786                             }
6787                         }
6788                     }
6789                 }
6790
6791                 /* If we allocated a string above, free it */
6792                 if (! utf8_target) Safefree(utf8_p);
6793             }
6794         }
6795     }
6796
6797     return (flags & ANYOF_INVERT) ? !match : match;
6798 }
6799
6800 STATIC U8 *
6801 S_reghop3(U8 *s, I32 off, const U8* lim)
6802 {
6803     dVAR;
6804
6805     PERL_ARGS_ASSERT_REGHOP3;
6806
6807     if (off >= 0) {
6808         while (off-- && s < lim) {
6809             /* XXX could check well-formedness here */
6810             s += UTF8SKIP(s);
6811         }
6812     }
6813     else {
6814         while (off++ && s > lim) {
6815             s--;
6816             if (UTF8_IS_CONTINUED(*s)) {
6817                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6818                     s--;
6819             }
6820             /* XXX could check well-formedness here */
6821         }
6822     }
6823     return s;
6824 }
6825
6826 #ifdef XXX_dmq
6827 /* there are a bunch of places where we use two reghop3's that should
6828    be replaced with this routine. but since thats not done yet
6829    we ifdef it out - dmq
6830 */
6831 STATIC U8 *
6832 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6833 {
6834     dVAR;
6835
6836     PERL_ARGS_ASSERT_REGHOP4;
6837
6838     if (off >= 0) {
6839         while (off-- && s < rlim) {
6840             /* XXX could check well-formedness here */
6841             s += UTF8SKIP(s);
6842         }
6843     }
6844     else {
6845         while (off++ && s > llim) {
6846             s--;
6847             if (UTF8_IS_CONTINUED(*s)) {
6848                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6849                     s--;
6850             }
6851             /* XXX could check well-formedness here */
6852         }
6853     }
6854     return s;
6855 }
6856 #endif
6857
6858 STATIC U8 *
6859 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6860 {
6861     dVAR;
6862
6863     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6864
6865     if (off >= 0) {
6866         while (off-- && s < lim) {
6867             /* XXX could check well-formedness here */
6868             s += UTF8SKIP(s);
6869         }
6870         if (off >= 0)
6871             return NULL;
6872     }
6873     else {
6874         while (off++ && s > lim) {
6875             s--;
6876             if (UTF8_IS_CONTINUED(*s)) {
6877                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6878                     s--;
6879             }
6880             /* XXX could check well-formedness here */
6881         }
6882         if (off <= 0)
6883             return NULL;
6884     }
6885     return s;
6886 }
6887
6888 static void
6889 restore_pos(pTHX_ void *arg)
6890 {
6891     dVAR;
6892     regexp * const rex = (regexp *)arg;
6893     if (PL_reg_eval_set) {
6894         if (PL_reg_oldsaved) {
6895             rex->subbeg = PL_reg_oldsaved;
6896             rex->sublen = PL_reg_oldsavedlen;
6897 #ifdef PERL_OLD_COPY_ON_WRITE
6898             rex->saved_copy = PL_nrs;
6899 #endif
6900             RXp_MATCH_COPIED_on(rex);
6901         }
6902         PL_reg_magic->mg_len = PL_reg_oldpos;
6903         PL_reg_eval_set = 0;
6904         PL_curpm = PL_reg_oldcurpm;
6905     }
6906 }
6907
6908 STATIC void
6909 S_to_utf8_substr(pTHX_ register regexp *prog)
6910 {
6911     int i = 1;
6912
6913     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6914
6915     do {
6916         if (prog->substrs->data[i].substr
6917             && !prog->substrs->data[i].utf8_substr) {
6918             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6919             prog->substrs->data[i].utf8_substr = sv;
6920             sv_utf8_upgrade(sv);
6921             if (SvVALID(prog->substrs->data[i].substr)) {
6922                 if (SvTAIL(prog->substrs->data[i].substr)) {
6923                     /* Trim the trailing \n that fbm_compile added last
6924                        time.  */
6925                     SvCUR_set(sv, SvCUR(sv) - 1);
6926                     /* Whilst this makes the SV technically "invalid" (as its
6927                        buffer is no longer followed by "\0") when fbm_compile()
6928                        adds the "\n" back, a "\0" is restored.  */
6929                     fbm_compile(sv, FBMcf_TAIL);
6930                 } else
6931                     fbm_compile(sv, 0);
6932             }
6933             if (prog->substrs->data[i].substr == prog->check_substr)
6934                 prog->check_utf8 = sv;
6935         }
6936     } while (i--);
6937 }
6938
6939 STATIC void
6940 S_to_byte_substr(pTHX_ register regexp *prog)
6941 {
6942     dVAR;
6943     int i = 1;
6944
6945     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6946
6947     do {
6948         if (prog->substrs->data[i].utf8_substr
6949             && !prog->substrs->data[i].substr) {
6950             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6951             if (sv_utf8_downgrade(sv, TRUE)) {
6952                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6953                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
6954                         /* Trim the trailing \n that fbm_compile added last
6955                            time.  */
6956                         SvCUR_set(sv, SvCUR(sv) - 1);
6957                         fbm_compile(sv, FBMcf_TAIL);
6958                     } else
6959                         fbm_compile(sv, 0);
6960                 }
6961             } else {
6962                 SvREFCNT_dec(sv);
6963                 sv = &PL_sv_undef;
6964             }
6965             prog->substrs->data[i].substr = sv;
6966             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6967                 prog->check_substr = sv;
6968         }
6969     } while (i--);
6970 }
6971
6972 /*
6973  * Local variables:
6974  * c-indentation-style: bsd
6975  * c-basic-offset: 4
6976  * indent-tabs-mode: t
6977  * End:
6978  *
6979  * ex: set ts=8 sts=4 sw=4 noet:
6980  */