src/5014003/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 154
 155 /* The actual code for CCC_TRY, which uses several variables from the routine
 156  * it's callable from.  It is designed to be the bulk of a case statement.
 157  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 158  *      nextchr matches the class.
 159  * UTF8_TEST is the whole test string to use for utf8 targets
 160  * LOAD is what to use to test, and if not present to load in the swash for the
 161  *      class
 162  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 163  *      UTF8_TEST test.
 164  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 165  * utf8 and a variant, load the swash if necessary and test using the utf8
 166  * test.  Advance to the next character if test is ok, otherwise fail; If not
 167  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 168  * fails, or advance to the next character */
 169
 170 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 171     if (locinput >= PL_regeol) {                                              \
 172         sayNO;                                                                \
 173     }                                                                         \
 174     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 175         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 176         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 177             sayNO;                                                            \
 178         }                                                                     \
 179         locinput += PL_utf8skip[nextchr];                                     \
 180         nextchr = UCHARAT(locinput);                                          \
 181         break;                                                                \
 182     }                                                                         \
 183     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 184         sayNO;                                                                \
 185     }                                                                         \
 186     nextchr = UCHARAT(++locinput);                                            \
 187     break;
 188
 189 /* Handle the non-locale cases for a character class and its complement.  It
 190  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 191  * This is because that code fails when the test succeeds, so we want to have
 192  * the test fail so that the code succeeds.  The swash is stored in a
 193  * predictable PL_ place */
 194 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 195                            CLASS, STR)                                        \
 196     case NAME:                                                                \
 197         _CCC_TRY_CODE( !, FUNC,                                               \
 198                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 199                                             (U8*)locinput, TRUE)),            \
 200                           CLASS, STR)                                         \
 201     case NNAME:                                                               \
 202         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 203                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 204                                             (U8*)locinput, TRUE)),            \
 205                           CLASS, STR)                                         \
 206
 207 /* Generate the case statements for both locale and non-locale character
 208  * classes in regmatch for classes that don't have special unicode semantics.
 209  * Locales don't use an immediate swash, but an intermediary special locale
 210  * function that is called on the pointer to the current place in the input
 211  * string.  That function will resolve to needing the same swash.  One might
 212  * think that because we don't know what the locale will match, we shouldn't
 213  * check with the swash loading function that it loaded properly; ie, that we
 214  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 215  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 216  * irrelevant here */
 217 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 218                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 219                 NAMEA, NNAMEA, FUNCA,                                         \
 220                 CLASS, STR)                                                   \
 221     case NAMEL:                                                               \
 222         PL_reg_flags |= RF_tainted;                                           \
 223         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 224     case NNAMEL:                                                              \
 225         PL_reg_flags |= RF_tainted;                                           \
 226         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 227                        CLASS, STR)                                            \
 228     case NAMEA:                                                               \
 229         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 230             sayNO;                                                            \
 231         }                                                                     \
 232         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 233         nextchr = UCHARAT(++locinput);                                        \
 234         break;                                                                \
 235     case NNAMEA:                                                              \
 236         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 237             sayNO;                                                            \
 238         }                                                                     \
 239         if (utf8_target) {                                                    \
 240             locinput += PL_utf8skip[nextchr];                                 \
 241             nextchr = UCHARAT(locinput);                                      \
 242         }                                                                     \
 243         else {                                                                \
 244             nextchr = UCHARAT(++locinput);                                    \
 245         }                                                                     \
 246         break;                                                                \
 247     /* Generate the non-locale cases */                                       \
 248     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 249
 250 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 251  * statements to handle separate Unicode semantics nodes */
 252 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 253                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 254                   NAMEU, NNAMEU, FUNCU,                                        \
 255                   NAMEA, NNAMEA, FUNCA,                                        \
 256                   CLASS, STR)                                                  \
 257     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 258             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 259             NAMEA, NNAMEA, FUNCA,                                              \
 260             CLASS, STR)                                                        \
 261     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 262
 263 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 264
 265 /* for use after a quantifier and before an EXACT-like node -- japhy */
 266 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 267  *
 268  * NOTE that *nothing* that affects backtracking should be in here, specifically
 269  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 270  * node that is in between two EXACT like nodes when ascertaining what the required
 271  * "follow" character is. This should probably be moved to regex compile time
 272  * although it may be done at run time beause of the REF possibility - more
 273  * investigation required. -- demerphq
 274 */
 275 #define JUMPABLE(rn) (      \
 276     OP(rn) == OPEN ||       \
 277     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 278     OP(rn) == EVAL ||   \
 279     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 280     OP(rn) == PLUS || OP(rn) == MINMOD || \
 281     OP(rn) == KEEPS || \
 282     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 283 )
 284 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 285
 286 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 287
 288 #if 0
 289 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 290    we don't need this definition. */
 291 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 292 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 293 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 294
 295 #else
 296 /* ... so we use this as its faster. */
 297 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 298 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 299 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 300 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 301
 302 #endif
 303
 304 /*
 305   Search for mandatory following text node; for lookahead, the text must
 306   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 307 */
 308 #define FIND_NEXT_IMPT(rn) STMT_START { \
 309     while (JUMPABLE(rn)) { \
 310         const OPCODE type = OP(rn); \
 311         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 312             rn = NEXTOPER(NEXTOPER(rn)); \
 313         else if (type == PLUS) \
 314             rn = NEXTOPER(rn); \
 315         else if (type == IFMATCH) \
 316             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 317         else rn += NEXT_OFF(rn); \
 318     } \
 319 } STMT_END
 320
 321
 322 static void restore_pos(pTHX_ void *arg);
 323
 324 #define REGCP_PAREN_ELEMS 4
 325 #define REGCP_OTHER_ELEMS 5
 326 #define REGCP_FRAME_ELEMS 1
 327 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 328  * are needed for the regexp context stack bookkeeping. */
 329
 330 STATIC CHECKPOINT
 331 S_regcppush(pTHX_ I32 parenfloor)
 332 {
 333     dVAR;
 334     const int retval = PL_savestack_ix;
 335     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 336     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 337     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 338     int p;
 339     GET_RE_DEBUG_FLAGS_DECL;
 340
 341     if (paren_elems_to_push < 0)
 342         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 343
 344     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 345         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 346                    " out of range (%lu-%ld)",
 347                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 348
 349     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 350
 351     for (p = PL_regsize; p > parenfloor; p--) {
 352 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 353         SSPUSHINT(PL_regoffs[p].end);
 354         SSPUSHINT(PL_regoffs[p].start);
 355         SSPUSHPTR(PL_reg_start_tmp[p]);
 356         SSPUSHINT(p);
 357         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 358           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 359                       (UV)p, (IV)PL_regoffs[p].start,
 360                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 361                       (IV)PL_regoffs[p].end
 362         ));
 363     }
 364 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 365     SSPUSHPTR(PL_regoffs);
 366     SSPUSHINT(PL_regsize);
 367     SSPUSHINT(*PL_reglastparen);
 368     SSPUSHINT(*PL_reglastcloseparen);
 369     SSPUSHPTR(PL_reginput);
 370     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 371
 372     return retval;
 373 }
 374
 375 /* These are needed since we do not localize EVAL nodes: */
 376 #define REGCP_SET(cp)                                           \
 377     DEBUG_STATE_r(                                              \
 378             PerlIO_printf(Perl_debug_log,                       \
 379                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 380                 (IV)PL_savestack_ix));                          \
 381     cp = PL_savestack_ix
 382
 383 #define REGCP_UNWIND(cp)                                        \
 384     DEBUG_STATE_r(                                              \
 385         if (cp != PL_savestack_ix)                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 388                 (IV)(cp), (IV)PL_savestack_ix));                \
 389     regcpblow(cp)
 390
 391 STATIC char *
 392 S_regcppop(pTHX_ const regexp *rex)
 393 {
 394     dVAR;
 395     UV i;
 396     char *input;
 397     GET_RE_DEBUG_FLAGS_DECL;
 398
 399     PERL_ARGS_ASSERT_REGCPPOP;
 400
 401     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 402     i = SSPOPUV;
 403     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 404     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 405     input = (char *) SSPOPPTR;
 406     *PL_reglastcloseparen = SSPOPINT;
 407     *PL_reglastparen = SSPOPINT;
 408     PL_regsize = SSPOPINT;
 409     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 410
 411     i -= REGCP_OTHER_ELEMS;
 412     /* Now restore the parentheses context. */
 413     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 414         I32 tmps;
 415         U32 paren = (U32)SSPOPINT;
 416         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 417         PL_regoffs[paren].start = SSPOPINT;
 418         tmps = SSPOPINT;
 419         if (paren <= *PL_reglastparen)
 420             PL_regoffs[paren].end = tmps;
 421         DEBUG_BUFFERS_r(
 422             PerlIO_printf(Perl_debug_log,
 423                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 424                           (UV)paren, (IV)PL_regoffs[paren].start,
 425                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 426                           (IV)PL_regoffs[paren].end,
 427                           (paren > *PL_reglastparen ? "(no)" : ""));
 428         );
 429     }
 430     DEBUG_BUFFERS_r(
 431         if (*PL_reglastparen + 1 <= rex->nparens) {
 432             PerlIO_printf(Perl_debug_log,
 433                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 434                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 435         }
 436     );
 437 #if 1
 438     /* It would seem that the similar code in regtry()
 439      * already takes care of this, and in fact it is in
 440      * a better location to since this code can #if 0-ed out
 441      * but the code in regtry() is needed or otherwise tests
 442      * requiring null fields (pat.t#187 and split.t#{13,14}
 443      * (as of patchlevel 7877)  will fail.  Then again,
 444      * this code seems to be necessary or otherwise
 445      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 446      * --jhi updated by dapm */
 447     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 448         if (i > PL_regsize)
 449             PL_regoffs[i].start = -1;
 450         PL_regoffs[i].end = -1;
 451     }
 452 #endif
 453     return input;
 454 }
 455
 456 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 457
 458 /*
 459  * pregexec and friends
 460  */
 461
 462 #ifndef PERL_IN_XSUB_RE
 463 /*
 464  - pregexec - match a regexp against a string
 465  */
 466 I32
 467 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 468          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 469 /* strend: pointer to null at end of string */
 470 /* strbeg: real beginning of string */
 471 /* minend: end of match must be >=minend after stringarg. */
 472 /* nosave: For optimizations. */
 473 {
 474     PERL_ARGS_ASSERT_PREGEXEC;
 475
 476     return
 477         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 478                       nosave ? 0 : REXEC_COPY_STR);
 479 }
 480 #endif
 481
 482 /*
 483  * Need to implement the following flags for reg_anch:
 484  *
 485  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 486  * USE_INTUIT_ML
 487  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 488  * INTUIT_AUTORITATIVE_ML
 489  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 490  * INTUIT_ONCE_ML
 491  *
 492  * Another flag for this function: SECOND_TIME (so that float substrs
 493  * with giant delta may be not rechecked).
 494  */
 495
 496 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 497
 498 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 499    Otherwise, only SvCUR(sv) is used to get strbeg. */
 500
 501 /* XXXX We assume that strpos is strbeg unless sv. */
 502
 503 /* XXXX Some places assume that there is a fixed substring.
 504         An update may be needed if optimizer marks as "INTUITable"
 505         RExen without fixed substrings.  Similarly, it is assumed that
 506         lengths of all the strings are no more than minlen, thus they
 507         cannot come from lookahead.
 508         (Or minlen should take into account lookahead.)
 509   NOTE: Some of this comment is not correct. minlen does now take account
 510   of lookahead/behind. Further research is required. -- demerphq
 511
 512 */
 513
 514 /* A failure to find a constant substring means that there is no need to make
 515    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 516    finding a substring too deep into the string means that less calls to
 517    regtry() should be needed.
 518
 519    REx compiler's optimizer found 4 possible hints:
 520         a) Anchored substring;
 521         b) Fixed substring;
 522         c) Whether we are anchored (beginning-of-line or \G);
 523         d) First node (of those at offset 0) which may distinguish positions;
 524    We use a)b)d) and multiline-part of c), and try to find a position in the
 525    string which does not contradict any of them.
 526  */
 527
 528 /* Most of decisions we do here should have been done at compile time.
 529    The nodes of the REx which we used for the search should have been
 530    deleted from the finite automaton. */
 531
 532 char *
 533 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 534                      char *strend, const U32 flags, re_scream_pos_data *data)
 535 {
 536     dVAR;
 537     struct regexp *const prog = (struct regexp *)SvANY(rx);
 538     register I32 start_shift = 0;
 539     /* Should be nonnegative! */
 540     register I32 end_shift   = 0;
 541     register char *s;
 542     register SV *check;
 543     char *strbeg;
 544     char *t;
 545     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 546     I32 ml_anch;
 547     register char *other_last = NULL;   /* other substr checked before this */
 548     char *check_at = NULL;              /* check substr found at this pos */
 549     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 550     RXi_GET_DECL(prog,progi);
 551 #ifdef DEBUGGING
 552     const char * const i_strpos = strpos;
 553 #endif
 554     GET_RE_DEBUG_FLAGS_DECL;
 555
 556     PERL_ARGS_ASSERT_RE_INTUIT_START;
 557
 558     RX_MATCH_UTF8_set(rx,utf8_target);
 559
 560     if (RX_UTF8(rx)) {
 561         PL_reg_flags |= RF_utf8;
 562     }
 563     DEBUG_EXECUTE_r(
 564         debug_start_match(rx, utf8_target, strpos, strend,
 565             sv ? "Guessing start of match in sv for"
 566                : "Guessing start of match in string for");
 567               );
 568
 569     /* CHR_DIST() would be more correct here but it makes things slow. */
 570     if (prog->minlen > strend - strpos) {
 571         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 572                               "String too short... [re_intuit_start]\n"));
 573         goto fail;
 574     }
 575
 576     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 577     PL_regeol = strend;
 578     if (utf8_target) {
 579         if (!prog->check_utf8 && prog->check_substr)
 580             to_utf8_substr(prog);
 581         check = prog->check_utf8;
 582     } else {
 583         if (!prog->check_substr && prog->check_utf8)
 584             to_byte_substr(prog);
 585         check = prog->check_substr;
 586     }
 587     if (check == &PL_sv_undef) {
 588         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 589                 "Non-utf8 string cannot match utf8 check string\n"));
 590         goto fail;
 591     }
 592     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 593         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 594                      || ( (prog->extflags & RXf_ANCH_BOL)
 595                           && !multiline ) );    /* Check after \n? */
 596
 597         if (!ml_anch) {
 598           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 599                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 600                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 601                && sv && !SvROK(sv)
 602                && (strpos != strbeg)) {
 603               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 604               goto fail;
 605           }
 606           if (prog->check_offset_min == prog->check_offset_max &&
 607               !(prog->extflags & RXf_CANY_SEEN)) {
 608             /* Substring at constant offset from beg-of-str... */
 609             I32 slen;
 610
 611             s = HOP3c(strpos, prog->check_offset_min, strend);
 612
 613             if (SvTAIL(check)) {
 614                 slen = SvCUR(check);    /* >= 1 */
 615
 616                 if ( strend - s > slen || strend - s < slen - 1
 617                      || (strend - s == slen && strend[-1] != '\n')) {
 618                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 619                     goto fail_finish;
 620                 }
 621                 /* Now should match s[0..slen-2] */
 622                 slen--;
 623                 if (slen && (*SvPVX_const(check) != *s
 624                              || (slen > 1
 625                                  && memNE(SvPVX_const(check), s, slen)))) {
 626                   report_neq:
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 628                     goto fail_finish;
 629                 }
 630             }
 631             else if (*SvPVX_const(check) != *s
 632                      || ((slen = SvCUR(check)) > 1
 633                          && memNE(SvPVX_const(check), s, slen)))
 634                 goto report_neq;
 635             check_at = s;
 636             goto success_at_start;
 637           }
 638         }
 639         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 640         s = strpos;
 641         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 642         end_shift = prog->check_end_shift;
 643
 644         if (!ml_anch) {
 645             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 646                                          - (SvTAIL(check) != 0);
 647             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 648
 649             if (end_shift < eshift)
 650                 end_shift = eshift;
 651         }
 652     }
 653     else {                              /* Can match at random position */
 654         ml_anch = 0;
 655         s = strpos;
 656         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 657         end_shift = prog->check_end_shift;
 658
 659         /* end shift should be non negative here */
 660     }
 661
 662 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 663     if (end_shift < 0)
 664         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 665                    (IV)end_shift, RX_PRECOMP(prog));
 666 #endif
 667
 668   restart:
 669     /* Find a possible match in the region s..strend by looking for
 670        the "check" substring in the region corrected by start/end_shift. */
 671
 672     {
 673         I32 srch_start_shift = start_shift;
 674         I32 srch_end_shift = end_shift;
 675         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 676             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 677             srch_start_shift = strbeg - s;
 678         }
 679     DEBUG_OPTIMISE_MORE_r({
 680         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 681             (IV)prog->check_offset_min,
 682             (IV)srch_start_shift,
 683             (IV)srch_end_shift,
 684             (IV)prog->check_end_shift);
 685     });
 686
 687     if (flags & REXEC_SCREAM) {
 688         I32 p = -1;                     /* Internal iterator of scream. */
 689         I32 * const pp = data ? data->scream_pos : &p;
 690
 691         if (PL_screamfirst[BmRARE(check)] >= 0
 692             || ( BmRARE(check) == '\n'
 693                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 694                  && SvTAIL(check) ))
 695             s = screaminstr(sv, check,
 696                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 697         else
 698             goto fail_finish;
 699         /* we may be pointing at the wrong string */
 700         if (s && RXp_MATCH_COPIED(prog))
 701             s = strbeg + (s - SvPVX_const(sv));
 702         if (data)
 703             *data->scream_olds = s;
 704     }
 705     else {
 706         U8* start_point;
 707         U8* end_point;
 708         if (prog->extflags & RXf_CANY_SEEN) {
 709             start_point= (U8*)(s + srch_start_shift);
 710             end_point= (U8*)(strend - srch_end_shift);
 711         } else {
 712             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 713             end_point= HOP3(strend, -srch_end_shift, strbeg);
 714         }
 715         DEBUG_OPTIMISE_MORE_r({
 716             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 717                 (int)(end_point - start_point),
 718                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 719                 start_point);
 720         });
 721
 722         s = fbm_instr( start_point, end_point,
 723                       check, multiline ? FBMrf_MULTILINE : 0);
 724     }
 725     }
 726     /* Update the count-of-usability, remove useless subpatterns,
 727         unshift s.  */
 728
 729     DEBUG_EXECUTE_r({
 730         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 731             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 732         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 733                           (s ? "Found" : "Did not find"),
 734             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 735                 ? "anchored" : "floating"),
 736             quoted,
 737             RE_SV_TAIL(check),
 738             (s ? " at offset " : "...\n") );
 739     });
 740
 741     if (!s)
 742         goto fail_finish;
 743     /* Finish the diagnostic message */
 744     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 745
 746     /* XXX dmq: first branch is for positive lookbehind...
 747        Our check string is offset from the beginning of the pattern.
 748        So we need to do any stclass tests offset forward from that
 749        point. I think. :-(
 750      */
 751
 752
 753
 754     check_at=s;
 755
 756
 757     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 758        Start with the other substr.
 759        XXXX no SCREAM optimization yet - and a very coarse implementation
 760        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 761                 *always* match.  Probably should be marked during compile...
 762        Probably it is right to do no SCREAM here...
 763      */
 764
 765     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 766                 : (prog->float_substr && prog->anchored_substr))
 767     {
 768         /* Take into account the "other" substring. */
 769         /* XXXX May be hopelessly wrong for UTF... */
 770         if (!other_last)
 771             other_last = strpos;
 772         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 773           do_other_anchored:
 774             {
 775                 char * const last = HOP3c(s, -start_shift, strbeg);
 776                 char *last1, *last2;
 777                 char * const saved_s = s;
 778                 SV* must;
 779
 780                 t = s - prog->check_offset_max;
 781                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 782                     && (!utf8_target
 783                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 784                             && t > strpos)))
 785                     NOOP;
 786                 else
 787                     t = strpos;
 788                 t = HOP3c(t, prog->anchored_offset, strend);
 789                 if (t < other_last)     /* These positions already checked */
 790                     t = other_last;
 791                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 792                 if (last < last1)
 793                     last1 = last;
 794                 /* XXXX It is not documented what units *_offsets are in.
 795                    We assume bytes, but this is clearly wrong.
 796                    Meaning this code needs to be carefully reviewed for errors.
 797                    dmq.
 798                   */
 799
 800                 /* On end-of-str: see comment below. */
 801                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 802                 if (must == &PL_sv_undef) {
 803                     s = (char*)NULL;
 804                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 805                 }
 806                 else
 807                     s = fbm_instr(
 808                         (unsigned char*)t,
 809                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 810                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 811                         must,
 812                         multiline ? FBMrf_MULTILINE : 0
 813                     );
 814                 DEBUG_EXECUTE_r({
 815                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 816                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 817                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 818                         (s ? "Found" : "Contradicts"),
 819                         quoted, RE_SV_TAIL(must));
 820                 });
 821
 822
 823                 if (!s) {
 824                     if (last1 >= last2) {
 825                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 826                                                 ", giving up...\n"));
 827                         goto fail_finish;
 828                     }
 829                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 830                         ", trying floating at offset %ld...\n",
 831                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 832                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 833                     s = HOP3c(last, 1, strend);
 834                     goto restart;
 835                 }
 836                 else {
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 838                           (long)(s - i_strpos)));
 839                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 840                     other_last = HOP3c(s, 1, strend);
 841                     s = saved_s;
 842                     if (t == strpos)
 843                         goto try_at_start;
 844                     goto try_at_offset;
 845                 }
 846             }
 847         }
 848         else {          /* Take into account the floating substring. */
 849             char *last, *last1;
 850             char * const saved_s = s;
 851             SV* must;
 852
 853             t = HOP3c(s, -start_shift, strbeg);
 854             last1 = last =
 855                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 856             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 857                 last = HOP3c(t, prog->float_max_offset, strend);
 858             s = HOP3c(t, prog->float_min_offset, strend);
 859             if (s < other_last)
 860                 s = other_last;
 861  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 862             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 863             /* fbm_instr() takes into account exact value of end-of-str
 864                if the check is SvTAIL(ed).  Since false positives are OK,
 865                and end-of-str is not later than strend we are OK. */
 866             if (must == &PL_sv_undef) {
 867                 s = (char*)NULL;
 868                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 869             }
 870             else
 871                 s = fbm_instr((unsigned char*)s,
 872                               (unsigned char*)last + SvCUR(must)
 873                                   - (SvTAIL(must)!=0),
 874                               must, multiline ? FBMrf_MULTILINE : 0);
 875             DEBUG_EXECUTE_r({
 876                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 879                     (s ? "Found" : "Contradicts"),
 880                     quoted, RE_SV_TAIL(must));
 881             });
 882             if (!s) {
 883                 if (last1 == last) {
 884                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 885                                             ", giving up...\n"));
 886                     goto fail_finish;
 887                 }
 888                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 889                     ", trying anchored starting at offset %ld...\n",
 890                     (long)(saved_s + 1 - i_strpos)));
 891                 other_last = last;
 892                 s = HOP3c(t, 1, strend);
 893                 goto restart;
 894             }
 895             else {
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 897                       (long)(s - i_strpos)));
 898                 other_last = s; /* Fix this later. --Hugo */
 899                 s = saved_s;
 900                 if (t == strpos)
 901                     goto try_at_start;
 902                 goto try_at_offset;
 903             }
 904         }
 905     }
 906
 907
 908     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 909
 910     DEBUG_OPTIMISE_MORE_r(
 911         PerlIO_printf(Perl_debug_log,
 912             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 913             (IV)prog->check_offset_min,
 914             (IV)prog->check_offset_max,
 915             (IV)(s-strpos),
 916             (IV)(t-strpos),
 917             (IV)(t-s),
 918             (IV)(strend-strpos)
 919         )
 920     );
 921
 922     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 923         && (!utf8_target
 924             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 925                  && t > strpos)))
 926     {
 927         /* Fixed substring is found far enough so that the match
 928            cannot start at strpos. */
 929       try_at_offset:
 930         if (ml_anch && t[-1] != '\n') {
 931             /* Eventually fbm_*() should handle this, but often
 932                anchored_offset is not 0, so this check will not be wasted. */
 933             /* XXXX In the code below we prefer to look for "^" even in
 934                presence of anchored substrings.  And we search even
 935                beyond the found float position.  These pessimizations
 936                are historical artefacts only.  */
 937           find_anchor:
 938             while (t < strend - prog->minlen) {
 939                 if (*t == '\n') {
 940                     if (t < check_at - prog->check_offset_min) {
 941                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 942                             /* Since we moved from the found position,
 943                                we definitely contradict the found anchored
 944                                substr.  Due to the above check we do not
 945                                contradict "check" substr.
 946                                Thus we can arrive here only if check substr
 947                                is float.  Redo checking for "other"=="fixed".
 948                              */
 949                             strpos = t + 1;
 950                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 951                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 952                             goto do_other_anchored;
 953                         }
 954                         /* We don't contradict the found floating substring. */
 955                         /* XXXX Why not check for STCLASS? */
 956                         s = t + 1;
 957                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 958                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 959                         goto set_useful;
 960                     }
 961                     /* Position contradicts check-string */
 962                     /* XXXX probably better to look for check-string
 963                        than for "\n", so one should lower the limit for t? */
 964                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 965                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 966                     other_last = strpos = s = t + 1;
 967                     goto restart;
 968                 }
 969                 t++;
 970             }
 971             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 972                         PL_colors[0], PL_colors[1]));
 973             goto fail_finish;
 974         }
 975         else {
 976             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 977                         PL_colors[0], PL_colors[1]));
 978         }
 979         s = t;
 980       set_useful:
 981         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 982     }
 983     else {
 984         /* The found string does not prohibit matching at strpos,
 985            - no optimization of calling REx engine can be performed,
 986            unless it was an MBOL and we are not after MBOL,
 987            or a future STCLASS check will fail this. */
 988       try_at_start:
 989         /* Even in this situation we may use MBOL flag if strpos is offset
 990            wrt the start of the string. */
 991         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 992             && (strpos != strbeg) && strpos[-1] != '\n'
 993             /* May be due to an implicit anchor of m{.*foo}  */
 994             && !(prog->intflags & PREGf_IMPLICIT))
 995         {
 996             t = strpos;
 997             goto find_anchor;
 998         }
 999         DEBUG_EXECUTE_r( if (ml_anch)
1000             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1001                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1002         );
1003       success_at_start:
1004         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1005             && (utf8_target ? (
1006                 prog->check_utf8                /* Could be deleted already */
1007                 && --BmUSEFUL(prog->check_utf8) < 0
1008                 && (prog->check_utf8 == prog->float_utf8)
1009             ) : (
1010                 prog->check_substr              /* Could be deleted already */
1011                 && --BmUSEFUL(prog->check_substr) < 0
1012                 && (prog->check_substr == prog->float_substr)
1013             )))
1014         {
1015             /* If flags & SOMETHING - do not do it many times on the same match */
1016             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1017             /* XXX Does the destruction order has to change with utf8_target? */
1018             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1019             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1020             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1021             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1022             check = NULL;                       /* abort */
1023             s = strpos;
1024             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1025                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1026             if (prog->intflags & PREGf_IMPLICIT)
1027                 prog->extflags &= ~RXf_ANCH_MBOL;
1028             /* XXXX This is a remnant of the old implementation.  It
1029                     looks wasteful, since now INTUIT can use many
1030                     other heuristics. */
1031             prog->extflags &= ~RXf_USE_INTUIT;
1032             /* XXXX What other flags might need to be cleared in this branch? */
1033         }
1034         else
1035             s = strpos;
1036     }
1037
1038     /* Last resort... */
1039     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1040     /* trie stclasses are too expensive to use here, we are better off to
1041        leave it to regmatch itself */
1042     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1043         /* minlen == 0 is possible if regstclass is \b or \B,
1044            and the fixed substr is ''$.
1045            Since minlen is already taken into account, s+1 is before strend;
1046            accidentally, minlen >= 1 guaranties no false positives at s + 1
1047            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1048            regstclass does not come from lookahead...  */
1049         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1050            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1051         const U8* const str = (U8*)STRING(progi->regstclass);
1052         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1053                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1054                     : 1);
1055         char * endpos;
1056         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1057             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1058         else if (prog->float_substr || prog->float_utf8)
1059             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1060         else
1061             endpos= strend;
1062
1063         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1064                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1065
1066         t = s;
1067         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1068         if (!s) {
1069 #ifdef DEBUGGING
1070             const char *what = NULL;
1071 #endif
1072             if (endpos == strend) {
1073                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074                                 "Could not match STCLASS...\n") );
1075                 goto fail;
1076             }
1077             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1078                                    "This position contradicts STCLASS...\n") );
1079             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1080                 goto fail;
1081             /* Contradict one of substrings */
1082             if (prog->anchored_substr || prog->anchored_utf8) {
1083                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1084                     DEBUG_EXECUTE_r( what = "anchored" );
1085                   hop_and_restart:
1086                     s = HOP3c(t, 1, strend);
1087                     if (s + start_shift + end_shift > strend) {
1088                         /* XXXX Should be taken into account earlier? */
1089                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1090                                                "Could not match STCLASS...\n") );
1091                         goto fail;
1092                     }
1093                     if (!check)
1094                         goto giveup;
1095                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1096                                 "Looking for %s substr starting at offset %ld...\n",
1097                                  what, (long)(s + start_shift - i_strpos)) );
1098                     goto restart;
1099                 }
1100                 /* Have both, check_string is floating */
1101                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1102                     goto retry_floating_check;
1103                 /* Recheck anchored substring, but not floating... */
1104                 s = check_at;
1105                 if (!check)
1106                     goto giveup;
1107                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                           "Looking for anchored substr starting at offset %ld...\n",
1109                           (long)(other_last - i_strpos)) );
1110                 goto do_other_anchored;
1111             }
1112             /* Another way we could have checked stclass at the
1113                current position only: */
1114             if (ml_anch) {
1115                 s = t = t + 1;
1116                 if (!check)
1117                     goto giveup;
1118                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                           "Looking for /%s^%s/m starting at offset %ld...\n",
1120                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1121                 goto try_at_offset;
1122             }
1123             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1124                 goto fail;
1125             /* Check is floating substring. */
1126           retry_floating_check:
1127             t = check_at - start_shift;
1128             DEBUG_EXECUTE_r( what = "floating" );
1129             goto hop_and_restart;
1130         }
1131         if (t != s) {
1132             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1133                         "By STCLASS: moving %ld --> %ld\n",
1134                                   (long)(t - i_strpos), (long)(s - i_strpos))
1135                    );
1136         }
1137         else {
1138             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1139                                   "Does not contradict STCLASS...\n");
1140                    );
1141         }
1142     }
1143   giveup:
1144     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1145                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1146                           PL_colors[5], (long)(s - i_strpos)) );
1147     return s;
1148
1149   fail_finish:                          /* Substring not found */
1150     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1151         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1152   fail:
1153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1154                           PL_colors[4], PL_colors[5]));
1155     return NULL;
1156 }
1157
1158 #define DECL_TRIE_TYPE(scan) \
1159     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1160                     trie_type = (scan->flags != EXACT) \
1161                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1162                               : (utf8_target ? trie_utf8 : trie_plain)
1163
1164 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1165 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1166     switch (trie_type) {                                                    \
1167     case trie_utf8_fold:                                                    \
1168         if ( foldlen>0 ) {                                                  \
1169             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1170             foldlen -= len;                                                 \
1171             uscan += len;                                                   \
1172             len=0;                                                          \
1173         } else {                                                            \
1174             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1175             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1176             foldlen -= UNISKIP( uvc );                                      \
1177             uscan = foldbuf + UNISKIP( uvc );                               \
1178         }                                                                   \
1179         break;                                                              \
1180     case trie_latin_utf8_fold:                                              \
1181         if ( foldlen>0 ) {                                                  \
1182             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1183             foldlen -= len;                                                 \
1184             uscan += len;                                                   \
1185             len=0;                                                          \
1186         } else {                                                            \
1187             len = 1;                                                        \
1188             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1189             foldlen -= UNISKIP( uvc );                                      \
1190             uscan = foldbuf + UNISKIP( uvc );                               \
1191         }                                                                   \
1192         break;                                                              \
1193     case trie_utf8:                                                         \
1194         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1195         break;                                                              \
1196     case trie_plain:                                                        \
1197         uvc = (UV)*uc;                                                      \
1198         len = 1;                                                            \
1199     }                                                                       \
1200     if (uvc < 256) {                                                        \
1201         charid = trie->charmap[ uvc ];                                      \
1202     }                                                                       \
1203     else {                                                                  \
1204         charid = 0;                                                         \
1205         if (widecharmap) {                                                  \
1206             SV** const svpp = hv_fetch(widecharmap,                         \
1207                         (char*)&uvc, sizeof(UV), 0);                        \
1208             if (svpp)                                                       \
1209                 charid = (U16)SvIV(*svpp);                                  \
1210         }                                                                   \
1211     }                                                                       \
1212 } STMT_END
1213
1214 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1215 STMT_START {                                              \
1216     while (s <= e) {                                      \
1217         if ( (CoNd)                                       \
1218              && (ln == 1 || folder(s, pat_string, ln))    \
1219              && (!reginfo || regtry(reginfo, &s)) )       \
1220             goto got_it;                                  \
1221         s++;                                              \
1222     }                                                     \
1223 } STMT_END
1224
1225 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1226 STMT_START {                                          \
1227     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1228         CoDe                                          \
1229         s += uskip;                                   \
1230     }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_SCAN(CoDe)                          \
1234 STMT_START {                                          \
1235     while (s < strend) {                              \
1236         CoDe                                          \
1237         s++;                                          \
1238     }                                                 \
1239 } STMT_END
1240
1241 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1242 REXEC_FBC_UTF8_SCAN(                                  \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1254 REXEC_FBC_SCAN(                                       \
1255     if (CoNd) {                                       \
1256         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1257             goto got_it;                              \
1258         else                                          \
1259             tmp = doevery;                            \
1260     }                                                 \
1261     else                                              \
1262         tmp = 1;                                      \
1263 )
1264
1265 #define REXEC_FBC_TRYIT               \
1266 if ((!reginfo || regtry(reginfo, &s))) \
1267     goto got_it
1268
1269 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1270     if (utf8_target) {                                             \
1271         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1272     }                                                          \
1273     else {                                                     \
1274         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1275     }
1276
1277 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1278     if (utf8_target) {                                             \
1279         UtFpReLoAd;                                            \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }
1285
1286 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1287     PL_reg_flags |= RF_tainted;                                \
1288     if (utf8_target) {                                             \
1289         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1290     }                                                          \
1291     else {                                                     \
1292         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1293     }
1294
1295 #define DUMP_EXEC_POS(li,s,doutf8) \
1296     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1297
1298
1299 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1300         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1301         tmp = TEST_NON_UTF8(tmp);                                              \
1302         REXEC_FBC_UTF8_SCAN(                                                   \
1303             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1304                 tmp = !tmp;                                                    \
1305                 IF_SUCCESS;                                                    \
1306             }                                                                  \
1307             else {                                                             \
1308                 IF_FAIL;                                                       \
1309             }                                                                  \
1310         );                                                                     \
1311
1312 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1313         if (s == PL_bostr) {                                                   \
1314             tmp = '\n';                                                        \
1315         }                                                                      \
1316         else {                                                                 \
1317             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1318             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1319         }                                                                      \
1320         tmp = TeSt1_UtF8;                                                      \
1321         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1322         REXEC_FBC_UTF8_SCAN(                                                   \
1323             if (tmp == ! (TeSt2_UtF8)) { \
1324                 tmp = !tmp;                                                    \
1325                 IF_SUCCESS;                                                    \
1326             }                                                                  \
1327             else {                                                             \
1328                 IF_FAIL;                                                       \
1329             }                                                                  \
1330         );                                                                     \
1331
1332 /* The only difference between the BOUND and NBOUND cases is that
1333  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1334  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1335  * with the other one being empty */
1336 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1337     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1338
1339 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1340     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1341
1342 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1343     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1344
1345 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1346     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1347
1348
1349 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1350  * be passed in completely with the variable name being tested, which isn't
1351  * such a clean interface, but this is easier to read than it was before.  We
1352  * are looking for the boundary (or non-boundary between a word and non-word
1353  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1354  * must be different.  Find the "wordness" of the character just prior to this
1355  * one, and compare it with the wordness of this one.  If they differ, we have
1356  * a boundary.  At the beginning of the string, pretend that the previous
1357  * character was a new-line */
1358 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359     if (utf8_target) {                                                         \
1360                 UTF8_CODE \
1361     }                                                                          \
1362     else {  /* Not utf8 */                                                     \
1363         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1364         tmp = TEST_NON_UTF8(tmp);                                              \
1365         REXEC_FBC_SCAN(                                                        \
1366             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1367                 tmp = !tmp;                                                    \
1368                 IF_SUCCESS;                                                    \
1369             }                                                                  \
1370             else {                                                             \
1371                 IF_FAIL;                                                       \
1372             }                                                                  \
1373         );                                                                     \
1374     }                                                                          \
1375     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1376         goto got_it;
1377
1378 /* We know what class REx starts with.  Try to find this position... */
1379 /* if reginfo is NULL, its a dryrun */
1380 /* annoyingly all the vars in this routine have different names from their counterparts
1381    in regmatch. /grrr */
1382
1383 STATIC char *
1384 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1385     const char *strend, regmatch_info *reginfo)
1386 {
1387         dVAR;
1388         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1389         char *pat_string;   /* The pattern's exactish string */
1390         char *pat_end;      /* ptr to end char of pat_string */
1391         re_fold_t folder;       /* Function for computing non-utf8 folds */
1392         const U8 *fold_array;   /* array for folding ords < 256 */
1393         STRLEN ln;
1394         STRLEN lnc;
1395         register STRLEN uskip;
1396         U8 c1;
1397         U8 c2;
1398         char *e;
1399         register I32 tmp = 1;   /* Scratch variable? */
1400         register const bool utf8_target = PL_reg_match_utf8;
1401         UV utf8_fold_flags = 0;
1402         RXi_GET_DECL(prog,progi);
1403
1404         PERL_ARGS_ASSERT_FIND_BYCLASS;
1405
1406         /* We know what class it must start with. */
1407         switch (OP(c)) {
1408         case ANYOFV:
1409         case ANYOF:
1410             if (utf8_target || OP(c) == ANYOFV) {
1411                 STRLEN inclasslen = strend - s;
1412                 REXEC_FBC_UTF8_CLASS_SCAN(
1413                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1414             }
1415             else {
1416                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1417             }
1418             break;
1419         case CANY:
1420             REXEC_FBC_SCAN(
1421                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1422                     goto got_it;
1423                 else
1424                     tmp = doevery;
1425             );
1426             break;
1427
1428         case EXACTFA:
1429             if (UTF_PATTERN || utf8_target) {
1430                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1431                 goto do_exactf_utf8;
1432             }
1433             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1434             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1435             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1436
1437         case EXACTFU:
1438             if (UTF_PATTERN || utf8_target) {
1439                 utf8_fold_flags = 0;
1440                 goto do_exactf_utf8;
1441             }
1442             fold_array = PL_fold_latin1;
1443             folder = foldEQ_latin1;
1444             /* XXX This uses the full utf8 fold because if the pattern contains
1445              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1446              * There could be a new node type, say EXACTFU_SS, which is
1447              * generated by regcomp only if there is an 'ss', and then every
1448              * other case could goto do_exactf_non_utf8;*/
1449             goto do_exactf_utf8;
1450
1451         case EXACTF:
1452             if (UTF_PATTERN || utf8_target) {
1453                 utf8_fold_flags = 0;
1454                 goto do_exactf_utf8;
1455             }
1456             fold_array = PL_fold;
1457             folder = foldEQ;
1458             goto do_exactf_non_utf8;
1459
1460         case EXACTFL:
1461             if (UTF_PATTERN || utf8_target) {
1462                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1463                 goto do_exactf_utf8;
1464             }
1465             fold_array = PL_fold_locale;
1466             folder = foldEQ_locale;
1467
1468             /* FALL THROUGH */
1469
1470         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1471
1472             /* The idea in the non-utf8 EXACTF* cases is to first find the
1473              * first character of the EXACTF* node and then, if necessary,
1474              * case-insensitively compare the full text of the node.  c1 is the
1475              * first character.  c2 is its fold.  This logic will not work for
1476              * Unicode semantics and the german sharp ss, which hence should
1477              * not be compiled into a node that gets here. */
1478             pat_string = STRING(c);
1479             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1480
1481             e = HOP3c(strend, -((I32)ln), s);
1482
1483             if (!reginfo && e < s) {
1484                 e = s;                  /* Due to minlen logic of intuit() */
1485             }
1486
1487             c1 = *pat_string;
1488             c2 = fold_array[c1];
1489             if (c1 == c2) { /* If char and fold are the same */
1490                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1491             }
1492             else {
1493                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1494             }
1495             break;
1496
1497         do_exactf_utf8:
1498
1499             /* If one of the operands is in utf8, we can't use the simpler
1500              * folding above, due to the fact that many different characters
1501              * can have the same fold, or portion of a fold, or different-
1502              * length fold */
1503             pat_string = STRING(c);
1504             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1505             pat_end = pat_string + ln;
1506             lnc = (UTF_PATTERN) /* length to match in characters */
1507                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1508                     : ln;
1509
1510             /* Set the end position to the final character available */
1511             e = HOP3c(strend, -1, s);
1512
1513             if (!reginfo && e < s) {
1514                 e = s;                  /* Due to minlen logic of intuit() */
1515             }
1516
1517             while (s <= e) {
1518                 char *my_strend= (char *)strend;
1519                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1520                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1521                     && (!reginfo || regtry(reginfo, &s)) )
1522                 {
1523                     goto got_it;
1524                 }
1525                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1526             }
1527             break;
1528         case BOUNDL:
1529             PL_reg_flags |= RF_tainted;
1530             FBC_BOUND(isALNUM_LC,
1531                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1532                       isALNUM_LC_utf8((U8*)s));
1533             break;
1534         case NBOUNDL:
1535             PL_reg_flags |= RF_tainted;
1536             FBC_NBOUND(isALNUM_LC,
1537                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1538                        isALNUM_LC_utf8((U8*)s));
1539             break;
1540         case BOUND:
1541             FBC_BOUND(isWORDCHAR,
1542                       isALNUM_uni(tmp),
1543                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1544             break;
1545         case BOUNDA:
1546             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1547                              isWORDCHAR_A(tmp),
1548                              isWORDCHAR_A((U8*)s));
1549             break;
1550         case NBOUND:
1551             FBC_NBOUND(isWORDCHAR,
1552                        isALNUM_uni(tmp),
1553                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1554             break;
1555         case NBOUNDA:
1556             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1557                               isWORDCHAR_A(tmp),
1558                               isWORDCHAR_A((U8*)s));
1559             break;
1560         case BOUNDU:
1561             FBC_BOUND(isWORDCHAR_L1,
1562                       isALNUM_uni(tmp),
1563                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1564             break;
1565         case NBOUNDU:
1566             FBC_NBOUND(isWORDCHAR_L1,
1567                        isALNUM_uni(tmp),
1568                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1569             break;
1570         case ALNUML:
1571             REXEC_FBC_CSCAN_TAINT(
1572                 isALNUM_LC_utf8((U8*)s),
1573                 isALNUM_LC(*s)
1574             );
1575             break;
1576         case ALNUMU:
1577             REXEC_FBC_CSCAN_PRELOAD(
1578                 LOAD_UTF8_CHARCLASS_ALNUM(),
1579                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1580                 isWORDCHAR_L1((U8) *s)
1581             );
1582             break;
1583         case ALNUM:
1584             REXEC_FBC_CSCAN_PRELOAD(
1585                 LOAD_UTF8_CHARCLASS_ALNUM(),
1586                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1587                 isWORDCHAR((U8) *s)
1588             );
1589             break;
1590         case ALNUMA:
1591             /* Don't need to worry about utf8, as it can match only a single
1592              * byte invariant character */
1593             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1594             break;
1595         case NALNUMU:
1596             REXEC_FBC_CSCAN_PRELOAD(
1597                 LOAD_UTF8_CHARCLASS_ALNUM(),
1598                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1599                 ! isWORDCHAR_L1((U8) *s)
1600             );
1601             break;
1602         case NALNUM:
1603             REXEC_FBC_CSCAN_PRELOAD(
1604                 LOAD_UTF8_CHARCLASS_ALNUM(),
1605                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1606                 ! isALNUM(*s)
1607             );
1608             break;
1609         case NALNUMA:
1610             REXEC_FBC_CSCAN(
1611                 !isWORDCHAR_A(*s),
1612                 !isWORDCHAR_A(*s)
1613             );
1614             break;
1615         case NALNUML:
1616             REXEC_FBC_CSCAN_TAINT(
1617                 !isALNUM_LC_utf8((U8*)s),
1618                 !isALNUM_LC(*s)
1619             );
1620             break;
1621         case SPACEU:
1622             REXEC_FBC_CSCAN_PRELOAD(
1623                 LOAD_UTF8_CHARCLASS_SPACE(),
1624                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1625                 isSPACE_L1((U8) *s)
1626             );
1627             break;
1628         case SPACE:
1629             REXEC_FBC_CSCAN_PRELOAD(
1630                 LOAD_UTF8_CHARCLASS_SPACE(),
1631                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1632                 isSPACE((U8) *s)
1633             );
1634             break;
1635         case SPACEA:
1636             /* Don't need to worry about utf8, as it can match only a single
1637              * byte invariant character */
1638             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1639             break;
1640         case SPACEL:
1641             REXEC_FBC_CSCAN_TAINT(
1642                 isSPACE_LC_utf8((U8*)s),
1643                 isSPACE_LC(*s)
1644             );
1645             break;
1646         case NSPACEU:
1647             REXEC_FBC_CSCAN_PRELOAD(
1648                 LOAD_UTF8_CHARCLASS_SPACE(),
1649                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1650                 ! isSPACE_L1((U8) *s)
1651             );
1652             break;
1653         case NSPACE:
1654             REXEC_FBC_CSCAN_PRELOAD(
1655                 LOAD_UTF8_CHARCLASS_SPACE(),
1656                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1657                 ! isSPACE((U8) *s)
1658             );
1659             break;
1660         case NSPACEA:
1661             REXEC_FBC_CSCAN(
1662                 !isSPACE_A(*s),
1663                 !isSPACE_A(*s)
1664             );
1665             break;
1666         case NSPACEL:
1667             REXEC_FBC_CSCAN_TAINT(
1668                 !isSPACE_LC_utf8((U8*)s),
1669                 !isSPACE_LC(*s)
1670             );
1671             break;
1672         case DIGIT:
1673             REXEC_FBC_CSCAN_PRELOAD(
1674                 LOAD_UTF8_CHARCLASS_DIGIT(),
1675                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1676                 isDIGIT(*s)
1677             );
1678             break;
1679         case DIGITA:
1680             /* Don't need to worry about utf8, as it can match only a single
1681              * byte invariant character */
1682             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1683             break;
1684         case DIGITL:
1685             REXEC_FBC_CSCAN_TAINT(
1686                 isDIGIT_LC_utf8((U8*)s),
1687                 isDIGIT_LC(*s)
1688             );
1689             break;
1690         case NDIGIT:
1691             REXEC_FBC_CSCAN_PRELOAD(
1692                 LOAD_UTF8_CHARCLASS_DIGIT(),
1693                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1694                 !isDIGIT(*s)
1695             );
1696             break;
1697         case NDIGITA:
1698             REXEC_FBC_CSCAN(
1699                 !isDIGIT_A(*s),
1700                 !isDIGIT_A(*s)
1701             );
1702             break;
1703         case NDIGITL:
1704             REXEC_FBC_CSCAN_TAINT(
1705                 !isDIGIT_LC_utf8((U8*)s),
1706                 !isDIGIT_LC(*s)
1707             );
1708             break;
1709         case LNBREAK:
1710             REXEC_FBC_CSCAN(
1711                 is_LNBREAK_utf8(s),
1712                 is_LNBREAK_latin1(s)
1713             );
1714             break;
1715         case VERTWS:
1716             REXEC_FBC_CSCAN(
1717                 is_VERTWS_utf8(s),
1718                 is_VERTWS_latin1(s)
1719             );
1720             break;
1721         case NVERTWS:
1722             REXEC_FBC_CSCAN(
1723                 !is_VERTWS_utf8(s),
1724                 !is_VERTWS_latin1(s)
1725             );
1726             break;
1727         case HORIZWS:
1728             REXEC_FBC_CSCAN(
1729                 is_HORIZWS_utf8(s),
1730                 is_HORIZWS_latin1(s)
1731             );
1732             break;
1733         case NHORIZWS:
1734             REXEC_FBC_CSCAN(
1735                 !is_HORIZWS_utf8(s),
1736                 !is_HORIZWS_latin1(s)
1737             );
1738             break;
1739         case AHOCORASICKC:
1740         case AHOCORASICK:
1741             {
1742                 DECL_TRIE_TYPE(c);
1743                 /* what trie are we using right now */
1744                 reg_ac_data *aho
1745                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1746                 reg_trie_data *trie
1747                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1748                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1749
1750                 const char *last_start = strend - trie->minlen;
1751 #ifdef DEBUGGING
1752                 const char *real_start = s;
1753 #endif
1754                 STRLEN maxlen = trie->maxlen;
1755                 SV *sv_points;
1756                 U8 **points; /* map of where we were in the input string
1757                                 when reading a given char. For ASCII this
1758                                 is unnecessary overhead as the relationship
1759                                 is always 1:1, but for Unicode, especially
1760                                 case folded Unicode this is not true. */
1761                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762                 U8 *bitmap=NULL;
1763
1764
1765                 GET_RE_DEBUG_FLAGS_DECL;
1766
1767                 /* We can't just allocate points here. We need to wrap it in
1768                  * an SV so it gets freed properly if there is a croak while
1769                  * running the match */
1770                 ENTER;
1771                 SAVETMPS;
1772                 sv_points=newSV(maxlen * sizeof(U8 *));
1773                 SvCUR_set(sv_points,
1774                     maxlen * sizeof(U8 *));
1775                 SvPOK_on(sv_points);
1776                 sv_2mortal(sv_points);
1777                 points=(U8**)SvPV_nolen(sv_points );
1778                 if ( trie_type != trie_utf8_fold
1779                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1780                 {
1781                     if (trie->bitmap)
1782                         bitmap=(U8*)trie->bitmap;
1783                     else
1784                         bitmap=(U8*)ANYOF_BITMAP(c);
1785                 }
1786                 /* this is the Aho-Corasick algorithm modified a touch
1787                    to include special handling for long "unknown char"
1788                    sequences. The basic idea being that we use AC as long
1789                    as we are dealing with a possible matching char, when
1790                    we encounter an unknown char (and we have not encountered
1791                    an accepting state) we scan forward until we find a legal
1792                    starting char.
1793                    AC matching is basically that of trie matching, except
1794                    that when we encounter a failing transition, we fall back
1795                    to the current states "fail state", and try the current char
1796                    again, a process we repeat until we reach the root state,
1797                    state 1, or a legal transition. If we fail on the root state
1798                    then we can either terminate if we have reached an accepting
1799                    state previously, or restart the entire process from the beginning
1800                    if we have not.
1801
1802                  */
1803                 while (s <= last_start) {
1804                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1805                     U8 *uc = (U8*)s;
1806                     U16 charid = 0;
1807                     U32 base = 1;
1808                     U32 state = 1;
1809                     UV uvc = 0;
1810                     STRLEN len = 0;
1811                     STRLEN foldlen = 0;
1812                     U8 *uscan = (U8*)NULL;
1813                     U8 *leftmost = NULL;
1814 #ifdef DEBUGGING
1815                     U32 accepted_word= 0;
1816 #endif
1817                     U32 pointpos = 0;
1818
1819                     while ( state && uc <= (U8*)strend ) {
1820                         int failed=0;
1821                         U32 word = aho->states[ state ].wordnum;
1822
1823                         if( state==1 ) {
1824                             if ( bitmap ) {
1825                                 DEBUG_TRIE_EXECUTE_r(
1826                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1827                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1828                                             (char *)uc, utf8_target );
1829                                         PerlIO_printf( Perl_debug_log,
1830                                             " Scanning for legal start char...\n");
1831                                     }
1832                                 );
1833                                 if (utf8_target) {
1834                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1835                                         uc += UTF8SKIP(uc);
1836                                     }
1837                                 } else {
1838                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1839                                         uc++;
1840                                     }
1841                                 }
1842                                 s= (char *)uc;
1843                             }
1844                             if (uc >(U8*)last_start) break;
1845                         }
1846
1847                         if ( word ) {
1848                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1849                             if (!leftmost || lpos < leftmost) {
1850                                 DEBUG_r(accepted_word=word);
1851                                 leftmost= lpos;
1852                             }
1853                             if (base==0) break;
1854
1855                         }
1856                         points[pointpos++ % maxlen]= uc;
1857                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1858                                              uscan, len, uvc, charid, foldlen,
1859                                              foldbuf, uniflags);
1860                         DEBUG_TRIE_EXECUTE_r({
1861                             dump_exec_pos( (char *)uc, c, strend, real_start,
1862                                 s,   utf8_target );
1863                             PerlIO_printf(Perl_debug_log,
1864                                 " Charid:%3u CP:%4"UVxf" ",
1865                                  charid, uvc);
1866                         });
1867
1868                         do {
1869 #ifdef DEBUGGING
1870                             word = aho->states[ state ].wordnum;
1871 #endif
1872                             base = aho->states[ state ].trans.base;
1873
1874                             DEBUG_TRIE_EXECUTE_r({
1875                                 if (failed)
1876                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1877                                         s,   utf8_target );
1878                                 PerlIO_printf( Perl_debug_log,
1879                                     "%sState: %4"UVxf", word=%"UVxf,
1880                                     failed ? " Fail transition to " : "",
1881                                     (UV)state, (UV)word);
1882                             });
1883                             if ( base ) {
1884                                 U32 tmp;
1885                                 I32 offset;
1886                                 if (charid &&
1887                                      ( ((offset = base + charid
1888                                         - 1 - trie->uniquecharcount)) >= 0)
1889                                      && ((U32)offset < trie->lasttrans)
1890                                      && trie->trans[offset].check == state
1891                                      && (tmp=trie->trans[offset].next))
1892                                 {
1893                                     DEBUG_TRIE_EXECUTE_r(
1894                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1895                                     state = tmp;
1896                                     break;
1897                                 }
1898                                 else {
1899                                     DEBUG_TRIE_EXECUTE_r(
1900                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1901                                     failed = 1;
1902                                     state = aho->fail[state];
1903                                 }
1904                             }
1905                             else {
1906                                 /* we must be accepting here */
1907                                 DEBUG_TRIE_EXECUTE_r(
1908                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1909                                 failed = 1;
1910                                 break;
1911                             }
1912                         } while(state);
1913                         uc += len;
1914                         if (failed) {
1915                             if (leftmost)
1916                                 break;
1917                             if (!state) state = 1;
1918                         }
1919                     }
1920                     if ( aho->states[ state ].wordnum ) {
1921                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1922                         if (!leftmost || lpos < leftmost) {
1923                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1924                             leftmost = lpos;
1925                         }
1926                     }
1927                     if (leftmost) {
1928                         s = (char*)leftmost;
1929                         DEBUG_TRIE_EXECUTE_r({
1930                             PerlIO_printf(
1931                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1932                                 (UV)accepted_word, (IV)(s - real_start)
1933                             );
1934                         });
1935                         if (!reginfo || regtry(reginfo, &s)) {
1936                             FREETMPS;
1937                             LEAVE;
1938                             goto got_it;
1939                         }
1940                         s = HOPc(s,1);
1941                         DEBUG_TRIE_EXECUTE_r({
1942                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1943                         });
1944                     } else {
1945                         DEBUG_TRIE_EXECUTE_r(
1946                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1947                         break;
1948                     }
1949                 }
1950                 FREETMPS;
1951                 LEAVE;
1952             }
1953             break;
1954         default:
1955             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1956             break;
1957         }
1958         return 0;
1959       got_it:
1960         return s;
1961 }
1962
1963
1964 /*
1965  - regexec_flags - match a regexp against a string
1966  */
1967 I32
1968 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1969               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1970 /* strend: pointer to null at end of string */
1971 /* strbeg: real beginning of string */
1972 /* minend: end of match must be >=minend after stringarg. */
1973 /* data: May be used for some additional optimizations.
1974          Currently its only used, with a U32 cast, for transmitting
1975          the ganch offset when doing a /g match. This will change */
1976 /* nosave: For optimizations. */
1977 {
1978     dVAR;
1979     struct regexp *const prog = (struct regexp *)SvANY(rx);
1980     /*register*/ char *s;
1981     register regnode *c;
1982     /*register*/ char *startpos = stringarg;
1983     I32 minlen;         /* must match at least this many chars */
1984     I32 dontbother = 0; /* how many characters not to try at end */
1985     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1986     I32 scream_pos = -1;                /* Internal iterator of scream. */
1987     char *scream_olds = NULL;
1988     const bool utf8_target = cBOOL(DO_UTF8(sv));
1989     I32 multiline;
1990     RXi_GET_DECL(prog,progi);
1991     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1992     regexp_paren_pair *swap = NULL;
1993     GET_RE_DEBUG_FLAGS_DECL;
1994
1995     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1996     PERL_UNUSED_ARG(data);
1997
1998     /* Be paranoid... */
1999     if (prog == NULL || startpos == NULL) {
2000         Perl_croak(aTHX_ "NULL regexp parameter");
2001         return 0;
2002     }
2003
2004     multiline = prog->extflags & RXf_PMf_MULTILINE;
2005     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2006
2007     RX_MATCH_UTF8_set(rx, utf8_target);
2008     DEBUG_EXECUTE_r(
2009         debug_start_match(rx, utf8_target, startpos, strend,
2010         "Matching");
2011     );
2012
2013     minlen = prog->minlen;
2014
2015     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2016         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2017                               "String too short [regexec_flags]...\n"));
2018         goto phooey;
2019     }
2020
2021
2022     /* Check validity of program. */
2023     if (UCHARAT(progi->program) != REG_MAGIC) {
2024         Perl_croak(aTHX_ "corrupted regexp program");
2025     }
2026
2027     PL_reg_flags = 0;
2028     PL_reg_eval_set = 0;
2029     PL_reg_maxiter = 0;
2030
2031     if (RX_UTF8(rx))
2032         PL_reg_flags |= RF_utf8;
2033
2034     /* Mark beginning of line for ^ and lookbehind. */
2035     reginfo.bol = startpos; /* XXX not used ??? */
2036     PL_bostr  = strbeg;
2037     reginfo.sv = sv;
2038
2039     /* Mark end of line for $ (and such) */
2040     PL_regeol = strend;
2041
2042     /* see how far we have to get to not match where we matched before */
2043     reginfo.till = startpos+minend;
2044
2045     /* If there is a "must appear" string, look for it. */
2046     s = startpos;
2047
2048     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2049         MAGIC *mg;
2050         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2051             reginfo.ganch = startpos + prog->gofs;
2052             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2053               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2054         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2055                   && SvMAGIC(sv)
2056                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2057                   && mg->mg_len >= 0) {
2058             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2059             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2060                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2061
2062             if (prog->extflags & RXf_ANCH_GPOS) {
2063                 if (s > reginfo.ganch)
2064                     goto phooey;
2065                 s = reginfo.ganch - prog->gofs;
2066                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2067                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2068                 if (s < strbeg)
2069                     goto phooey;
2070             }
2071         }
2072         else if (data) {
2073             reginfo.ganch = strbeg + PTR2UV(data);
2074             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2075                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2076
2077         } else {                                /* pos() not defined */
2078             reginfo.ganch = strbeg;
2079             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2080                  "GPOS: reginfo.ganch = strbeg\n"));
2081         }
2082     }
2083     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2084         /* We have to be careful. If the previous successful match
2085            was from this regex we don't want a subsequent partially
2086            successful match to clobber the old results.
2087            So when we detect this possibility we add a swap buffer
2088            to the re, and switch the buffer each match. If we fail
2089            we switch it back, otherwise we leave it swapped.
2090         */
2091         swap = prog->offs;
2092         /* do we need a save destructor here for eval dies? */
2093         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2094     }
2095     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2096         re_scream_pos_data d;
2097
2098         d.scream_olds = &scream_olds;
2099         d.scream_pos = &scream_pos;
2100         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2101         if (!s) {
2102             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2103             goto phooey;        /* not present */
2104         }
2105     }
2106
2107
2108
2109     /* Simplest case:  anchored match need be tried only once. */
2110     /*  [unless only anchor is BOL and multiline is set] */
2111     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2112         if (s == startpos && regtry(&reginfo, &startpos))
2113             goto got_it;
2114         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2115                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2116         {
2117             char *end;
2118
2119             if (minlen)
2120                 dontbother = minlen - 1;
2121             end = HOP3c(strend, -dontbother, strbeg) - 1;
2122             /* for multiline we only have to try after newlines */
2123             if (prog->check_substr || prog->check_utf8) {
2124                 /* because of the goto we can not easily reuse the macros for bifurcating the
2125                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2126                 if (utf8_target) {
2127                     if (s == startpos)
2128                         goto after_try_utf8;
2129                     while (1) {
2130                         if (regtry(&reginfo, &s)) {
2131                             goto got_it;
2132                         }
2133                       after_try_utf8:
2134                         if (s > end) {
2135                             goto phooey;
2136                         }
2137                         if (prog->extflags & RXf_USE_INTUIT) {
2138                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2139                             if (!s) {
2140                                 goto phooey;
2141                             }
2142                         }
2143                         else {
2144                             s += UTF8SKIP(s);
2145                         }
2146                     }
2147                 } /* end search for check string in unicode */
2148                 else {
2149                     if (s == startpos) {
2150                         goto after_try_latin;
2151                     }
2152                     while (1) {
2153                         if (regtry(&reginfo, &s)) {
2154                             goto got_it;
2155                         }
2156                       after_try_latin:
2157                         if (s > end) {
2158                             goto phooey;
2159                         }
2160                         if (prog->extflags & RXf_USE_INTUIT) {
2161                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2162                             if (!s) {
2163                                 goto phooey;
2164                             }
2165                         }
2166                         else {
2167                             s++;
2168                         }
2169                     }
2170                 } /* end search for check string in latin*/
2171             } /* end search for check string */
2172             else { /* search for newline */
2173                 if (s > startpos) {
2174                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2175                     s--;
2176                 }
2177                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2178                 while (s < end) {
2179                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2180                         if (regtry(&reginfo, &s))
2181                             goto got_it;
2182                     }
2183                 }
2184             } /* end search for newline */
2185         } /* end anchored/multiline check string search */
2186         goto phooey;
2187     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2188     {
2189         /* the warning about reginfo.ganch being used without initialization
2190            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2191            and we only enter this block when the same bit is set. */
2192         char *tmp_s = reginfo.ganch - prog->gofs;
2193
2194         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2195             goto got_it;
2196         goto phooey;
2197     }
2198
2199     /* Messy cases:  unanchored match. */
2200     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2201         /* we have /x+whatever/ */
2202         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2203         char ch;
2204 #ifdef DEBUGGING
2205         int did_match = 0;
2206 #endif
2207         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2208             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2209         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2210
2211         if (utf8_target) {
2212             REXEC_FBC_SCAN(
2213                 if (*s == ch) {
2214                     DEBUG_EXECUTE_r( did_match = 1 );
2215                     if (regtry(&reginfo, &s)) goto got_it;
2216                     s += UTF8SKIP(s);
2217                     while (s < strend && *s == ch)
2218                         s += UTF8SKIP(s);
2219                 }
2220             );
2221         }
2222         else {
2223             REXEC_FBC_SCAN(
2224                 if (*s == ch) {
2225                     DEBUG_EXECUTE_r( did_match = 1 );
2226                     if (regtry(&reginfo, &s)) goto got_it;
2227                     s++;
2228                     while (s < strend && *s == ch)
2229                         s++;
2230                 }
2231             );
2232         }
2233         DEBUG_EXECUTE_r(if (!did_match)
2234                 PerlIO_printf(Perl_debug_log,
2235                                   "Did not find anchored character...\n")
2236                );
2237     }
2238     else if (prog->anchored_substr != NULL
2239               || prog->anchored_utf8 != NULL
2240               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2241                   && prog->float_max_offset < strend - s)) {
2242         SV *must;
2243         I32 back_max;
2244         I32 back_min;
2245         char *last;
2246         char *last1;            /* Last position checked before */
2247 #ifdef DEBUGGING
2248         int did_match = 0;
2249 #endif
2250         if (prog->anchored_substr || prog->anchored_utf8) {
2251             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2252                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2253             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2254             back_max = back_min = prog->anchored_offset;
2255         } else {
2256             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2257                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2258             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2259             back_max = prog->float_max_offset;
2260             back_min = prog->float_min_offset;
2261         }
2262
2263
2264         if (must == &PL_sv_undef)
2265             /* could not downgrade utf8 check substring, so must fail */
2266             goto phooey;
2267
2268         if (back_min<0) {
2269             last = strend;
2270         } else {
2271             last = HOP3c(strend,        /* Cannot start after this */
2272                   -(I32)(CHR_SVLEN(must)
2273                          - (SvTAIL(must) != 0) + back_min), strbeg);
2274         }
2275         if (s > PL_bostr)
2276             last1 = HOPc(s, -1);
2277         else
2278             last1 = s - 1;      /* bogus */
2279
2280         /* XXXX check_substr already used to find "s", can optimize if
2281            check_substr==must. */
2282         scream_pos = -1;
2283         dontbother = end_shift;
2284         strend = HOPc(strend, -dontbother);
2285         while ( (s <= last) &&
2286                 ((flags & REXEC_SCREAM)
2287                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2288                                     end_shift, &scream_pos, 0))
2289                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2290                                   (unsigned char*)strend, must,
2291                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2292             /* we may be pointing at the wrong string */
2293             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2294                 s = strbeg + (s - SvPVX_const(sv));
2295             DEBUG_EXECUTE_r( did_match = 1 );
2296             if (HOPc(s, -back_max) > last1) {
2297                 last1 = HOPc(s, -back_min);
2298                 s = HOPc(s, -back_max);
2299             }
2300             else {
2301                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2302
2303                 last1 = HOPc(s, -back_min);
2304                 s = t;
2305             }
2306             if (utf8_target) {
2307                 while (s <= last1) {
2308                     if (regtry(&reginfo, &s))
2309                         goto got_it;
2310                     s += UTF8SKIP(s);
2311                 }
2312             }
2313             else {
2314                 while (s <= last1) {
2315                     if (regtry(&reginfo, &s))
2316                         goto got_it;
2317                     s++;
2318                 }
2319             }
2320         }
2321         DEBUG_EXECUTE_r(if (!did_match) {
2322             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2323                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2324             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2325                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2326                                ? "anchored" : "floating"),
2327                 quoted, RE_SV_TAIL(must));
2328         });
2329         goto phooey;
2330     }
2331     else if ( (c = progi->regstclass) ) {
2332         if (minlen) {
2333             const OPCODE op = OP(progi->regstclass);
2334             /* don't bother with what can't match */
2335             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2336                 strend = HOPc(strend, -(minlen - 1));
2337         }
2338         DEBUG_EXECUTE_r({
2339             SV * const prop = sv_newmortal();
2340             regprop(prog, prop, c);
2341             {
2342                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2343                     s,strend-s,60);
2344                 PerlIO_printf(Perl_debug_log,
2345                     "Matching stclass %.*s against %s (%d bytes)\n",
2346                     (int)SvCUR(prop), SvPVX_const(prop),
2347                      quoted, (int)(strend - s));
2348             }
2349         });
2350         if (find_byclass(prog, c, s, strend, &reginfo))
2351             goto got_it;
2352         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2353     }
2354     else {
2355         dontbother = 0;
2356         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2357             /* Trim the end. */
2358             char *last;
2359             SV* float_real;
2360
2361             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2362                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2363             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2364
2365             if (flags & REXEC_SCREAM) {
2366                 last = screaminstr(sv, float_real, s - strbeg,
2367                                    end_shift, &scream_pos, 1); /* last one */
2368                 if (!last)
2369                     last = scream_olds; /* Only one occurrence. */
2370                 /* we may be pointing at the wrong string */
2371                 else if (RXp_MATCH_COPIED(prog))
2372                     s = strbeg + (s - SvPVX_const(sv));
2373             }
2374             else {
2375                 STRLEN len;
2376                 const char * const little = SvPV_const(float_real, len);
2377
2378                 if (SvTAIL(float_real)) {
2379                     if (memEQ(strend - len + 1, little, len - 1))
2380                         last = strend - len + 1;
2381                     else if (!multiline)
2382                         last = memEQ(strend - len, little, len)
2383                             ? strend - len : NULL;
2384                     else
2385                         goto find_last;
2386                 } else {
2387                   find_last:
2388                     if (len)
2389                         last = rninstr(s, strend, little, little + len);
2390                     else
2391                         last = strend;  /* matching "$" */
2392                 }
2393             }
2394             if (last == NULL) {
2395                 DEBUG_EXECUTE_r(
2396                     PerlIO_printf(Perl_debug_log,
2397                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2398                         PL_colors[4], PL_colors[5]));
2399                 goto phooey; /* Should not happen! */
2400             }
2401             dontbother = strend - last + prog->float_min_offset;
2402         }
2403         if (minlen && (dontbother < minlen))
2404             dontbother = minlen - 1;
2405         strend -= dontbother;              /* this one's always in bytes! */
2406         /* We don't know much -- general case. */
2407         if (utf8_target) {
2408             for (;;) {
2409                 if (regtry(&reginfo, &s))
2410                     goto got_it;
2411                 if (s >= strend)
2412                     break;
2413                 s += UTF8SKIP(s);
2414             };
2415         }
2416         else {
2417             do {
2418                 if (regtry(&reginfo, &s))
2419                     goto got_it;
2420             } while (s++ < strend);
2421         }
2422     }
2423
2424     /* Failure. */
2425     goto phooey;
2426
2427 got_it:
2428     Safefree(swap);
2429     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2430
2431     if (PL_reg_eval_set)
2432         restore_pos(aTHX_ prog);
2433     if (RXp_PAREN_NAMES(prog))
2434         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2435
2436     /* make sure $`, $&, $', and $digit will work later */
2437     if ( !(flags & REXEC_NOT_FIRST) ) {
2438         RX_MATCH_COPY_FREE(rx);
2439         if (flags & REXEC_COPY_STR) {
2440             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2441 #ifdef PERL_OLD_COPY_ON_WRITE
2442             if ((SvIsCOW(sv)
2443                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2444                 if (DEBUG_C_TEST) {
2445                     PerlIO_printf(Perl_debug_log,
2446                                   "Copy on write: regexp capture, type %d\n",
2447                                   (int) SvTYPE(sv));
2448                 }
2449                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2450                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2451                 assert (SvPOKp(prog->saved_copy));
2452             } else
2453 #endif
2454             {
2455                 RX_MATCH_COPIED_on(rx);
2456                 s = savepvn(strbeg, i);
2457                 prog->subbeg = s;
2458             }
2459             prog->sublen = i;
2460         }
2461         else {
2462             prog->subbeg = strbeg;
2463             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2464         }
2465     }
2466
2467     return 1;
2468
2469 phooey:
2470     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2471                           PL_colors[4], PL_colors[5]));
2472     if (PL_reg_eval_set)
2473         restore_pos(aTHX_ prog);
2474     if (swap) {
2475         /* we failed :-( roll it back */
2476         Safefree(prog->offs);
2477         prog->offs = swap;
2478     }
2479
2480     return 0;
2481 }
2482
2483
2484 /*
2485  - regtry - try match at specific point
2486  */
2487 STATIC I32                      /* 0 failure, 1 success */
2488 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2489 {
2490     dVAR;
2491     CHECKPOINT lastcp;
2492     REGEXP *const rx = reginfo->prog;
2493     regexp *const prog = (struct regexp *)SvANY(rx);
2494     RXi_GET_DECL(prog,progi);
2495     GET_RE_DEBUG_FLAGS_DECL;
2496
2497     PERL_ARGS_ASSERT_REGTRY;
2498
2499     reginfo->cutpoint=NULL;
2500
2501     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2502         MAGIC *mg;
2503
2504         PL_reg_eval_set = RS_init;
2505         DEBUG_EXECUTE_r(DEBUG_s(
2506             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2507                           (IV)(PL_stack_sp - PL_stack_base));
2508             ));
2509         SAVESTACK_CXPOS();
2510         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2511         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2512         SAVETMPS;
2513         /* Apparently this is not needed, judging by wantarray. */
2514         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2515            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2516
2517         if (reginfo->sv) {
2518             /* Make $_ available to executed code. */
2519             if (reginfo->sv != DEFSV) {
2520                 SAVE_DEFSV;
2521                 DEFSV_set(reginfo->sv);
2522             }
2523
2524             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2525                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2526                 /* prepare for quick setting of pos */
2527 #ifdef PERL_OLD_COPY_ON_WRITE
2528                 if (SvIsCOW(reginfo->sv))
2529                     sv_force_normal_flags(reginfo->sv, 0);
2530 #endif
2531                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2532                                  &PL_vtbl_mglob, NULL, 0);
2533                 mg->mg_len = -1;
2534             }
2535             PL_reg_magic    = mg;
2536             PL_reg_oldpos   = mg->mg_len;
2537             SAVEDESTRUCTOR_X(restore_pos, prog);
2538         }
2539         if (!PL_reg_curpm) {
2540             Newxz(PL_reg_curpm, 1, PMOP);
2541 #ifdef USE_ITHREADS
2542             {
2543                 SV* const repointer = &PL_sv_undef;
2544                 /* this regexp is also owned by the new PL_reg_curpm, which
2545                    will try to free it.  */
2546                 av_push(PL_regex_padav, repointer);
2547                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2548                 PL_regex_pad = AvARRAY(PL_regex_padav);
2549             }
2550 #endif
2551         }
2552 #ifdef USE_ITHREADS
2553         /* It seems that non-ithreads works both with and without this code.
2554            So for efficiency reasons it seems best not to have the code
2555            compiled when it is not needed.  */
2556         /* This is safe against NULLs: */
2557         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2558         /* PM_reg_curpm owns a reference to this regexp.  */
2559         (void)ReREFCNT_inc(rx);
2560 #endif
2561         PM_SETRE(PL_reg_curpm, rx);
2562         PL_reg_oldcurpm = PL_curpm;
2563         PL_curpm = PL_reg_curpm;
2564         if (RXp_MATCH_COPIED(prog)) {
2565             /*  Here is a serious problem: we cannot rewrite subbeg,
2566                 since it may be needed if this match fails.  Thus
2567                 $` inside (?{}) could fail... */
2568             PL_reg_oldsaved = prog->subbeg;
2569             PL_reg_oldsavedlen = prog->sublen;
2570 #ifdef PERL_OLD_COPY_ON_WRITE
2571             PL_nrs = prog->saved_copy;
2572 #endif
2573             RXp_MATCH_COPIED_off(prog);
2574         }
2575         else
2576             PL_reg_oldsaved = NULL;
2577         prog->subbeg = PL_bostr;
2578         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2579     }
2580     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2581     prog->offs[0].start = *startpos - PL_bostr;
2582     PL_reginput = *startpos;
2583     PL_reglastparen = &prog->lastparen;
2584     PL_reglastcloseparen = &prog->lastcloseparen;
2585     prog->lastparen = 0;
2586     prog->lastcloseparen = 0;
2587     PL_regsize = 0;
2588     PL_regoffs = prog->offs;
2589     if (PL_reg_start_tmpl <= prog->nparens) {
2590         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2591         if(PL_reg_start_tmp)
2592             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2593         else
2594             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2595     }
2596
2597     /* XXXX What this code is doing here?!!!  There should be no need
2598        to do this again and again, PL_reglastparen should take care of
2599        this!  --ilya*/
2600
2601     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2602      * Actually, the code in regcppop() (which Ilya may be meaning by
2603      * PL_reglastparen), is not needed at all by the test suite
2604      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2605      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2606      * Meanwhile, this code *is* needed for the
2607      * above-mentioned test suite tests to succeed.  The common theme
2608      * on those tests seems to be returning null fields from matches.
2609      * --jhi updated by dapm */
2610 #if 1
2611     if (prog->nparens) {
2612         regexp_paren_pair *pp = PL_regoffs;
2613         register I32 i;
2614         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2615             ++pp;
2616             pp->start = -1;
2617             pp->end = -1;
2618         }
2619     }
2620 #endif
2621     REGCP_SET(lastcp);
2622     if (regmatch(reginfo, progi->program + 1)) {
2623         PL_regoffs[0].end = PL_reginput - PL_bostr;
2624         return 1;
2625     }
2626     if (reginfo->cutpoint)
2627         *startpos= reginfo->cutpoint;
2628     REGCP_UNWIND(lastcp);
2629     return 0;
2630 }
2631
2632
2633 #define sayYES goto yes
2634 #define sayNO goto no
2635 #define sayNO_SILENT goto no_silent
2636
2637 /* we dont use STMT_START/END here because it leads to
2638    "unreachable code" warnings, which are bogus, but distracting. */
2639 #define CACHEsayNO \
2640     if (ST.cache_mask) \
2641        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2642     sayNO
2643
2644 /* this is used to determine how far from the left messages like
2645    'failed...' are printed. It should be set such that messages
2646    are inline with the regop output that created them.
2647 */
2648 #define REPORT_CODE_OFF 32
2649
2650
2651 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2652 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2653
2654 #define SLAB_FIRST(s) (&(s)->states[0])
2655 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2656
2657 /* grab a new slab and return the first slot in it */
2658
2659 STATIC regmatch_state *
2660 S_push_slab(pTHX)
2661 {
2662 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2663     dMY_CXT;
2664 #endif
2665     regmatch_slab *s = PL_regmatch_slab->next;
2666     if (!s) {
2667         Newx(s, 1, regmatch_slab);
2668         s->prev = PL_regmatch_slab;
2669         s->next = NULL;
2670         PL_regmatch_slab->next = s;
2671     }
2672     PL_regmatch_slab = s;
2673     return SLAB_FIRST(s);
2674 }
2675
2676
2677 /* push a new state then goto it */
2678
2679 #define PUSH_STATE_GOTO(state, node) \
2680     scan = node; \
2681     st->resume_state = state; \
2682     goto push_state;
2683
2684 /* push a new state with success backtracking, then goto it */
2685
2686 #define PUSH_YES_STATE_GOTO(state, node) \
2687     scan = node; \
2688     st->resume_state = state; \
2689     goto push_yes_state;
2690
2691
2692
2693 /*
2694
2695 regmatch() - main matching routine
2696
2697 This is basically one big switch statement in a loop. We execute an op,
2698 set 'next' to point the next op, and continue. If we come to a point which
2699 we may need to backtrack to on failure such as (A|B|C), we push a
2700 backtrack state onto the backtrack stack. On failure, we pop the top
2701 state, and re-enter the loop at the state indicated. If there are no more
2702 states to pop, we return failure.
2703
2704 Sometimes we also need to backtrack on success; for example /A+/, where
2705 after successfully matching one A, we need to go back and try to
2706 match another one; similarly for lookahead assertions: if the assertion
2707 completes successfully, we backtrack to the state just before the assertion
2708 and then carry on.  In these cases, the pushed state is marked as
2709 'backtrack on success too'. This marking is in fact done by a chain of
2710 pointers, each pointing to the previous 'yes' state. On success, we pop to
2711 the nearest yes state, discarding any intermediate failure-only states.
2712 Sometimes a yes state is pushed just to force some cleanup code to be
2713 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2714 it to free the inner regex.
2715
2716 Note that failure backtracking rewinds the cursor position, while
2717 success backtracking leaves it alone.
2718
2719 A pattern is complete when the END op is executed, while a subpattern
2720 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2721 ops trigger the "pop to last yes state if any, otherwise return true"
2722 behaviour.
2723
2724 A common convention in this function is to use A and B to refer to the two
2725 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2726 the subpattern to be matched possibly multiple times, while B is the entire
2727 rest of the pattern. Variable and state names reflect this convention.
2728
2729 The states in the main switch are the union of ops and failure/success of
2730 substates associated with with that op.  For example, IFMATCH is the op
2731 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2732 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2733 successfully matched A and IFMATCH_A_fail is a state saying that we have
2734 just failed to match A. Resume states always come in pairs. The backtrack
2735 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2736 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2737 on success or failure.
2738
2739 The struct that holds a backtracking state is actually a big union, with
2740 one variant for each major type of op. The variable st points to the
2741 top-most backtrack struct. To make the code clearer, within each
2742 block of code we #define ST to alias the relevant union.
2743
2744 Here's a concrete example of a (vastly oversimplified) IFMATCH
2745 implementation:
2746
2747     switch (state) {
2748     ....
2749
2750 #define ST st->u.ifmatch
2751
2752     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2753         ST.foo = ...; // some state we wish to save
2754         ...
2755         // push a yes backtrack state with a resume value of
2756         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2757         // first node of A:
2758         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2759         // NOTREACHED
2760
2761     case IFMATCH_A: // we have successfully executed A; now continue with B
2762         next = B;
2763         bar = ST.foo; // do something with the preserved value
2764         break;
2765
2766     case IFMATCH_A_fail: // A failed, so the assertion failed
2767         ...;   // do some housekeeping, then ...
2768         sayNO; // propagate the failure
2769
2770 #undef ST
2771
2772     ...
2773     }
2774
2775 For any old-timers reading this who are familiar with the old recursive
2776 approach, the code above is equivalent to:
2777
2778     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2779     {
2780         int foo = ...
2781         ...
2782         if (regmatch(A)) {
2783             next = B;
2784             bar = foo;
2785             break;
2786         }
2787         ...;   // do some housekeeping, then ...
2788         sayNO; // propagate the failure
2789     }
2790
2791 The topmost backtrack state, pointed to by st, is usually free. If you
2792 want to claim it, populate any ST.foo fields in it with values you wish to
2793 save, then do one of
2794
2795         PUSH_STATE_GOTO(resume_state, node);
2796         PUSH_YES_STATE_GOTO(resume_state, node);
2797
2798 which sets that backtrack state's resume value to 'resume_state', pushes a
2799 new free entry to the top of the backtrack stack, then goes to 'node'.
2800 On backtracking, the free slot is popped, and the saved state becomes the
2801 new free state. An ST.foo field in this new top state can be temporarily
2802 accessed to retrieve values, but once the main loop is re-entered, it
2803 becomes available for reuse.
2804
2805 Note that the depth of the backtrack stack constantly increases during the
2806 left-to-right execution of the pattern, rather than going up and down with
2807 the pattern nesting. For example the stack is at its maximum at Z at the
2808 end of the pattern, rather than at X in the following:
2809
2810     /(((X)+)+)+....(Y)+....Z/
2811
2812 The only exceptions to this are lookahead/behind assertions and the cut,
2813 (?>A), which pop all the backtrack states associated with A before
2814 continuing.
2815
2816 Backtrack state structs are allocated in slabs of about 4K in size.
2817 PL_regmatch_state and st always point to the currently active state,
2818 and PL_regmatch_slab points to the slab currently containing
2819 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2820 allocated, and is never freed until interpreter destruction. When the slab
2821 is full, a new one is allocated and chained to the end. At exit from
2822 regmatch(), slabs allocated since entry are freed.
2823
2824 */
2825
2826
2827 #define DEBUG_STATE_pp(pp)                                  \
2828     DEBUG_STATE_r({                                         \
2829         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2830         PerlIO_printf(Perl_debug_log,                       \
2831             "    %*s"pp" %s%s%s%s%s\n",                     \
2832             depth*2, "",                                    \
2833             PL_reg_name[st->resume_state],                     \
2834             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2835             ((st==yes_state) ? "Y" : ""),                   \
2836             ((st==mark_state) ? "M" : ""),                  \
2837             ((st==yes_state||st==mark_state) ? "]" : "")    \
2838         );                                                  \
2839     });
2840
2841
2842 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2843
2844 #ifdef DEBUGGING
2845
2846 STATIC void
2847 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2848     const char *start, const char *end, const char *blurb)
2849 {
2850     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2851
2852     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2853
2854     if (!PL_colorset)
2855             reginitcolors();
2856     {
2857         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2858             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2859
2860         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2861             start, end - start, 60);
2862
2863         PerlIO_printf(Perl_debug_log,
2864             "%s%s REx%s %s against %s\n",
2865                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2866
2867         if (utf8_target||utf8_pat)
2868             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2869                 utf8_pat ? "pattern" : "",
2870                 utf8_pat && utf8_target ? " and " : "",
2871                 utf8_target ? "string" : ""
2872             );
2873     }
2874 }
2875
2876 STATIC void
2877 S_dump_exec_pos(pTHX_ const char *locinput,
2878                       const regnode *scan,
2879                       const char *loc_regeol,
2880                       const char *loc_bostr,
2881                       const char *loc_reg_starttry,
2882                       const bool utf8_target)
2883 {
2884     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2885     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2886     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2887     /* The part of the string before starttry has one color
2888        (pref0_len chars), between starttry and current
2889        position another one (pref_len - pref0_len chars),
2890        after the current position the third one.
2891        We assume that pref0_len <= pref_len, otherwise we
2892        decrease pref0_len.  */
2893     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2894         ? (5 + taill) - l : locinput - loc_bostr;
2895     int pref0_len;
2896
2897     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2898
2899     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2900         pref_len++;
2901     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2902     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2903         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2904               ? (5 + taill) - pref_len : loc_regeol - locinput);
2905     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2906         l--;
2907     if (pref0_len < 0)
2908         pref0_len = 0;
2909     if (pref0_len > pref_len)
2910         pref0_len = pref_len;
2911     {
2912         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2913
2914         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2915             (locinput - pref_len),pref0_len, 60, 4, 5);
2916
2917         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2918                     (locinput - pref_len + pref0_len),
2919                     pref_len - pref0_len, 60, 2, 3);
2920
2921         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2922                     locinput, loc_regeol - locinput, 10, 0, 1);
2923
2924         const STRLEN tlen=len0+len1+len2;
2925         PerlIO_printf(Perl_debug_log,
2926                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2927                     (IV)(locinput - loc_bostr),
2928                     len0, s0,
2929                     len1, s1,
2930                     (docolor ? "" : "> <"),
2931                     len2, s2,
2932                     (int)(tlen > 19 ? 0 :  19 - tlen),
2933                     "");
2934     }
2935 }
2936
2937 #endif
2938
2939 /* reg_check_named_buff_matched()
2940  * Checks to see if a named buffer has matched. The data array of
2941  * buffer numbers corresponding to the buffer is expected to reside
2942  * in the regexp->data->data array in the slot stored in the ARG() of
2943  * node involved. Note that this routine doesn't actually care about the
2944  * name, that information is not preserved from compilation to execution.
2945  * Returns the index of the leftmost defined buffer with the given name
2946  * or 0 if non of the buffers matched.
2947  */
2948 STATIC I32
2949 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2950 {
2951     I32 n;
2952     RXi_GET_DECL(rex,rexi);
2953     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2954     I32 *nums=(I32*)SvPVX(sv_dat);
2955
2956     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2957
2958     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2959         if ((I32)*PL_reglastparen >= nums[n] &&
2960             PL_regoffs[nums[n]].end != -1)
2961         {
2962             return nums[n];
2963         }
2964     }
2965     return 0;
2966 }
2967
2968
2969 /* free all slabs above current one  - called during LEAVE_SCOPE */
2970
2971 STATIC void
2972 S_clear_backtrack_stack(pTHX_ void *p)
2973 {
2974     regmatch_slab *s = PL_regmatch_slab->next;
2975     PERL_UNUSED_ARG(p);
2976
2977     if (!s)
2978         return;
2979     PL_regmatch_slab->next = NULL;
2980     while (s) {
2981         regmatch_slab * const osl = s;
2982         s = s->next;
2983         Safefree(osl);
2984     }
2985 }
2986
2987
2988 #define SETREX(Re1,Re2) \
2989     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2990     Re1 = (Re2)
2991
2992 STATIC I32                      /* 0 failure, 1 success */
2993 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2994 {
2995 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2996     dMY_CXT;
2997 #endif
2998     dVAR;
2999     register const bool utf8_target = PL_reg_match_utf8;
3000     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3001     REGEXP *rex_sv = reginfo->prog;
3002     regexp *rex = (struct regexp *)SvANY(rex_sv);
3003     RXi_GET_DECL(rex,rexi);
3004     I32 oldsave;
3005     /* the current state. This is a cached copy of PL_regmatch_state */
3006     register regmatch_state *st;
3007     /* cache heavy used fields of st in registers */
3008     register regnode *scan;
3009     register regnode *next;
3010     register U32 n = 0; /* general value; init to avoid compiler warning */
3011     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3012     register char *locinput = PL_reginput;
3013     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3014
3015     bool result = 0;        /* return value of S_regmatch */
3016     int depth = 0;          /* depth of backtrack stack */
3017     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3018     const U32 max_nochange_depth =
3019         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3020         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3021     regmatch_state *yes_state = NULL; /* state to pop to on success of
3022                                                             subpattern */
3023     /* mark_state piggy backs on the yes_state logic so that when we unwind
3024        the stack on success we can update the mark_state as we go */
3025     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3026     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3027     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3028     U32 state_num;
3029     bool no_final = 0;      /* prevent failure from backtracking? */
3030     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3031     char *startpoint = PL_reginput;
3032     SV *popmark = NULL;     /* are we looking for a mark? */
3033     SV *sv_commit = NULL;   /* last mark name seen in failure */
3034     SV *sv_yes_mark = NULL; /* last mark name we have seen
3035                                during a successful match */
3036     U32 lastopen = 0;       /* last open we saw */
3037     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3038     SV* const oreplsv = GvSV(PL_replgv);
3039     /* these three flags are set by various ops to signal information to
3040      * the very next op. They have a useful lifetime of exactly one loop
3041      * iteration, and are not preserved or restored by state pushes/pops
3042      */
3043     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3044     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3045     int logical = 0;        /* the following EVAL is:
3046                                 0: (?{...})
3047                                 1: (?(?{...})X|Y)
3048                                 2: (??{...})
3049                                or the following IFMATCH/UNLESSM is:
3050                                 false: plain (?=foo)
3051                                 true:  used as a condition: (?(?=foo))
3052                             */
3053 #ifdef DEBUGGING
3054     GET_RE_DEBUG_FLAGS_DECL;
3055 #endif
3056
3057     PERL_ARGS_ASSERT_REGMATCH;
3058
3059     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3060             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3061     }));
3062     /* on first ever call to regmatch, allocate first slab */
3063     if (!PL_regmatch_slab) {
3064         Newx(PL_regmatch_slab, 1, regmatch_slab);
3065         PL_regmatch_slab->prev = NULL;
3066         PL_regmatch_slab->next = NULL;
3067         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3068     }
3069
3070     oldsave = PL_savestack_ix;
3071     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3072     SAVEVPTR(PL_regmatch_slab);
3073     SAVEVPTR(PL_regmatch_state);
3074
3075     /* grab next free state slot */
3076     st = ++PL_regmatch_state;
3077     if (st >  SLAB_LAST(PL_regmatch_slab))
3078         st = PL_regmatch_state = S_push_slab(aTHX);
3079
3080     /* Note that nextchr is a byte even in UTF */
3081     nextchr = UCHARAT(locinput);
3082     scan = prog;
3083     while (scan != NULL) {
3084
3085         DEBUG_EXECUTE_r( {
3086             SV * const prop = sv_newmortal();
3087             regnode *rnext=regnext(scan);
3088             DUMP_EXEC_POS( locinput, scan, utf8_target );
3089             regprop(rex, prop, scan);
3090
3091             PerlIO_printf(Perl_debug_log,
3092                     "%3"IVdf":%*s%s(%"IVdf")\n",
3093                     (IV)(scan - rexi->program), depth*2, "",
3094                     SvPVX_const(prop),
3095                     (PL_regkind[OP(scan)] == END || !rnext) ?
3096                         0 : (IV)(rnext - rexi->program));
3097         });
3098
3099         next = scan + NEXT_OFF(scan);
3100         if (next == scan)
3101             next = NULL;
3102         state_num = OP(scan);
3103
3104       reenter_switch:
3105
3106         assert(PL_reglastparen == &rex->lastparen);
3107         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3108         assert(PL_regoffs == rex->offs);
3109
3110         switch (state_num) {
3111         case BOL:
3112             if (locinput == PL_bostr)
3113             {
3114                 /* reginfo->till = reginfo->bol; */
3115                 break;
3116             }
3117             sayNO;
3118         case MBOL:
3119             if (locinput == PL_bostr ||
3120                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3121             {
3122                 break;
3123             }
3124             sayNO;
3125         case SBOL:
3126             if (locinput == PL_bostr)
3127                 break;
3128             sayNO;
3129         case GPOS:
3130             if (locinput == reginfo->ganch)
3131                 break;
3132             sayNO;
3133
3134         case KEEPS:
3135             /* update the startpoint */
3136             st->u.keeper.val = PL_regoffs[0].start;
3137             PL_reginput = locinput;
3138             PL_regoffs[0].start = locinput - PL_bostr;
3139             PUSH_STATE_GOTO(KEEPS_next, next);
3140             /*NOT-REACHED*/
3141         case KEEPS_next_fail:
3142             /* rollback the start point change */
3143             PL_regoffs[0].start = st->u.keeper.val;
3144             sayNO_SILENT;
3145             /*NOT-REACHED*/
3146         case EOL:
3147                 goto seol;
3148         case MEOL:
3149             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3150                 sayNO;
3151             break;
3152         case SEOL:
3153           seol:
3154             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3155                 sayNO;
3156             if (PL_regeol - locinput > 1)
3157                 sayNO;
3158             break;
3159         case EOS:
3160             if (PL_regeol != locinput)
3161                 sayNO;
3162             break;
3163         case SANY:
3164             if (!nextchr && locinput >= PL_regeol)
3165                 sayNO;
3166             if (utf8_target) {
3167                 locinput += PL_utf8skip[nextchr];
3168                 if (locinput > PL_regeol)
3169                     sayNO;
3170                 nextchr = UCHARAT(locinput);
3171             }
3172             else
3173                 nextchr = UCHARAT(++locinput);
3174             break;
3175         case CANY:
3176             if (!nextchr && locinput >= PL_regeol)
3177                 sayNO;
3178             nextchr = UCHARAT(++locinput);
3179             break;
3180         case REG_ANY:
3181             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3182                 sayNO;
3183             if (utf8_target) {
3184                 locinput += PL_utf8skip[nextchr];
3185                 if (locinput > PL_regeol)
3186                     sayNO;
3187                 nextchr = UCHARAT(locinput);
3188             }
3189             else
3190                 nextchr = UCHARAT(++locinput);
3191             break;
3192
3193 #undef  ST
3194 #define ST st->u.trie
3195         case TRIEC:
3196             /* In this case the charclass data is available inline so
3197                we can fail fast without a lot of extra overhead.
3198              */
3199             if (scan->flags == EXACT || !utf8_target) {
3200                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3201                     DEBUG_EXECUTE_r(
3202                         PerlIO_printf(Perl_debug_log,
3203                                   "%*s  %sfailed to match trie start class...%s\n",
3204                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3205                     );
3206                     sayNO_SILENT;
3207                     /* NOTREACHED */
3208                 }
3209             }
3210             /* FALL THROUGH */
3211         case TRIE:
3212             /* the basic plan of execution of the trie is:
3213              * At the beginning, run though all the states, and
3214              * find the longest-matching word. Also remember the position
3215              * of the shortest matching word. For example, this pattern:
3216              *    1  2 3 4    5
3217              *    ab|a|x|abcd|abc
3218              * when matched against the string "abcde", will generate
3219              * accept states for all words except 3, with the longest
3220              * matching word being 4, and the shortest being 1 (with
3221              * the position being after char 1 of the string).
3222              *
3223              * Then for each matching word, in word order (i.e. 1,2,4,5),
3224              * we run the remainder of the pattern; on each try setting
3225              * the current position to the character following the word,
3226              * returning to try the next word on failure.
3227              *
3228              * We avoid having to build a list of words at runtime by
3229              * using a compile-time structure, wordinfo[].prev, which
3230              * gives, for each word, the previous accepting word (if any).
3231              * In the case above it would contain the mappings 1->2, 2->0,
3232              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3233              * the longest word (4 above), a list of all words, by
3234              * following the list of prev pointers; this gives us the
3235              * unordered list 4,5,1,2. Then given the current word we have
3236              * just tried, we can go through the list and find the
3237              * next-biggest word to try (so if we just failed on word 2,
3238              * the next in the list is 4).
3239              *
3240              * Since at runtime we don't record the matching position in
3241              * the string for each word, we have to work that out for
3242              * each word we're about to process. The wordinfo table holds
3243              * the character length of each word; given that we recorded
3244              * at the start: the position of the shortest word and its
3245              * length in chars, we just need to move the pointer the
3246              * difference between the two char lengths. Depending on
3247              * Unicode status and folding, that's cheap or expensive.
3248              *
3249              * This algorithm is optimised for the case where are only a
3250              * small number of accept states, i.e. 0,1, or maybe 2.
3251              * With lots of accepts states, and having to try all of them,
3252              * it becomes quadratic on number of accept states to find all
3253              * the next words.
3254              */
3255
3256             {
3257                 /* what type of TRIE am I? (utf8 makes this contextual) */
3258                 DECL_TRIE_TYPE(scan);
3259
3260                 /* what trie are we using right now */
3261                 reg_trie_data * const trie
3262                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3263                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3264                 U32 state = trie->startstate;
3265
3266                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3267                     !TRIE_BITMAP_TEST(trie,*locinput)
3268                 ) {
3269                     if (trie->states[ state ].wordnum) {
3270                          DEBUG_EXECUTE_r(
3271                             PerlIO_printf(Perl_debug_log,
3272                                           "%*s  %smatched empty string...%s\n",
3273                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3274                         );
3275                         if (!trie->jump)
3276                             break;
3277                     } else {
3278                         DEBUG_EXECUTE_r(
3279                             PerlIO_printf(Perl_debug_log,
3280                                           "%*s  %sfailed to match trie start class...%s\n",
3281                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3282                         );
3283                         sayNO_SILENT;
3284                    }
3285                 }
3286
3287             {
3288                 U8 *uc = ( U8* )locinput;
3289
3290                 STRLEN len = 0;
3291                 STRLEN foldlen = 0;
3292                 U8 *uscan = (U8*)NULL;
3293                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3294                 U32 charcount = 0; /* how many input chars we have matched */
3295                 U32 accepted = 0; /* have we seen any accepting states? */
3296
3297                 ST.B = next;
3298                 ST.jump = trie->jump;
3299                 ST.me = scan;
3300                 ST.firstpos = NULL;
3301                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3302                 ST.nextword = 0;
3303
3304                 /* fully traverse the TRIE; note the position of the
3305                    shortest accept state and the wordnum of the longest
3306                    accept state */
3307
3308                 while ( state && uc <= (U8*)PL_regeol ) {
3309                     U32 base = trie->states[ state ].trans.base;
3310                     UV uvc = 0;
3311                     U16 charid = 0;
3312                     U16 wordnum;
3313                     wordnum = trie->states[ state ].wordnum;
3314
3315                     if (wordnum) { /* it's an accept state */
3316                         if (!accepted) {
3317                             accepted = 1;
3318                             /* record first match position */
3319                             if (ST.longfold) {
3320                                 ST.firstpos = (U8*)locinput;
3321                                 ST.firstchars = 0;
3322                             }
3323                             else {
3324                                 ST.firstpos = uc;
3325                                 ST.firstchars = charcount;
3326                             }
3327                         }
3328                         if (!ST.nextword || wordnum < ST.nextword)
3329                             ST.nextword = wordnum;
3330                         ST.topword = wordnum;
3331                     }
3332
3333                     DEBUG_TRIE_EXECUTE_r({
3334                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3335                                 PerlIO_printf( Perl_debug_log,
3336                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3337                                     2+depth * 2, "", PL_colors[4],
3338                                     (UV)state, (accepted ? 'Y' : 'N'));
3339                     });
3340
3341                     /* read a char and goto next state */
3342                     if ( base ) {
3343                         I32 offset;
3344                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3345                                              uscan, len, uvc, charid, foldlen,
3346                                              foldbuf, uniflags);
3347                         charcount++;
3348                         if (foldlen>0)
3349                             ST.longfold = TRUE;
3350                         if (charid &&
3351                              ( ((offset =
3352                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3353
3354                              && ((U32)offset < trie->lasttrans)
3355                              && trie->trans[offset].check == state)
3356                         {
3357                             state = trie->trans[offset].next;
3358                         }
3359                         else {
3360                             state = 0;
3361                         }
3362                         uc += len;
3363
3364                     }
3365                     else {
3366                         state = 0;
3367                     }
3368                     DEBUG_TRIE_EXECUTE_r(
3369                         PerlIO_printf( Perl_debug_log,
3370                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3371                             charid, uvc, (UV)state, PL_colors[5] );
3372                     );
3373                 }
3374                 if (!accepted)
3375                    sayNO;
3376
3377                 /* calculate total number of accept states */
3378                 {
3379                     U16 w = ST.topword;
3380                     accepted = 0;
3381                     while (w) {
3382                         w = trie->wordinfo[w].prev;
3383                         accepted++;
3384                     }
3385                     ST.accepted = accepted;
3386                 }
3387
3388                 DEBUG_EXECUTE_r(
3389                     PerlIO_printf( Perl_debug_log,
3390                         "%*s  %sgot %"IVdf" possible matches%s\n",
3391                         REPORT_CODE_OFF + depth * 2, "",
3392                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3393                 );
3394                 goto trie_first_try; /* jump into the fail handler */
3395             }}
3396             /* NOTREACHED */
3397
3398         case TRIE_next_fail: /* we failed - try next alternative */
3399             if ( ST.jump) {
3400                 REGCP_UNWIND(ST.cp);
3401                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3402                     PL_regoffs[n].end = -1;
3403                 *PL_reglastparen = n;
3404             }
3405             if (!--ST.accepted) {
3406                 DEBUG_EXECUTE_r({
3407                     PerlIO_printf( Perl_debug_log,
3408                         "%*s  %sTRIE failed...%s\n",
3409                         REPORT_CODE_OFF+depth*2, "",
3410                         PL_colors[4],
3411                         PL_colors[5] );
3412                 });
3413                 sayNO_SILENT;
3414             }
3415             {
3416                 /* Find next-highest word to process.  Note that this code
3417                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3418                 register U16 min = 0;
3419                 register U16 word;
3420                 register U16 const nextword = ST.nextword;
3421                 register reg_trie_wordinfo * const wordinfo
3422                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3423                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3424                     if (word > nextword && (!min || word < min))
3425                         min = word;
3426                 }
3427                 ST.nextword = min;
3428             }
3429
3430           trie_first_try:
3431             if (do_cutgroup) {
3432                 do_cutgroup = 0;
3433                 no_final = 0;
3434             }
3435
3436             if ( ST.jump) {
3437                 ST.lastparen = *PL_reglastparen;
3438                 REGCP_SET(ST.cp);
3439             }
3440
3441             /* find start char of end of current word */
3442             {
3443                 U32 chars; /* how many chars to skip */
3444                 U8 *uc = ST.firstpos;
3445                 reg_trie_data * const trie
3446                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3447
3448                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3449                             >=  ST.firstchars);
3450                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3451                             - ST.firstchars;
3452
3453                 if (ST.longfold) {
3454                     /* the hard option - fold each char in turn and find
3455                      * its folded length (which may be different */
3456                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3457                     STRLEN foldlen;
3458                     STRLEN len;
3459                     UV uvc;
3460                     U8 *uscan;
3461
3462                     while (chars) {
3463                         if (utf8_target) {
3464                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3465                                                     uniflags);
3466                             uc += len;
3467                         }
3468                         else {
3469                             uvc = *uc;
3470                             uc++;
3471                         }
3472                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3473                         uscan = foldbuf;
3474                         while (foldlen) {
3475                             if (!--chars)
3476                                 break;
3477                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3478                                             uniflags);
3479                             uscan += len;
3480                             foldlen -= len;
3481                         }
3482                     }
3483                 }
3484                 else {
3485                     if (utf8_target)
3486                         while (chars--)
3487                             uc += UTF8SKIP(uc);
3488                     else
3489                         uc += chars;
3490                 }
3491                 PL_reginput = (char *)uc;
3492             }
3493
3494             scan = (ST.jump && ST.jump[ST.nextword])
3495                         ? ST.me + ST.jump[ST.nextword]
3496                         : ST.B;
3497
3498             DEBUG_EXECUTE_r({
3499                 PerlIO_printf( Perl_debug_log,
3500                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3501                     REPORT_CODE_OFF+depth*2, "",
3502                     PL_colors[4],
3503                     ST.nextword,
3504                     PL_colors[5]
3505                     );
3506             });
3507
3508             if (ST.accepted > 1 || has_cutgroup) {
3509                 PUSH_STATE_GOTO(TRIE_next, scan);
3510                 /* NOTREACHED */
3511             }
3512             /* only one choice left - just continue */
3513             DEBUG_EXECUTE_r({
3514                 AV *const trie_words
3515                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3516                 SV ** const tmp = av_fetch( trie_words,
3517                     ST.nextword-1, 0 );
3518                 SV *sv= tmp ? sv_newmortal() : NULL;
3519
3520                 PerlIO_printf( Perl_debug_log,
3521                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3522                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3523                     ST.nextword,
3524                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3525                             PL_colors[0], PL_colors[1],
3526                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3527                         )
3528                     : "not compiled under -Dr",
3529                     PL_colors[5] );
3530             });
3531
3532             locinput = PL_reginput;
3533             nextchr = UCHARAT(locinput);
3534             continue; /* execute rest of RE */
3535             /* NOTREACHED */
3536 #undef  ST
3537
3538         case EXACT: {
3539             char *s = STRING(scan);
3540             ln = STR_LEN(scan);
3541             if (utf8_target != UTF_PATTERN) {
3542                 /* The target and the pattern have differing utf8ness. */
3543                 char *l = locinput;
3544                 const char * const e = s + ln;
3545
3546                 if (utf8_target) {
3547                     /* The target is utf8, the pattern is not utf8. */
3548                     while (s < e) {
3549                         STRLEN ulen;
3550                         if (l >= PL_regeol)
3551                              sayNO;
3552                         if (NATIVE_TO_UNI(*(U8*)s) !=
3553                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3554                                             uniflags))
3555                              sayNO;
3556                         l += ulen;
3557                         s ++;
3558                     }
3559                 }
3560                 else {
3561                     /* The target is not utf8, the pattern is utf8. */
3562                     while (s < e) {
3563                         STRLEN ulen;
3564                         if (l >= PL_regeol)
3565                             sayNO;
3566                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3567                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3568                                            uniflags))
3569                             sayNO;
3570                         s += ulen;
3571                         l ++;
3572                     }
3573                 }
3574                 locinput = l;
3575                 nextchr = UCHARAT(locinput);
3576                 break;
3577             }
3578             /* The target and the pattern have the same utf8ness. */
3579             /* Inline the first character, for speed. */
3580             if (UCHARAT(s) != nextchr)
3581                 sayNO;
3582             if (PL_regeol - locinput < ln)
3583                 sayNO;
3584             if (ln > 1 && memNE(s, locinput, ln))
3585                 sayNO;
3586             locinput += ln;
3587             nextchr = UCHARAT(locinput);
3588             break;
3589             }
3590         case EXACTFL: {
3591             re_fold_t folder;
3592             const U8 * fold_array;
3593             const char * s;
3594             U32 fold_utf8_flags;
3595
3596             PL_reg_flags |= RF_tainted;
3597             folder = foldEQ_locale;
3598             fold_array = PL_fold_locale;
3599             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3600             goto do_exactf;
3601
3602         case EXACTFU:
3603             folder = foldEQ_latin1;
3604             fold_array = PL_fold_latin1;
3605             fold_utf8_flags = 0;
3606             goto do_exactf;
3607
3608         case EXACTFA:
3609             folder = foldEQ_latin1;
3610             fold_array = PL_fold_latin1;
3611             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3612             goto do_exactf;
3613
3614         case EXACTF:
3615             folder = foldEQ;
3616             fold_array = PL_fold;
3617             fold_utf8_flags = 0;
3618
3619           do_exactf:
3620             s = STRING(scan);
3621             ln = STR_LEN(scan);
3622
3623             if (utf8_target || UTF_PATTERN) {
3624               /* Either target or the pattern are utf8. */
3625                 const char * const l = locinput;
3626                 char *e = PL_regeol;
3627
3628                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3629                                l, &e, 0,  utf8_target, fold_utf8_flags))
3630                 {
3631                     sayNO;
3632                 }
3633                 locinput = e;
3634                 nextchr = UCHARAT(locinput);
3635                 break;
3636             }
3637
3638             /* Neither the target nor the pattern are utf8 */
3639             if (UCHARAT(s) != nextchr &&
3640                 UCHARAT(s) != fold_array[nextchr])
3641             {
3642                 sayNO;
3643             }
3644             if (PL_regeol - locinput < ln)
3645                 sayNO;
3646             if (ln > 1 && ! folder(s, locinput, ln))
3647                 sayNO;
3648             locinput += ln;
3649             nextchr = UCHARAT(locinput);
3650             break;
3651         }
3652
3653         /* XXX Could improve efficiency by separating these all out using a
3654          * macro or in-line function.  At that point regcomp.c would no longer
3655          * have to set the FLAGS fields of these */
3656         case BOUNDL:
3657         case NBOUNDL:
3658             PL_reg_flags |= RF_tainted;
3659             /* FALL THROUGH */
3660         case BOUND:
3661         case BOUNDU:
3662         case BOUNDA:
3663         case NBOUND:
3664         case NBOUNDU:
3665         case NBOUNDA:
3666             /* was last char in word? */
3667             if (utf8_target
3668                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3669                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3670             {
3671                 if (locinput == PL_bostr)
3672                     ln = '\n';
3673                 else {
3674                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3675
3676                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3677                 }
3678                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3679                     ln = isALNUM_uni(ln);
3680                     LOAD_UTF8_CHARCLASS_ALNUM();
3681                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3682                 }
3683                 else {
3684                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3685                     n = isALNUM_LC_utf8((U8*)locinput);
3686                 }
3687             }
3688             else {
3689
3690                 /* Here the string isn't utf8, or is utf8 and only ascii
3691                  * characters are to match \w.  In the latter case looking at
3692                  * the byte just prior to the current one may be just the final
3693                  * byte of a multi-byte character.  This is ok.  There are two
3694                  * cases:
3695                  * 1) it is a single byte character, and then the test is doing
3696                  *      just what it's supposed to.
3697                  * 2) it is a multi-byte character, in which case the final
3698                  *      byte is never mistakable for ASCII, and so the test
3699                  *      will say it is not a word character, which is the
3700                  *      correct answer. */
3701                 ln = (locinput != PL_bostr) ?
3702                     UCHARAT(locinput - 1) : '\n';
3703                 switch (FLAGS(scan)) {
3704                     case REGEX_UNICODE_CHARSET:
3705                         ln = isWORDCHAR_L1(ln);
3706                         n = isWORDCHAR_L1(nextchr);
3707                         break;
3708                     case REGEX_LOCALE_CHARSET:
3709                         ln = isALNUM_LC(ln);
3710                         n = isALNUM_LC(nextchr);
3711                         break;
3712                     case REGEX_DEPENDS_CHARSET:
3713                         ln = isALNUM(ln);
3714                         n = isALNUM(nextchr);
3715                         break;
3716                     case REGEX_ASCII_RESTRICTED_CHARSET:
3717                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3718                         ln = isWORDCHAR_A(ln);
3719                         n = isWORDCHAR_A(nextchr);
3720                         break;
3721                     default:
3722                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3723                         break;
3724                 }
3725             }
3726             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3727              * regcomp.sym */
3728             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3729                     sayNO;
3730             break;
3731         case ANYOFV:
3732         case ANYOF:
3733             if (utf8_target || state_num == ANYOFV) {
3734                 STRLEN inclasslen = PL_regeol - locinput;
3735                 if (locinput >= PL_regeol)
3736                     sayNO;
3737
3738                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3739                     sayNO;
3740                 locinput += inclasslen;
3741                 nextchr = UCHARAT(locinput);
3742                 break;
3743             }
3744             else {
3745                 if (nextchr < 0)
3746                     nextchr = UCHARAT(locinput);
3747                 if (!nextchr && locinput >= PL_regeol)
3748                     sayNO;
3749                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3750                     sayNO;
3751                 nextchr = UCHARAT(++locinput);
3752                 break;
3753             }
3754             break;
3755         /* Special char classes - The defines start on line 129 or so */
3756         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3757                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3758                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3759                   ALNUMA, NALNUMA, isWORDCHAR_A,
3760                   alnum, "a");
3761
3762         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3763                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3764                   SPACEU, NSPACEU, isSPACE_L1,
3765                   SPACEA, NSPACEA, isSPACE_A,
3766                   space, " ");
3767
3768         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3769                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3770                 DIGITA, NDIGITA, isDIGIT_A,
3771                 digit, "0");
3772
3773         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3774                        a Unicode extended Grapheme Cluster */
3775             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3776               extended Grapheme Cluster is:
3777
3778                CR LF
3779                | Prepend* Begin Extend*
3780                | .
3781
3782                Begin is (Hangul-syllable | ! Control)
3783                Extend is (Grapheme_Extend | Spacing_Mark)
3784                Control is [ GCB_Control CR LF ]
3785
3786                The discussion below shows how the code for CLUMP is derived
3787                from this regex.  Note that most of these concepts are from
3788                property values of the Grapheme Cluster Boundary (GCB) property.
3789                No code point can have multiple property values for a given
3790                property.  Thus a code point in Prepend can't be in Control, but
3791                it must be in !Control.  This is why Control above includes
3792                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3793                property separately, and so can't be in GCB_Control, even though
3794                they logically are controls.  Control is not the same as gc=cc,
3795                but includes format and other characters as well.
3796
3797                The Unicode definition of Hangul-syllable is:
3798                    L+
3799                    | (L* ( ( V | LV ) V* | LVT ) T*)
3800                    | T+
3801                   )
3802                Each of these is a value for the GCB property, and hence must be
3803                disjoint, so the order they are tested is immaterial, so the
3804                above can safely be changed to
3805                    T+
3806                    | L+
3807                    | (L* ( LVT | ( V | LV ) V*) T*)
3808
3809                The last two terms can be combined like this:
3810                    L* ( L
3811                         | (( LVT | ( V | LV ) V*) T*))
3812
3813                And refactored into this:
3814                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3815
3816                That means that if we have seen any L's at all we can quit
3817                there, but if the next character is a LVT, a V or and LV we
3818                should keep going.
3819
3820                There is a subtlety with Prepend* which showed up in testing.
3821                Note that the Begin, and only the Begin is required in:
3822                 | Prepend* Begin Extend*
3823                Also, Begin contains '! Control'.  A Prepend must be a '!
3824                Control', which means it must be a Begin.  What it comes down to
3825                is that if we match Prepend* and then find no suitable Begin
3826                afterwards, that if we backtrack the last Prepend, that one will
3827                be a suitable Begin.
3828             */
3829
3830             if (locinput >= PL_regeol)
3831                 sayNO;
3832             if  (! utf8_target) {
3833
3834                 /* Match either CR LF  or '.', as all the other possibilities
3835                  * require utf8 */
3836                 locinput++;         /* Match the . or CR */
3837                 if (nextchr == '\r'
3838                     && locinput < PL_regeol
3839                     && UCHARAT(locinput) == '\n') locinput++;
3840             }
3841             else {
3842
3843                 /* Utf8: See if is ( CR LF ); already know that locinput <
3844                  * PL_regeol, so locinput+1 is in bounds */
3845                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3846                     locinput += 2;
3847                 }
3848                 else {
3849                     /* In case have to backtrack to beginning, then match '.' */
3850                     char *starting = locinput;
3851
3852                     /* In case have to backtrack the last prepend */
3853                     char *previous_prepend = 0;
3854
3855                     LOAD_UTF8_CHARCLASS_GCB();
3856
3857                     /* Match (prepend)* */
3858                     while (locinput < PL_regeol
3859                            && swash_fetch(PL_utf8_X_prepend,
3860                                           (U8*)locinput, utf8_target))
3861                     {
3862                         previous_prepend = locinput;
3863                         locinput += UTF8SKIP(locinput);
3864                     }
3865
3866                     /* As noted above, if we matched a prepend character, but
3867                      * the next thing won't match, back off the last prepend we
3868                      * matched, as it is guaranteed to match the begin */
3869                     if (previous_prepend
3870                         && (locinput >=  PL_regeol
3871                             || ! swash_fetch(PL_utf8_X_begin,
3872                                              (U8*)locinput, utf8_target)))
3873                     {
3874                         locinput = previous_prepend;
3875                     }
3876
3877                     /* Note that here we know PL_regeol > locinput, as we
3878                      * tested that upon input to this switch case, and if we
3879                      * moved locinput forward, we tested the result just above
3880                      * and it either passed, or we backed off so that it will
3881                      * now pass */
3882                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3883
3884                         /* Here did not match the required 'Begin' in the
3885                          * second term.  So just match the very first
3886                          * character, the '.' of the final term of the regex */
3887                         locinput = starting + UTF8SKIP(starting);
3888                     } else {
3889
3890                         /* Here is the beginning of a character that can have
3891                          * an extender.  It is either a hangul syllable, or a
3892                          * non-control */
3893                         if (swash_fetch(PL_utf8_X_non_hangul,
3894                                         (U8*)locinput, utf8_target))
3895                         {
3896
3897                             /* Here not a Hangul syllable, must be a
3898                              * ('!  * Control') */
3899                             locinput += UTF8SKIP(locinput);
3900                         } else {
3901
3902                             /* Here is a Hangul syllable.  It can be composed
3903                              * of several individual characters.  One
3904                              * possibility is T+ */
3905                             if (swash_fetch(PL_utf8_X_T,
3906                                             (U8*)locinput, utf8_target))
3907                             {
3908                                 while (locinput < PL_regeol
3909                                         && swash_fetch(PL_utf8_X_T,
3910                                                         (U8*)locinput, utf8_target))
3911                                 {
3912                                     locinput += UTF8SKIP(locinput);
3913                                 }
3914                             } else {
3915
3916                                 /* Here, not T+, but is a Hangul.  That means
3917                                  * it is one of the others: L, LV, LVT or V,
3918                                  * and matches:
3919                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3920
3921                                 /* Match L*           */
3922                                 while (locinput < PL_regeol
3923                                         && swash_fetch(PL_utf8_X_L,
3924                                                         (U8*)locinput, utf8_target))
3925                                 {
3926                                     locinput += UTF8SKIP(locinput);
3927                                 }
3928
3929                                 /* Here, have exhausted L*.  If the next
3930                                  * character is not an LV, LVT nor V, it means
3931                                  * we had to have at least one L, so matches L+
3932                                  * in the original equation, we have a complete
3933                                  * hangul syllable.  Are done. */
3934
3935                                 if (locinput < PL_regeol
3936                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3937                                                     (U8*)locinput, utf8_target))
3938                                 {
3939
3940                                     /* Otherwise keep going.  Must be LV, LVT
3941                                      * or V.  See if LVT */
3942                                     if (swash_fetch(PL_utf8_X_LVT,
3943                                                     (U8*)locinput, utf8_target))
3944                                     {
3945                                         locinput += UTF8SKIP(locinput);
3946                                     } else {
3947
3948                                         /* Must be  V or LV.  Take it, then
3949                                          * match V*     */
3950                                         locinput += UTF8SKIP(locinput);
3951                                         while (locinput < PL_regeol
3952                                                 && swash_fetch(PL_utf8_X_V,
3953                                                          (U8*)locinput, utf8_target))
3954                                         {
3955                                             locinput += UTF8SKIP(locinput);
3956                                         }
3957                                     }
3958
3959                                     /* And any of LV, LVT, or V can be followed
3960                                      * by T*            */
3961                                     while (locinput < PL_regeol
3962                                            && swash_fetch(PL_utf8_X_T,
3963                                                            (U8*)locinput,
3964                                                            utf8_target))
3965                                     {
3966                                         locinput += UTF8SKIP(locinput);
3967                                     }
3968                                 }
3969                             }
3970                         }
3971
3972                         /* Match any extender */
3973                         while (locinput < PL_regeol
3974                                 && swash_fetch(PL_utf8_X_extend,
3975                                                 (U8*)locinput, utf8_target))
3976                         {
3977                             locinput += UTF8SKIP(locinput);
3978                         }
3979                     }
3980                 }
3981                 if (locinput > PL_regeol) sayNO;
3982             }
3983             nextchr = UCHARAT(locinput);
3984             break;
3985
3986         case NREFFL:
3987         {   /* The capture buffer cases.  The ones beginning with N for the
3988                named buffers just convert to the equivalent numbered and
3989                pretend they were called as the corresponding numbered buffer
3990                op.  */
3991             /* don't initialize these in the declaration, it makes C++
3992                unhappy */
3993             char *s;
3994             char type;
3995             re_fold_t folder;
3996             const U8 *fold_array;
3997             UV utf8_fold_flags;
3998
3999             PL_reg_flags |= RF_tainted;
4000             folder = foldEQ_locale;
4001             fold_array = PL_fold_locale;
4002             type = REFFL;
4003             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4004             goto do_nref;
4005
4006         case NREFFA:
4007             folder = foldEQ_latin1;
4008             fold_array = PL_fold_latin1;
4009             type = REFFA;
4010             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4011             goto do_nref;
4012
4013         case NREFFU:
4014             folder = foldEQ_latin1;
4015             fold_array = PL_fold_latin1;
4016             type = REFFU;
4017             utf8_fold_flags = 0;
4018             goto do_nref;
4019
4020         case NREFF:
4021             folder = foldEQ;
4022             fold_array = PL_fold;
4023             type = REFF;
4024             utf8_fold_flags = 0;
4025             goto do_nref;
4026
4027         case NREF:
4028             type = REF;
4029             folder = NULL;
4030             fold_array = NULL;
4031             utf8_fold_flags = 0;
4032           do_nref:
4033
4034             /* For the named back references, find the corresponding buffer
4035              * number */
4036             n = reg_check_named_buff_matched(rex,scan);
4037
4038             if ( ! n ) {
4039                 sayNO;
4040             }
4041             goto do_nref_ref_common;
4042
4043         case REFFL:
4044             PL_reg_flags |= RF_tainted;
4045             folder = foldEQ_locale;
4046             fold_array = PL_fold_locale;
4047             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4048             goto do_ref;
4049
4050         case REFFA:
4051             folder = foldEQ_latin1;
4052             fold_array = PL_fold_latin1;
4053             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4054             goto do_ref;
4055
4056         case REFFU:
4057             folder = foldEQ_latin1;
4058             fold_array = PL_fold_latin1;
4059             utf8_fold_flags = 0;
4060             goto do_ref;
4061
4062         case REFF:
4063             folder = foldEQ;
4064             fold_array = PL_fold;
4065             utf8_fold_flags = 0;
4066             goto do_ref;
4067
4068         case REF:
4069             folder = NULL;
4070             fold_array = NULL;
4071             utf8_fold_flags = 0;
4072
4073           do_ref:
4074             type = OP(scan);
4075             n = ARG(scan);  /* which paren pair */
4076
4077           do_nref_ref_common:
4078             ln = PL_regoffs[n].start;
4079             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4080             if (*PL_reglastparen < n || ln == -1)
4081                 sayNO;                  /* Do not match unless seen CLOSEn. */
4082             if (ln == PL_regoffs[n].end)
4083                 break;
4084
4085             s = PL_bostr + ln;
4086             if (type != REF     /* REF can do byte comparison */
4087                 && (utf8_target || type == REFFU))
4088             { /* XXX handle REFFL better */
4089                 char * limit = PL_regeol;
4090
4091                 /* This call case insensitively compares the entire buffer
4092                     * at s, with the current input starting at locinput, but
4093                     * not going off the end given by PL_regeol, and returns in
4094                     * limit upon success, how much of the current input was
4095                     * matched */
4096                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4097                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4098                 {
4099                     sayNO;
4100                 }
4101                 locinput = limit;
4102                 nextchr = UCHARAT(locinput);
4103                 break;
4104             }
4105
4106             /* Not utf8:  Inline the first character, for speed. */
4107             if (UCHARAT(s) != nextchr &&
4108                 (type == REF ||
4109                  UCHARAT(s) != fold_array[nextchr]))
4110                 sayNO;
4111             ln = PL_regoffs[n].end - ln;
4112             if (locinput + ln > PL_regeol)
4113                 sayNO;
4114             if (ln > 1 && (type == REF
4115                            ? memNE(s, locinput, ln)
4116                            : ! folder(s, locinput, ln)))
4117                 sayNO;
4118             locinput += ln;
4119             nextchr = UCHARAT(locinput);
4120             break;
4121         }
4122         case NOTHING:
4123         case TAIL:
4124             break;
4125         case BACK:
4126             break;
4127
4128 #undef  ST
4129 #define ST st->u.eval
4130         {
4131             SV *ret;
4132             REGEXP *re_sv;
4133             regexp *re;
4134             regexp_internal *rei;
4135             regnode *startpoint;
4136
4137         case GOSTART:
4138         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4139             if (cur_eval && cur_eval->locinput==locinput) {
4140                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4141                     Perl_croak(aTHX_ "Infinite recursion in regex");
4142                 if ( ++nochange_depth > max_nochange_depth )
4143                     Perl_croak(aTHX_
4144                         "Pattern subroutine nesting without pos change"
4145                         " exceeded limit in regex");
4146             } else {
4147                 nochange_depth = 0;
4148             }
4149             re_sv = rex_sv;
4150             re = rex;
4151             rei = rexi;
4152             (void)ReREFCNT_inc(rex_sv);
4153             if (OP(scan)==GOSUB) {
4154                 startpoint = scan + ARG2L(scan);
4155                 ST.close_paren = ARG(scan);
4156             } else {
4157                 startpoint = rei->program+1;
4158                 ST.close_paren = 0;
4159             }
4160             goto eval_recurse_doit;
4161             /* NOTREACHED */
4162         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4163             if (cur_eval && cur_eval->locinput==locinput) {
4164                 if ( ++nochange_depth > max_nochange_depth )
4165                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4166             } else {
4167                 nochange_depth = 0;
4168             }
4169             {
4170                 /* execute the code in the {...} */
4171                 dSP;
4172                 SV ** const before = SP;
4173                 OP_4tree * const oop = PL_op;
4174                 COP * const ocurcop = PL_curcop;
4175                 PAD *old_comppad;
4176                 char *saved_regeol = PL_regeol;
4177                 struct re_save_state saved_state;
4178
4179                 /* To not corrupt the existing regex state while executing the
4180                  * eval we would normally put it on the save stack, like with
4181                  * save_re_context. However, re-evals have a weird scoping so we
4182                  * can't just add ENTER/LEAVE here. With that, things like
4183                  *
4184                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4185                  *
4186                  * would break, as they expect the localisation to be unwound
4187                  * only when the re-engine backtracks through the bit that
4188                  * localised it.
4189                  *
4190                  * What we do instead is just saving the state in a local c
4191                  * variable.
4192                  */
4193                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4194
4195                 n = ARG(scan);
4196                 PL_op = (OP_4tree*)rexi->data->data[n];
4197                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4198                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4199                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4200                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4201
4202                 if (sv_yes_mark) {
4203                     SV *sv_mrk = get_sv("REGMARK", 1);
4204                     sv_setsv(sv_mrk, sv_yes_mark);
4205                 }
4206
4207                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4208                 SPAGAIN;
4209                 if (SP == before)
4210                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4211                 else {
4212                     ret = POPs;
4213                     PUTBACK;
4214                 }
4215
4216                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4217
4218                 PL_op = oop;
4219                 PAD_RESTORE_LOCAL(old_comppad);
4220                 PL_curcop = ocurcop;
4221                 PL_regeol = saved_regeol;
4222                 if (!logical) {
4223                     /* /(?{...})/ */
4224                     sv_setsv(save_scalar(PL_replgv), ret);
4225                     break;
4226                 }
4227             }
4228             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4229                 logical = 0;
4230                 {
4231                     /* extract RE object from returned value; compiling if
4232                      * necessary */
4233                     MAGIC *mg = NULL;
4234                     REGEXP *rx = NULL;
4235
4236                     if (SvROK(ret)) {
4237                         SV *const sv = SvRV(ret);
4238
4239                         if (SvTYPE(sv) == SVt_REGEXP) {
4240                             rx = (REGEXP*) sv;
4241                         } else if (SvSMAGICAL(sv)) {
4242                             mg = mg_find(sv, PERL_MAGIC_qr);
4243                             assert(mg);
4244                         }
4245                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4246                         rx = (REGEXP*) ret;
4247                     } else if (SvSMAGICAL(ret)) {
4248                         if (SvGMAGICAL(ret)) {
4249                             /* I don't believe that there is ever qr magic
4250                                here.  */
4251                             assert(!mg_find(ret, PERL_MAGIC_qr));
4252                             sv_unmagic(ret, PERL_MAGIC_qr);
4253                         }
4254                         else {
4255                             mg = mg_find(ret, PERL_MAGIC_qr);
4256                             /* testing suggests mg only ends up non-NULL for
4257                                scalars who were upgraded and compiled in the
4258                                else block below. In turn, this is only
4259                                triggered in the "postponed utf8 string" tests
4260                                in t/op/pat.t  */
4261                         }
4262                     }
4263
4264                     if (mg) {
4265                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4266                         assert(rx);
4267                     }
4268                     if (rx) {
4269                         rx = reg_temp_copy(NULL, rx);
4270                     }
4271                     else {
4272                         U32 pm_flags = 0;
4273                         const I32 osize = PL_regsize;
4274
4275                         if (DO_UTF8(ret)) {
4276                             assert (SvUTF8(ret));
4277                         } else if (SvUTF8(ret)) {
4278                             /* Not doing UTF-8, despite what the SV says. Is
4279                                this only if we're trapped in use 'bytes'?  */
4280                             /* Make a copy of the octet sequence, but without
4281                                the flag on, as the compiler now honours the
4282                                SvUTF8 flag on ret.  */
4283                             STRLEN len;
4284                             const char *const p = SvPV(ret, len);
4285                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4286                         }
4287                         rx = CALLREGCOMP(ret, pm_flags);
4288                         if (!(SvFLAGS(ret)
4289                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4290                                  | SVs_GMG))) {
4291                             /* This isn't a first class regexp. Instead, it's
4292                                caching a regexp onto an existing, Perl visible
4293                                scalar.  */
4294                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4295                         }
4296                         PL_regsize = osize;
4297                     }
4298                     re_sv = rx;
4299                     re = (struct regexp *)SvANY(rx);
4300                 }
4301                 RXp_MATCH_COPIED_off(re);
4302                 re->subbeg = rex->subbeg;
4303                 re->sublen = rex->sublen;
4304                 rei = RXi_GET(re);
4305                 DEBUG_EXECUTE_r(
4306                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4307                         "Matching embedded");
4308                 );
4309                 startpoint = rei->program + 1;
4310                 ST.close_paren = 0; /* only used for GOSUB */
4311                 /* borrowed from regtry */
4312                 if (PL_reg_start_tmpl <= re->nparens) {
4313                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4314                     if(PL_reg_start_tmp)
4315                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4316                     else
4317                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4318                 }
4319
4320         eval_recurse_doit: /* Share code with GOSUB below this line */
4321                 /* run the pattern returned from (??{...}) */
4322                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4323                 REGCP_SET(ST.lastcp);
4324
4325                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4326
4327                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4328                 PL_reglastparen = &re->lastparen;
4329                 PL_reglastcloseparen = &re->lastcloseparen;
4330                 re->lastparen = 0;
4331                 re->lastcloseparen = 0;
4332
4333                 PL_reginput = locinput;
4334                 PL_regsize = 0;
4335
4336                 /* XXXX This is too dramatic a measure... */
4337                 PL_reg_maxiter = 0;
4338
4339                 ST.toggle_reg_flags = PL_reg_flags;
4340                 if (RX_UTF8(re_sv))
4341                     PL_reg_flags |= RF_utf8;
4342                 else
4343                     PL_reg_flags &= ~RF_utf8;
4344                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4345
4346                 ST.prev_rex = rex_sv;
4347                 ST.prev_curlyx = cur_curlyx;
4348                 SETREX(rex_sv,re_sv);
4349                 rex = re;
4350                 rexi = rei;
4351                 cur_curlyx = NULL;
4352                 ST.B = next;
4353                 ST.prev_eval = cur_eval;
4354                 cur_eval = st;
4355                 /* now continue from first node in postoned RE */
4356                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4357                 /* NOTREACHED */
4358             }
4359             /* logical is 1,   /(?(?{...})X|Y)/ */
4360             sw = cBOOL(SvTRUE(ret));
4361             logical = 0;
4362             break;
4363         }
4364
4365         case EVAL_AB: /* cleanup after a successful (??{A})B */
4366             /* note: this is called twice; first after popping B, then A */
4367             PL_reg_flags ^= ST.toggle_reg_flags;
4368             ReREFCNT_dec(rex_sv);
4369             SETREX(rex_sv,ST.prev_rex);
4370             rex = (struct regexp *)SvANY(rex_sv);
4371             rexi = RXi_GET(rex);
4372             regcpblow(ST.cp);
4373             cur_eval = ST.prev_eval;
4374             cur_curlyx = ST.prev_curlyx;
4375
4376             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4377             PL_reglastparen = &rex->lastparen;
4378             PL_reglastcloseparen = &rex->lastcloseparen;
4379             /* also update PL_regoffs */
4380             PL_regoffs = rex->offs;
4381
4382             /* XXXX This is too dramatic a measure... */
4383             PL_reg_maxiter = 0;
4384             if ( nochange_depth )
4385                 nochange_depth--;
4386             sayYES;
4387
4388
4389         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4390             /* note: this is called twice; first after popping B, then A */
4391             PL_reg_flags ^= ST.toggle_reg_flags;
4392             ReREFCNT_dec(rex_sv);
4393             SETREX(rex_sv,ST.prev_rex);
4394             rex = (struct regexp *)SvANY(rex_sv);
4395             rexi = RXi_GET(rex);
4396             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4397             PL_reglastparen = &rex->lastparen;
4398             PL_reglastcloseparen = &rex->lastcloseparen;
4399
4400             PL_reginput = locinput;
4401             REGCP_UNWIND(ST.lastcp);
4402             regcppop(rex);
4403             cur_eval = ST.prev_eval;
4404             cur_curlyx = ST.prev_curlyx;
4405             /* XXXX This is too dramatic a measure... */
4406             PL_reg_maxiter = 0;
4407             if ( nochange_depth )
4408                 nochange_depth--;
4409             sayNO_SILENT;
4410 #undef ST
4411
4412         case OPEN:
4413             n = ARG(scan);  /* which paren pair */
4414             PL_reg_start_tmp[n] = locinput;
4415             if (n > PL_regsize)
4416                 PL_regsize = n;
4417             lastopen = n;
4418             break;
4419         case CLOSE:
4420             n = ARG(scan);  /* which paren pair */
4421             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4422             PL_regoffs[n].end = locinput - PL_bostr;
4423             /*if (n > PL_regsize)
4424                 PL_regsize = n;*/
4425             if (n > *PL_reglastparen)
4426                 *PL_reglastparen = n;
4427             *PL_reglastcloseparen = n;
4428             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4429                 goto fake_end;
4430             }
4431             break;
4432         case ACCEPT:
4433             if (ARG(scan)){
4434                 regnode *cursor;
4435                 for (cursor=scan;
4436                      cursor && OP(cursor)!=END;
4437                      cursor=regnext(cursor))
4438                 {
4439                     if ( OP(cursor)==CLOSE ){
4440                         n = ARG(cursor);
4441                         if ( n <= lastopen ) {
4442                             PL_regoffs[n].start
4443                                 = PL_reg_start_tmp[n] - PL_bostr;
4444                             PL_regoffs[n].end = locinput - PL_bostr;
4445                             /*if (n > PL_regsize)
4446                             PL_regsize = n;*/
4447                             if (n > *PL_reglastparen)
4448                                 *PL_reglastparen = n;
4449                             *PL_reglastcloseparen = n;
4450                             if ( n == ARG(scan) || (cur_eval &&
4451                                 cur_eval->u.eval.close_paren == n))
4452                                 break;
4453                         }
4454                     }
4455                 }
4456             }
4457             goto fake_end;
4458             /*NOTREACHED*/
4459         case GROUPP:
4460             n = ARG(scan);  /* which paren pair */
4461             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4462             break;
4463         case NGROUPP:
4464             /* reg_check_named_buff_matched returns 0 for no match */
4465             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4466             break;
4467         case INSUBP:
4468             n = ARG(scan);
4469             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4470             break;
4471         case DEFINEP:
4472             sw = 0;
4473             break;
4474         case IFTHEN:
4475             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4476             if (sw)
4477                 next = NEXTOPER(NEXTOPER(scan));
4478             else {
4479                 next = scan + ARG(scan);
4480                 if (OP(next) == IFTHEN) /* Fake one. */
4481                     next = NEXTOPER(NEXTOPER(next));
4482             }
4483             break;
4484         case LOGICAL:
4485             logical = scan->flags;
4486             break;
4487
4488 /*******************************************************************
4489
4490 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4491 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4492 STAR/PLUS/CURLY/CURLYN are used instead.)
4493
4494 A*B is compiled as <CURLYX><A><WHILEM><B>
4495
4496 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4497 state, which contains the current count, initialised to -1. It also sets
4498 cur_curlyx to point to this state, with any previous value saved in the
4499 state block.
4500
4501 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4502 since the pattern may possibly match zero times (i.e. it's a while {} loop
4503 rather than a do {} while loop).
4504
4505 Each entry to WHILEM represents a successful match of A. The count in the
4506 CURLYX block is incremented, another WHILEM state is pushed, and execution
4507 passes to A or B depending on greediness and the current count.
4508
4509 For example, if matching against the string a1a2a3b (where the aN are
4510 substrings that match /A/), then the match progresses as follows: (the
4511 pushed states are interspersed with the bits of strings matched so far):
4512
4513     <CURLYX cnt=-1>
4514     <CURLYX cnt=0><WHILEM>
4515     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4516     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4517     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4518     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4519
4520 (Contrast this with something like CURLYM, which maintains only a single
4521 backtrack state:
4522
4523     <CURLYM cnt=0> a1
4524     a1 <CURLYM cnt=1> a2
4525     a1 a2 <CURLYM cnt=2> a3
4526     a1 a2 a3 <CURLYM cnt=3> b
4527 )
4528
4529 Each WHILEM state block marks a point to backtrack to upon partial failure
4530 of A or B, and also contains some minor state data related to that
4531 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4532 overall state, such as the count, and pointers to the A and B ops.
4533
4534 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4535 must always point to the *current* CURLYX block, the rules are:
4536
4537 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4538 and set cur_curlyx to point the new block.
4539
4540 When popping the CURLYX block after a successful or unsuccessful match,
4541 restore the previous cur_curlyx.
4542
4543 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4544 to the outer one saved in the CURLYX block.
4545
4546 When popping the WHILEM block after a successful or unsuccessful B match,
4547 restore the previous cur_curlyx.
4548
4549 Here's an example for the pattern (AI* BI)*BO
4550 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4551
4552 cur_
4553 curlyx backtrack stack
4554 ------ ---------------
4555 NULL
4556 CO     <CO prev=NULL> <WO>
4557 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4558 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4559 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4560
4561 At this point the pattern succeeds, and we work back down the stack to
4562 clean up, restoring as we go:
4563
4564 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4565 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4566 CO     <CO prev=NULL> <WO>
4567 NULL
4568
4569 *******************************************************************/
4570
4571 #define ST st->u.curlyx
4572
4573         case CURLYX:    /* start of /A*B/  (for complex A) */
4574         {
4575             /* No need to save/restore up to this paren */
4576             I32 parenfloor = scan->flags;
4577
4578             assert(next); /* keep Coverity happy */
4579             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4580                 next += ARG(next);
4581
4582             /* XXXX Probably it is better to teach regpush to support
4583                parenfloor > PL_regsize... */
4584             if (parenfloor > (I32)*PL_reglastparen)
4585                 parenfloor = *PL_reglastparen; /* Pessimization... */
4586
4587             ST.prev_curlyx= cur_curlyx;
4588             cur_curlyx = st;
4589             ST.cp = PL_savestack_ix;
4590
4591             /* these fields contain the state of the current curly.
4592              * they are accessed by subsequent WHILEMs */
4593             ST.parenfloor = parenfloor;
4594             ST.me = scan;
4595             ST.B = next;
4596             ST.minmod = minmod;
4597             minmod = 0;
4598             ST.count = -1;      /* this will be updated by WHILEM */
4599             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4600
4601             PL_reginput = locinput;
4602             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4603             /* NOTREACHED */
4604         }
4605
4606         case CURLYX_end: /* just finished matching all of A*B */
4607             cur_curlyx = ST.prev_curlyx;
4608             sayYES;
4609             /* NOTREACHED */
4610
4611         case CURLYX_end_fail: /* just failed to match all of A*B */
4612             regcpblow(ST.cp);
4613             cur_curlyx = ST.prev_curlyx;
4614             sayNO;
4615             /* NOTREACHED */
4616
4617
4618 #undef ST
4619 #define ST st->u.whilem
4620
4621         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4622         {
4623             /* see the discussion above about CURLYX/WHILEM */
4624             I32 n;
4625             int min = ARG1(cur_curlyx->u.curlyx.me);
4626             int max = ARG2(cur_curlyx->u.curlyx.me);
4627             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4628
4629             assert(cur_curlyx); /* keep Coverity happy */
4630             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4631             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4632             ST.cache_offset = 0;
4633             ST.cache_mask = 0;
4634
4635             PL_reginput = locinput;
4636
4637             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4638                   "%*s  whilem: matched %ld out of %d..%d\n",
4639                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4640             );
4641
4642             /* First just match a string of min A's. */
4643
4644             if (n < min) {
4645                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4646                 cur_curlyx->u.curlyx.lastloc = locinput;
4647                 REGCP_SET(ST.lastcp);
4648
4649                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4650                 /* NOTREACHED */
4651             }
4652
4653             /* If degenerate A matches "", assume A done. */
4654
4655             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4656                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4657                    "%*s  whilem: empty match detected, trying continuation...\n",
4658                    REPORT_CODE_OFF+depth*2, "")
4659                 );
4660                 goto do_whilem_B_max;
4661             }
4662
4663             /* super-linear cache processing */
4664
4665             if (scan->flags) {
4666
4667                 if (!PL_reg_maxiter) {
4668                     /* start the countdown: Postpone detection until we
4669                      * know the match is not *that* much linear. */
4670                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4671                     /* possible overflow for long strings and many CURLYX's */
4672                     if (PL_reg_maxiter < 0)
4673                         PL_reg_maxiter = I32_MAX;
4674                     PL_reg_leftiter = PL_reg_maxiter;
4675                 }
4676
4677                 if (PL_reg_leftiter-- == 0) {
4678                     /* initialise cache */
4679                     const I32 size = (PL_reg_maxiter + 7)/8;
4680                     if (PL_reg_poscache) {
4681                         if ((I32)PL_reg_poscache_size < size) {
4682                             Renew(PL_reg_poscache, size, char);
4683                             PL_reg_poscache_size = size;
4684                         }
4685                         Zero(PL_reg_poscache, size, char);
4686                     }
4687                     else {
4688                         PL_reg_poscache_size = size;
4689                         Newxz(PL_reg_poscache, size, char);
4690                     }
4691                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4692       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4693                               PL_colors[4], PL_colors[5])
4694                     );
4695                 }
4696
4697                 if (PL_reg_leftiter < 0) {
4698                     /* have we already failed at this position? */
4699                     I32 offset, mask;
4700                     offset  = (scan->flags & 0xf) - 1
4701                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4702                     mask    = 1 << (offset % 8);
4703                     offset /= 8;
4704                     if (PL_reg_poscache[offset] & mask) {
4705                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4706                             "%*s  whilem: (cache) already tried at this position...\n",
4707                             REPORT_CODE_OFF+depth*2, "")
4708                         );
4709                         sayNO; /* cache records failure */
4710                     }
4711                     ST.cache_offset = offset;
4712                     ST.cache_mask   = mask;
4713                 }
4714             }
4715
4716             /* Prefer B over A for minimal matching. */
4717
4718             if (cur_curlyx->u.curlyx.minmod) {
4719                 ST.save_curlyx = cur_curlyx;
4720                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4721                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4722                 REGCP_SET(ST.lastcp);
4723                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4724                 /* NOTREACHED */
4725             }
4726
4727             /* Prefer A over B for maximal matching. */
4728
4729             if (n < max) { /* More greed allowed? */
4730                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4731                 cur_curlyx->u.curlyx.lastloc = locinput;
4732                 REGCP_SET(ST.lastcp);
4733                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4734                 /* NOTREACHED */
4735             }
4736             goto do_whilem_B_max;
4737         }
4738         /* NOTREACHED */
4739
4740         case WHILEM_B_min: /* just matched B in a minimal match */
4741         case WHILEM_B_max: /* just matched B in a maximal match */
4742             cur_curlyx = ST.save_curlyx;
4743             sayYES;
4744             /* NOTREACHED */
4745
4746         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4747             cur_curlyx = ST.save_curlyx;
4748             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4749             cur_curlyx->u.curlyx.count--;
4750             CACHEsayNO;
4751             /* NOTREACHED */
4752
4753         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4754             /* FALL THROUGH */
4755         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4756             REGCP_UNWIND(ST.lastcp);
4757             regcppop(rex);
4758             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4759             cur_curlyx->u.curlyx.count--;
4760             CACHEsayNO;
4761             /* NOTREACHED */
4762
4763         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4764             REGCP_UNWIND(ST.lastcp);
4765             regcppop(rex);      /* Restore some previous $<digit>s? */
4766             PL_reginput = locinput;
4767             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4768                 "%*s  whilem: failed, trying continuation...\n",
4769                 REPORT_CODE_OFF+depth*2, "")
4770             );
4771           do_whilem_B_max:
4772             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4773                 && ckWARN(WARN_REGEXP)
4774                 && !(PL_reg_flags & RF_warned))
4775             {
4776                 PL_reg_flags |= RF_warned;
4777                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4778                      "Complex regular subexpression recursion",
4779                      REG_INFTY - 1);
4780             }
4781
4782             /* now try B */
4783             ST.save_curlyx = cur_curlyx;
4784             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4785             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4786             /* NOTREACHED */
4787
4788         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4789             cur_curlyx = ST.save_curlyx;
4790             REGCP_UNWIND(ST.lastcp);
4791             regcppop(rex);
4792
4793             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4794                 /* Maximum greed exceeded */
4795                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4796                     && ckWARN(WARN_REGEXP)
4797                     && !(PL_reg_flags & RF_warned))
4798                 {
4799                     PL_reg_flags |= RF_warned;
4800                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4801                         "%s limit (%d) exceeded",
4802                         "Complex regular subexpression recursion",
4803                         REG_INFTY - 1);
4804                 }
4805                 cur_curlyx->u.curlyx.count--;
4806                 CACHEsayNO;
4807             }
4808
4809             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4810                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4811             );
4812             /* Try grabbing another A and see if it helps. */
4813             PL_reginput = locinput;
4814             cur_curlyx->u.curlyx.lastloc = locinput;
4815             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4816             REGCP_SET(ST.lastcp);
4817             PUSH_STATE_GOTO(WHILEM_A_min,
4818                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4819             /* NOTREACHED */
4820
4821 #undef  ST
4822 #define ST st->u.branch
4823
4824         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4825             next = scan + ARG(scan);
4826             if (next == scan)
4827                 next = NULL;
4828             scan = NEXTOPER(scan);
4829             /* FALL THROUGH */
4830
4831         case BRANCH:        /*  /(...|A|...)/ */
4832             scan = NEXTOPER(scan); /* scan now points to inner node */
4833             ST.lastparen = *PL_reglastparen;
4834             ST.next_branch = next;
4835             REGCP_SET(ST.cp);
4836             PL_reginput = locinput;
4837
4838             /* Now go into the branch */
4839             if (has_cutgroup) {
4840                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4841             } else {
4842                 PUSH_STATE_GOTO(BRANCH_next, scan);
4843             }
4844             /* NOTREACHED */
4845         case CUTGROUP:
4846             PL_reginput = locinput;
4847             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4848                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4849             PUSH_STATE_GOTO(CUTGROUP_next,next);
4850             /* NOTREACHED */
4851         case CUTGROUP_next_fail:
4852             do_cutgroup = 1;
4853             no_final = 1;
4854             if (st->u.mark.mark_name)
4855                 sv_commit = st->u.mark.mark_name;
4856             sayNO;
4857             /* NOTREACHED */
4858         case BRANCH_next:
4859             sayYES;
4860             /* NOTREACHED */
4861         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4862             if (do_cutgroup) {
4863                 do_cutgroup = 0;
4864                 no_final = 0;
4865             }
4866             REGCP_UNWIND(ST.cp);
4867             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4868                 PL_regoffs[n].end = -1;
4869             *PL_reglastparen = n;
4870             /*dmq: *PL_reglastcloseparen = n; */
4871             scan = ST.next_branch;
4872             /* no more branches? */
4873             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4874                 DEBUG_EXECUTE_r({
4875                     PerlIO_printf( Perl_debug_log,
4876                         "%*s  %sBRANCH failed...%s\n",
4877                         REPORT_CODE_OFF+depth*2, "",
4878                         PL_colors[4],
4879                         PL_colors[5] );
4880                 });
4881                 sayNO_SILENT;
4882             }
4883             continue; /* execute next BRANCH[J] op */
4884             /* NOTREACHED */
4885
4886         case MINMOD:
4887             minmod = 1;
4888             break;
4889
4890 #undef  ST
4891 #define ST st->u.curlym
4892
4893         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4894
4895             /* This is an optimisation of CURLYX that enables us to push
4896              * only a single backtracking state, no matter how many matches
4897              * there are in {m,n}. It relies on the pattern being constant
4898              * length, with no parens to influence future backrefs
4899              */
4900
4901             ST.me = scan;
4902             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4903
4904             /* if paren positive, emulate an OPEN/CLOSE around A */
4905             if (ST.me->flags) {
4906                 U32 paren = ST.me->flags;
4907                 if (paren > PL_regsize)
4908                     PL_regsize = paren;
4909                 if (paren > *PL_reglastparen)
4910                     *PL_reglastparen = paren;
4911                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4912             }
4913             ST.A = scan;
4914             ST.B = next;
4915             ST.alen = 0;
4916             ST.count = 0;
4917             ST.minmod = minmod;
4918             minmod = 0;
4919             ST.c1 = CHRTEST_UNINIT;
4920             REGCP_SET(ST.cp);
4921
4922             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4923                 goto curlym_do_B;
4924
4925           curlym_do_A: /* execute the A in /A{m,n}B/  */
4926             PL_reginput = locinput;
4927             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4928             /* NOTREACHED */
4929
4930         case CURLYM_A: /* we've just matched an A */
4931             locinput = st->locinput;
4932             nextchr = UCHARAT(locinput);
4933
4934             ST.count++;
4935             /* after first match, determine A's length: u.curlym.alen */
4936             if (ST.count == 1) {
4937                 if (PL_reg_match_utf8) {
4938                     char *s = locinput;
4939                     while (s < PL_reginput) {
4940                         ST.alen++;
4941                         s += UTF8SKIP(s);
4942                     }
4943                 }
4944                 else {
4945                     ST.alen = PL_reginput - locinput;
4946                 }
4947                 if (ST.alen == 0)
4948                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4949             }
4950             DEBUG_EXECUTE_r(
4951                 PerlIO_printf(Perl_debug_log,
4952                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4953                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4954                           (IV) ST.count, (IV)ST.alen)
4955             );
4956
4957             locinput = PL_reginput;
4958
4959             if (cur_eval && cur_eval->u.eval.close_paren &&
4960                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4961                 goto fake_end;
4962
4963             {
4964                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4965                 if ( max == REG_INFTY || ST.count < max )
4966                     goto curlym_do_A; /* try to match another A */
4967             }
4968             goto curlym_do_B; /* try to match B */
4969
4970         case CURLYM_A_fail: /* just failed to match an A */
4971             REGCP_UNWIND(ST.cp);
4972
4973             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4974                 || (cur_eval && cur_eval->u.eval.close_paren &&
4975                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4976                 sayNO;
4977
4978           curlym_do_B: /* execute the B in /A{m,n}B/  */
4979             PL_reginput = locinput;
4980             if (ST.c1 == CHRTEST_UNINIT) {
4981                 /* calculate c1 and c2 for possible match of 1st char
4982                  * following curly */
4983                 ST.c1 = ST.c2 = CHRTEST_VOID;
4984                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4985                     regnode *text_node = ST.B;
4986                     if (! HAS_TEXT(text_node))
4987                         FIND_NEXT_IMPT(text_node);
4988                     /* this used to be
4989
4990                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4991
4992                         But the former is redundant in light of the latter.
4993
4994                         if this changes back then the macro for
4995                         IS_TEXT and friends need to change.
4996                      */
4997                     if (PL_regkind[OP(text_node)] == EXACT)
4998                     {
4999
5000                         ST.c1 = (U8)*STRING(text_node);
5001                         switch (OP(text_node)) {
5002                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5003                             case EXACTFA:
5004                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5005                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5006                             default: ST.c2 = ST.c1;
5007                         }
5008                     }
5009                 }
5010             }
5011
5012             DEBUG_EXECUTE_r(
5013                 PerlIO_printf(Perl_debug_log,
5014                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5015                     (int)(REPORT_CODE_OFF+(depth*2)),
5016                     "", (IV)ST.count)
5017                 );
5018             if (ST.c1 != CHRTEST_VOID
5019                     && UCHARAT(PL_reginput) != ST.c1
5020                     && UCHARAT(PL_reginput) != ST.c2)
5021             {
5022                 /* simulate B failing */
5023                 DEBUG_OPTIMISE_r(
5024                     PerlIO_printf(Perl_debug_log,
5025                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5026                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5027                         (IV)ST.c1,(IV)ST.c2
5028                 ));
5029                 state_num = CURLYM_B_fail;
5030                 goto reenter_switch;
5031             }
5032
5033             if (ST.me->flags) {
5034                 /* mark current A as captured */
5035                 I32 paren = ST.me->flags;
5036                 if (ST.count) {
5037                     PL_regoffs[paren].start
5038                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5039                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5040                     /*dmq: *PL_reglastcloseparen = paren; */
5041                 }
5042                 else
5043                     PL_regoffs[paren].end = -1;
5044                 if (cur_eval && cur_eval->u.eval.close_paren &&
5045                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5046                 {
5047                     if (ST.count)
5048                         goto fake_end;
5049                     else
5050                         sayNO;
5051                 }
5052             }
5053
5054             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5055             /* NOTREACHED */
5056
5057         case CURLYM_B_fail: /* just failed to match a B */
5058             REGCP_UNWIND(ST.cp);
5059             if (ST.minmod) {
5060                 I32 max = ARG2(ST.me);
5061                 if (max != REG_INFTY && ST.count == max)
5062                     sayNO;
5063                 goto curlym_do_A; /* try to match a further A */
5064             }
5065             /* backtrack one A */
5066             if (ST.count == ARG1(ST.me) /* min */)
5067                 sayNO;
5068             ST.count--;
5069             locinput = HOPc(locinput, -ST.alen);
5070             goto curlym_do_B; /* try to match B */
5071
5072 #undef ST
5073 #define ST st->u.curly
5074
5075 #define CURLY_SETPAREN(paren, success) \
5076     if (paren) { \
5077         if (success) { \
5078             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5079             PL_regoffs[paren].end = locinput - PL_bostr; \
5080             *PL_reglastcloseparen = paren; \
5081         } \
5082         else \
5083             PL_regoffs[paren].end = -1; \
5084     }
5085
5086         case STAR:              /*  /A*B/ where A is width 1 */
5087             ST.paren = 0;
5088             ST.min = 0;
5089             ST.max = REG_INFTY;
5090             scan = NEXTOPER(scan);
5091             goto repeat;
5092         case PLUS:              /*  /A+B/ where A is width 1 */
5093             ST.paren = 0;
5094             ST.min = 1;
5095             ST.max = REG_INFTY;
5096             scan = NEXTOPER(scan);
5097             goto repeat;
5098         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5099             ST.paren = scan->flags;     /* Which paren to set */
5100             if (ST.paren > PL_regsize)
5101                 PL_regsize = ST.paren;
5102             if (ST.paren > *PL_reglastparen)
5103                 *PL_reglastparen = ST.paren;
5104             ST.min = ARG1(scan);  /* min to match */
5105             ST.max = ARG2(scan);  /* max to match */
5106             if (cur_eval && cur_eval->u.eval.close_paren &&
5107                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5108                 ST.min=1;
5109                 ST.max=1;
5110             }
5111             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5112             goto repeat;
5113         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5114             ST.paren = 0;
5115             ST.min = ARG1(scan);  /* min to match */
5116             ST.max = ARG2(scan);  /* max to match */
5117             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5118           repeat:
5119             /*
5120             * Lookahead to avoid useless match attempts
5121             * when we know what character comes next.
5122             *
5123             * Used to only do .*x and .*?x, but now it allows
5124             * for )'s, ('s and (?{ ... })'s to be in the way
5125             * of the quantifier and the EXACT-like node.  -- japhy
5126             */
5127
5128             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5129                 sayNO;
5130             if (HAS_TEXT(next) || JUMPABLE(next)) {
5131                 U8 *s;
5132                 regnode *text_node = next;
5133
5134                 if (! HAS_TEXT(text_node))
5135                     FIND_NEXT_IMPT(text_node);
5136
5137                 if (! HAS_TEXT(text_node))
5138                     ST.c1 = ST.c2 = CHRTEST_VOID;
5139                 else {
5140                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5141                         ST.c1 = ST.c2 = CHRTEST_VOID;
5142                         goto assume_ok_easy;
5143                     }
5144                     else
5145                         s = (U8*)STRING(text_node);
5146
5147                     /*  Currently we only get here when
5148
5149                         PL_rekind[OP(text_node)] == EXACT
5150
5151                         if this changes back then the macro for IS_TEXT and
5152                         friends need to change. */
5153                     if (!UTF_PATTERN) {
5154                         ST.c1 = *s;
5155                         switch (OP(text_node)) {
5156                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5157                             case EXACTFA:
5158                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5159                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5160                             default: ST.c2 = ST.c1; break;
5161                         }
5162                     }
5163                     else { /* UTF_PATTERN */
5164                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5165                              STRLEN ulen1, ulen2;
5166                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5167                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5168
5169                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5170                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5171 #ifdef EBCDIC
5172                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5173                                                     ckWARN(WARN_UTF8) ?
5174                                                     0 : UTF8_ALLOW_ANY);
5175                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5176                                                     ckWARN(WARN_UTF8) ?
5177                                                     0 : UTF8_ALLOW_ANY);
5178 #else
5179                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5180                                                     uniflags);
5181                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5182                                                     uniflags);
5183 #endif
5184                         }
5185                         else {
5186                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5187                                                      uniflags);
5188                         }
5189                     }
5190                 }
5191             }
5192             else
5193                 ST.c1 = ST.c2 = CHRTEST_VOID;
5194         assume_ok_easy:
5195
5196             ST.A = scan;
5197             ST.B = next;
5198             PL_reginput = locinput;
5199             if (minmod) {
5200                 minmod = 0;
5201                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5202                     sayNO;
5203                 ST.count = ST.min;
5204                 locinput = PL_reginput;
5205                 REGCP_SET(ST.cp);
5206                 if (ST.c1 == CHRTEST_VOID)
5207                     goto curly_try_B_min;
5208
5209                 ST.oldloc = locinput;
5210
5211                 /* set ST.maxpos to the furthest point along the
5212                  * string that could possibly match */
5213                 if  (ST.max == REG_INFTY) {
5214                     ST.maxpos = PL_regeol - 1;
5215                     if (utf8_target)
5216                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5217                             ST.maxpos--;
5218                 }
5219                 else if (utf8_target) {
5220                     int m = ST.max - ST.min;
5221                     for (ST.maxpos = locinput;
5222                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5223                         ST.maxpos += UTF8SKIP(ST.maxpos);
5224                 }
5225                 else {
5226                     ST.maxpos = locinput + ST.max - ST.min;
5227                     if (ST.maxpos >= PL_regeol)
5228                         ST.maxpos = PL_regeol - 1;
5229                 }
5230                 goto curly_try_B_min_known;
5231
5232             }
5233             else {
5234                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5235                 locinput = PL_reginput;
5236                 if (ST.count < ST.min)
5237                     sayNO;
5238                 if ((ST.count > ST.min)
5239                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5240                 {
5241                     /* A{m,n} must come at the end of the string, there's
5242                      * no point in backing off ... */
5243                     ST.min = ST.count;
5244                     /* ...except that $ and \Z can match before *and* after
5245                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5246                        We may back off by one in this case. */
5247                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5248                         ST.min--;
5249                 }
5250                 REGCP_SET(ST.cp);
5251                 goto curly_try_B_max;
5252             }
5253             /* NOTREACHED */
5254
5255
5256         case CURLY_B_min_known_fail:
5257             /* failed to find B in a non-greedy match where c1,c2 valid */
5258             if (ST.paren && ST.count)
5259                 PL_regoffs[ST.paren].end = -1;
5260
5261             PL_reginput = locinput;     /* Could be reset... */
5262             REGCP_UNWIND(ST.cp);
5263             /* Couldn't or didn't -- move forward. */
5264             ST.oldloc = locinput;
5265             if (utf8_target)
5266                 locinput += UTF8SKIP(locinput);
5267             else
5268                 locinput++;
5269             ST.count++;
5270           curly_try_B_min_known:
5271              /* find the next place where 'B' could work, then call B */
5272             {
5273                 int n;
5274                 if (utf8_target) {
5275                     n = (ST.oldloc == locinput) ? 0 : 1;
5276                     if (ST.c1 == ST.c2) {
5277                         STRLEN len;
5278                         /* set n to utf8_distance(oldloc, locinput) */
5279                         while (locinput <= ST.maxpos &&
5280                                utf8n_to_uvchr((U8*)locinput,
5281                                               UTF8_MAXBYTES, &len,
5282                                               uniflags) != (UV)ST.c1) {
5283                             locinput += len;
5284                             n++;
5285                         }
5286                     }
5287                     else {
5288                         /* set n to utf8_distance(oldloc, locinput) */
5289                         while (locinput <= ST.maxpos) {
5290                             STRLEN len;
5291                             const UV c = utf8n_to_uvchr((U8*)locinput,
5292                                                   UTF8_MAXBYTES, &len,
5293                                                   uniflags);
5294                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5295                                 break;
5296                             locinput += len;
5297                             n++;
5298                         }
5299                     }
5300                 }
5301                 else {
5302                     if (ST.c1 == ST.c2) {
5303                         while (locinput <= ST.maxpos &&
5304                                UCHARAT(locinput) != ST.c1)
5305                             locinput++;
5306                     }
5307                     else {
5308                         while (locinput <= ST.maxpos
5309                                && UCHARAT(locinput) != ST.c1
5310                                && UCHARAT(locinput) != ST.c2)
5311                             locinput++;
5312                     }
5313                     n = locinput - ST.oldloc;
5314                 }
5315                 if (locinput > ST.maxpos)
5316                     sayNO;
5317                 /* PL_reginput == oldloc now */
5318                 if (n) {
5319                     ST.count += n;
5320                     if (regrepeat(rex, ST.A, n, depth) < n)
5321                         sayNO;
5322                 }
5323                 PL_reginput = locinput;
5324                 CURLY_SETPAREN(ST.paren, ST.count);
5325                 if (cur_eval && cur_eval->u.eval.close_paren &&
5326                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5327                     goto fake_end;
5328                 }
5329                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5330             }
5331             /* NOTREACHED */
5332
5333
5334         case CURLY_B_min_fail:
5335             /* failed to find B in a non-greedy match where c1,c2 invalid */
5336             if (ST.paren && ST.count)
5337                 PL_regoffs[ST.paren].end = -1;
5338
5339             REGCP_UNWIND(ST.cp);
5340             /* failed -- move forward one */
5341             PL_reginput = locinput;
5342             if (regrepeat(rex, ST.A, 1, depth)) {
5343                 ST.count++;
5344                 locinput = PL_reginput;
5345                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5346                         ST.count > 0)) /* count overflow ? */
5347                 {
5348                   curly_try_B_min:
5349                     CURLY_SETPAREN(ST.paren, ST.count);
5350                     if (cur_eval && cur_eval->u.eval.close_paren &&
5351                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5352                         goto fake_end;
5353                     }
5354                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5355                 }
5356             }
5357             sayNO;
5358             /* NOTREACHED */
5359
5360
5361         curly_try_B_max:
5362             /* a successful greedy match: now try to match B */
5363             if (cur_eval && cur_eval->u.eval.close_paren &&
5364                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5365                 goto fake_end;
5366             }
5367             {
5368                 UV c = 0;
5369                 if (ST.c1 != CHRTEST_VOID)
5370                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5371                                            UTF8_MAXBYTES, 0, uniflags)
5372                                 : (UV) UCHARAT(PL_reginput);
5373                 /* If it could work, try it. */
5374                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5375                     CURLY_SETPAREN(ST.paren, ST.count);
5376                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5377                     /* NOTREACHED */
5378                 }
5379             }
5380             /* FALL THROUGH */
5381         case CURLY_B_max_fail:
5382             /* failed to find B in a greedy match */
5383             if (ST.paren && ST.count)
5384                 PL_regoffs[ST.paren].end = -1;
5385
5386             REGCP_UNWIND(ST.cp);
5387             /*  back up. */
5388             if (--ST.count < ST.min)
5389                 sayNO;
5390             PL_reginput = locinput = HOPc(locinput, -1);
5391             goto curly_try_B_max;
5392
5393 #undef ST
5394
5395         case END:
5396             fake_end:
5397             if (cur_eval) {
5398                 /* we've just finished A in /(??{A})B/; now continue with B */
5399                 I32 tmpix;
5400                 st->u.eval.toggle_reg_flags
5401                             = cur_eval->u.eval.toggle_reg_flags;
5402                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5403
5404                 st->u.eval.prev_rex = rex_sv;           /* inner */
5405                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5406                 rex = (struct regexp *)SvANY(rex_sv);
5407                 rexi = RXi_GET(rex);
5408                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5409                 (void)ReREFCNT_inc(rex_sv);
5410                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5411
5412                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5413                 PL_reglastparen = &rex->lastparen;
5414                 PL_reglastcloseparen = &rex->lastcloseparen;
5415
5416                 REGCP_SET(st->u.eval.lastcp);
5417                 PL_reginput = locinput;
5418
5419                 /* Restore parens of the outer rex without popping the
5420                  * savestack */
5421                 tmpix = PL_savestack_ix;
5422                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5423                 regcppop(rex);
5424                 PL_savestack_ix = tmpix;
5425
5426                 st->u.eval.prev_eval = cur_eval;
5427                 cur_eval = cur_eval->u.eval.prev_eval;
5428                 DEBUG_EXECUTE_r(
5429                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5430                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5431                 if ( nochange_depth )
5432                     nochange_depth--;
5433
5434                 PUSH_YES_STATE_GOTO(EVAL_AB,
5435                         st->u.eval.prev_eval->u.eval.B); /* match B */
5436             }
5437
5438             if (locinput < reginfo->till) {
5439                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5440                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5441                                       PL_colors[4],
5442                                       (long)(locinput - PL_reg_starttry),
5443                                       (long)(reginfo->till - PL_reg_starttry),
5444                                       PL_colors[5]));
5445
5446                 sayNO_SILENT;           /* Cannot match: too short. */
5447             }
5448             PL_reginput = locinput;     /* put where regtry can find it */
5449             sayYES;                     /* Success! */
5450
5451         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5452             DEBUG_EXECUTE_r(
5453             PerlIO_printf(Perl_debug_log,
5454                 "%*s  %ssubpattern success...%s\n",
5455                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5456             PL_reginput = locinput;     /* put where regtry can find it */
5457             sayYES;                     /* Success! */
5458
5459 #undef  ST
5460 #define ST st->u.ifmatch
5461
5462         case SUSPEND:   /* (?>A) */
5463             ST.wanted = 1;
5464             PL_reginput = locinput;
5465             goto do_ifmatch;
5466
5467         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5468             ST.wanted = 0;
5469             goto ifmatch_trivial_fail_test;
5470
5471         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5472             ST.wanted = 1;
5473           ifmatch_trivial_fail_test:
5474             if (scan->flags) {
5475                 char * const s = HOPBACKc(locinput, scan->flags);
5476                 if (!s) {
5477                     /* trivial fail */
5478                     if (logical) {
5479                         logical = 0;
5480                         sw = 1 - cBOOL(ST.wanted);
5481                     }
5482                     else if (ST.wanted)
5483                         sayNO;
5484                     next = scan + ARG(scan);
5485                     if (next == scan)
5486                         next = NULL;
5487                     break;
5488                 }
5489                 PL_reginput = s;
5490             }
5491             else
5492                 PL_reginput = locinput;
5493
5494           do_ifmatch:
5495             ST.me = scan;
5496             ST.logical = logical;
5497             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5498
5499             /* execute body of (?...A) */
5500             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5501             /* NOTREACHED */
5502
5503         case IFMATCH_A_fail: /* body of (?...A) failed */
5504             ST.wanted = !ST.wanted;
5505             /* FALL THROUGH */
5506
5507         case IFMATCH_A: /* body of (?...A) succeeded */
5508             if (ST.logical) {
5509                 sw = cBOOL(ST.wanted);
5510             }
5511             else if (!ST.wanted)
5512                 sayNO;
5513
5514             if (OP(ST.me) == SUSPEND)
5515                 locinput = PL_reginput;
5516             else {
5517                 locinput = PL_reginput = st->locinput;
5518                 nextchr = UCHARAT(locinput);
5519             }
5520             scan = ST.me + ARG(ST.me);
5521             if (scan == ST.me)
5522                 scan = NULL;
5523             continue; /* execute B */
5524
5525 #undef ST
5526
5527         case LONGJMP:
5528             next = scan + ARG(scan);
5529             if (next == scan)
5530                 next = NULL;
5531             break;
5532         case COMMIT:
5533             reginfo->cutpoint = PL_regeol;
5534             /* FALLTHROUGH */
5535         case PRUNE:
5536             PL_reginput = locinput;
5537             if (!scan->flags)
5538                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5539             PUSH_STATE_GOTO(COMMIT_next,next);
5540             /* NOTREACHED */
5541         case COMMIT_next_fail:
5542             no_final = 1;
5543             /* FALLTHROUGH */
5544         case OPFAIL:
5545             sayNO;
5546             /* NOTREACHED */
5547
5548 #define ST st->u.mark
5549         case MARKPOINT:
5550             ST.prev_mark = mark_state;
5551             ST.mark_name = sv_commit = sv_yes_mark
5552                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5553             mark_state = st;
5554             ST.mark_loc = PL_reginput = locinput;
5555             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5556             /* NOTREACHED */
5557         case MARKPOINT_next:
5558             mark_state = ST.prev_mark;
5559             sayYES;
5560             /* NOTREACHED */
5561         case MARKPOINT_next_fail:
5562             if (popmark && sv_eq(ST.mark_name,popmark))
5563             {
5564                 if (ST.mark_loc > startpoint)
5565                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5566                 popmark = NULL; /* we found our mark */
5567                 sv_commit = ST.mark_name;
5568
5569                 DEBUG_EXECUTE_r({
5570                         PerlIO_printf(Perl_debug_log,
5571                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5572                             REPORT_CODE_OFF+depth*2, "",
5573                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5574                 });
5575             }
5576             mark_state = ST.prev_mark;
5577             sv_yes_mark = mark_state ?
5578                 mark_state->u.mark.mark_name : NULL;
5579             sayNO;
5580             /* NOTREACHED */
5581         case SKIP:
5582             PL_reginput = locinput;
5583             if (scan->flags) {
5584                 /* (*SKIP) : if we fail we cut here*/
5585                 ST.mark_name = NULL;
5586                 ST.mark_loc = locinput;
5587                 PUSH_STATE_GOTO(SKIP_next,next);
5588             } else {
5589                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5590                    otherwise do nothing.  Meaning we need to scan
5591                  */
5592                 regmatch_state *cur = mark_state;
5593                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5594
5595                 while (cur) {
5596                     if ( sv_eq( cur->u.mark.mark_name,
5597                                 find ) )
5598                     {
5599                         ST.mark_name = find;
5600                         PUSH_STATE_GOTO( SKIP_next, next );
5601                     }
5602                     cur = cur->u.mark.prev_mark;
5603                 }
5604             }
5605             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5606             break;
5607         case SKIP_next_fail:
5608             if (ST.mark_name) {
5609                 /* (*CUT:NAME) - Set up to search for the name as we
5610                    collapse the stack*/
5611                 popmark = ST.mark_name;
5612             } else {
5613                 /* (*CUT) - No name, we cut here.*/
5614                 if (ST.mark_loc > startpoint)
5615                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5616                 /* but we set sv_commit to latest mark_name if there
5617                    is one so they can test to see how things lead to this
5618                    cut */
5619                 if (mark_state)
5620                     sv_commit=mark_state->u.mark.mark_name;
5621             }
5622             no_final = 1;
5623             sayNO;
5624             /* NOTREACHED */
5625 #undef ST
5626         case FOLDCHAR:
5627             n = ARG(scan);
5628             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5629                 locinput += ln;
5630             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5631                 sayNO;
5632             } else  {
5633                 U8 folded[UTF8_MAXBYTES_CASE+1];
5634                 STRLEN foldlen;
5635                 const char * const l = locinput;
5636                 char *e = PL_regeol;
5637                 to_uni_fold(n, folded, &foldlen);
5638
5639                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5640                                l, &e, 0,  utf8_target)) {
5641                         sayNO;
5642                 }
5643                 locinput = e;
5644             }
5645             nextchr = UCHARAT(locinput);
5646             break;
5647         case LNBREAK:
5648             if ((n=is_LNBREAK(locinput,utf8_target))) {
5649                 locinput += n;
5650                 nextchr = UCHARAT(locinput);
5651             } else
5652                 sayNO;
5653             break;
5654
5655 #define CASE_CLASS(nAmE)                              \
5656         case nAmE:                                    \
5657             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5658                 locinput += n;                        \
5659                 nextchr = UCHARAT(locinput);          \
5660             } else                                    \
5661                 sayNO;                                \
5662             break;                                    \
5663         case N##nAmE:                                 \
5664             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5665                 sayNO;                                \
5666             } else {                                  \
5667                 locinput += UTF8SKIP(locinput);       \
5668                 nextchr = UCHARAT(locinput);          \
5669             }                                         \
5670             break
5671
5672         CASE_CLASS(VERTWS);
5673         CASE_CLASS(HORIZWS);
5674 #undef CASE_CLASS
5675
5676         default:
5677             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5678                           PTR2UV(scan), OP(scan));
5679             Perl_croak(aTHX_ "regexp memory corruption");
5680
5681         } /* end switch */
5682
5683         /* switch break jumps here */
5684         scan = next; /* prepare to execute the next op and ... */
5685         continue;    /* ... jump back to the top, reusing st */
5686         /* NOTREACHED */
5687
5688       push_yes_state:
5689         /* push a state that backtracks on success */
5690         st->u.yes.prev_yes_state = yes_state;
5691         yes_state = st;
5692         /* FALL THROUGH */
5693       push_state:
5694         /* push a new regex state, then continue at scan  */
5695         {
5696             regmatch_state *newst;
5697
5698             DEBUG_STACK_r({
5699                 regmatch_state *cur = st;
5700                 regmatch_state *curyes = yes_state;
5701                 int curd = depth;
5702                 regmatch_slab *slab = PL_regmatch_slab;
5703                 for (;curd > -1;cur--,curd--) {
5704                     if (cur < SLAB_FIRST(slab)) {
5705                         slab = slab->prev;
5706                         cur = SLAB_LAST(slab);
5707                     }
5708                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5709                         REPORT_CODE_OFF + 2 + depth * 2,"",
5710                         curd, PL_reg_name[cur->resume_state],
5711                         (curyes == cur) ? "yes" : ""
5712                     );
5713                     if (curyes == cur)
5714                         curyes = cur->u.yes.prev_yes_state;
5715                 }
5716             } else
5717                 DEBUG_STATE_pp("push")
5718             );
5719             depth++;
5720             st->locinput = locinput;
5721             newst = st+1;
5722             if (newst >  SLAB_LAST(PL_regmatch_slab))
5723                 newst = S_push_slab(aTHX);
5724             PL_regmatch_state = newst;
5725
5726             locinput = PL_reginput;
5727             nextchr = UCHARAT(locinput);
5728             st = newst;
5729             continue;
5730             /* NOTREACHED */
5731         }
5732     }
5733
5734     /*
5735     * We get here only if there's trouble -- normally "case END" is
5736     * the terminating point.
5737     */
5738     Perl_croak(aTHX_ "corrupted regexp pointers");
5739     /*NOTREACHED*/
5740     sayNO;
5741
5742 yes:
5743     if (yes_state) {
5744         /* we have successfully completed a subexpression, but we must now
5745          * pop to the state marked by yes_state and continue from there */
5746         assert(st != yes_state);
5747 #ifdef DEBUGGING
5748         while (st != yes_state) {
5749             st--;
5750             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5751                 PL_regmatch_slab = PL_regmatch_slab->prev;
5752                 st = SLAB_LAST(PL_regmatch_slab);
5753             }
5754             DEBUG_STATE_r({
5755                 if (no_final) {
5756                     DEBUG_STATE_pp("pop (no final)");
5757                 } else {
5758                     DEBUG_STATE_pp("pop (yes)");
5759                 }
5760             });
5761             depth--;
5762         }
5763 #else
5764         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5765             || yes_state > SLAB_LAST(PL_regmatch_slab))
5766         {
5767             /* not in this slab, pop slab */
5768             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5769             PL_regmatch_slab = PL_regmatch_slab->prev;
5770             st = SLAB_LAST(PL_regmatch_slab);
5771         }
5772         depth -= (st - yes_state);
5773 #endif
5774         st = yes_state;
5775         yes_state = st->u.yes.prev_yes_state;
5776         PL_regmatch_state = st;
5777
5778         if (no_final) {
5779             locinput= st->locinput;
5780             nextchr = UCHARAT(locinput);
5781         }
5782         state_num = st->resume_state + no_final;
5783         goto reenter_switch;
5784     }
5785
5786     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5787                           PL_colors[4], PL_colors[5]));
5788
5789     if (PL_reg_eval_set) {
5790         /* each successfully executed (?{...}) block does the equivalent of
5791          *   local $^R = do {...}
5792          * When popping the save stack, all these locals would be undone;
5793          * bypass this by setting the outermost saved $^R to the latest
5794          * value */
5795         if (oreplsv != GvSV(PL_replgv))
5796             sv_setsv(oreplsv, GvSV(PL_replgv));
5797     }
5798     result = 1;
5799     goto final_exit;
5800
5801 no:
5802     DEBUG_EXECUTE_r(
5803         PerlIO_printf(Perl_debug_log,
5804             "%*s  %sfailed...%s\n",
5805             REPORT_CODE_OFF+depth*2, "",
5806             PL_colors[4], PL_colors[5])
5807         );
5808
5809 no_silent:
5810     if (no_final) {
5811         if (yes_state) {
5812             goto yes;
5813         } else {
5814             goto final_exit;
5815         }
5816     }
5817     if (depth) {
5818         /* there's a previous state to backtrack to */
5819         st--;
5820         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5821             PL_regmatch_slab = PL_regmatch_slab->prev;
5822             st = SLAB_LAST(PL_regmatch_slab);
5823         }
5824         PL_regmatch_state = st;
5825         locinput= st->locinput;
5826         nextchr = UCHARAT(locinput);
5827
5828         DEBUG_STATE_pp("pop");
5829         depth--;
5830         if (yes_state == st)
5831             yes_state = st->u.yes.prev_yes_state;
5832
5833         state_num = st->resume_state + 1; /* failure = success + 1 */
5834         goto reenter_switch;
5835     }
5836     result = 0;
5837
5838   final_exit:
5839     if (rex->intflags & PREGf_VERBARG_SEEN) {
5840         SV *sv_err = get_sv("REGERROR", 1);
5841         SV *sv_mrk = get_sv("REGMARK", 1);
5842         if (result) {
5843             sv_commit = &PL_sv_no;
5844             if (!sv_yes_mark)
5845                 sv_yes_mark = &PL_sv_yes;
5846         } else {
5847             if (!sv_commit)
5848                 sv_commit = &PL_sv_yes;
5849             sv_yes_mark = &PL_sv_no;
5850         }
5851         sv_setsv(sv_err, sv_commit);
5852         sv_setsv(sv_mrk, sv_yes_mark);
5853     }
5854
5855     /* clean up; in particular, free all slabs above current one */
5856     LEAVE_SCOPE(oldsave);
5857
5858     return result;
5859 }
5860
5861 /*
5862  - regrepeat - repeatedly match something simple, report how many
5863  */
5864 /*
5865  * [This routine now assumes that it will only match on things of length 1.
5866  * That was true before, but now we assume scan - reginput is the count,
5867  * rather than incrementing count on every character.  [Er, except utf8.]]
5868  */
5869 STATIC I32
5870 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5871 {
5872     dVAR;
5873     register char *scan;
5874     register I32 c;
5875     register char *loceol = PL_regeol;
5876     register I32 hardcount = 0;
5877     register bool utf8_target = PL_reg_match_utf8;
5878     UV utf8_flags;
5879 #ifndef DEBUGGING
5880     PERL_UNUSED_ARG(depth);
5881 #endif
5882
5883     PERL_ARGS_ASSERT_REGREPEAT;
5884
5885     scan = PL_reginput;
5886     if (max == REG_INFTY)
5887         max = I32_MAX;
5888     else if (max < loceol - scan)
5889         loceol = scan + max;
5890     switch (OP(p)) {
5891     case REG_ANY:
5892         if (utf8_target) {
5893             loceol = PL_regeol;
5894             while (scan < loceol && hardcount < max && *scan != '\n') {
5895                 scan += UTF8SKIP(scan);
5896                 hardcount++;
5897             }
5898         } else {
5899             while (scan < loceol && *scan != '\n')
5900                 scan++;
5901         }
5902         break;
5903     case SANY:
5904         if (utf8_target) {
5905             loceol = PL_regeol;
5906             while (scan < loceol && hardcount < max) {
5907                 scan += UTF8SKIP(scan);
5908                 hardcount++;
5909             }
5910         }
5911         else
5912             scan = loceol;
5913         break;
5914     case CANY:
5915         scan = loceol;
5916         break;
5917     case EXACT:
5918         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5919          * means they match only characters in the string that can be expressed
5920          * as a single byte.  For non-utf8 strings, that means a simple match.
5921          * For utf8 strings, the character matched must be an invariant, or
5922          * downgradable to a single byte.  The pattern's utf8ness is
5923          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5924          * it is, it's an invariant */
5925
5926         c = (U8)*STRING(p);
5927         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5928
5929         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5930             while (scan < loceol && UCHARAT(scan) == c) {
5931                 scan++;
5932             }
5933         }
5934         else {
5935
5936             /* Here, the string is utf8, and the pattern char is different
5937              * in utf8 than not, so can't compare them directly.  Outside the
5938              * loop, find find the two utf8 bytes that represent c, and then
5939              * look for those in sequence in the utf8 string */
5940             U8 high = UTF8_TWO_BYTE_HI(c);
5941             U8 low = UTF8_TWO_BYTE_LO(c);
5942             loceol = PL_regeol;
5943
5944             while (hardcount < max
5945                     && scan + 1 < loceol
5946                     && UCHARAT(scan) == high
5947                     && UCHARAT(scan + 1) == low)
5948             {
5949                 scan += 2;
5950                 hardcount++;
5951             }
5952         }
5953         break;
5954     case EXACTFA:
5955         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5956         goto do_exactf;
5957
5958     case EXACTFL:
5959         PL_reg_flags |= RF_tainted;
5960         utf8_flags = FOLDEQ_UTF8_LOCALE;
5961         goto do_exactf;
5962
5963     case EXACTF:
5964     case EXACTFU:
5965         utf8_flags = 0;
5966
5967         /* The comments for the EXACT case above apply as well to these fold
5968          * ones */
5969
5970     do_exactf:
5971         c = (U8)*STRING(p);
5972         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5973
5974         if (utf8_target) { /* Use full Unicode fold matching */
5975             char *tmpeol = loceol;
5976             while (hardcount < max
5977                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5978                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5979             {
5980                 scan = tmpeol;
5981                 tmpeol = loceol;
5982                 hardcount++;
5983             }
5984
5985             /* XXX Note that the above handles properly the German sharp s in
5986              * the pattern matching ss in the string.  But it doesn't handle
5987              * properly cases where the string contains say 'LIGATURE ff' and
5988              * the pattern is 'f+'.  This would require, say, a new function or
5989              * revised interface to foldEQ_utf8(), in which the maximum number
5990              * of characters to match could be passed and it would return how
5991              * many actually did.  This is just one of many cases where
5992              * multi-char folds don't work properly, and so the fix is being
5993              * deferred */
5994         }
5995         else {
5996             U8 folded;
5997
5998             /* Here, the string isn't utf8 and c is a single byte; and either
5999              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6000              * doesn't affect c.  Can just do simple comparisons for exact or
6001              * fold matching. */
6002             switch (OP(p)) {
6003                 case EXACTF: folded = PL_fold[c]; break;
6004                 case EXACTFA:
6005                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6006                 case EXACTFL: folded = PL_fold_locale[c]; break;
6007                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6008             }
6009             while (scan < loceol &&
6010                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6011             {
6012                 scan++;
6013             }
6014         }
6015         break;
6016     case ANYOFV:
6017     case ANYOF:
6018         if (utf8_target || OP(p) == ANYOFV) {
6019             STRLEN inclasslen;
6020             loceol = PL_regeol;
6021             inclasslen = loceol - scan;
6022             while (hardcount < max
6023                    && ((inclasslen = loceol - scan) > 0)
6024                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6025             {
6026                 scan += inclasslen;
6027                 hardcount++;
6028             }
6029         } else {
6030             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6031                 scan++;
6032         }
6033         break;
6034     case ALNUMU:
6035         if (utf8_target) {
6036     utf8_wordchar:
6037             loceol = PL_regeol;
6038             LOAD_UTF8_CHARCLASS_ALNUM();
6039             while (hardcount < max && scan < loceol &&
6040                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6041             {
6042                 scan += UTF8SKIP(scan);
6043                 hardcount++;
6044             }
6045         } else {
6046             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6047                 scan++;
6048             }
6049         }
6050         break;
6051     case ALNUM:
6052         if (utf8_target)
6053             goto utf8_wordchar;
6054         while (scan < loceol && isALNUM((U8) *scan)) {
6055             scan++;
6056         }
6057         break;
6058     case ALNUMA:
6059         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6060             scan++;
6061         }
6062         break;
6063     case ALNUML:
6064         PL_reg_flags |= RF_tainted;
6065         if (utf8_target) {
6066             loceol = PL_regeol;
6067             while (hardcount < max && scan < loceol &&
6068                    isALNUM_LC_utf8((U8*)scan)) {
6069                 scan += UTF8SKIP(scan);
6070                 hardcount++;
6071             }
6072         } else {
6073             while (scan < loceol && isALNUM_LC(*scan))
6074                 scan++;
6075         }
6076         break;
6077     case NALNUMU:
6078         if (utf8_target) {
6079
6080     utf8_Nwordchar:
6081
6082             loceol = PL_regeol;
6083             LOAD_UTF8_CHARCLASS_ALNUM();
6084             while (hardcount < max && scan < loceol &&
6085                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6086             {
6087                 scan += UTF8SKIP(scan);
6088                 hardcount++;
6089             }
6090         } else {
6091             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6092                 scan++;
6093             }
6094         }
6095         break;
6096     case NALNUM:
6097         if (utf8_target)
6098             goto utf8_Nwordchar;
6099         while (scan < loceol && ! isALNUM((U8) *scan)) {
6100             scan++;
6101         }
6102         break;
6103     case NALNUMA:
6104         if (utf8_target) {
6105             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6106                 scan += UTF8SKIP(scan);
6107             }
6108         }
6109         else {
6110             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6111                 scan++;
6112             }
6113         }
6114         break;
6115     case NALNUML:
6116         PL_reg_flags |= RF_tainted;
6117         if (utf8_target) {
6118             loceol = PL_regeol;
6119             while (hardcount < max && scan < loceol &&
6120                    !isALNUM_LC_utf8((U8*)scan)) {
6121                 scan += UTF8SKIP(scan);
6122                 hardcount++;
6123             }
6124         } else {
6125             while (scan < loceol && !isALNUM_LC(*scan))
6126                 scan++;
6127         }
6128         break;
6129     case SPACEU:
6130         if (utf8_target) {
6131
6132     utf8_space:
6133
6134             loceol = PL_regeol;
6135             LOAD_UTF8_CHARCLASS_SPACE();
6136             while (hardcount < max && scan < loceol &&
6137                    (*scan == ' ' ||
6138                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6139             {
6140                 scan += UTF8SKIP(scan);
6141                 hardcount++;
6142             }
6143             break;
6144         }
6145         else {
6146             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6147                 scan++;
6148             }
6149             break;
6150         }
6151     case SPACE:
6152         if (utf8_target)
6153             goto utf8_space;
6154
6155         while (scan < loceol && isSPACE((U8) *scan)) {
6156             scan++;
6157         }
6158         break;
6159     case SPACEA:
6160         while (scan < loceol && isSPACE_A((U8) *scan)) {
6161             scan++;
6162         }
6163         break;
6164     case SPACEL:
6165         PL_reg_flags |= RF_tainted;
6166         if (utf8_target) {
6167             loceol = PL_regeol;
6168             while (hardcount < max && scan < loceol &&
6169                    isSPACE_LC_utf8((U8*)scan)) {
6170                 scan += UTF8SKIP(scan);
6171                 hardcount++;
6172             }
6173         } else {
6174             while (scan < loceol && isSPACE_LC(*scan))
6175                 scan++;
6176         }
6177         break;
6178     case NSPACEU:
6179         if (utf8_target) {
6180
6181     utf8_Nspace:
6182
6183             loceol = PL_regeol;
6184             LOAD_UTF8_CHARCLASS_SPACE();
6185             while (hardcount < max && scan < loceol &&
6186                    ! (*scan == ' ' ||
6187                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6188             {
6189                 scan += UTF8SKIP(scan);
6190                 hardcount++;
6191             }
6192             break;
6193         }
6194         else {
6195             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6196                 scan++;
6197             }
6198         }
6199         break;
6200     case NSPACE:
6201         if (utf8_target)
6202             goto utf8_Nspace;
6203
6204         while (scan < loceol && ! isSPACE((U8) *scan)) {
6205             scan++;
6206         }
6207         break;
6208     case NSPACEA:
6209         if (utf8_target) {
6210             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6211                 scan += UTF8SKIP(scan);
6212             }
6213         }
6214         else {
6215             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6216                 scan++;
6217             }
6218         }
6219         break;
6220     case NSPACEL:
6221         PL_reg_flags |= RF_tainted;
6222         if (utf8_target) {
6223             loceol = PL_regeol;
6224             while (hardcount < max && scan < loceol &&
6225                    !isSPACE_LC_utf8((U8*)scan)) {
6226                 scan += UTF8SKIP(scan);
6227                 hardcount++;
6228             }
6229         } else {
6230             while (scan < loceol && !isSPACE_LC(*scan))
6231                 scan++;
6232         }
6233         break;
6234     case DIGIT:
6235         if (utf8_target) {
6236             loceol = PL_regeol;
6237             LOAD_UTF8_CHARCLASS_DIGIT();
6238             while (hardcount < max && scan < loceol &&
6239                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6240                 scan += UTF8SKIP(scan);
6241                 hardcount++;
6242             }
6243         } else {
6244             while (scan < loceol && isDIGIT(*scan))
6245                 scan++;
6246         }
6247         break;
6248     case DIGITA:
6249         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6250             scan++;
6251         }
6252         break;
6253     case DIGITL:
6254         PL_reg_flags |= RF_tainted;
6255         if (utf8_target) {
6256             loceol = PL_regeol;
6257             while (hardcount < max && scan < loceol &&
6258                    isDIGIT_LC_utf8((U8*)scan)) {
6259                 scan += UTF8SKIP(scan);
6260                 hardcount++;
6261             }
6262         } else {
6263             while (scan < loceol && isDIGIT_LC(*scan))
6264                 scan++;
6265         }
6266         break;
6267     case NDIGIT:
6268         if (utf8_target) {
6269             loceol = PL_regeol;
6270             LOAD_UTF8_CHARCLASS_DIGIT();
6271             while (hardcount < max && scan < loceol &&
6272                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6273                 scan += UTF8SKIP(scan);
6274                 hardcount++;
6275             }
6276         } else {
6277             while (scan < loceol && !isDIGIT(*scan))
6278                 scan++;
6279         }
6280         break;
6281     case NDIGITA:
6282         if (utf8_target) {
6283             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6284                 scan += UTF8SKIP(scan);
6285             }
6286         }
6287         else {
6288             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6289                 scan++;
6290             }
6291         }
6292         break;
6293     case NDIGITL:
6294         PL_reg_flags |= RF_tainted;
6295         if (utf8_target) {
6296             loceol = PL_regeol;
6297             while (hardcount < max && scan < loceol &&
6298                    !isDIGIT_LC_utf8((U8*)scan)) {
6299                 scan += UTF8SKIP(scan);
6300                 hardcount++;
6301             }
6302         } else {
6303             while (scan < loceol && !isDIGIT_LC(*scan))
6304                 scan++;
6305         }
6306         break;
6307     case LNBREAK:
6308         if (utf8_target) {
6309             loceol = PL_regeol;
6310             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6311                 scan += c;
6312                 hardcount++;
6313             }
6314         } else {
6315             /*
6316               LNBREAK can match two latin chars, which is ok,
6317               because we have a null terminated string, but we
6318               have to use hardcount in this situation
6319             */
6320             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6321                 scan+=c;
6322                 hardcount++;
6323             }
6324         }
6325         break;
6326     case HORIZWS:
6327         if (utf8_target) {
6328             loceol = PL_regeol;
6329             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6330                 scan += c;
6331                 hardcount++;
6332             }
6333         } else {
6334             while (scan < loceol && is_HORIZWS_latin1(scan))
6335                 scan++;
6336         }
6337         break;
6338     case NHORIZWS:
6339         if (utf8_target) {
6340             loceol = PL_regeol;
6341             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6342                 scan += UTF8SKIP(scan);
6343                 hardcount++;
6344             }
6345         } else {
6346             while (scan < loceol && !is_HORIZWS_latin1(scan))
6347                 scan++;
6348
6349         }
6350         break;
6351     case VERTWS:
6352         if (utf8_target) {
6353             loceol = PL_regeol;
6354             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6355                 scan += c;
6356                 hardcount++;
6357             }
6358         } else {
6359             while (scan < loceol && is_VERTWS_latin1(scan))
6360                 scan++;
6361
6362         }
6363         break;
6364     case NVERTWS:
6365         if (utf8_target) {
6366             loceol = PL_regeol;
6367             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6368                 scan += UTF8SKIP(scan);
6369                 hardcount++;
6370             }
6371         } else {
6372             while (scan < loceol && !is_VERTWS_latin1(scan))
6373                 scan++;
6374
6375         }
6376         break;
6377
6378     default:            /* Called on something of 0 width. */
6379         break;          /* So match right here or not at all. */
6380     }
6381
6382     if (hardcount)
6383         c = hardcount;
6384     else
6385         c = scan - PL_reginput;
6386     PL_reginput = scan;
6387
6388     DEBUG_r({
6389         GET_RE_DEBUG_FLAGS_DECL;
6390         DEBUG_EXECUTE_r({
6391             SV * const prop = sv_newmortal();
6392             regprop(prog, prop, p);
6393             PerlIO_printf(Perl_debug_log,
6394                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6395                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6396         });
6397     });
6398
6399     return(c);
6400 }
6401
6402
6403 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6404 /*
6405 - regclass_swash - prepare the utf8 swash
6406 */
6407
6408 SV *
6409 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6410 {
6411     dVAR;
6412     SV *sw  = NULL;
6413     SV *si  = NULL;
6414     SV *alt = NULL;
6415     RXi_GET_DECL(prog,progi);
6416     const struct reg_data * const data = prog ? progi->data : NULL;
6417
6418     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6419
6420     assert(ANYOF_NONBITMAP(node));
6421
6422     if (data && data->count) {
6423         const U32 n = ARG(node);
6424
6425         if (data->what[n] == 's') {
6426             SV * const rv = MUTABLE_SV(data->data[n]);
6427             AV * const av = MUTABLE_AV(SvRV(rv));
6428             SV **const ary = AvARRAY(av);
6429             SV **a, **b;
6430
6431             /* See the end of regcomp.c:S_regclass() for
6432              * documentation of these array elements. */
6433
6434             si = *ary;
6435             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6436             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6437
6438             if (a)
6439                 sw = *a;
6440             else if (si && doinit) {
6441                 sw = swash_init("utf8", "", si, 1, 0);
6442                 (void)av_store(av, 1, sw);
6443             }
6444             if (b)
6445                 alt = *b;
6446         }
6447     }
6448
6449     if (listsvp)
6450         *listsvp = si;
6451     if (altsvp)
6452         *altsvp  = alt;
6453
6454     return sw;
6455 }
6456 #endif
6457
6458 /*
6459  - reginclass - determine if a character falls into a character class
6460
6461   n is the ANYOF regnode
6462   p is the target string
6463   lenp is pointer to the maximum number of bytes of how far to go in p
6464     (This is assumed wthout checking to always be at least the current
6465     character's size)
6466   utf8_target tells whether p is in UTF-8.
6467
6468   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6469   from a successful match, the value it points to will be updated to how many
6470   bytes in p were matched.  If there was no match, the value is undefined,
6471   possibly changed from the input.
6472
6473   Note that this can be a synthetic start class, a combination of various
6474   nodes, so things you think might be mutually exclusive, such as locale,
6475   aren't.  It can match both locale and non-locale
6476
6477  */
6478
6479 STATIC bool
6480 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6481 {
6482     dVAR;
6483     const char flags = ANYOF_FLAGS(n);
6484     bool match = FALSE;
6485     UV c = *p;
6486     STRLEN c_len = 0;
6487     STRLEN maxlen;
6488
6489     PERL_ARGS_ASSERT_REGINCLASS;
6490
6491     /* If c is not already the code point, get it */
6492     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6493         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6494                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6495                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6496                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6497                  * UTF8_ALLOW_FFFF */
6498         if (c_len == (STRLEN)-1)
6499             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6500     }
6501     else {
6502         c_len = 1;
6503     }
6504
6505     /* Use passed in max length, or one character if none passed in or less
6506      * than one character.  And assume will match just one character.  This is
6507      * overwritten later if matched more. */
6508     if (lenp) {
6509         maxlen = (*lenp > c_len) ? *lenp : c_len;
6510         *lenp = c_len;
6511
6512     }
6513     else {
6514         maxlen = c_len;
6515     }
6516
6517     /* If this character is potentially in the bitmap, check it */
6518     if (c < 256) {
6519         if (ANYOF_BITMAP_TEST(n, c))
6520             match = TRUE;
6521         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6522                 && ! utf8_target
6523                 && ! isASCII(c))
6524         {
6525             match = TRUE;
6526         }
6527
6528         else if (flags & ANYOF_LOCALE) {
6529             PL_reg_flags |= RF_tainted;
6530
6531             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6532                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6533             {
6534                 match = TRUE;
6535             }
6536             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6537                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6538                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6539                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6540                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6541                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6542                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6543                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6544                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6545                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6546                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6547                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6548                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6549                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6550                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6562                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6563                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6564                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6565                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6566                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6567                      ) /* How's that for a conditional? */
6568             ) {
6569                 match = TRUE;
6570             }
6571         }
6572     }
6573
6574     /* If the bitmap didn't (or couldn't) match, and something outside the
6575      * bitmap could match, try that.  Locale nodes specifiy completely the
6576      * behavior of code points in the bit map (otherwise, a utf8 target would
6577      * cause them to be treated as Unicode and not locale), except in
6578      * the very unlikely event when this node is a synthetic start class, which
6579      * could be a combination of locale and non-locale nodes.  So allow locale
6580      * to match for the synthetic start class, which will give a false
6581      * positive that will be resolved when the match is done again as not part
6582      * of the synthetic start class */
6583     if (!match) {
6584         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6585             match = TRUE;       /* Everything above 255 matches */
6586         }
6587         else if (ANYOF_NONBITMAP(n)
6588                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6589                      || (utf8_target
6590                          && (c >=256
6591                              || (! (flags & ANYOF_LOCALE))
6592                              || (flags & ANYOF_IS_SYNTHETIC)))))
6593         {
6594             AV *av;
6595             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6596
6597             if (sw) {
6598                 U8 * utf8_p;
6599                 if (utf8_target) {
6600                     utf8_p = (U8 *) p;
6601                 } else {
6602
6603                     /* Not utf8.  Convert as much of the string as available up
6604                      * to the limit of how far the (single) character in the
6605                      * pattern can possibly match (no need to go further).  If
6606                      * the node is a straight ANYOF or not folding, it can't
6607                      * match more than one.  Otherwise, It can match up to how
6608                      * far a single char can fold to.  Since not utf8, each
6609                      * character is a single byte, so the max it can be in
6610                      * bytes is the same as the max it can be in characters */
6611                     STRLEN len = (OP(n) == ANYOF
6612                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6613                                   ? 1
6614                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6615                                     ? maxlen
6616                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6617                     utf8_p = bytes_to_utf8(p, &len);
6618                 }
6619
6620                 if (swash_fetch(sw, utf8_p, TRUE))
6621                     match = TRUE;
6622                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6623
6624                     /* Here, we need to test if the fold of the target string
6625                      * matches.  The non-multi char folds have all been moved to
6626                      * the compilation phase, and the multi-char folds have
6627                      * been stored by regcomp into 'av'; we linearly check to
6628                      * see if any match the target string (folded).   We know
6629                      * that the originals were each one character, but we don't
6630                      * currently know how many characters/bytes each folded to,
6631                      * except we do know that there are small limits imposed by
6632                      * Unicode.  XXX A performance enhancement would be to have
6633                      * regcomp.c store the max number of chars/bytes that are
6634                      * in an av entry, as, say the 0th element.  Even better
6635                      * would be to have a hash of the few characters that can
6636                      * start a multi-char fold to the max number of chars of
6637                      * those folds.
6638                      *
6639                      * If there is a match, we will need to advance (if lenp is
6640                      * specified) the match pointer in the target string.  But
6641                      * what we are comparing here isn't that string directly,
6642                      * but its fold, whose length may differ from the original.
6643                      * As we go along in constructing the fold, therefore, we
6644                      * create a map so that we know how many bytes in the
6645                      * source to advance given that we have matched a certain
6646                      * number of bytes in the fold.  This map is stored in
6647                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6648                      * the fold of the first character that we are folding.
6649                      * Then map_fold_len_back[n] is set to the number of bytes
6650                      * in that first character.  Similarly let m be the
6651                      * corresponding number for the second character to be
6652                      * folded.  Then map_fold_len_back[n+m] is set to the
6653                      * number of bytes occupied by the first two source
6654                      * characters. ... */
6655                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6656                     U8 folded[UTF8_MAXBYTES_CASE+1];
6657                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6658                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6659                                                   chars */
6660
6661                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6662
6663                         /* Here, only need to fold the first char of the target
6664                          * string.  It the source wasn't utf8, is 1 byte long */
6665                         to_utf8_fold(utf8_p, folded, &foldlen);
6666                         total_foldlen = foldlen;
6667                         map_fold_len_back[foldlen] = (utf8_target)
6668                                                      ? UTF8SKIP(utf8_p)
6669                                                      : 1;
6670                     }
6671                     else {
6672
6673                         /* Here, need to fold more than the first char.  Do so
6674                          * up to the limits */
6675                         U8* source_ptr = utf8_p;    /* The source for the fold
6676                                                        is the regex target
6677                                                        string */
6678                         U8* folded_ptr = folded;
6679                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6680                                                        available byte in the
6681                                                        target string */
6682                         U8 i;
6683                         for (i = 0;
6684                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6685                              i++)
6686                         {
6687
6688                             /* Fold the next character */
6689                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6690                             STRLEN this_char_foldlen;
6691                             to_utf8_fold(source_ptr,
6692                                          this_char_folded,
6693                                          &this_char_foldlen);
6694
6695                             /* Bail if it would exceed the byte limit for
6696                              * folding a single char. */
6697                             if (this_char_foldlen + folded_ptr - folded >
6698                                                             UTF8_MAXBYTES_CASE)
6699                             {
6700                                 break;
6701                             }
6702
6703                             /* Add the fold of this character */
6704                             Copy(this_char_folded,
6705                                  folded_ptr,
6706                                  this_char_foldlen,
6707                                  U8);
6708                             source_ptr += UTF8SKIP(source_ptr);
6709                             folded_ptr += this_char_foldlen;
6710                             total_foldlen = folded_ptr - folded;
6711
6712                             /* Create map from the number of bytes in the fold
6713                              * back to the number of bytes in the source.  If
6714                              * the source isn't utf8, the byte count is just
6715                              * the number of characters so far */
6716                             map_fold_len_back[total_foldlen]
6717                                                       = (utf8_target)
6718                                                         ? source_ptr - utf8_p
6719                                                         : i + 1;
6720                         }
6721                         *folded_ptr = '\0';
6722                     }
6723
6724
6725                     /* Do the linear search to see if the fold is in the list
6726                      * of multi-char folds. */
6727                     if (av) {
6728                         I32 i;
6729                         for (i = 0; i <= av_len(av); i++) {
6730                             SV* const sv = *av_fetch(av, i, FALSE);
6731                             STRLEN len;
6732                             const char * const s = SvPV_const(sv, len);
6733
6734                             if (len <= total_foldlen
6735                                 && memEQ(s, (char*)folded, len)
6736
6737                                    /* If 0, means matched a partial char. See
6738                                     * [perl #90536] */
6739                                 && map_fold_len_back[len])
6740                             {
6741
6742                                 /* Advance the target string ptr to account for
6743                                  * this fold, but have to translate from the
6744                                  * folded length to the corresponding source
6745                                  * length. */
6746                                 if (lenp) {
6747                                     *lenp = map_fold_len_back[len];
6748                                 }
6749                                 match = TRUE;
6750                                 break;
6751                             }
6752                         }
6753                     }
6754                 }
6755
6756                 /* If we allocated a string above, free it */
6757                 if (! utf8_target) Safefree(utf8_p);
6758             }
6759         }
6760     }
6761
6762     return (flags & ANYOF_INVERT) ? !match : match;
6763 }
6764
6765 STATIC U8 *
6766 S_reghop3(U8 *s, I32 off, const U8* lim)
6767 {
6768     dVAR;
6769
6770     PERL_ARGS_ASSERT_REGHOP3;
6771
6772     if (off >= 0) {
6773         while (off-- && s < lim) {
6774             /* XXX could check well-formedness here */
6775             s += UTF8SKIP(s);
6776         }
6777     }
6778     else {
6779         while (off++ && s > lim) {
6780             s--;
6781             if (UTF8_IS_CONTINUED(*s)) {
6782                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6783                     s--;
6784             }
6785             /* XXX could check well-formedness here */
6786         }
6787     }
6788     return s;
6789 }
6790
6791 #ifdef XXX_dmq
6792 /* there are a bunch of places where we use two reghop3's that should
6793    be replaced with this routine. but since thats not done yet
6794    we ifdef it out - dmq
6795 */
6796 STATIC U8 *
6797 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6798 {
6799     dVAR;
6800
6801     PERL_ARGS_ASSERT_REGHOP4;
6802
6803     if (off >= 0) {
6804         while (off-- && s < rlim) {
6805             /* XXX could check well-formedness here */
6806             s += UTF8SKIP(s);
6807         }
6808     }
6809     else {
6810         while (off++ && s > llim) {
6811             s--;
6812             if (UTF8_IS_CONTINUED(*s)) {
6813                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6814                     s--;
6815             }
6816             /* XXX could check well-formedness here */
6817         }
6818     }
6819     return s;
6820 }
6821 #endif
6822
6823 STATIC U8 *
6824 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6825 {
6826     dVAR;
6827
6828     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6829
6830     if (off >= 0) {
6831         while (off-- && s < lim) {
6832             /* XXX could check well-formedness here */
6833             s += UTF8SKIP(s);
6834         }
6835         if (off >= 0)
6836             return NULL;
6837     }
6838     else {
6839         while (off++ && s > lim) {
6840             s--;
6841             if (UTF8_IS_CONTINUED(*s)) {
6842                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6843                     s--;
6844             }
6845             /* XXX could check well-formedness here */
6846         }
6847         if (off <= 0)
6848             return NULL;
6849     }
6850     return s;
6851 }
6852
6853 static void
6854 restore_pos(pTHX_ void *arg)
6855 {
6856     dVAR;
6857     regexp * const rex = (regexp *)arg;
6858     if (PL_reg_eval_set) {
6859         if (PL_reg_oldsaved) {
6860             rex->subbeg = PL_reg_oldsaved;
6861             rex->sublen = PL_reg_oldsavedlen;
6862 #ifdef PERL_OLD_COPY_ON_WRITE
6863             rex->saved_copy = PL_nrs;
6864 #endif
6865             RXp_MATCH_COPIED_on(rex);
6866         }
6867         PL_reg_magic->mg_len = PL_reg_oldpos;
6868         PL_reg_eval_set = 0;
6869         PL_curpm = PL_reg_oldcurpm;
6870     }
6871 }
6872
6873 STATIC void
6874 S_to_utf8_substr(pTHX_ register regexp *prog)
6875 {
6876     int i = 1;
6877
6878     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6879
6880     do {
6881         if (prog->substrs->data[i].substr
6882             && !prog->substrs->data[i].utf8_substr) {
6883             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6884             prog->substrs->data[i].utf8_substr = sv;
6885             sv_utf8_upgrade(sv);
6886             if (SvVALID(prog->substrs->data[i].substr)) {
6887                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6888                 if (flags & FBMcf_TAIL) {
6889                     /* Trim the trailing \n that fbm_compile added last
6890                        time.  */
6891                     SvCUR_set(sv, SvCUR(sv) - 1);
6892                     /* Whilst this makes the SV technically "invalid" (as its
6893                        buffer is no longer followed by "\0") when fbm_compile()
6894                        adds the "\n" back, a "\0" is restored.  */
6895                 }
6896                 fbm_compile(sv, flags);
6897             }
6898             if (prog->substrs->data[i].substr == prog->check_substr)
6899                 prog->check_utf8 = sv;
6900         }
6901     } while (i--);
6902 }
6903
6904 STATIC void
6905 S_to_byte_substr(pTHX_ register regexp *prog)
6906 {
6907     dVAR;
6908     int i = 1;
6909
6910     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6911
6912     do {
6913         if (prog->substrs->data[i].utf8_substr
6914             && !prog->substrs->data[i].substr) {
6915             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6916             if (sv_utf8_downgrade(sv, TRUE)) {
6917                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6918                     const U8 flags
6919                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6920                     if (flags & FBMcf_TAIL) {
6921                         /* Trim the trailing \n that fbm_compile added last
6922                            time.  */
6923                         SvCUR_set(sv, SvCUR(sv) - 1);
6924                     }
6925                     fbm_compile(sv, flags);
6926                 }
6927             } else {
6928                 SvREFCNT_dec(sv);
6929                 sv = &PL_sv_undef;
6930             }
6931             prog->substrs->data[i].substr = sv;
6932             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6933                 prog->check_substr = sv;
6934         }
6935     } while (i--);
6936 }
6937
6938 /*
6939  * Local variables:
6940  * c-indentation-style: bsd
6941  * c-basic-offset: 4
6942  * indent-tabs-mode: t
6943  * End:
6944  *
6945  * ex: set ts=8 sts=4 sw=4 noet:
6946  */