src/5013011/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 128
 129 /* Doesn't do an assert to verify that is correct */
 130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 131     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 132
 133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 136
 137 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 138         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 139         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 140         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 141             * assert should likely and hopefully fail on an EBCDIC machine */ \
 142         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 143                                                                             \
 144         /* No asserts are done for these, in case called on an early        \
 145             * Unicode version in which they map to nothing */               \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 152         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 153
 154 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 155
 156 /* The actual code for CCC_TRY, which uses several variables from the routine
 157  * it's callable from.  It is designed to be the bulk of a case statement.
 158  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 159  *      nextchr matches the class.
 160  * UTF8_TEST is the whole test string to use for utf8 targets
 161  * LOAD is what to use to test, and if not present to load in the swash for the
 162  *      class
 163  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 164  *      UTF8_TEST test.
 165  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 166  * utf8 and a variant, load the swash if necessary and test using the utf8
 167  * test.  Advance to the next character if test is ok, otherwise fail; If not
 168  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 169  * fails, or advance to the next character */
 170
 171 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 172     if (locinput >= PL_regeol) {                                              \
 173         sayNO;                                                                \
 174     }                                                                         \
 175     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 176         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 177         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 178             sayNO;                                                            \
 179         }                                                                     \
 180         locinput += PL_utf8skip[nextchr];                                     \
 181         nextchr = UCHARAT(locinput);                                          \
 182         break;                                                                \
 183     }                                                                         \
 184     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 185         sayNO;                                                                \
 186     }                                                                         \
 187     nextchr = UCHARAT(++locinput);                                            \
 188     break;
 189
 190 /* Handle the non-locale cases for a character class and its complement.  It
 191  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 192  * This is because that code fails when the test succeeds, so we want to have
 193  * the test fail so that the code succeeds.  The swash is stored in a
 194  * predictable PL_ place */
 195 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 196                            CLASS, STR)                                        \
 197     case NAME:                                                                \
 198         _CCC_TRY_CODE( !, FUNC,                                               \
 199                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 200                                             (U8*)locinput, TRUE)),            \
 201                           CLASS, STR)                                         \
 202     case NNAME:                                                               \
 203         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 204                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 205                                             (U8*)locinput, TRUE)),            \
 206                           CLASS, STR)                                         \
 207
 208 /* Generate the case statements for both locale and non-locale character
 209  * classes in regmatch for classes that don't have special unicode semantics.
 210  * Locales don't use an immediate swash, but an intermediary special locale
 211  * function that is called on the pointer to the current place in the input
 212  * string.  That function will resolve to needing the same swash.  One might
 213  * think that because we don't know what the locale will match, we shouldn't
 214  * check with the swash loading function that it loaded properly; ie, that we
 215  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 216  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 217  * irrelevant here */
 218 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 219                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 220                 NAMEA, NNAMEA, FUNCA,                                         \
 221                 CLASS, STR)                                                   \
 222     case NAMEL:                                                               \
 223         PL_reg_flags |= RF_tainted;                                           \
 224         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 225     case NNAMEL:                                                              \
 226         PL_reg_flags |= RF_tainted;                                           \
 227         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 228                        CLASS, STR)                                            \
 229     case NAMEA:                                                               \
 230         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 231             sayNO;                                                            \
 232         }                                                                     \
 233         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 234         nextchr = UCHARAT(++locinput);                                        \
 235         break;                                                                \
 236     case NNAMEA:                                                              \
 237         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 238             sayNO;                                                            \
 239         }                                                                     \
 240         if (utf8_target) {                                                    \
 241             locinput += PL_utf8skip[nextchr];                                 \
 242             nextchr = UCHARAT(locinput);                                      \
 243         }                                                                     \
 244         else {                                                                \
 245             nextchr = UCHARAT(++locinput);                                    \
 246         }                                                                     \
 247         break;                                                                \
 248     /* Generate the non-locale cases */                                       \
 249     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 250
 251 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 252  * statements to handle separate Unicode semantics nodes */
 253 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 254                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 255                   NAMEU, NNAMEU, FUNCU,                                        \
 256                   NAMEA, NNAMEA, FUNCA,                                        \
 257                   CLASS, STR)                                                  \
 258     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 259             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 260             NAMEA, NNAMEA, FUNCA,                                              \
 261             CLASS, STR)                                                        \
 262     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 263
 264 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 265
 266 /* for use after a quantifier and before an EXACT-like node -- japhy */
 267 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 268  *
 269  * NOTE that *nothing* that affects backtracking should be in here, specifically
 270  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 271  * node that is in between two EXACT like nodes when ascertaining what the required
 272  * "follow" character is. This should probably be moved to regex compile time
 273  * although it may be done at run time beause of the REF possibility - more
 274  * investigation required. -- demerphq
 275 */
 276 #define JUMPABLE(rn) (      \
 277     OP(rn) == OPEN ||       \
 278     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 279     OP(rn) == EVAL ||   \
 280     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 281     OP(rn) == PLUS || OP(rn) == MINMOD || \
 282     OP(rn) == KEEPS || \
 283     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 284 )
 285 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 286
 287 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 288
 289 #if 0
 290 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 291    we don't need this definition. */
 292 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 293 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 294 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 295
 296 #else
 297 /* ... so we use this as its faster. */
 298 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 299 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 300 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 302
 303 #endif
 304
 305 /*
 306   Search for mandatory following text node; for lookahead, the text must
 307   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 308 */
 309 #define FIND_NEXT_IMPT(rn) STMT_START { \
 310     while (JUMPABLE(rn)) { \
 311         const OPCODE type = OP(rn); \
 312         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 313             rn = NEXTOPER(NEXTOPER(rn)); \
 314         else if (type == PLUS) \
 315             rn = NEXTOPER(rn); \
 316         else if (type == IFMATCH) \
 317             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 318         else rn += NEXT_OFF(rn); \
 319     } \
 320 } STMT_END
 321
 322
 323 static void restore_pos(pTHX_ void *arg);
 324
 325 #define REGCP_PAREN_ELEMS 4
 326 #define REGCP_OTHER_ELEMS 5
 327 #define REGCP_FRAME_ELEMS 1
 328 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 329  * are needed for the regexp context stack bookkeeping. */
 330
 331 STATIC CHECKPOINT
 332 S_regcppush(pTHX_ I32 parenfloor)
 333 {
 334     dVAR;
 335     const int retval = PL_savestack_ix;
 336     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 337     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 338     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 339     int p;
 340     GET_RE_DEBUG_FLAGS_DECL;
 341
 342     if (paren_elems_to_push < 0)
 343         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 344
 345     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 346         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 347                    " out of range (%lu-%ld)",
 348                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 349
 350     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 351
 352     for (p = PL_regsize; p > parenfloor; p--) {
 353 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 354         SSPUSHINT(PL_regoffs[p].end);
 355         SSPUSHINT(PL_regoffs[p].start);
 356         SSPUSHPTR(PL_reg_start_tmp[p]);
 357         SSPUSHINT(p);
 358         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 359           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 360                       (UV)p, (IV)PL_regoffs[p].start,
 361                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 362                       (IV)PL_regoffs[p].end
 363         ));
 364     }
 365 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 366     SSPUSHPTR(PL_regoffs);
 367     SSPUSHINT(PL_regsize);
 368     SSPUSHINT(*PL_reglastparen);
 369     SSPUSHINT(*PL_reglastcloseparen);
 370     SSPUSHPTR(PL_reginput);
 371     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 372
 373     return retval;
 374 }
 375
 376 /* These are needed since we do not localize EVAL nodes: */
 377 #define REGCP_SET(cp)                                           \
 378     DEBUG_STATE_r(                                              \
 379             PerlIO_printf(Perl_debug_log,                       \
 380                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 381                 (IV)PL_savestack_ix));                          \
 382     cp = PL_savestack_ix
 383
 384 #define REGCP_UNWIND(cp)                                        \
 385     DEBUG_STATE_r(                                              \
 386         if (cp != PL_savestack_ix)                              \
 387             PerlIO_printf(Perl_debug_log,                       \
 388                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 389                 (IV)(cp), (IV)PL_savestack_ix));                \
 390     regcpblow(cp)
 391
 392 STATIC char *
 393 S_regcppop(pTHX_ const regexp *rex)
 394 {
 395     dVAR;
 396     UV i;
 397     char *input;
 398     GET_RE_DEBUG_FLAGS_DECL;
 399
 400     PERL_ARGS_ASSERT_REGCPPOP;
 401
 402     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 403     i = SSPOPUV;
 404     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 405     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 406     input = (char *) SSPOPPTR;
 407     *PL_reglastcloseparen = SSPOPINT;
 408     *PL_reglastparen = SSPOPINT;
 409     PL_regsize = SSPOPINT;
 410     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 411
 412     i -= REGCP_OTHER_ELEMS;
 413     /* Now restore the parentheses context. */
 414     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 415         I32 tmps;
 416         U32 paren = (U32)SSPOPINT;
 417         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 418         PL_regoffs[paren].start = SSPOPINT;
 419         tmps = SSPOPINT;
 420         if (paren <= *PL_reglastparen)
 421             PL_regoffs[paren].end = tmps;
 422         DEBUG_BUFFERS_r(
 423             PerlIO_printf(Perl_debug_log,
 424                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 425                           (UV)paren, (IV)PL_regoffs[paren].start,
 426                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 427                           (IV)PL_regoffs[paren].end,
 428                           (paren > *PL_reglastparen ? "(no)" : ""));
 429         );
 430     }
 431     DEBUG_BUFFERS_r(
 432         if (*PL_reglastparen + 1 <= rex->nparens) {
 433             PerlIO_printf(Perl_debug_log,
 434                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 435                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 436         }
 437     );
 438 #if 1
 439     /* It would seem that the similar code in regtry()
 440      * already takes care of this, and in fact it is in
 441      * a better location to since this code can #if 0-ed out
 442      * but the code in regtry() is needed or otherwise tests
 443      * requiring null fields (pat.t#187 and split.t#{13,14}
 444      * (as of patchlevel 7877)  will fail.  Then again,
 445      * this code seems to be necessary or otherwise
 446      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 447      * --jhi updated by dapm */
 448     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 449         if (i > PL_regsize)
 450             PL_regoffs[i].start = -1;
 451         PL_regoffs[i].end = -1;
 452     }
 453 #endif
 454     return input;
 455 }
 456
 457 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 458
 459 /*
 460  * pregexec and friends
 461  */
 462
 463 #ifndef PERL_IN_XSUB_RE
 464 /*
 465  - pregexec - match a regexp against a string
 466  */
 467 I32
 468 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 469          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 470 /* strend: pointer to null at end of string */
 471 /* strbeg: real beginning of string */
 472 /* minend: end of match must be >=minend after stringarg. */
 473 /* nosave: For optimizations. */
 474 {
 475     PERL_ARGS_ASSERT_PREGEXEC;
 476
 477     return
 478         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 479                       nosave ? 0 : REXEC_COPY_STR);
 480 }
 481 #endif
 482
 483 /*
 484  * Need to implement the following flags for reg_anch:
 485  *
 486  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 487  * USE_INTUIT_ML
 488  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 489  * INTUIT_AUTORITATIVE_ML
 490  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 491  * INTUIT_ONCE_ML
 492  *
 493  * Another flag for this function: SECOND_TIME (so that float substrs
 494  * with giant delta may be not rechecked).
 495  */
 496
 497 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 498
 499 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 500    Otherwise, only SvCUR(sv) is used to get strbeg. */
 501
 502 /* XXXX We assume that strpos is strbeg unless sv. */
 503
 504 /* XXXX Some places assume that there is a fixed substring.
 505         An update may be needed if optimizer marks as "INTUITable"
 506         RExen without fixed substrings.  Similarly, it is assumed that
 507         lengths of all the strings are no more than minlen, thus they
 508         cannot come from lookahead.
 509         (Or minlen should take into account lookahead.)
 510   NOTE: Some of this comment is not correct. minlen does now take account
 511   of lookahead/behind. Further research is required. -- demerphq
 512
 513 */
 514
 515 /* A failure to find a constant substring means that there is no need to make
 516    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 517    finding a substring too deep into the string means that less calls to
 518    regtry() should be needed.
 519
 520    REx compiler's optimizer found 4 possible hints:
 521         a) Anchored substring;
 522         b) Fixed substring;
 523         c) Whether we are anchored (beginning-of-line or \G);
 524         d) First node (of those at offset 0) which may distinguish positions;
 525    We use a)b)d) and multiline-part of c), and try to find a position in the
 526    string which does not contradict any of them.
 527  */
 528
 529 /* Most of decisions we do here should have been done at compile time.
 530    The nodes of the REx which we used for the search should have been
 531    deleted from the finite automaton. */
 532
 533 char *
 534 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 535                      char *strend, const U32 flags, re_scream_pos_data *data)
 536 {
 537     dVAR;
 538     struct regexp *const prog = (struct regexp *)SvANY(rx);
 539     register I32 start_shift = 0;
 540     /* Should be nonnegative! */
 541     register I32 end_shift   = 0;
 542     register char *s;
 543     register SV *check;
 544     char *strbeg;
 545     char *t;
 546     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 547     I32 ml_anch;
 548     register char *other_last = NULL;   /* other substr checked before this */
 549     char *check_at = NULL;              /* check substr found at this pos */
 550     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 551     RXi_GET_DECL(prog,progi);
 552 #ifdef DEBUGGING
 553     const char * const i_strpos = strpos;
 554 #endif
 555     GET_RE_DEBUG_FLAGS_DECL;
 556
 557     PERL_ARGS_ASSERT_RE_INTUIT_START;
 558
 559     RX_MATCH_UTF8_set(rx,utf8_target);
 560
 561     if (RX_UTF8(rx)) {
 562         PL_reg_flags |= RF_utf8;
 563     }
 564     DEBUG_EXECUTE_r(
 565         debug_start_match(rx, utf8_target, strpos, strend,
 566             sv ? "Guessing start of match in sv for"
 567                : "Guessing start of match in string for");
 568               );
 569
 570     /* CHR_DIST() would be more correct here but it makes things slow. */
 571     if (prog->minlen > strend - strpos) {
 572         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 573                               "String too short... [re_intuit_start]\n"));
 574         goto fail;
 575     }
 576
 577     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 578     PL_regeol = strend;
 579     if (utf8_target) {
 580         if (!prog->check_utf8 && prog->check_substr)
 581             to_utf8_substr(prog);
 582         check = prog->check_utf8;
 583     } else {
 584         if (!prog->check_substr && prog->check_utf8)
 585             to_byte_substr(prog);
 586         check = prog->check_substr;
 587     }
 588     if (check == &PL_sv_undef) {
 589         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 590                 "Non-utf8 string cannot match utf8 check string\n"));
 591         goto fail;
 592     }
 593     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 594         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 595                      || ( (prog->extflags & RXf_ANCH_BOL)
 596                           && !multiline ) );    /* Check after \n? */
 597
 598         if (!ml_anch) {
 599           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 600                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 601                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 602                && sv && !SvROK(sv)
 603                && (strpos != strbeg)) {
 604               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 605               goto fail;
 606           }
 607           if (prog->check_offset_min == prog->check_offset_max &&
 608               !(prog->extflags & RXf_CANY_SEEN)) {
 609             /* Substring at constant offset from beg-of-str... */
 610             I32 slen;
 611
 612             s = HOP3c(strpos, prog->check_offset_min, strend);
 613
 614             if (SvTAIL(check)) {
 615                 slen = SvCUR(check);    /* >= 1 */
 616
 617                 if ( strend - s > slen || strend - s < slen - 1
 618                      || (strend - s == slen && strend[-1] != '\n')) {
 619                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 620                     goto fail_finish;
 621                 }
 622                 /* Now should match s[0..slen-2] */
 623                 slen--;
 624                 if (slen && (*SvPVX_const(check) != *s
 625                              || (slen > 1
 626                                  && memNE(SvPVX_const(check), s, slen)))) {
 627                   report_neq:
 628                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 629                     goto fail_finish;
 630                 }
 631             }
 632             else if (*SvPVX_const(check) != *s
 633                      || ((slen = SvCUR(check)) > 1
 634                          && memNE(SvPVX_const(check), s, slen)))
 635                 goto report_neq;
 636             check_at = s;
 637             goto success_at_start;
 638           }
 639         }
 640         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 641         s = strpos;
 642         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 643         end_shift = prog->check_end_shift;
 644
 645         if (!ml_anch) {
 646             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 647                                          - (SvTAIL(check) != 0);
 648             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 649
 650             if (end_shift < eshift)
 651                 end_shift = eshift;
 652         }
 653     }
 654     else {                              /* Can match at random position */
 655         ml_anch = 0;
 656         s = strpos;
 657         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 658         end_shift = prog->check_end_shift;
 659
 660         /* end shift should be non negative here */
 661     }
 662
 663 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 664     if (end_shift < 0)
 665         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 666                    (IV)end_shift, RX_PRECOMP(prog));
 667 #endif
 668
 669   restart:
 670     /* Find a possible match in the region s..strend by looking for
 671        the "check" substring in the region corrected by start/end_shift. */
 672
 673     {
 674         I32 srch_start_shift = start_shift;
 675         I32 srch_end_shift = end_shift;
 676         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 677             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 678             srch_start_shift = strbeg - s;
 679         }
 680     DEBUG_OPTIMISE_MORE_r({
 681         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 682             (IV)prog->check_offset_min,
 683             (IV)srch_start_shift,
 684             (IV)srch_end_shift,
 685             (IV)prog->check_end_shift);
 686     });
 687
 688     if (flags & REXEC_SCREAM) {
 689         I32 p = -1;                     /* Internal iterator of scream. */
 690         I32 * const pp = data ? data->scream_pos : &p;
 691
 692         if (PL_screamfirst[BmRARE(check)] >= 0
 693             || ( BmRARE(check) == '\n'
 694                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 695                  && SvTAIL(check) ))
 696             s = screaminstr(sv, check,
 697                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 698         else
 699             goto fail_finish;
 700         /* we may be pointing at the wrong string */
 701         if (s && RXp_MATCH_COPIED(prog))
 702             s = strbeg + (s - SvPVX_const(sv));
 703         if (data)
 704             *data->scream_olds = s;
 705     }
 706     else {
 707         U8* start_point;
 708         U8* end_point;
 709         if (prog->extflags & RXf_CANY_SEEN) {
 710             start_point= (U8*)(s + srch_start_shift);
 711             end_point= (U8*)(strend - srch_end_shift);
 712         } else {
 713             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 714             end_point= HOP3(strend, -srch_end_shift, strbeg);
 715         }
 716         DEBUG_OPTIMISE_MORE_r({
 717             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 718                 (int)(end_point - start_point),
 719                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 720                 start_point);
 721         });
 722
 723         s = fbm_instr( start_point, end_point,
 724                       check, multiline ? FBMrf_MULTILINE : 0);
 725     }
 726     }
 727     /* Update the count-of-usability, remove useless subpatterns,
 728         unshift s.  */
 729
 730     DEBUG_EXECUTE_r({
 731         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 732             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 733         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 734                           (s ? "Found" : "Did not find"),
 735             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 736                 ? "anchored" : "floating"),
 737             quoted,
 738             RE_SV_TAIL(check),
 739             (s ? " at offset " : "...\n") );
 740     });
 741
 742     if (!s)
 743         goto fail_finish;
 744     /* Finish the diagnostic message */
 745     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 746
 747     /* XXX dmq: first branch is for positive lookbehind...
 748        Our check string is offset from the beginning of the pattern.
 749        So we need to do any stclass tests offset forward from that
 750        point. I think. :-(
 751      */
 752
 753
 754
 755     check_at=s;
 756
 757
 758     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 759        Start with the other substr.
 760        XXXX no SCREAM optimization yet - and a very coarse implementation
 761        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 762                 *always* match.  Probably should be marked during compile...
 763        Probably it is right to do no SCREAM here...
 764      */
 765
 766     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 767                 : (prog->float_substr && prog->anchored_substr))
 768     {
 769         /* Take into account the "other" substring. */
 770         /* XXXX May be hopelessly wrong for UTF... */
 771         if (!other_last)
 772             other_last = strpos;
 773         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 774           do_other_anchored:
 775             {
 776                 char * const last = HOP3c(s, -start_shift, strbeg);
 777                 char *last1, *last2;
 778                 char * const saved_s = s;
 779                 SV* must;
 780
 781                 t = s - prog->check_offset_max;
 782                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 783                     && (!utf8_target
 784                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 785                             && t > strpos)))
 786                     NOOP;
 787                 else
 788                     t = strpos;
 789                 t = HOP3c(t, prog->anchored_offset, strend);
 790                 if (t < other_last)     /* These positions already checked */
 791                     t = other_last;
 792                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 793                 if (last < last1)
 794                     last1 = last;
 795                 /* XXXX It is not documented what units *_offsets are in.
 796                    We assume bytes, but this is clearly wrong.
 797                    Meaning this code needs to be carefully reviewed for errors.
 798                    dmq.
 799                   */
 800
 801                 /* On end-of-str: see comment below. */
 802                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 803                 if (must == &PL_sv_undef) {
 804                     s = (char*)NULL;
 805                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 806                 }
 807                 else
 808                     s = fbm_instr(
 809                         (unsigned char*)t,
 810                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 811                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 812                         must,
 813                         multiline ? FBMrf_MULTILINE : 0
 814                     );
 815                 DEBUG_EXECUTE_r({
 816                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 817                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 818                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 819                         (s ? "Found" : "Contradicts"),
 820                         quoted, RE_SV_TAIL(must));
 821                 });
 822
 823
 824                 if (!s) {
 825                     if (last1 >= last2) {
 826                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 827                                                 ", giving up...\n"));
 828                         goto fail_finish;
 829                     }
 830                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 831                         ", trying floating at offset %ld...\n",
 832                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 833                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 834                     s = HOP3c(last, 1, strend);
 835                     goto restart;
 836                 }
 837                 else {
 838                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 839                           (long)(s - i_strpos)));
 840                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 841                     other_last = HOP3c(s, 1, strend);
 842                     s = saved_s;
 843                     if (t == strpos)
 844                         goto try_at_start;
 845                     goto try_at_offset;
 846                 }
 847             }
 848         }
 849         else {          /* Take into account the floating substring. */
 850             char *last, *last1;
 851             char * const saved_s = s;
 852             SV* must;
 853
 854             t = HOP3c(s, -start_shift, strbeg);
 855             last1 = last =
 856                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 857             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 858                 last = HOP3c(t, prog->float_max_offset, strend);
 859             s = HOP3c(t, prog->float_min_offset, strend);
 860             if (s < other_last)
 861                 s = other_last;
 862  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 863             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 864             /* fbm_instr() takes into account exact value of end-of-str
 865                if the check is SvTAIL(ed).  Since false positives are OK,
 866                and end-of-str is not later than strend we are OK. */
 867             if (must == &PL_sv_undef) {
 868                 s = (char*)NULL;
 869                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 870             }
 871             else
 872                 s = fbm_instr((unsigned char*)s,
 873                               (unsigned char*)last + SvCUR(must)
 874                                   - (SvTAIL(must)!=0),
 875                               must, multiline ? FBMrf_MULTILINE : 0);
 876             DEBUG_EXECUTE_r({
 877                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 878                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 879                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 880                     (s ? "Found" : "Contradicts"),
 881                     quoted, RE_SV_TAIL(must));
 882             });
 883             if (!s) {
 884                 if (last1 == last) {
 885                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 886                                             ", giving up...\n"));
 887                     goto fail_finish;
 888                 }
 889                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 890                     ", trying anchored starting at offset %ld...\n",
 891                     (long)(saved_s + 1 - i_strpos)));
 892                 other_last = last;
 893                 s = HOP3c(t, 1, strend);
 894                 goto restart;
 895             }
 896             else {
 897                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 898                       (long)(s - i_strpos)));
 899                 other_last = s; /* Fix this later. --Hugo */
 900                 s = saved_s;
 901                 if (t == strpos)
 902                     goto try_at_start;
 903                 goto try_at_offset;
 904             }
 905         }
 906     }
 907
 908
 909     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 910
 911     DEBUG_OPTIMISE_MORE_r(
 912         PerlIO_printf(Perl_debug_log,
 913             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 914             (IV)prog->check_offset_min,
 915             (IV)prog->check_offset_max,
 916             (IV)(s-strpos),
 917             (IV)(t-strpos),
 918             (IV)(t-s),
 919             (IV)(strend-strpos)
 920         )
 921     );
 922
 923     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 924         && (!utf8_target
 925             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 926                  && t > strpos)))
 927     {
 928         /* Fixed substring is found far enough so that the match
 929            cannot start at strpos. */
 930       try_at_offset:
 931         if (ml_anch && t[-1] != '\n') {
 932             /* Eventually fbm_*() should handle this, but often
 933                anchored_offset is not 0, so this check will not be wasted. */
 934             /* XXXX In the code below we prefer to look for "^" even in
 935                presence of anchored substrings.  And we search even
 936                beyond the found float position.  These pessimizations
 937                are historical artefacts only.  */
 938           find_anchor:
 939             while (t < strend - prog->minlen) {
 940                 if (*t == '\n') {
 941                     if (t < check_at - prog->check_offset_min) {
 942                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 943                             /* Since we moved from the found position,
 944                                we definitely contradict the found anchored
 945                                substr.  Due to the above check we do not
 946                                contradict "check" substr.
 947                                Thus we can arrive here only if check substr
 948                                is float.  Redo checking for "other"=="fixed".
 949                              */
 950                             strpos = t + 1;
 951                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 952                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 953                             goto do_other_anchored;
 954                         }
 955                         /* We don't contradict the found floating substring. */
 956                         /* XXXX Why not check for STCLASS? */
 957                         s = t + 1;
 958                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 959                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 960                         goto set_useful;
 961                     }
 962                     /* Position contradicts check-string */
 963                     /* XXXX probably better to look for check-string
 964                        than for "\n", so one should lower the limit for t? */
 965                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 966                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 967                     other_last = strpos = s = t + 1;
 968                     goto restart;
 969                 }
 970                 t++;
 971             }
 972             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 973                         PL_colors[0], PL_colors[1]));
 974             goto fail_finish;
 975         }
 976         else {
 977             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 978                         PL_colors[0], PL_colors[1]));
 979         }
 980         s = t;
 981       set_useful:
 982         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 983     }
 984     else {
 985         /* The found string does not prohibit matching at strpos,
 986            - no optimization of calling REx engine can be performed,
 987            unless it was an MBOL and we are not after MBOL,
 988            or a future STCLASS check will fail this. */
 989       try_at_start:
 990         /* Even in this situation we may use MBOL flag if strpos is offset
 991            wrt the start of the string. */
 992         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 993             && (strpos != strbeg) && strpos[-1] != '\n'
 994             /* May be due to an implicit anchor of m{.*foo}  */
 995             && !(prog->intflags & PREGf_IMPLICIT))
 996         {
 997             t = strpos;
 998             goto find_anchor;
 999         }
1000         DEBUG_EXECUTE_r( if (ml_anch)
1001             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1002                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1003         );
1004       success_at_start:
1005         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1006             && (utf8_target ? (
1007                 prog->check_utf8                /* Could be deleted already */
1008                 && --BmUSEFUL(prog->check_utf8) < 0
1009                 && (prog->check_utf8 == prog->float_utf8)
1010             ) : (
1011                 prog->check_substr              /* Could be deleted already */
1012                 && --BmUSEFUL(prog->check_substr) < 0
1013                 && (prog->check_substr == prog->float_substr)
1014             )))
1015         {
1016             /* If flags & SOMETHING - do not do it many times on the same match */
1017             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1018             /* XXX Does the destruction order has to change with utf8_target? */
1019             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1020             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1021             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1022             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1023             check = NULL;                       /* abort */
1024             s = strpos;
1025             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1026                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1027             if (prog->intflags & PREGf_IMPLICIT)
1028                 prog->extflags &= ~RXf_ANCH_MBOL;
1029             /* XXXX This is a remnant of the old implementation.  It
1030                     looks wasteful, since now INTUIT can use many
1031                     other heuristics. */
1032             prog->extflags &= ~RXf_USE_INTUIT;
1033             /* XXXX What other flags might need to be cleared in this branch? */
1034         }
1035         else
1036             s = strpos;
1037     }
1038
1039     /* Last resort... */
1040     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1041     /* trie stclasses are too expensive to use here, we are better off to
1042        leave it to regmatch itself */
1043     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1044         /* minlen == 0 is possible if regstclass is \b or \B,
1045            and the fixed substr is ''$.
1046            Since minlen is already taken into account, s+1 is before strend;
1047            accidentally, minlen >= 1 guaranties no false positives at s + 1
1048            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1049            regstclass does not come from lookahead...  */
1050         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1051            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1052         const U8* const str = (U8*)STRING(progi->regstclass);
1053         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1054                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1055                     : 1);
1056         char * endpos;
1057         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1058             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1059         else if (prog->float_substr || prog->float_utf8)
1060             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1061         else
1062             endpos= strend;
1063
1064         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1065                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1066
1067         t = s;
1068         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1069         if (!s) {
1070 #ifdef DEBUGGING
1071             const char *what = NULL;
1072 #endif
1073             if (endpos == strend) {
1074                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1075                                 "Could not match STCLASS...\n") );
1076                 goto fail;
1077             }
1078             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1079                                    "This position contradicts STCLASS...\n") );
1080             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1081                 goto fail;
1082             /* Contradict one of substrings */
1083             if (prog->anchored_substr || prog->anchored_utf8) {
1084                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1085                     DEBUG_EXECUTE_r( what = "anchored" );
1086                   hop_and_restart:
1087                     s = HOP3c(t, 1, strend);
1088                     if (s + start_shift + end_shift > strend) {
1089                         /* XXXX Should be taken into account earlier? */
1090                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1091                                                "Could not match STCLASS...\n") );
1092                         goto fail;
1093                     }
1094                     if (!check)
1095                         goto giveup;
1096                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1097                                 "Looking for %s substr starting at offset %ld...\n",
1098                                  what, (long)(s + start_shift - i_strpos)) );
1099                     goto restart;
1100                 }
1101                 /* Have both, check_string is floating */
1102                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1103                     goto retry_floating_check;
1104                 /* Recheck anchored substring, but not floating... */
1105                 s = check_at;
1106                 if (!check)
1107                     goto giveup;
1108                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109                           "Looking for anchored substr starting at offset %ld...\n",
1110                           (long)(other_last - i_strpos)) );
1111                 goto do_other_anchored;
1112             }
1113             /* Another way we could have checked stclass at the
1114                current position only: */
1115             if (ml_anch) {
1116                 s = t = t + 1;
1117                 if (!check)
1118                     goto giveup;
1119                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120                           "Looking for /%s^%s/m starting at offset %ld...\n",
1121                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1122                 goto try_at_offset;
1123             }
1124             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1125                 goto fail;
1126             /* Check is floating substring. */
1127           retry_floating_check:
1128             t = check_at - start_shift;
1129             DEBUG_EXECUTE_r( what = "floating" );
1130             goto hop_and_restart;
1131         }
1132         if (t != s) {
1133             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1134                         "By STCLASS: moving %ld --> %ld\n",
1135                                   (long)(t - i_strpos), (long)(s - i_strpos))
1136                    );
1137         }
1138         else {
1139             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1140                                   "Does not contradict STCLASS...\n");
1141                    );
1142         }
1143     }
1144   giveup:
1145     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1146                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1147                           PL_colors[5], (long)(s - i_strpos)) );
1148     return s;
1149
1150   fail_finish:                          /* Substring not found */
1151     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1152         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1153   fail:
1154     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1155                           PL_colors[4], PL_colors[5]));
1156     return NULL;
1157 }
1158
1159 #define DECL_TRIE_TYPE(scan) \
1160     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1161                     trie_type = (scan->flags != EXACT) \
1162                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1163                               : (utf8_target ? trie_utf8 : trie_plain)
1164
1165 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1166 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1167     switch (trie_type) {                                                    \
1168     case trie_utf8_fold:                                                    \
1169         if ( foldlen>0 ) {                                                  \
1170             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1171             foldlen -= len;                                                 \
1172             uscan += len;                                                   \
1173             len=0;                                                          \
1174         } else {                                                            \
1175             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1176             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1177             foldlen -= UNISKIP( uvc );                                      \
1178             uscan = foldbuf + UNISKIP( uvc );                               \
1179         }                                                                   \
1180         break;                                                              \
1181     case trie_latin_utf8_fold:                                              \
1182         if ( foldlen>0 ) {                                                  \
1183             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1184             foldlen -= len;                                                 \
1185             uscan += len;                                                   \
1186             len=0;                                                          \
1187         } else {                                                            \
1188             len = 1;                                                        \
1189             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1190             foldlen -= UNISKIP( uvc );                                      \
1191             uscan = foldbuf + UNISKIP( uvc );                               \
1192         }                                                                   \
1193         break;                                                              \
1194     case trie_utf8:                                                         \
1195         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1196         break;                                                              \
1197     case trie_plain:                                                        \
1198         uvc = (UV)*uc;                                                      \
1199         len = 1;                                                            \
1200     }                                                                       \
1201     if (uvc < 256) {                                                        \
1202         charid = trie->charmap[ uvc ];                                      \
1203     }                                                                       \
1204     else {                                                                  \
1205         charid = 0;                                                         \
1206         if (widecharmap) {                                                  \
1207             SV** const svpp = hv_fetch(widecharmap,                         \
1208                         (char*)&uvc, sizeof(UV), 0);                        \
1209             if (svpp)                                                       \
1210                 charid = (U16)SvIV(*svpp);                                  \
1211         }                                                                   \
1212     }                                                                       \
1213 } STMT_END
1214
1215 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1216 STMT_START {                                              \
1217     while (s <= e) {                                      \
1218         if ( (CoNd)                                       \
1219              && (ln == 1 || folder(s, pat_string, ln))    \
1220              && (!reginfo || regtry(reginfo, &s)) )       \
1221             goto got_it;                                  \
1222         s++;                                              \
1223     }                                                     \
1224 } STMT_END
1225
1226 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1227 STMT_START {                                          \
1228     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1229         CoDe                                          \
1230         s += uskip;                                   \
1231     }                                                 \
1232 } STMT_END
1233
1234 #define REXEC_FBC_SCAN(CoDe)                          \
1235 STMT_START {                                          \
1236     while (s < strend) {                              \
1237         CoDe                                          \
1238         s++;                                          \
1239     }                                                 \
1240 } STMT_END
1241
1242 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1243 REXEC_FBC_UTF8_SCAN(                                  \
1244     if (CoNd) {                                       \
1245         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1246             goto got_it;                              \
1247         else                                          \
1248             tmp = doevery;                            \
1249     }                                                 \
1250     else                                              \
1251         tmp = 1;                                      \
1252 )
1253
1254 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1255 REXEC_FBC_SCAN(                                       \
1256     if (CoNd) {                                       \
1257         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1258             goto got_it;                              \
1259         else                                          \
1260             tmp = doevery;                            \
1261     }                                                 \
1262     else                                              \
1263         tmp = 1;                                      \
1264 )
1265
1266 #define REXEC_FBC_TRYIT               \
1267 if ((!reginfo || regtry(reginfo, &s))) \
1268     goto got_it
1269
1270 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1271     if (utf8_target) {                                             \
1272         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1273     }                                                          \
1274     else {                                                     \
1275         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1276     }
1277
1278 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1279     if (utf8_target) {                                             \
1280         UtFpReLoAd;                                            \
1281         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1282     }                                                          \
1283     else {                                                     \
1284         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1285     }
1286
1287 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1288     PL_reg_flags |= RF_tainted;                                \
1289     if (utf8_target) {                                             \
1290         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1291     }                                                          \
1292     else {                                                     \
1293         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1294     }
1295
1296 #define DUMP_EXEC_POS(li,s,doutf8) \
1297     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1298
1299
1300 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1301         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1302         tmp = TEST_NON_UTF8(tmp);                                              \
1303         REXEC_FBC_UTF8_SCAN(                                                   \
1304             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1305                 tmp = !tmp;                                                    \
1306                 IF_SUCCESS;                                                    \
1307             }                                                                  \
1308             else {                                                             \
1309                 IF_FAIL;                                                       \
1310             }                                                                  \
1311         );                                                                     \
1312
1313 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1314         if (s == PL_bostr) {                                                   \
1315             tmp = '\n';                                                        \
1316         }                                                                      \
1317         else {                                                                 \
1318             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1319             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1320         }                                                                      \
1321         tmp = TeSt1_UtF8;                                                      \
1322         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1323         REXEC_FBC_UTF8_SCAN(                                                   \
1324             if (tmp == ! (TeSt2_UtF8)) { \
1325                 tmp = !tmp;                                                    \
1326                 IF_SUCCESS;                                                    \
1327             }                                                                  \
1328             else {                                                             \
1329                 IF_FAIL;                                                       \
1330             }                                                                  \
1331         );                                                                     \
1332
1333 /* The only difference between the BOUND and NBOUND cases is that
1334  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1335  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1336  * with the other one being empty */
1337 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1338     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1339
1340 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1341     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1342
1343 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1344     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1345
1346 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1347     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1348
1349
1350 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1351  * be passed in completely with the variable name being tested, which isn't
1352  * such a clean interface, but this is easier to read than it was before.  We
1353  * are looking for the boundary (or non-boundary between a word and non-word
1354  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1355  * must be different.  Find the "wordness" of the character just prior to this
1356  * one, and compare it with the wordness of this one.  If they differ, we have
1357  * a boundary.  At the beginning of the string, pretend that the previous
1358  * character was a new-line */
1359 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1360     if (utf8_target) {                                                         \
1361                 UTF8_CODE \
1362     }                                                                          \
1363     else {  /* Not utf8 */                                                     \
1364         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1365         tmp = TEST_NON_UTF8(tmp);                                              \
1366         REXEC_FBC_SCAN(                                                        \
1367             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1368                 tmp = !tmp;                                                    \
1369                 IF_SUCCESS;                                                    \
1370             }                                                                  \
1371             else {                                                             \
1372                 IF_FAIL;                                                       \
1373             }                                                                  \
1374         );                                                                     \
1375     }                                                                          \
1376     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1377         goto got_it;
1378
1379 /* We know what class REx starts with.  Try to find this position... */
1380 /* if reginfo is NULL, its a dryrun */
1381 /* annoyingly all the vars in this routine have different names from their counterparts
1382    in regmatch. /grrr */
1383
1384 STATIC char *
1385 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1386     const char *strend, regmatch_info *reginfo)
1387 {
1388         dVAR;
1389         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1390         char *pat_string;   /* The pattern's exactish string */
1391         char *pat_end;      /* ptr to end char of pat_string */
1392         re_fold_t folder;       /* Function for computing non-utf8 folds */
1393         const U8 *fold_array;   /* array for folding ords < 256 */
1394         STRLEN ln;
1395         STRLEN lnc;
1396         register STRLEN uskip;
1397         U8 c1;
1398         U8 c2;
1399         char *e;
1400         register I32 tmp = 1;   /* Scratch variable? */
1401         register const bool utf8_target = PL_reg_match_utf8;
1402         UV utf8_fold_flags = 0;
1403         RXi_GET_DECL(prog,progi);
1404
1405         PERL_ARGS_ASSERT_FIND_BYCLASS;
1406
1407         /* We know what class it must start with. */
1408         switch (OP(c)) {
1409         case ANYOFV:
1410         case ANYOF:
1411             if (utf8_target || OP(c) == ANYOFV) {
1412                 STRLEN inclasslen = strend - s;
1413                 REXEC_FBC_UTF8_CLASS_SCAN(
1414                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1415             }
1416             else {
1417                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1418             }
1419             break;
1420         case CANY:
1421             REXEC_FBC_SCAN(
1422                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1423                     goto got_it;
1424                 else
1425                     tmp = doevery;
1426             );
1427             break;
1428
1429         case EXACTFA:
1430             if (UTF_PATTERN || utf8_target) {
1431                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1432                 goto do_exactf_utf8;
1433             }
1434             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1435             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1436             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1437
1438         case EXACTFU:
1439             if (UTF_PATTERN || utf8_target) {
1440                 utf8_fold_flags = 0;
1441                 goto do_exactf_utf8;
1442             }
1443             fold_array = PL_fold_latin1;
1444             folder = foldEQ_latin1;
1445             /* XXX This uses the full utf8 fold because if the pattern contains
1446              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1447              * There could be a new node type, say EXACTFU_SS, which is
1448              * generated by regcomp only if there is an 'ss', and then every
1449              * other case could goto do_exactf_non_utf8;*/
1450             goto do_exactf_utf8;
1451
1452         case EXACTF:
1453             if (UTF_PATTERN || utf8_target) {
1454                 utf8_fold_flags = 0;
1455                 goto do_exactf_utf8;
1456             }
1457             fold_array = PL_fold;
1458             folder = foldEQ;
1459             goto do_exactf_non_utf8;
1460
1461         case EXACTFL:
1462             if (UTF_PATTERN || utf8_target) {
1463                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1464                 goto do_exactf_utf8;
1465             }
1466             fold_array = PL_fold_locale;
1467             folder = foldEQ_locale;
1468
1469             /* FALL THROUGH */
1470
1471         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1472
1473             /* The idea in the non-utf8 EXACTF* cases is to first find the
1474              * first character of the EXACTF* node and then, if necessary,
1475              * case-insensitively compare the full text of the node.  c1 is the
1476              * first character.  c2 is its fold.  This logic will not work for
1477              * Unicode semantics and the german sharp ss, which hence should
1478              * not be compiled into a node that gets here. */
1479             pat_string = STRING(c);
1480             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1481
1482             e = HOP3c(strend, -((I32)ln), s);
1483
1484             if (!reginfo && e < s) {
1485                 e = s;                  /* Due to minlen logic of intuit() */
1486             }
1487
1488             c1 = *pat_string;
1489             c2 = fold_array[c1];
1490             if (c1 == c2) { /* If char and fold are the same */
1491                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1492             }
1493             else {
1494                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1495             }
1496             break;
1497
1498         do_exactf_utf8:
1499
1500             /* If one of the operands is in utf8, we can't use the simpler
1501              * folding above, due to the fact that many different characters
1502              * can have the same fold, or portion of a fold, or different-
1503              * length fold */
1504             pat_string = STRING(c);
1505             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1506             pat_end = pat_string + ln;
1507             lnc = (UTF_PATTERN) /* length to match in characters */
1508                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1509                     : ln;
1510
1511             e = HOP3c(strend, -((I32)lnc), s);
1512
1513             if (!reginfo && e < s) {
1514                 e = s;                  /* Due to minlen logic of intuit() */
1515             }
1516
1517             while (s <= e) {
1518                 char *my_strend= (char *)strend;
1519                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1520                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1521                     && (!reginfo || regtry(reginfo, &s)) )
1522                 {
1523                     goto got_it;
1524                 }
1525                 s += UTF8SKIP(s);
1526             }
1527             break;
1528         case BOUNDL:
1529             PL_reg_flags |= RF_tainted;
1530             FBC_BOUND(isALNUM_LC,
1531                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1532                       isALNUM_LC_utf8((U8*)s));
1533             break;
1534         case NBOUNDL:
1535             PL_reg_flags |= RF_tainted;
1536             FBC_NBOUND(isALNUM_LC,
1537                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1538                        isALNUM_LC_utf8((U8*)s));
1539             break;
1540         case BOUND:
1541             FBC_BOUND(isWORDCHAR,
1542                       isALNUM_uni(tmp),
1543                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1544             break;
1545         case BOUNDA:
1546             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1547                              isWORDCHAR_A(tmp),
1548                              isWORDCHAR_A((U8*)s));
1549             break;
1550         case NBOUND:
1551             FBC_NBOUND(isWORDCHAR,
1552                        isALNUM_uni(tmp),
1553                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1554             break;
1555         case NBOUNDA:
1556             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1557                               isWORDCHAR_A(tmp),
1558                               isWORDCHAR_A((U8*)s));
1559             break;
1560         case BOUNDU:
1561             FBC_BOUND(isWORDCHAR_L1,
1562                       isALNUM_uni(tmp),
1563                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1564             break;
1565         case NBOUNDU:
1566             FBC_NBOUND(isWORDCHAR_L1,
1567                        isALNUM_uni(tmp),
1568                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1569             break;
1570         case ALNUML:
1571             REXEC_FBC_CSCAN_TAINT(
1572                 isALNUM_LC_utf8((U8*)s),
1573                 isALNUM_LC(*s)
1574             );
1575             break;
1576         case ALNUMU:
1577             REXEC_FBC_CSCAN_PRELOAD(
1578                 LOAD_UTF8_CHARCLASS_ALNUM(),
1579                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1580                 isWORDCHAR_L1((U8) *s)
1581             );
1582             break;
1583         case ALNUM:
1584             REXEC_FBC_CSCAN_PRELOAD(
1585                 LOAD_UTF8_CHARCLASS_ALNUM(),
1586                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1587                 isWORDCHAR((U8) *s)
1588             );
1589             break;
1590         case ALNUMA:
1591             /* Don't need to worry about utf8, as it can match only a single
1592              * byte invariant character */
1593             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1594             break;
1595         case NALNUMU:
1596             REXEC_FBC_CSCAN_PRELOAD(
1597                 LOAD_UTF8_CHARCLASS_ALNUM(),
1598                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1599                 ! isWORDCHAR_L1((U8) *s)
1600             );
1601             break;
1602         case NALNUM:
1603             REXEC_FBC_CSCAN_PRELOAD(
1604                 LOAD_UTF8_CHARCLASS_ALNUM(),
1605                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1606                 ! isALNUM(*s)
1607             );
1608             break;
1609         case NALNUMA:
1610             REXEC_FBC_CSCAN(
1611                 !isWORDCHAR_A(*s),
1612                 !isWORDCHAR_A(*s)
1613             );
1614             break;
1615         case NALNUML:
1616             REXEC_FBC_CSCAN_TAINT(
1617                 !isALNUM_LC_utf8((U8*)s),
1618                 !isALNUM_LC(*s)
1619             );
1620             break;
1621         case SPACEU:
1622             REXEC_FBC_CSCAN_PRELOAD(
1623                 LOAD_UTF8_CHARCLASS_SPACE(),
1624                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1625                 isSPACE_L1((U8) *s)
1626             );
1627             break;
1628         case SPACE:
1629             REXEC_FBC_CSCAN_PRELOAD(
1630                 LOAD_UTF8_CHARCLASS_SPACE(),
1631                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1632                 isSPACE((U8) *s)
1633             );
1634             break;
1635         case SPACEA:
1636             /* Don't need to worry about utf8, as it can match only a single
1637              * byte invariant character */
1638             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1639             break;
1640         case SPACEL:
1641             REXEC_FBC_CSCAN_TAINT(
1642                 isSPACE_LC_utf8((U8*)s),
1643                 isSPACE_LC(*s)
1644             );
1645             break;
1646         case NSPACEU:
1647             REXEC_FBC_CSCAN_PRELOAD(
1648                 LOAD_UTF8_CHARCLASS_SPACE(),
1649                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1650                 ! isSPACE_L1((U8) *s)
1651             );
1652             break;
1653         case NSPACE:
1654             REXEC_FBC_CSCAN_PRELOAD(
1655                 LOAD_UTF8_CHARCLASS_SPACE(),
1656                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1657                 ! isSPACE((U8) *s)
1658             );
1659             break;
1660         case NSPACEA:
1661             REXEC_FBC_CSCAN(
1662                 !isSPACE_A(*s),
1663                 !isSPACE_A(*s)
1664             );
1665             break;
1666         case NSPACEL:
1667             REXEC_FBC_CSCAN_TAINT(
1668                 !isSPACE_LC_utf8((U8*)s),
1669                 !isSPACE_LC(*s)
1670             );
1671             break;
1672         case DIGIT:
1673             REXEC_FBC_CSCAN_PRELOAD(
1674                 LOAD_UTF8_CHARCLASS_DIGIT(),
1675                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1676                 isDIGIT(*s)
1677             );
1678             break;
1679         case DIGITA:
1680             /* Don't need to worry about utf8, as it can match only a single
1681              * byte invariant character */
1682             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1683             break;
1684         case DIGITL:
1685             REXEC_FBC_CSCAN_TAINT(
1686                 isDIGIT_LC_utf8((U8*)s),
1687                 isDIGIT_LC(*s)
1688             );
1689             break;
1690         case NDIGIT:
1691             REXEC_FBC_CSCAN_PRELOAD(
1692                 LOAD_UTF8_CHARCLASS_DIGIT(),
1693                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1694                 !isDIGIT(*s)
1695             );
1696             break;
1697         case NDIGITA:
1698             REXEC_FBC_CSCAN(
1699                 !isDIGIT_A(*s),
1700                 !isDIGIT_A(*s)
1701             );
1702             break;
1703         case NDIGITL:
1704             REXEC_FBC_CSCAN_TAINT(
1705                 !isDIGIT_LC_utf8((U8*)s),
1706                 !isDIGIT_LC(*s)
1707             );
1708             break;
1709         case LNBREAK:
1710             REXEC_FBC_CSCAN(
1711                 is_LNBREAK_utf8(s),
1712                 is_LNBREAK_latin1(s)
1713             );
1714             break;
1715         case VERTWS:
1716             REXEC_FBC_CSCAN(
1717                 is_VERTWS_utf8(s),
1718                 is_VERTWS_latin1(s)
1719             );
1720             break;
1721         case NVERTWS:
1722             REXEC_FBC_CSCAN(
1723                 !is_VERTWS_utf8(s),
1724                 !is_VERTWS_latin1(s)
1725             );
1726             break;
1727         case HORIZWS:
1728             REXEC_FBC_CSCAN(
1729                 is_HORIZWS_utf8(s),
1730                 is_HORIZWS_latin1(s)
1731             );
1732             break;
1733         case NHORIZWS:
1734             REXEC_FBC_CSCAN(
1735                 !is_HORIZWS_utf8(s),
1736                 !is_HORIZWS_latin1(s)
1737             );
1738             break;
1739         case AHOCORASICKC:
1740         case AHOCORASICK:
1741             {
1742                 DECL_TRIE_TYPE(c);
1743                 /* what trie are we using right now */
1744                 reg_ac_data *aho
1745                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1746                 reg_trie_data *trie
1747                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1748                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1749
1750                 const char *last_start = strend - trie->minlen;
1751 #ifdef DEBUGGING
1752                 const char *real_start = s;
1753 #endif
1754                 STRLEN maxlen = trie->maxlen;
1755                 SV *sv_points;
1756                 U8 **points; /* map of where we were in the input string
1757                                 when reading a given char. For ASCII this
1758                                 is unnecessary overhead as the relationship
1759                                 is always 1:1, but for Unicode, especially
1760                                 case folded Unicode this is not true. */
1761                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762                 U8 *bitmap=NULL;
1763
1764
1765                 GET_RE_DEBUG_FLAGS_DECL;
1766
1767                 /* We can't just allocate points here. We need to wrap it in
1768                  * an SV so it gets freed properly if there is a croak while
1769                  * running the match */
1770                 ENTER;
1771                 SAVETMPS;
1772                 sv_points=newSV(maxlen * sizeof(U8 *));
1773                 SvCUR_set(sv_points,
1774                     maxlen * sizeof(U8 *));
1775                 SvPOK_on(sv_points);
1776                 sv_2mortal(sv_points);
1777                 points=(U8**)SvPV_nolen(sv_points );
1778                 if ( trie_type != trie_utf8_fold
1779                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1780                 {
1781                     if (trie->bitmap)
1782                         bitmap=(U8*)trie->bitmap;
1783                     else
1784                         bitmap=(U8*)ANYOF_BITMAP(c);
1785                 }
1786                 /* this is the Aho-Corasick algorithm modified a touch
1787                    to include special handling for long "unknown char"
1788                    sequences. The basic idea being that we use AC as long
1789                    as we are dealing with a possible matching char, when
1790                    we encounter an unknown char (and we have not encountered
1791                    an accepting state) we scan forward until we find a legal
1792                    starting char.
1793                    AC matching is basically that of trie matching, except
1794                    that when we encounter a failing transition, we fall back
1795                    to the current states "fail state", and try the current char
1796                    again, a process we repeat until we reach the root state,
1797                    state 1, or a legal transition. If we fail on the root state
1798                    then we can either terminate if we have reached an accepting
1799                    state previously, or restart the entire process from the beginning
1800                    if we have not.
1801
1802                  */
1803                 while (s <= last_start) {
1804                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1805                     U8 *uc = (U8*)s;
1806                     U16 charid = 0;
1807                     U32 base = 1;
1808                     U32 state = 1;
1809                     UV uvc = 0;
1810                     STRLEN len = 0;
1811                     STRLEN foldlen = 0;
1812                     U8 *uscan = (U8*)NULL;
1813                     U8 *leftmost = NULL;
1814 #ifdef DEBUGGING
1815                     U32 accepted_word= 0;
1816 #endif
1817                     U32 pointpos = 0;
1818
1819                     while ( state && uc <= (U8*)strend ) {
1820                         int failed=0;
1821                         U32 word = aho->states[ state ].wordnum;
1822
1823                         if( state==1 ) {
1824                             if ( bitmap ) {
1825                                 DEBUG_TRIE_EXECUTE_r(
1826                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1827                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1828                                             (char *)uc, utf8_target );
1829                                         PerlIO_printf( Perl_debug_log,
1830                                             " Scanning for legal start char...\n");
1831                                     }
1832                                 );
1833                                 if (utf8_target) {
1834                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1835                                         uc += UTF8SKIP(uc);
1836                                     }
1837                                 } else {
1838                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1839                                         uc++;
1840                                     }
1841                                 }
1842                                 s= (char *)uc;
1843                             }
1844                             if (uc >(U8*)last_start) break;
1845                         }
1846
1847                         if ( word ) {
1848                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1849                             if (!leftmost || lpos < leftmost) {
1850                                 DEBUG_r(accepted_word=word);
1851                                 leftmost= lpos;
1852                             }
1853                             if (base==0) break;
1854
1855                         }
1856                         points[pointpos++ % maxlen]= uc;
1857                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1858                                              uscan, len, uvc, charid, foldlen,
1859                                              foldbuf, uniflags);
1860                         DEBUG_TRIE_EXECUTE_r({
1861                             dump_exec_pos( (char *)uc, c, strend, real_start,
1862                                 s,   utf8_target );
1863                             PerlIO_printf(Perl_debug_log,
1864                                 " Charid:%3u CP:%4"UVxf" ",
1865                                  charid, uvc);
1866                         });
1867
1868                         do {
1869 #ifdef DEBUGGING
1870                             word = aho->states[ state ].wordnum;
1871 #endif
1872                             base = aho->states[ state ].trans.base;
1873
1874                             DEBUG_TRIE_EXECUTE_r({
1875                                 if (failed)
1876                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1877                                         s,   utf8_target );
1878                                 PerlIO_printf( Perl_debug_log,
1879                                     "%sState: %4"UVxf", word=%"UVxf,
1880                                     failed ? " Fail transition to " : "",
1881                                     (UV)state, (UV)word);
1882                             });
1883                             if ( base ) {
1884                                 U32 tmp;
1885                                 I32 offset;
1886                                 if (charid &&
1887                                      ( ((offset = base + charid
1888                                         - 1 - trie->uniquecharcount)) >= 0)
1889                                      && ((U32)offset < trie->lasttrans)
1890                                      && trie->trans[offset].check == state
1891                                      && (tmp=trie->trans[offset].next))
1892                                 {
1893                                     DEBUG_TRIE_EXECUTE_r(
1894                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1895                                     state = tmp;
1896                                     break;
1897                                 }
1898                                 else {
1899                                     DEBUG_TRIE_EXECUTE_r(
1900                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1901                                     failed = 1;
1902                                     state = aho->fail[state];
1903                                 }
1904                             }
1905                             else {
1906                                 /* we must be accepting here */
1907                                 DEBUG_TRIE_EXECUTE_r(
1908                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1909                                 failed = 1;
1910                                 break;
1911                             }
1912                         } while(state);
1913                         uc += len;
1914                         if (failed) {
1915                             if (leftmost)
1916                                 break;
1917                             if (!state) state = 1;
1918                         }
1919                     }
1920                     if ( aho->states[ state ].wordnum ) {
1921                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1922                         if (!leftmost || lpos < leftmost) {
1923                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1924                             leftmost = lpos;
1925                         }
1926                     }
1927                     if (leftmost) {
1928                         s = (char*)leftmost;
1929                         DEBUG_TRIE_EXECUTE_r({
1930                             PerlIO_printf(
1931                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1932                                 (UV)accepted_word, (IV)(s - real_start)
1933                             );
1934                         });
1935                         if (!reginfo || regtry(reginfo, &s)) {
1936                             FREETMPS;
1937                             LEAVE;
1938                             goto got_it;
1939                         }
1940                         s = HOPc(s,1);
1941                         DEBUG_TRIE_EXECUTE_r({
1942                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1943                         });
1944                     } else {
1945                         DEBUG_TRIE_EXECUTE_r(
1946                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1947                         break;
1948                     }
1949                 }
1950                 FREETMPS;
1951                 LEAVE;
1952             }
1953             break;
1954         default:
1955             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1956             break;
1957         }
1958         return 0;
1959       got_it:
1960         return s;
1961 }
1962
1963
1964 /*
1965  - regexec_flags - match a regexp against a string
1966  */
1967 I32
1968 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1969               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1970 /* strend: pointer to null at end of string */
1971 /* strbeg: real beginning of string */
1972 /* minend: end of match must be >=minend after stringarg. */
1973 /* data: May be used for some additional optimizations.
1974          Currently its only used, with a U32 cast, for transmitting
1975          the ganch offset when doing a /g match. This will change */
1976 /* nosave: For optimizations. */
1977 {
1978     dVAR;
1979     struct regexp *const prog = (struct regexp *)SvANY(rx);
1980     /*register*/ char *s;
1981     register regnode *c;
1982     /*register*/ char *startpos = stringarg;
1983     I32 minlen;         /* must match at least this many chars */
1984     I32 dontbother = 0; /* how many characters not to try at end */
1985     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1986     I32 scream_pos = -1;                /* Internal iterator of scream. */
1987     char *scream_olds = NULL;
1988     const bool utf8_target = cBOOL(DO_UTF8(sv));
1989     I32 multiline;
1990     RXi_GET_DECL(prog,progi);
1991     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1992     regexp_paren_pair *swap = NULL;
1993     GET_RE_DEBUG_FLAGS_DECL;
1994
1995     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1996     PERL_UNUSED_ARG(data);
1997
1998     /* Be paranoid... */
1999     if (prog == NULL || startpos == NULL) {
2000         Perl_croak(aTHX_ "NULL regexp parameter");
2001         return 0;
2002     }
2003
2004     multiline = prog->extflags & RXf_PMf_MULTILINE;
2005     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2006
2007     RX_MATCH_UTF8_set(rx, utf8_target);
2008     DEBUG_EXECUTE_r(
2009         debug_start_match(rx, utf8_target, startpos, strend,
2010         "Matching");
2011     );
2012
2013     minlen = prog->minlen;
2014
2015     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2016         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2017                               "String too short [regexec_flags]...\n"));
2018         goto phooey;
2019     }
2020
2021
2022     /* Check validity of program. */
2023     if (UCHARAT(progi->program) != REG_MAGIC) {
2024         Perl_croak(aTHX_ "corrupted regexp program");
2025     }
2026
2027     PL_reg_flags = 0;
2028     PL_reg_eval_set = 0;
2029     PL_reg_maxiter = 0;
2030
2031     if (RX_UTF8(rx))
2032         PL_reg_flags |= RF_utf8;
2033
2034     /* Mark beginning of line for ^ and lookbehind. */
2035     reginfo.bol = startpos; /* XXX not used ??? */
2036     PL_bostr  = strbeg;
2037     reginfo.sv = sv;
2038
2039     /* Mark end of line for $ (and such) */
2040     PL_regeol = strend;
2041
2042     /* see how far we have to get to not match where we matched before */
2043     reginfo.till = startpos+minend;
2044
2045     /* If there is a "must appear" string, look for it. */
2046     s = startpos;
2047
2048     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2049         MAGIC *mg;
2050         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2051             reginfo.ganch = startpos + prog->gofs;
2052             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2053               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2054         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2055                   && SvMAGIC(sv)
2056                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2057                   && mg->mg_len >= 0) {
2058             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2059             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2060                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2061
2062             if (prog->extflags & RXf_ANCH_GPOS) {
2063                 if (s > reginfo.ganch)
2064                     goto phooey;
2065                 s = reginfo.ganch - prog->gofs;
2066                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2067                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2068                 if (s < strbeg)
2069                     goto phooey;
2070             }
2071         }
2072         else if (data) {
2073             reginfo.ganch = strbeg + PTR2UV(data);
2074             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2075                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2076
2077         } else {                                /* pos() not defined */
2078             reginfo.ganch = strbeg;
2079             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2080                  "GPOS: reginfo.ganch = strbeg\n"));
2081         }
2082     }
2083     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2084         /* We have to be careful. If the previous successful match
2085            was from this regex we don't want a subsequent partially
2086            successful match to clobber the old results.
2087            So when we detect this possibility we add a swap buffer
2088            to the re, and switch the buffer each match. If we fail
2089            we switch it back, otherwise we leave it swapped.
2090         */
2091         swap = prog->offs;
2092         /* do we need a save destructor here for eval dies? */
2093         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2094     }
2095     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2096         re_scream_pos_data d;
2097
2098         d.scream_olds = &scream_olds;
2099         d.scream_pos = &scream_pos;
2100         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2101         if (!s) {
2102             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2103             goto phooey;        /* not present */
2104         }
2105     }
2106
2107
2108
2109     /* Simplest case:  anchored match need be tried only once. */
2110     /*  [unless only anchor is BOL and multiline is set] */
2111     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2112         if (s == startpos && regtry(&reginfo, &startpos))
2113             goto got_it;
2114         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2115                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2116         {
2117             char *end;
2118
2119             if (minlen)
2120                 dontbother = minlen - 1;
2121             end = HOP3c(strend, -dontbother, strbeg) - 1;
2122             /* for multiline we only have to try after newlines */
2123             if (prog->check_substr || prog->check_utf8) {
2124                 /* because of the goto we can not easily reuse the macros for bifurcating the
2125                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2126                 if (utf8_target) {
2127                     if (s == startpos)
2128                         goto after_try_utf8;
2129                     while (1) {
2130                         if (regtry(&reginfo, &s)) {
2131                             goto got_it;
2132                         }
2133                       after_try_utf8:
2134                         if (s > end) {
2135                             goto phooey;
2136                         }
2137                         if (prog->extflags & RXf_USE_INTUIT) {
2138                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2139                             if (!s) {
2140                                 goto phooey;
2141                             }
2142                         }
2143                         else {
2144                             s += UTF8SKIP(s);
2145                         }
2146                     }
2147                 } /* end search for check string in unicode */
2148                 else {
2149                     if (s == startpos) {
2150                         goto after_try_latin;
2151                     }
2152                     while (1) {
2153                         if (regtry(&reginfo, &s)) {
2154                             goto got_it;
2155                         }
2156                       after_try_latin:
2157                         if (s > end) {
2158                             goto phooey;
2159                         }
2160                         if (prog->extflags & RXf_USE_INTUIT) {
2161                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2162                             if (!s) {
2163                                 goto phooey;
2164                             }
2165                         }
2166                         else {
2167                             s++;
2168                         }
2169                     }
2170                 } /* end search for check string in latin*/
2171             } /* end search for check string */
2172             else { /* search for newline */
2173                 if (s > startpos) {
2174                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2175                     s--;
2176                 }
2177                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2178                 while (s < end) {
2179                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2180                         if (regtry(&reginfo, &s))
2181                             goto got_it;
2182                     }
2183                 }
2184             } /* end search for newline */
2185         } /* end anchored/multiline check string search */
2186         goto phooey;
2187     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2188     {
2189         /* the warning about reginfo.ganch being used without initialization
2190            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2191            and we only enter this block when the same bit is set. */
2192         char *tmp_s = reginfo.ganch - prog->gofs;
2193
2194         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2195             goto got_it;
2196         goto phooey;
2197     }
2198
2199     /* Messy cases:  unanchored match. */
2200     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2201         /* we have /x+whatever/ */
2202         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2203         char ch;
2204 #ifdef DEBUGGING
2205         int did_match = 0;
2206 #endif
2207         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2208             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2209         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2210
2211         if (utf8_target) {
2212             REXEC_FBC_SCAN(
2213                 if (*s == ch) {
2214                     DEBUG_EXECUTE_r( did_match = 1 );
2215                     if (regtry(&reginfo, &s)) goto got_it;
2216                     s += UTF8SKIP(s);
2217                     while (s < strend && *s == ch)
2218                         s += UTF8SKIP(s);
2219                 }
2220             );
2221         }
2222         else {
2223             REXEC_FBC_SCAN(
2224                 if (*s == ch) {
2225                     DEBUG_EXECUTE_r( did_match = 1 );
2226                     if (regtry(&reginfo, &s)) goto got_it;
2227                     s++;
2228                     while (s < strend && *s == ch)
2229                         s++;
2230                 }
2231             );
2232         }
2233         DEBUG_EXECUTE_r(if (!did_match)
2234                 PerlIO_printf(Perl_debug_log,
2235                                   "Did not find anchored character...\n")
2236                );
2237     }
2238     else if (prog->anchored_substr != NULL
2239               || prog->anchored_utf8 != NULL
2240               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2241                   && prog->float_max_offset < strend - s)) {
2242         SV *must;
2243         I32 back_max;
2244         I32 back_min;
2245         char *last;
2246         char *last1;            /* Last position checked before */
2247 #ifdef DEBUGGING
2248         int did_match = 0;
2249 #endif
2250         if (prog->anchored_substr || prog->anchored_utf8) {
2251             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2252                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2253             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2254             back_max = back_min = prog->anchored_offset;
2255         } else {
2256             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2257                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2258             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2259             back_max = prog->float_max_offset;
2260             back_min = prog->float_min_offset;
2261         }
2262
2263
2264         if (must == &PL_sv_undef)
2265             /* could not downgrade utf8 check substring, so must fail */
2266             goto phooey;
2267
2268         if (back_min<0) {
2269             last = strend;
2270         } else {
2271             last = HOP3c(strend,        /* Cannot start after this */
2272                   -(I32)(CHR_SVLEN(must)
2273                          - (SvTAIL(must) != 0) + back_min), strbeg);
2274         }
2275         if (s > PL_bostr)
2276             last1 = HOPc(s, -1);
2277         else
2278             last1 = s - 1;      /* bogus */
2279
2280         /* XXXX check_substr already used to find "s", can optimize if
2281            check_substr==must. */
2282         scream_pos = -1;
2283         dontbother = end_shift;
2284         strend = HOPc(strend, -dontbother);
2285         while ( (s <= last) &&
2286                 ((flags & REXEC_SCREAM)
2287                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2288                                     end_shift, &scream_pos, 0))
2289                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2290                                   (unsigned char*)strend, must,
2291                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2292             /* we may be pointing at the wrong string */
2293             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2294                 s = strbeg + (s - SvPVX_const(sv));
2295             DEBUG_EXECUTE_r( did_match = 1 );
2296             if (HOPc(s, -back_max) > last1) {
2297                 last1 = HOPc(s, -back_min);
2298                 s = HOPc(s, -back_max);
2299             }
2300             else {
2301                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2302
2303                 last1 = HOPc(s, -back_min);
2304                 s = t;
2305             }
2306             if (utf8_target) {
2307                 while (s <= last1) {
2308                     if (regtry(&reginfo, &s))
2309                         goto got_it;
2310                     s += UTF8SKIP(s);
2311                 }
2312             }
2313             else {
2314                 while (s <= last1) {
2315                     if (regtry(&reginfo, &s))
2316                         goto got_it;
2317                     s++;
2318                 }
2319             }
2320         }
2321         DEBUG_EXECUTE_r(if (!did_match) {
2322             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2323                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2324             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2325                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2326                                ? "anchored" : "floating"),
2327                 quoted, RE_SV_TAIL(must));
2328         });
2329         goto phooey;
2330     }
2331     else if ( (c = progi->regstclass) ) {
2332         if (minlen) {
2333             const OPCODE op = OP(progi->regstclass);
2334             /* don't bother with what can't match */
2335             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2336                 strend = HOPc(strend, -(minlen - 1));
2337         }
2338         DEBUG_EXECUTE_r({
2339             SV * const prop = sv_newmortal();
2340             regprop(prog, prop, c);
2341             {
2342                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2343                     s,strend-s,60);
2344                 PerlIO_printf(Perl_debug_log,
2345                     "Matching stclass %.*s against %s (%d bytes)\n",
2346                     (int)SvCUR(prop), SvPVX_const(prop),
2347                      quoted, (int)(strend - s));
2348             }
2349         });
2350         if (find_byclass(prog, c, s, strend, &reginfo))
2351             goto got_it;
2352         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2353     }
2354     else {
2355         dontbother = 0;
2356         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2357             /* Trim the end. */
2358             char *last;
2359             SV* float_real;
2360
2361             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2362                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2363             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2364
2365             if (flags & REXEC_SCREAM) {
2366                 last = screaminstr(sv, float_real, s - strbeg,
2367                                    end_shift, &scream_pos, 1); /* last one */
2368                 if (!last)
2369                     last = scream_olds; /* Only one occurrence. */
2370                 /* we may be pointing at the wrong string */
2371                 else if (RXp_MATCH_COPIED(prog))
2372                     s = strbeg + (s - SvPVX_const(sv));
2373             }
2374             else {
2375                 STRLEN len;
2376                 const char * const little = SvPV_const(float_real, len);
2377
2378                 if (SvTAIL(float_real)) {
2379                     if (memEQ(strend - len + 1, little, len - 1))
2380                         last = strend - len + 1;
2381                     else if (!multiline)
2382                         last = memEQ(strend - len, little, len)
2383                             ? strend - len : NULL;
2384                     else
2385                         goto find_last;
2386                 } else {
2387                   find_last:
2388                     if (len)
2389                         last = rninstr(s, strend, little, little + len);
2390                     else
2391                         last = strend;  /* matching "$" */
2392                 }
2393             }
2394             if (last == NULL) {
2395                 DEBUG_EXECUTE_r(
2396                     PerlIO_printf(Perl_debug_log,
2397                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2398                         PL_colors[4], PL_colors[5]));
2399                 goto phooey; /* Should not happen! */
2400             }
2401             dontbother = strend - last + prog->float_min_offset;
2402         }
2403         if (minlen && (dontbother < minlen))
2404             dontbother = minlen - 1;
2405         strend -= dontbother;              /* this one's always in bytes! */
2406         /* We don't know much -- general case. */
2407         if (utf8_target) {
2408             for (;;) {
2409                 if (regtry(&reginfo, &s))
2410                     goto got_it;
2411                 if (s >= strend)
2412                     break;
2413                 s += UTF8SKIP(s);
2414             };
2415         }
2416         else {
2417             do {
2418                 if (regtry(&reginfo, &s))
2419                     goto got_it;
2420             } while (s++ < strend);
2421         }
2422     }
2423
2424     /* Failure. */
2425     goto phooey;
2426
2427 got_it:
2428     Safefree(swap);
2429     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2430
2431     if (PL_reg_eval_set)
2432         restore_pos(aTHX_ prog);
2433     if (RXp_PAREN_NAMES(prog))
2434         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2435
2436     /* make sure $`, $&, $', and $digit will work later */
2437     if ( !(flags & REXEC_NOT_FIRST) ) {
2438         RX_MATCH_COPY_FREE(rx);
2439         if (flags & REXEC_COPY_STR) {
2440             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2441 #ifdef PERL_OLD_COPY_ON_WRITE
2442             if ((SvIsCOW(sv)
2443                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2444                 if (DEBUG_C_TEST) {
2445                     PerlIO_printf(Perl_debug_log,
2446                                   "Copy on write: regexp capture, type %d\n",
2447                                   (int) SvTYPE(sv));
2448                 }
2449                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2450                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2451                 assert (SvPOKp(prog->saved_copy));
2452             } else
2453 #endif
2454             {
2455                 RX_MATCH_COPIED_on(rx);
2456                 s = savepvn(strbeg, i);
2457                 prog->subbeg = s;
2458             }
2459             prog->sublen = i;
2460         }
2461         else {
2462             prog->subbeg = strbeg;
2463             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2464         }
2465     }
2466
2467     return 1;
2468
2469 phooey:
2470     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2471                           PL_colors[4], PL_colors[5]));
2472     if (PL_reg_eval_set)
2473         restore_pos(aTHX_ prog);
2474     if (swap) {
2475         /* we failed :-( roll it back */
2476         Safefree(prog->offs);
2477         prog->offs = swap;
2478     }
2479
2480     return 0;
2481 }
2482
2483
2484 /*
2485  - regtry - try match at specific point
2486  */
2487 STATIC I32                      /* 0 failure, 1 success */
2488 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2489 {
2490     dVAR;
2491     CHECKPOINT lastcp;
2492     REGEXP *const rx = reginfo->prog;
2493     regexp *const prog = (struct regexp *)SvANY(rx);
2494     RXi_GET_DECL(prog,progi);
2495     GET_RE_DEBUG_FLAGS_DECL;
2496
2497     PERL_ARGS_ASSERT_REGTRY;
2498
2499     reginfo->cutpoint=NULL;
2500
2501     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2502         MAGIC *mg;
2503
2504         PL_reg_eval_set = RS_init;
2505         DEBUG_EXECUTE_r(DEBUG_s(
2506             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2507                           (IV)(PL_stack_sp - PL_stack_base));
2508             ));
2509         SAVESTACK_CXPOS();
2510         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2511         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2512         SAVETMPS;
2513         /* Apparently this is not needed, judging by wantarray. */
2514         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2515            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2516
2517         if (reginfo->sv) {
2518             /* Make $_ available to executed code. */
2519             if (reginfo->sv != DEFSV) {
2520                 SAVE_DEFSV;
2521                 DEFSV_set(reginfo->sv);
2522             }
2523
2524             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2525                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2526                 /* prepare for quick setting of pos */
2527 #ifdef PERL_OLD_COPY_ON_WRITE
2528                 if (SvIsCOW(reginfo->sv))
2529                     sv_force_normal_flags(reginfo->sv, 0);
2530 #endif
2531                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2532                                  &PL_vtbl_mglob, NULL, 0);
2533                 mg->mg_len = -1;
2534             }
2535             PL_reg_magic    = mg;
2536             PL_reg_oldpos   = mg->mg_len;
2537             SAVEDESTRUCTOR_X(restore_pos, prog);
2538         }
2539         if (!PL_reg_curpm) {
2540             Newxz(PL_reg_curpm, 1, PMOP);
2541 #ifdef USE_ITHREADS
2542             {
2543                 SV* const repointer = &PL_sv_undef;
2544                 /* this regexp is also owned by the new PL_reg_curpm, which
2545                    will try to free it.  */
2546                 av_push(PL_regex_padav, repointer);
2547                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2548                 PL_regex_pad = AvARRAY(PL_regex_padav);
2549             }
2550 #endif
2551         }
2552 #ifdef USE_ITHREADS
2553         /* It seems that non-ithreads works both with and without this code.
2554            So for efficiency reasons it seems best not to have the code
2555            compiled when it is not needed.  */
2556         /* This is safe against NULLs: */
2557         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2558         /* PM_reg_curpm owns a reference to this regexp.  */
2559         ReREFCNT_inc(rx);
2560 #endif
2561         PM_SETRE(PL_reg_curpm, rx);
2562         PL_reg_oldcurpm = PL_curpm;
2563         PL_curpm = PL_reg_curpm;
2564         if (RXp_MATCH_COPIED(prog)) {
2565             /*  Here is a serious problem: we cannot rewrite subbeg,
2566                 since it may be needed if this match fails.  Thus
2567                 $` inside (?{}) could fail... */
2568             PL_reg_oldsaved = prog->subbeg;
2569             PL_reg_oldsavedlen = prog->sublen;
2570 #ifdef PERL_OLD_COPY_ON_WRITE
2571             PL_nrs = prog->saved_copy;
2572 #endif
2573             RXp_MATCH_COPIED_off(prog);
2574         }
2575         else
2576             PL_reg_oldsaved = NULL;
2577         prog->subbeg = PL_bostr;
2578         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2579     }
2580     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2581     prog->offs[0].start = *startpos - PL_bostr;
2582     PL_reginput = *startpos;
2583     PL_reglastparen = &prog->lastparen;
2584     PL_reglastcloseparen = &prog->lastcloseparen;
2585     prog->lastparen = 0;
2586     prog->lastcloseparen = 0;
2587     PL_regsize = 0;
2588     PL_regoffs = prog->offs;
2589     if (PL_reg_start_tmpl <= prog->nparens) {
2590         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2591         if(PL_reg_start_tmp)
2592             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2593         else
2594             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2595     }
2596
2597     /* XXXX What this code is doing here?!!!  There should be no need
2598        to do this again and again, PL_reglastparen should take care of
2599        this!  --ilya*/
2600
2601     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2602      * Actually, the code in regcppop() (which Ilya may be meaning by
2603      * PL_reglastparen), is not needed at all by the test suite
2604      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2605      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2606      * Meanwhile, this code *is* needed for the
2607      * above-mentioned test suite tests to succeed.  The common theme
2608      * on those tests seems to be returning null fields from matches.
2609      * --jhi updated by dapm */
2610 #if 1
2611     if (prog->nparens) {
2612         regexp_paren_pair *pp = PL_regoffs;
2613         register I32 i;
2614         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2615             ++pp;
2616             pp->start = -1;
2617             pp->end = -1;
2618         }
2619     }
2620 #endif
2621     REGCP_SET(lastcp);
2622     if (regmatch(reginfo, progi->program + 1)) {
2623         PL_regoffs[0].end = PL_reginput - PL_bostr;
2624         return 1;
2625     }
2626     if (reginfo->cutpoint)
2627         *startpos= reginfo->cutpoint;
2628     REGCP_UNWIND(lastcp);
2629     return 0;
2630 }
2631
2632
2633 #define sayYES goto yes
2634 #define sayNO goto no
2635 #define sayNO_SILENT goto no_silent
2636
2637 /* we dont use STMT_START/END here because it leads to
2638    "unreachable code" warnings, which are bogus, but distracting. */
2639 #define CACHEsayNO \
2640     if (ST.cache_mask) \
2641        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2642     sayNO
2643
2644 /* this is used to determine how far from the left messages like
2645    'failed...' are printed. It should be set such that messages
2646    are inline with the regop output that created them.
2647 */
2648 #define REPORT_CODE_OFF 32
2649
2650
2651 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2652 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2653
2654 #define SLAB_FIRST(s) (&(s)->states[0])
2655 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2656
2657 /* grab a new slab and return the first slot in it */
2658
2659 STATIC regmatch_state *
2660 S_push_slab(pTHX)
2661 {
2662 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2663     dMY_CXT;
2664 #endif
2665     regmatch_slab *s = PL_regmatch_slab->next;
2666     if (!s) {
2667         Newx(s, 1, regmatch_slab);
2668         s->prev = PL_regmatch_slab;
2669         s->next = NULL;
2670         PL_regmatch_slab->next = s;
2671     }
2672     PL_regmatch_slab = s;
2673     return SLAB_FIRST(s);
2674 }
2675
2676
2677 /* push a new state then goto it */
2678
2679 #define PUSH_STATE_GOTO(state, node) \
2680     scan = node; \
2681     st->resume_state = state; \
2682     goto push_state;
2683
2684 /* push a new state with success backtracking, then goto it */
2685
2686 #define PUSH_YES_STATE_GOTO(state, node) \
2687     scan = node; \
2688     st->resume_state = state; \
2689     goto push_yes_state;
2690
2691
2692
2693 /*
2694
2695 regmatch() - main matching routine
2696
2697 This is basically one big switch statement in a loop. We execute an op,
2698 set 'next' to point the next op, and continue. If we come to a point which
2699 we may need to backtrack to on failure such as (A|B|C), we push a
2700 backtrack state onto the backtrack stack. On failure, we pop the top
2701 state, and re-enter the loop at the state indicated. If there are no more
2702 states to pop, we return failure.
2703
2704 Sometimes we also need to backtrack on success; for example /A+/, where
2705 after successfully matching one A, we need to go back and try to
2706 match another one; similarly for lookahead assertions: if the assertion
2707 completes successfully, we backtrack to the state just before the assertion
2708 and then carry on.  In these cases, the pushed state is marked as
2709 'backtrack on success too'. This marking is in fact done by a chain of
2710 pointers, each pointing to the previous 'yes' state. On success, we pop to
2711 the nearest yes state, discarding any intermediate failure-only states.
2712 Sometimes a yes state is pushed just to force some cleanup code to be
2713 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2714 it to free the inner regex.
2715
2716 Note that failure backtracking rewinds the cursor position, while
2717 success backtracking leaves it alone.
2718
2719 A pattern is complete when the END op is executed, while a subpattern
2720 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2721 ops trigger the "pop to last yes state if any, otherwise return true"
2722 behaviour.
2723
2724 A common convention in this function is to use A and B to refer to the two
2725 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2726 the subpattern to be matched possibly multiple times, while B is the entire
2727 rest of the pattern. Variable and state names reflect this convention.
2728
2729 The states in the main switch are the union of ops and failure/success of
2730 substates associated with with that op.  For example, IFMATCH is the op
2731 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2732 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2733 successfully matched A and IFMATCH_A_fail is a state saying that we have
2734 just failed to match A. Resume states always come in pairs. The backtrack
2735 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2736 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2737 on success or failure.
2738
2739 The struct that holds a backtracking state is actually a big union, with
2740 one variant for each major type of op. The variable st points to the
2741 top-most backtrack struct. To make the code clearer, within each
2742 block of code we #define ST to alias the relevant union.
2743
2744 Here's a concrete example of a (vastly oversimplified) IFMATCH
2745 implementation:
2746
2747     switch (state) {
2748     ....
2749
2750 #define ST st->u.ifmatch
2751
2752     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2753         ST.foo = ...; // some state we wish to save
2754         ...
2755         // push a yes backtrack state with a resume value of
2756         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2757         // first node of A:
2758         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2759         // NOTREACHED
2760
2761     case IFMATCH_A: // we have successfully executed A; now continue with B
2762         next = B;
2763         bar = ST.foo; // do something with the preserved value
2764         break;
2765
2766     case IFMATCH_A_fail: // A failed, so the assertion failed
2767         ...;   // do some housekeeping, then ...
2768         sayNO; // propagate the failure
2769
2770 #undef ST
2771
2772     ...
2773     }
2774
2775 For any old-timers reading this who are familiar with the old recursive
2776 approach, the code above is equivalent to:
2777
2778     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2779     {
2780         int foo = ...
2781         ...
2782         if (regmatch(A)) {
2783             next = B;
2784             bar = foo;
2785             break;
2786         }
2787         ...;   // do some housekeeping, then ...
2788         sayNO; // propagate the failure
2789     }
2790
2791 The topmost backtrack state, pointed to by st, is usually free. If you
2792 want to claim it, populate any ST.foo fields in it with values you wish to
2793 save, then do one of
2794
2795         PUSH_STATE_GOTO(resume_state, node);
2796         PUSH_YES_STATE_GOTO(resume_state, node);
2797
2798 which sets that backtrack state's resume value to 'resume_state', pushes a
2799 new free entry to the top of the backtrack stack, then goes to 'node'.
2800 On backtracking, the free slot is popped, and the saved state becomes the
2801 new free state. An ST.foo field in this new top state can be temporarily
2802 accessed to retrieve values, but once the main loop is re-entered, it
2803 becomes available for reuse.
2804
2805 Note that the depth of the backtrack stack constantly increases during the
2806 left-to-right execution of the pattern, rather than going up and down with
2807 the pattern nesting. For example the stack is at its maximum at Z at the
2808 end of the pattern, rather than at X in the following:
2809
2810     /(((X)+)+)+....(Y)+....Z/
2811
2812 The only exceptions to this are lookahead/behind assertions and the cut,
2813 (?>A), which pop all the backtrack states associated with A before
2814 continuing.
2815
2816 Backtrack state structs are allocated in slabs of about 4K in size.
2817 PL_regmatch_state and st always point to the currently active state,
2818 and PL_regmatch_slab points to the slab currently containing
2819 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2820 allocated, and is never freed until interpreter destruction. When the slab
2821 is full, a new one is allocated and chained to the end. At exit from
2822 regmatch(), slabs allocated since entry are freed.
2823
2824 */
2825
2826
2827 #define DEBUG_STATE_pp(pp)                                  \
2828     DEBUG_STATE_r({                                         \
2829         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2830         PerlIO_printf(Perl_debug_log,                       \
2831             "    %*s"pp" %s%s%s%s%s\n",                     \
2832             depth*2, "",                                    \
2833             PL_reg_name[st->resume_state],                     \
2834             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2835             ((st==yes_state) ? "Y" : ""),                   \
2836             ((st==mark_state) ? "M" : ""),                  \
2837             ((st==yes_state||st==mark_state) ? "]" : "")    \
2838         );                                                  \
2839     });
2840
2841
2842 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2843
2844 #ifdef DEBUGGING
2845
2846 STATIC void
2847 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2848     const char *start, const char *end, const char *blurb)
2849 {
2850     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2851
2852     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2853
2854     if (!PL_colorset)
2855             reginitcolors();
2856     {
2857         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2858             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2859
2860         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2861             start, end - start, 60);
2862
2863         PerlIO_printf(Perl_debug_log,
2864             "%s%s REx%s %s against %s\n",
2865                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2866
2867         if (utf8_target||utf8_pat)
2868             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2869                 utf8_pat ? "pattern" : "",
2870                 utf8_pat && utf8_target ? " and " : "",
2871                 utf8_target ? "string" : ""
2872             );
2873     }
2874 }
2875
2876 STATIC void
2877 S_dump_exec_pos(pTHX_ const char *locinput,
2878                       const regnode *scan,
2879                       const char *loc_regeol,
2880                       const char *loc_bostr,
2881                       const char *loc_reg_starttry,
2882                       const bool utf8_target)
2883 {
2884     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2885     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2886     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2887     /* The part of the string before starttry has one color
2888        (pref0_len chars), between starttry and current
2889        position another one (pref_len - pref0_len chars),
2890        after the current position the third one.
2891        We assume that pref0_len <= pref_len, otherwise we
2892        decrease pref0_len.  */
2893     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2894         ? (5 + taill) - l : locinput - loc_bostr;
2895     int pref0_len;
2896
2897     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2898
2899     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2900         pref_len++;
2901     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2902     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2903         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2904               ? (5 + taill) - pref_len : loc_regeol - locinput);
2905     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2906         l--;
2907     if (pref0_len < 0)
2908         pref0_len = 0;
2909     if (pref0_len > pref_len)
2910         pref0_len = pref_len;
2911     {
2912         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2913
2914         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2915             (locinput - pref_len),pref0_len, 60, 4, 5);
2916
2917         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2918                     (locinput - pref_len + pref0_len),
2919                     pref_len - pref0_len, 60, 2, 3);
2920
2921         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2922                     locinput, loc_regeol - locinput, 10, 0, 1);
2923
2924         const STRLEN tlen=len0+len1+len2;
2925         PerlIO_printf(Perl_debug_log,
2926                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2927                     (IV)(locinput - loc_bostr),
2928                     len0, s0,
2929                     len1, s1,
2930                     (docolor ? "" : "> <"),
2931                     len2, s2,
2932                     (int)(tlen > 19 ? 0 :  19 - tlen),
2933                     "");
2934     }
2935 }
2936
2937 #endif
2938
2939 /* reg_check_named_buff_matched()
2940  * Checks to see if a named buffer has matched. The data array of
2941  * buffer numbers corresponding to the buffer is expected to reside
2942  * in the regexp->data->data array in the slot stored in the ARG() of
2943  * node involved. Note that this routine doesn't actually care about the
2944  * name, that information is not preserved from compilation to execution.
2945  * Returns the index of the leftmost defined buffer with the given name
2946  * or 0 if non of the buffers matched.
2947  */
2948 STATIC I32
2949 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2950 {
2951     I32 n;
2952     RXi_GET_DECL(rex,rexi);
2953     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2954     I32 *nums=(I32*)SvPVX(sv_dat);
2955
2956     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2957
2958     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2959         if ((I32)*PL_reglastparen >= nums[n] &&
2960             PL_regoffs[nums[n]].end != -1)
2961         {
2962             return nums[n];
2963         }
2964     }
2965     return 0;
2966 }
2967
2968
2969 /* free all slabs above current one  - called during LEAVE_SCOPE */
2970
2971 STATIC void
2972 S_clear_backtrack_stack(pTHX_ void *p)
2973 {
2974     regmatch_slab *s = PL_regmatch_slab->next;
2975     PERL_UNUSED_ARG(p);
2976
2977     if (!s)
2978         return;
2979     PL_regmatch_slab->next = NULL;
2980     while (s) {
2981         regmatch_slab * const osl = s;
2982         s = s->next;
2983         Safefree(osl);
2984     }
2985 }
2986
2987
2988 #define SETREX(Re1,Re2) \
2989     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2990     Re1 = (Re2)
2991
2992 STATIC I32                      /* 0 failure, 1 success */
2993 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2994 {
2995 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2996     dMY_CXT;
2997 #endif
2998     dVAR;
2999     register const bool utf8_target = PL_reg_match_utf8;
3000     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3001     REGEXP *rex_sv = reginfo->prog;
3002     regexp *rex = (struct regexp *)SvANY(rex_sv);
3003     RXi_GET_DECL(rex,rexi);
3004     I32 oldsave;
3005     /* the current state. This is a cached copy of PL_regmatch_state */
3006     register regmatch_state *st;
3007     /* cache heavy used fields of st in registers */
3008     register regnode *scan;
3009     register regnode *next;
3010     register U32 n = 0; /* general value; init to avoid compiler warning */
3011     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3012     register char *locinput = PL_reginput;
3013     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3014
3015     bool result = 0;        /* return value of S_regmatch */
3016     int depth = 0;          /* depth of backtrack stack */
3017     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3018     const U32 max_nochange_depth =
3019         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3020         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3021     regmatch_state *yes_state = NULL; /* state to pop to on success of
3022                                                             subpattern */
3023     /* mark_state piggy backs on the yes_state logic so that when we unwind
3024        the stack on success we can update the mark_state as we go */
3025     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3026     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3027     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3028     U32 state_num;
3029     bool no_final = 0;      /* prevent failure from backtracking? */
3030     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3031     char *startpoint = PL_reginput;
3032     SV *popmark = NULL;     /* are we looking for a mark? */
3033     SV *sv_commit = NULL;   /* last mark name seen in failure */
3034     SV *sv_yes_mark = NULL; /* last mark name we have seen
3035                                during a successful match */
3036     U32 lastopen = 0;       /* last open we saw */
3037     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3038     SV* const oreplsv = GvSV(PL_replgv);
3039     /* these three flags are set by various ops to signal information to
3040      * the very next op. They have a useful lifetime of exactly one loop
3041      * iteration, and are not preserved or restored by state pushes/pops
3042      */
3043     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3044     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3045     int logical = 0;        /* the following EVAL is:
3046                                 0: (?{...})
3047                                 1: (?(?{...})X|Y)
3048                                 2: (??{...})
3049                                or the following IFMATCH/UNLESSM is:
3050                                 false: plain (?=foo)
3051                                 true:  used as a condition: (?(?=foo))
3052                             */
3053 #ifdef DEBUGGING
3054     GET_RE_DEBUG_FLAGS_DECL;
3055 #endif
3056
3057     PERL_ARGS_ASSERT_REGMATCH;
3058
3059     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3060             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3061     }));
3062     /* on first ever call to regmatch, allocate first slab */
3063     if (!PL_regmatch_slab) {
3064         Newx(PL_regmatch_slab, 1, regmatch_slab);
3065         PL_regmatch_slab->prev = NULL;
3066         PL_regmatch_slab->next = NULL;
3067         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3068     }
3069
3070     oldsave = PL_savestack_ix;
3071     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3072     SAVEVPTR(PL_regmatch_slab);
3073     SAVEVPTR(PL_regmatch_state);
3074
3075     /* grab next free state slot */
3076     st = ++PL_regmatch_state;
3077     if (st >  SLAB_LAST(PL_regmatch_slab))
3078         st = PL_regmatch_state = S_push_slab(aTHX);
3079
3080     /* Note that nextchr is a byte even in UTF */
3081     nextchr = UCHARAT(locinput);
3082     scan = prog;
3083     while (scan != NULL) {
3084
3085         DEBUG_EXECUTE_r( {
3086             SV * const prop = sv_newmortal();
3087             regnode *rnext=regnext(scan);
3088             DUMP_EXEC_POS( locinput, scan, utf8_target );
3089             regprop(rex, prop, scan);
3090
3091             PerlIO_printf(Perl_debug_log,
3092                     "%3"IVdf":%*s%s(%"IVdf")\n",
3093                     (IV)(scan - rexi->program), depth*2, "",
3094                     SvPVX_const(prop),
3095                     (PL_regkind[OP(scan)] == END || !rnext) ?
3096                         0 : (IV)(rnext - rexi->program));
3097         });
3098
3099         next = scan + NEXT_OFF(scan);
3100         if (next == scan)
3101             next = NULL;
3102         state_num = OP(scan);
3103
3104         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3105       reenter_switch:
3106
3107         assert(PL_reglastparen == &rex->lastparen);
3108         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3109         assert(PL_regoffs == rex->offs);
3110
3111         switch (state_num) {
3112         case BOL:
3113             if (locinput == PL_bostr)
3114             {
3115                 /* reginfo->till = reginfo->bol; */
3116                 break;
3117             }
3118             sayNO;
3119         case MBOL:
3120             if (locinput == PL_bostr ||
3121                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3122             {
3123                 break;
3124             }
3125             sayNO;
3126         case SBOL:
3127             if (locinput == PL_bostr)
3128                 break;
3129             sayNO;
3130         case GPOS:
3131             if (locinput == reginfo->ganch)
3132                 break;
3133             sayNO;
3134
3135         case KEEPS:
3136             /* update the startpoint */
3137             st->u.keeper.val = PL_regoffs[0].start;
3138             PL_reginput = locinput;
3139             PL_regoffs[0].start = locinput - PL_bostr;
3140             PUSH_STATE_GOTO(KEEPS_next, next);
3141             /*NOT-REACHED*/
3142         case KEEPS_next_fail:
3143             /* rollback the start point change */
3144             PL_regoffs[0].start = st->u.keeper.val;
3145             sayNO_SILENT;
3146             /*NOT-REACHED*/
3147         case EOL:
3148                 goto seol;
3149         case MEOL:
3150             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3151                 sayNO;
3152             break;
3153         case SEOL:
3154           seol:
3155             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3156                 sayNO;
3157             if (PL_regeol - locinput > 1)
3158                 sayNO;
3159             break;
3160         case EOS:
3161             if (PL_regeol != locinput)
3162                 sayNO;
3163             break;
3164         case SANY:
3165             if (!nextchr && locinput >= PL_regeol)
3166                 sayNO;
3167             if (utf8_target) {
3168                 locinput += PL_utf8skip[nextchr];
3169                 if (locinput > PL_regeol)
3170                     sayNO;
3171                 nextchr = UCHARAT(locinput);
3172             }
3173             else
3174                 nextchr = UCHARAT(++locinput);
3175             break;
3176         case CANY:
3177             if (!nextchr && locinput >= PL_regeol)
3178                 sayNO;
3179             nextchr = UCHARAT(++locinput);
3180             break;
3181         case REG_ANY:
3182             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3183                 sayNO;
3184             if (utf8_target) {
3185                 locinput += PL_utf8skip[nextchr];
3186                 if (locinput > PL_regeol)
3187                     sayNO;
3188                 nextchr = UCHARAT(locinput);
3189             }
3190             else
3191                 nextchr = UCHARAT(++locinput);
3192             break;
3193
3194 #undef  ST
3195 #define ST st->u.trie
3196         case TRIEC:
3197             /* In this case the charclass data is available inline so
3198                we can fail fast without a lot of extra overhead.
3199              */
3200             if (scan->flags == EXACT || !utf8_target) {
3201                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3202                     DEBUG_EXECUTE_r(
3203                         PerlIO_printf(Perl_debug_log,
3204                                   "%*s  %sfailed to match trie start class...%s\n",
3205                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3206                     );
3207                     sayNO_SILENT;
3208                     /* NOTREACHED */
3209                 }
3210             }
3211             /* FALL THROUGH */
3212         case TRIE:
3213             /* the basic plan of execution of the trie is:
3214              * At the beginning, run though all the states, and
3215              * find the longest-matching word. Also remember the position
3216              * of the shortest matching word. For example, this pattern:
3217              *    1  2 3 4    5
3218              *    ab|a|x|abcd|abc
3219              * when matched against the string "abcde", will generate
3220              * accept states for all words except 3, with the longest
3221              * matching word being 4, and the shortest being 1 (with
3222              * the position being after char 1 of the string).
3223              *
3224              * Then for each matching word, in word order (i.e. 1,2,4,5),
3225              * we run the remainder of the pattern; on each try setting
3226              * the current position to the character following the word,
3227              * returning to try the next word on failure.
3228              *
3229              * We avoid having to build a list of words at runtime by
3230              * using a compile-time structure, wordinfo[].prev, which
3231              * gives, for each word, the previous accepting word (if any).
3232              * In the case above it would contain the mappings 1->2, 2->0,
3233              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3234              * the longest word (4 above), a list of all words, by
3235              * following the list of prev pointers; this gives us the
3236              * unordered list 4,5,1,2. Then given the current word we have
3237              * just tried, we can go through the list and find the
3238              * next-biggest word to try (so if we just failed on word 2,
3239              * the next in the list is 4).
3240              *
3241              * Since at runtime we don't record the matching position in
3242              * the string for each word, we have to work that out for
3243              * each word we're about to process. The wordinfo table holds
3244              * the character length of each word; given that we recorded
3245              * at the start: the position of the shortest word and its
3246              * length in chars, we just need to move the pointer the
3247              * difference between the two char lengths. Depending on
3248              * Unicode status and folding, that's cheap or expensive.
3249              *
3250              * This algorithm is optimised for the case where are only a
3251              * small number of accept states, i.e. 0,1, or maybe 2.
3252              * With lots of accepts states, and having to try all of them,
3253              * it becomes quadratic on number of accept states to find all
3254              * the next words.
3255              */
3256
3257             {
3258                 /* what type of TRIE am I? (utf8 makes this contextual) */
3259                 DECL_TRIE_TYPE(scan);
3260
3261                 /* what trie are we using right now */
3262                 reg_trie_data * const trie
3263                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3264                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3265                 U32 state = trie->startstate;
3266
3267                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3268                     !TRIE_BITMAP_TEST(trie,*locinput)
3269                 ) {
3270                     if (trie->states[ state ].wordnum) {
3271                          DEBUG_EXECUTE_r(
3272                             PerlIO_printf(Perl_debug_log,
3273                                           "%*s  %smatched empty string...%s\n",
3274                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3275                         );
3276                         if (!trie->jump)
3277                             break;
3278                     } else {
3279                         DEBUG_EXECUTE_r(
3280                             PerlIO_printf(Perl_debug_log,
3281                                           "%*s  %sfailed to match trie start class...%s\n",
3282                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3283                         );
3284                         sayNO_SILENT;
3285                    }
3286                 }
3287
3288             {
3289                 U8 *uc = ( U8* )locinput;
3290
3291                 STRLEN len = 0;
3292                 STRLEN foldlen = 0;
3293                 U8 *uscan = (U8*)NULL;
3294                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3295                 U32 charcount = 0; /* how many input chars we have matched */
3296                 U32 accepted = 0; /* have we seen any accepting states? */
3297
3298                 ST.B = next;
3299                 ST.jump = trie->jump;
3300                 ST.me = scan;
3301                 ST.firstpos = NULL;
3302                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3303                 ST.nextword = 0;
3304
3305                 /* fully traverse the TRIE; note the position of the
3306                    shortest accept state and the wordnum of the longest
3307                    accept state */
3308
3309                 while ( state && uc <= (U8*)PL_regeol ) {
3310                     U32 base = trie->states[ state ].trans.base;
3311                     UV uvc = 0;
3312                     U16 charid = 0;
3313                     U16 wordnum;
3314                     wordnum = trie->states[ state ].wordnum;
3315
3316                     if (wordnum) { /* it's an accept state */
3317                         if (!accepted) {
3318                             accepted = 1;
3319                             /* record first match position */
3320                             if (ST.longfold) {
3321                                 ST.firstpos = (U8*)locinput;
3322                                 ST.firstchars = 0;
3323                             }
3324                             else {
3325                                 ST.firstpos = uc;
3326                                 ST.firstchars = charcount;
3327                             }
3328                         }
3329                         if (!ST.nextword || wordnum < ST.nextword)
3330                             ST.nextword = wordnum;
3331                         ST.topword = wordnum;
3332                     }
3333
3334                     DEBUG_TRIE_EXECUTE_r({
3335                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3336                                 PerlIO_printf( Perl_debug_log,
3337                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3338                                     2+depth * 2, "", PL_colors[4],
3339                                     (UV)state, (accepted ? 'Y' : 'N'));
3340                     });
3341
3342                     /* read a char and goto next state */
3343                     if ( base ) {
3344                         I32 offset;
3345                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3346                                              uscan, len, uvc, charid, foldlen,
3347                                              foldbuf, uniflags);
3348                         charcount++;
3349                         if (foldlen>0)
3350                             ST.longfold = TRUE;
3351                         if (charid &&
3352                              ( ((offset =
3353                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3354
3355                              && ((U32)offset < trie->lasttrans)
3356                              && trie->trans[offset].check == state)
3357                         {
3358                             state = trie->trans[offset].next;
3359                         }
3360                         else {
3361                             state = 0;
3362                         }
3363                         uc += len;
3364
3365                     }
3366                     else {
3367                         state = 0;
3368                     }
3369                     DEBUG_TRIE_EXECUTE_r(
3370                         PerlIO_printf( Perl_debug_log,
3371                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3372                             charid, uvc, (UV)state, PL_colors[5] );
3373                     );
3374                 }
3375                 if (!accepted)
3376                    sayNO;
3377
3378                 /* calculate total number of accept states */
3379                 {
3380                     U16 w = ST.topword;
3381                     accepted = 0;
3382                     while (w) {
3383                         w = trie->wordinfo[w].prev;
3384                         accepted++;
3385                     }
3386                     ST.accepted = accepted;
3387                 }
3388
3389                 DEBUG_EXECUTE_r(
3390                     PerlIO_printf( Perl_debug_log,
3391                         "%*s  %sgot %"IVdf" possible matches%s\n",
3392                         REPORT_CODE_OFF + depth * 2, "",
3393                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3394                 );
3395                 goto trie_first_try; /* jump into the fail handler */
3396             }}
3397             /* NOTREACHED */
3398
3399         case TRIE_next_fail: /* we failed - try next alternative */
3400             if ( ST.jump) {
3401                 REGCP_UNWIND(ST.cp);
3402                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3403                     PL_regoffs[n].end = -1;
3404                 *PL_reglastparen = n;
3405             }
3406             if (!--ST.accepted) {
3407                 DEBUG_EXECUTE_r({
3408                     PerlIO_printf( Perl_debug_log,
3409                         "%*s  %sTRIE failed...%s\n",
3410                         REPORT_CODE_OFF+depth*2, "",
3411                         PL_colors[4],
3412                         PL_colors[5] );
3413                 });
3414                 sayNO_SILENT;
3415             }
3416             {
3417                 /* Find next-highest word to process.  Note that this code
3418                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3419                 register U16 min = 0;
3420                 register U16 word;
3421                 register U16 const nextword = ST.nextword;
3422                 register reg_trie_wordinfo * const wordinfo
3423                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3424                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3425                     if (word > nextword && (!min || word < min))
3426                         min = word;
3427                 }
3428                 ST.nextword = min;
3429             }
3430
3431           trie_first_try:
3432             if (do_cutgroup) {
3433                 do_cutgroup = 0;
3434                 no_final = 0;
3435             }
3436
3437             if ( ST.jump) {
3438                 ST.lastparen = *PL_reglastparen;
3439                 REGCP_SET(ST.cp);
3440             }
3441
3442             /* find start char of end of current word */
3443             {
3444                 U32 chars; /* how many chars to skip */
3445                 U8 *uc = ST.firstpos;
3446                 reg_trie_data * const trie
3447                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3448
3449                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3450                             >=  ST.firstchars);
3451                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3452                             - ST.firstchars;
3453
3454                 if (ST.longfold) {
3455                     /* the hard option - fold each char in turn and find
3456                      * its folded length (which may be different */
3457                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3458                     STRLEN foldlen;
3459                     STRLEN len;
3460                     UV uvc;
3461                     U8 *uscan;
3462
3463                     while (chars) {
3464                         if (utf8_target) {
3465                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3466                                                     uniflags);
3467                             uc += len;
3468                         }
3469                         else {
3470                             uvc = *uc;
3471                             uc++;
3472                         }
3473                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3474                         uscan = foldbuf;
3475                         while (foldlen) {
3476                             if (!--chars)
3477                                 break;
3478                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3479                                             uniflags);
3480                             uscan += len;
3481                             foldlen -= len;
3482                         }
3483                     }
3484                 }
3485                 else {
3486                     if (utf8_target)
3487                         while (chars--)
3488                             uc += UTF8SKIP(uc);
3489                     else
3490                         uc += chars;
3491                 }
3492                 PL_reginput = (char *)uc;
3493             }
3494
3495             scan = (ST.jump && ST.jump[ST.nextword])
3496                         ? ST.me + ST.jump[ST.nextword]
3497                         : ST.B;
3498
3499             DEBUG_EXECUTE_r({
3500                 PerlIO_printf( Perl_debug_log,
3501                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3502                     REPORT_CODE_OFF+depth*2, "",
3503                     PL_colors[4],
3504                     ST.nextword,
3505                     PL_colors[5]
3506                     );
3507             });
3508
3509             if (ST.accepted > 1 || has_cutgroup) {
3510                 PUSH_STATE_GOTO(TRIE_next, scan);
3511                 /* NOTREACHED */
3512             }
3513             /* only one choice left - just continue */
3514             DEBUG_EXECUTE_r({
3515                 AV *const trie_words
3516                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3517                 SV ** const tmp = av_fetch( trie_words,
3518                     ST.nextword-1, 0 );
3519                 SV *sv= tmp ? sv_newmortal() : NULL;
3520
3521                 PerlIO_printf( Perl_debug_log,
3522                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3523                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3524                     ST.nextword,
3525                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3526                             PL_colors[0], PL_colors[1],
3527                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3528                         )
3529                     : "not compiled under -Dr",
3530                     PL_colors[5] );
3531             });
3532
3533             locinput = PL_reginput;
3534             nextchr = UCHARAT(locinput);
3535             continue; /* execute rest of RE */
3536             /* NOTREACHED */
3537 #undef  ST
3538
3539         case EXACT: {
3540             char *s = STRING(scan);
3541             ln = STR_LEN(scan);
3542             if (utf8_target != UTF_PATTERN) {
3543                 /* The target and the pattern have differing utf8ness. */
3544                 char *l = locinput;
3545                 const char * const e = s + ln;
3546
3547                 if (utf8_target) {
3548                     /* The target is utf8, the pattern is not utf8. */
3549                     while (s < e) {
3550                         STRLEN ulen;
3551                         if (l >= PL_regeol)
3552                              sayNO;
3553                         if (NATIVE_TO_UNI(*(U8*)s) !=
3554                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3555                                             uniflags))
3556                              sayNO;
3557                         l += ulen;
3558                         s ++;
3559                     }
3560                 }
3561                 else {
3562                     /* The target is not utf8, the pattern is utf8. */
3563                     while (s < e) {
3564                         STRLEN ulen;
3565                         if (l >= PL_regeol)
3566                             sayNO;
3567                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3568                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3569                                            uniflags))
3570                             sayNO;
3571                         s += ulen;
3572                         l ++;
3573                     }
3574                 }
3575                 locinput = l;
3576                 nextchr = UCHARAT(locinput);
3577                 break;
3578             }
3579             /* The target and the pattern have the same utf8ness. */
3580             /* Inline the first character, for speed. */
3581             if (UCHARAT(s) != nextchr)
3582                 sayNO;
3583             if (PL_regeol - locinput < ln)
3584                 sayNO;
3585             if (ln > 1 && memNE(s, locinput, ln))
3586                 sayNO;
3587             locinput += ln;
3588             nextchr = UCHARAT(locinput);
3589             break;
3590             }
3591         case EXACTFL: {
3592             re_fold_t folder;
3593             const U8 * fold_array;
3594             const char * s;
3595             U32 fold_utf8_flags;
3596
3597             PL_reg_flags |= RF_tainted;
3598             folder = foldEQ_locale;
3599             fold_array = PL_fold_locale;
3600             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3601             goto do_exactf;
3602
3603         case EXACTFU:
3604             folder = foldEQ_latin1;
3605             fold_array = PL_fold_latin1;
3606             fold_utf8_flags = 0;
3607             goto do_exactf;
3608
3609         case EXACTFA:
3610             folder = foldEQ_latin1;
3611             fold_array = PL_fold_latin1;
3612             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3613             goto do_exactf;
3614
3615         case EXACTF:
3616             folder = foldEQ;
3617             fold_array = PL_fold;
3618             fold_utf8_flags = 0;
3619
3620           do_exactf:
3621             s = STRING(scan);
3622             ln = STR_LEN(scan);
3623
3624             if (utf8_target || UTF_PATTERN) {
3625               /* Either target or the pattern are utf8. */
3626                 const char * const l = locinput;
3627                 char *e = PL_regeol;
3628
3629                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3630                                l, &e, 0,  utf8_target, fold_utf8_flags)) {
3631                      /* One more case for the sharp s:
3632                       * pack("U0U*", 0xDF) =~ /ss/i,
3633                       * the 0xC3 0x9F are the UTF-8
3634                       * byte sequence for the U+00DF. */
3635
3636                      if (!(utf8_target &&
3637                            toLOWER(s[0]) == 's' &&
3638                            ln >= 2 &&
3639                            toLOWER(s[1]) == 's' &&
3640                            (U8)l[0] == 0xC3 &&
3641                            e - l >= 2 &&
3642                            (U8)l[1] == 0x9F))
3643                           sayNO;
3644                 }
3645                 locinput = e;
3646                 nextchr = UCHARAT(locinput);
3647                 break;
3648             }
3649
3650             /* Neither the target nor the pattern are utf8 */
3651             if (UCHARAT(s) != nextchr &&
3652                 UCHARAT(s) != fold_array[nextchr])
3653             {
3654                 sayNO;
3655             }
3656             if (PL_regeol - locinput < ln)
3657                 sayNO;
3658             if (ln > 1 && ! folder(s, locinput, ln))
3659                 sayNO;
3660             locinput += ln;
3661             nextchr = UCHARAT(locinput);
3662             break;
3663         }
3664
3665         /* XXX Could improve efficiency by separating these all out using a
3666          * macro or in-line function.  At that point regcomp.c would no longer
3667          * have to set the FLAGS fields of these */
3668         case BOUNDL:
3669         case NBOUNDL:
3670             PL_reg_flags |= RF_tainted;
3671             /* FALL THROUGH */
3672         case BOUND:
3673         case BOUNDU:
3674         case BOUNDA:
3675         case NBOUND:
3676         case NBOUNDU:
3677         case NBOUNDA:
3678             /* was last char in word? */
3679             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3680                 if (locinput == PL_bostr)
3681                     ln = '\n';
3682                 else {
3683                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3684
3685                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3686                 }
3687                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3688                     ln = isALNUM_uni(ln);
3689                     LOAD_UTF8_CHARCLASS_ALNUM();
3690                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3691                 }
3692                 else {
3693                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3694                     n = isALNUM_LC_utf8((U8*)locinput);
3695                 }
3696             }
3697             else {
3698
3699                 /* Here the string isn't utf8, or is utf8 and only ascii
3700                  * characters are to match \w.  In the latter case looking at
3701                  * the byte just prior to the current one may be just the final
3702                  * byte of a multi-byte character.  This is ok.  There are two
3703                  * cases:
3704                  * 1) it is a single byte character, and then the test is doing
3705                  *      just what it's supposed to.
3706                  * 2) it is a multi-byte character, in which case the final
3707                  *      byte is never mistakable for ASCII, and so the test
3708                  *      will say it is not a word character, which is the
3709                  *      correct answer. */
3710                 ln = (locinput != PL_bostr) ?
3711                     UCHARAT(locinput - 1) : '\n';
3712                 switch (FLAGS(scan)) {
3713                     case REGEX_UNICODE_CHARSET:
3714                         ln = isWORDCHAR_L1(ln);
3715                         n = isWORDCHAR_L1(nextchr);
3716                         break;
3717                     case REGEX_LOCALE_CHARSET:
3718                         ln = isALNUM_LC(ln);
3719                         n = isALNUM_LC(nextchr);
3720                         break;
3721                     case REGEX_DEPENDS_CHARSET:
3722                         ln = isALNUM(ln);
3723                         n = isALNUM(nextchr);
3724                         break;
3725                     case REGEX_ASCII_RESTRICTED_CHARSET:
3726                         ln = isWORDCHAR_A(ln);
3727                         n = isWORDCHAR_A(nextchr);
3728                         break;
3729                     default:
3730                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3731                         break;
3732                 }
3733             }
3734             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3735              * regcomp.sym */
3736             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3737                     sayNO;
3738             break;
3739         case ANYOFV:
3740         case ANYOF:
3741             if (utf8_target || state_num == ANYOFV) {
3742                 STRLEN inclasslen = PL_regeol - locinput;
3743                 if (locinput >= PL_regeol)
3744                     sayNO;
3745
3746                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3747                     sayNO;
3748                 locinput += inclasslen;
3749                 nextchr = UCHARAT(locinput);
3750                 break;
3751             }
3752             else {
3753                 if (nextchr < 0)
3754                     nextchr = UCHARAT(locinput);
3755                 if (!nextchr && locinput >= PL_regeol)
3756                     sayNO;
3757                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3758                     sayNO;
3759                 nextchr = UCHARAT(++locinput);
3760                 break;
3761             }
3762             break;
3763         /* Special char classes - The defines start on line 129 or so */
3764         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3765                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3766                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3767                   ALNUMA, NALNUMA, isWORDCHAR_A,
3768                   alnum, "a");
3769
3770         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3771                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3772                   SPACEU, NSPACEU, isSPACE_L1,
3773                   SPACEA, NSPACEA, isSPACE_A,
3774                   space, " ");
3775
3776         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3777                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3778                 DIGITA, NDIGITA, isDIGIT_A,
3779                 digit, "0");
3780
3781         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3782                        a Unicode extended Grapheme Cluster */
3783             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3784               extended Grapheme Cluster is:
3785
3786                CR LF
3787                | Prepend* Begin Extend*
3788                | .
3789
3790                Begin is (Hangul-syllable | ! Control)
3791                Extend is (Grapheme_Extend | Spacing_Mark)
3792                Control is [ GCB_Control CR LF ]
3793
3794                The discussion below shows how the code for CLUMP is derived
3795                from this regex.  Note that most of these concepts are from
3796                property values of the Grapheme Cluster Boundary (GCB) property.
3797                No code point can have multiple property values for a given
3798                property.  Thus a code point in Prepend can't be in Control, but
3799                it must be in !Control.  This is why Control above includes
3800                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3801                property separately, and so can't be in GCB_Control, even though
3802                they logically are controls.  Control is not the same as gc=cc,
3803                but includes format and other characters as well.
3804
3805                The Unicode definition of Hangul-syllable is:
3806                    L+
3807                    | (L* ( ( V | LV ) V* | LVT ) T*)
3808                    | T+
3809                   )
3810                Each of these is a value for the GCB property, and hence must be
3811                disjoint, so the order they are tested is immaterial, so the
3812                above can safely be changed to
3813                    T+
3814                    | L+
3815                    | (L* ( LVT | ( V | LV ) V*) T*)
3816
3817                The last two terms can be combined like this:
3818                    L* ( L
3819                         | (( LVT | ( V | LV ) V*) T*))
3820
3821                And refactored into this:
3822                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3823
3824                That means that if we have seen any L's at all we can quit
3825                there, but if the next character is a LVT, a V or and LV we
3826                should keep going.
3827
3828                There is a subtlety with Prepend* which showed up in testing.
3829                Note that the Begin, and only the Begin is required in:
3830                 | Prepend* Begin Extend*
3831                Also, Begin contains '! Control'.  A Prepend must be a '!
3832                Control', which means it must be a Begin.  What it comes down to
3833                is that if we match Prepend* and then find no suitable Begin
3834                afterwards, that if we backtrack the last Prepend, that one will
3835                be a suitable Begin.
3836             */
3837
3838             if (locinput >= PL_regeol)
3839                 sayNO;
3840             if  (! utf8_target) {
3841
3842                 /* Match either CR LF  or '.', as all the other possibilities
3843                  * require utf8 */
3844                 locinput++;         /* Match the . or CR */
3845                 if (nextchr == '\r'
3846                     && locinput < PL_regeol
3847                     && UCHARAT(locinput) == '\n') locinput++;
3848             }
3849             else {
3850
3851                 /* Utf8: See if is ( CR LF ); already know that locinput <
3852                  * PL_regeol, so locinput+1 is in bounds */
3853                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3854                     locinput += 2;
3855                 }
3856                 else {
3857                     /* In case have to backtrack to beginning, then match '.' */
3858                     char *starting = locinput;
3859
3860                     /* In case have to backtrack the last prepend */
3861                     char *previous_prepend = 0;
3862
3863                     LOAD_UTF8_CHARCLASS_GCB();
3864
3865                     /* Match (prepend)* */
3866                     while (locinput < PL_regeol
3867                            && swash_fetch(PL_utf8_X_prepend,
3868                                           (U8*)locinput, utf8_target))
3869                     {
3870                         previous_prepend = locinput;
3871                         locinput += UTF8SKIP(locinput);
3872                     }
3873
3874                     /* As noted above, if we matched a prepend character, but
3875                      * the next thing won't match, back off the last prepend we
3876                      * matched, as it is guaranteed to match the begin */
3877                     if (previous_prepend
3878                         && (locinput >=  PL_regeol
3879                             || ! swash_fetch(PL_utf8_X_begin,
3880                                              (U8*)locinput, utf8_target)))
3881                     {
3882                         locinput = previous_prepend;
3883                     }
3884
3885                     /* Note that here we know PL_regeol > locinput, as we
3886                      * tested that upon input to this switch case, and if we
3887                      * moved locinput forward, we tested the result just above
3888                      * and it either passed, or we backed off so that it will
3889                      * now pass */
3890                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3891
3892                         /* Here did not match the required 'Begin' in the
3893                          * second term.  So just match the very first
3894                          * character, the '.' of the final term of the regex */
3895                         locinput = starting + UTF8SKIP(starting);
3896                     } else {
3897
3898                         /* Here is the beginning of a character that can have
3899                          * an extender.  It is either a hangul syllable, or a
3900                          * non-control */
3901                         if (swash_fetch(PL_utf8_X_non_hangul,
3902                                         (U8*)locinput, utf8_target))
3903                         {
3904
3905                             /* Here not a Hangul syllable, must be a
3906                              * ('!  * Control') */
3907                             locinput += UTF8SKIP(locinput);
3908                         } else {
3909
3910                             /* Here is a Hangul syllable.  It can be composed
3911                              * of several individual characters.  One
3912                              * possibility is T+ */
3913                             if (swash_fetch(PL_utf8_X_T,
3914                                             (U8*)locinput, utf8_target))
3915                             {
3916                                 while (locinput < PL_regeol
3917                                         && swash_fetch(PL_utf8_X_T,
3918                                                         (U8*)locinput, utf8_target))
3919                                 {
3920                                     locinput += UTF8SKIP(locinput);
3921                                 }
3922                             } else {
3923
3924                                 /* Here, not T+, but is a Hangul.  That means
3925                                  * it is one of the others: L, LV, LVT or V,
3926                                  * and matches:
3927                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3928
3929                                 /* Match L*           */
3930                                 while (locinput < PL_regeol
3931                                         && swash_fetch(PL_utf8_X_L,
3932                                                         (U8*)locinput, utf8_target))
3933                                 {
3934                                     locinput += UTF8SKIP(locinput);
3935                                 }
3936
3937                                 /* Here, have exhausted L*.  If the next
3938                                  * character is not an LV, LVT nor V, it means
3939                                  * we had to have at least one L, so matches L+
3940                                  * in the original equation, we have a complete
3941                                  * hangul syllable.  Are done. */
3942
3943                                 if (locinput < PL_regeol
3944                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3945                                                     (U8*)locinput, utf8_target))
3946                                 {
3947
3948                                     /* Otherwise keep going.  Must be LV, LVT
3949                                      * or V.  See if LVT */
3950                                     if (swash_fetch(PL_utf8_X_LVT,
3951                                                     (U8*)locinput, utf8_target))
3952                                     {
3953                                         locinput += UTF8SKIP(locinput);
3954                                     } else {
3955
3956                                         /* Must be  V or LV.  Take it, then
3957                                          * match V*     */
3958                                         locinput += UTF8SKIP(locinput);
3959                                         while (locinput < PL_regeol
3960                                                 && swash_fetch(PL_utf8_X_V,
3961                                                          (U8*)locinput, utf8_target))
3962                                         {
3963                                             locinput += UTF8SKIP(locinput);
3964                                         }
3965                                     }
3966
3967                                     /* And any of LV, LVT, or V can be followed
3968                                      * by T*            */
3969                                     while (locinput < PL_regeol
3970                                            && swash_fetch(PL_utf8_X_T,
3971                                                            (U8*)locinput,
3972                                                            utf8_target))
3973                                     {
3974                                         locinput += UTF8SKIP(locinput);
3975                                     }
3976                                 }
3977                             }
3978                         }
3979
3980                         /* Match any extender */
3981                         while (locinput < PL_regeol
3982                                 && swash_fetch(PL_utf8_X_extend,
3983                                                 (U8*)locinput, utf8_target))
3984                         {
3985                             locinput += UTF8SKIP(locinput);
3986                         }
3987                     }
3988                 }
3989                 if (locinput > PL_regeol) sayNO;
3990             }
3991             nextchr = UCHARAT(locinput);
3992             break;
3993
3994         case NREFFL:
3995         {   /* The capture buffer cases.  The ones beginning with N for the
3996                named buffers just convert to the equivalent numbered and
3997                pretend they were called as the corresponding numbered buffer
3998                op.  */
3999             /* don't initialize these in the declaration, it makes C++
4000                unhappy */
4001             char *s;
4002             char type;
4003             re_fold_t folder;
4004             const U8 *fold_array;
4005             UV utf8_fold_flags;
4006
4007             PL_reg_flags |= RF_tainted;
4008             folder = foldEQ_locale;
4009             fold_array = PL_fold_locale;
4010             type = REFFL;
4011             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4012             goto do_nref;
4013
4014         case NREFFA:
4015             folder = foldEQ_latin1;
4016             fold_array = PL_fold_latin1;
4017             type = REFFA;
4018             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4019             goto do_nref;
4020
4021         case NREFFU:
4022             folder = foldEQ_latin1;
4023             fold_array = PL_fold_latin1;
4024             type = REFFU;
4025             utf8_fold_flags = 0;
4026             goto do_nref;
4027
4028         case NREFF:
4029             folder = foldEQ;
4030             fold_array = PL_fold;
4031             type = REFF;
4032             utf8_fold_flags = 0;
4033             goto do_nref;
4034
4035         case NREF:
4036             type = REF;
4037             folder = NULL;
4038             fold_array = NULL;
4039             utf8_fold_flags = 0;
4040           do_nref:
4041
4042             /* For the named back references, find the corresponding buffer
4043              * number */
4044             n = reg_check_named_buff_matched(rex,scan);
4045
4046             if ( ! n ) {
4047                 sayNO;
4048             }
4049             goto do_nref_ref_common;
4050
4051         case REFFL:
4052             PL_reg_flags |= RF_tainted;
4053             folder = foldEQ_locale;
4054             fold_array = PL_fold_locale;
4055             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4056             goto do_ref;
4057
4058         case REFFA:
4059             folder = foldEQ_latin1;
4060             fold_array = PL_fold_latin1;
4061             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4062             goto do_ref;
4063
4064         case REFFU:
4065             folder = foldEQ_latin1;
4066             fold_array = PL_fold_latin1;
4067             utf8_fold_flags = 0;
4068             goto do_ref;
4069
4070         case REFF:
4071             folder = foldEQ;
4072             fold_array = PL_fold;
4073             utf8_fold_flags = 0;
4074             goto do_ref;
4075
4076         case REF:
4077             folder = NULL;
4078             fold_array = NULL;
4079             utf8_fold_flags = 0;
4080
4081           do_ref:
4082             type = OP(scan);
4083             n = ARG(scan);  /* which paren pair */
4084
4085           do_nref_ref_common:
4086             ln = PL_regoffs[n].start;
4087             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4088             if (*PL_reglastparen < n || ln == -1)
4089                 sayNO;                  /* Do not match unless seen CLOSEn. */
4090             if (ln == PL_regoffs[n].end)
4091                 break;
4092
4093             s = PL_bostr + ln;
4094             if (type != REF     /* REF can do byte comparison */
4095                 && (utf8_target || type == REFFU))
4096             { /* XXX handle REFFL better */
4097                 char * limit = PL_regeol;
4098
4099                 /* This call case insensitively compares the entire buffer
4100                     * at s, with the current input starting at locinput, but
4101                     * not going off the end given by PL_regeol, and returns in
4102                     * limit upon success, how much of the current input was
4103                     * matched */
4104                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4105                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4106                 {
4107                     sayNO;
4108                 }
4109                 locinput = limit;
4110                 nextchr = UCHARAT(locinput);
4111                 break;
4112             }
4113
4114             /* Not utf8:  Inline the first character, for speed. */
4115             if (UCHARAT(s) != nextchr &&
4116                 (type == REF ||
4117                  UCHARAT(s) != fold_array[nextchr]))
4118                 sayNO;
4119             ln = PL_regoffs[n].end - ln;
4120             if (locinput + ln > PL_regeol)
4121                 sayNO;
4122             if (ln > 1 && (type == REF
4123                            ? memNE(s, locinput, ln)
4124                            : ! folder(s, locinput, ln)))
4125                 sayNO;
4126             locinput += ln;
4127             nextchr = UCHARAT(locinput);
4128             break;
4129         }
4130         case NOTHING:
4131         case TAIL:
4132             break;
4133         case BACK:
4134             break;
4135
4136 #undef  ST
4137 #define ST st->u.eval
4138         {
4139             SV *ret;
4140             REGEXP *re_sv;
4141             regexp *re;
4142             regexp_internal *rei;
4143             regnode *startpoint;
4144
4145         case GOSTART:
4146         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4147             if (cur_eval && cur_eval->locinput==locinput) {
4148                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4149                     Perl_croak(aTHX_ "Infinite recursion in regex");
4150                 if ( ++nochange_depth > max_nochange_depth )
4151                     Perl_croak(aTHX_
4152                         "Pattern subroutine nesting without pos change"
4153                         " exceeded limit in regex");
4154             } else {
4155                 nochange_depth = 0;
4156             }
4157             re_sv = rex_sv;
4158             re = rex;
4159             rei = rexi;
4160             (void)ReREFCNT_inc(rex_sv);
4161             if (OP(scan)==GOSUB) {
4162                 startpoint = scan + ARG2L(scan);
4163                 ST.close_paren = ARG(scan);
4164             } else {
4165                 startpoint = rei->program+1;
4166                 ST.close_paren = 0;
4167             }
4168             goto eval_recurse_doit;
4169             /* NOTREACHED */
4170         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4171             if (cur_eval && cur_eval->locinput==locinput) {
4172                 if ( ++nochange_depth > max_nochange_depth )
4173                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4174             } else {
4175                 nochange_depth = 0;
4176             }
4177             {
4178                 /* execute the code in the {...} */
4179                 dSP;
4180                 SV ** const before = SP;
4181                 OP_4tree * const oop = PL_op;
4182                 COP * const ocurcop = PL_curcop;
4183                 PAD *old_comppad;
4184                 char *saved_regeol = PL_regeol;
4185                 struct re_save_state saved_state;
4186
4187                 /* To not corrupt the existing regex state while executing the
4188                  * eval we would normally put it on the save stack, like with
4189                  * save_re_context. However, re-evals have a weird scoping so we
4190                  * can't just add ENTER/LEAVE here. With that, things like
4191                  *
4192                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4193                  *
4194                  * would break, as they expect the localisation to be unwound
4195                  * only when the re-engine backtracks through the bit that
4196                  * localised it.
4197                  *
4198                  * What we do instead is just saving the state in a local c
4199                  * variable.
4200                  */
4201                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4202
4203                 n = ARG(scan);
4204                 PL_op = (OP_4tree*)rexi->data->data[n];
4205                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4206                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4207                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4208                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4209
4210                 if (sv_yes_mark) {
4211                     SV *sv_mrk = get_sv("REGMARK", 1);
4212                     sv_setsv(sv_mrk, sv_yes_mark);
4213                 }
4214
4215                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4216                 SPAGAIN;
4217                 if (SP == before)
4218                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4219                 else {
4220                     ret = POPs;
4221                     PUTBACK;
4222                 }
4223
4224                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4225
4226                 PL_op = oop;
4227                 PAD_RESTORE_LOCAL(old_comppad);
4228                 PL_curcop = ocurcop;
4229                 PL_regeol = saved_regeol;
4230                 if (!logical) {
4231                     /* /(?{...})/ */
4232                     sv_setsv(save_scalar(PL_replgv), ret);
4233                     break;
4234                 }
4235             }
4236             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4237                 logical = 0;
4238                 {
4239                     /* extract RE object from returned value; compiling if
4240                      * necessary */
4241                     MAGIC *mg = NULL;
4242                     REGEXP *rx = NULL;
4243
4244                     if (SvROK(ret)) {
4245                         SV *const sv = SvRV(ret);
4246
4247                         if (SvTYPE(sv) == SVt_REGEXP) {
4248                             rx = (REGEXP*) sv;
4249                         } else if (SvSMAGICAL(sv)) {
4250                             mg = mg_find(sv, PERL_MAGIC_qr);
4251                             assert(mg);
4252                         }
4253                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4254                         rx = (REGEXP*) ret;
4255                     } else if (SvSMAGICAL(ret)) {
4256                         if (SvGMAGICAL(ret)) {
4257                             /* I don't believe that there is ever qr magic
4258                                here.  */
4259                             assert(!mg_find(ret, PERL_MAGIC_qr));
4260                             sv_unmagic(ret, PERL_MAGIC_qr);
4261                         }
4262                         else {
4263                             mg = mg_find(ret, PERL_MAGIC_qr);
4264                             /* testing suggests mg only ends up non-NULL for
4265                                scalars who were upgraded and compiled in the
4266                                else block below. In turn, this is only
4267                                triggered in the "postponed utf8 string" tests
4268                                in t/op/pat.t  */
4269                         }
4270                     }
4271
4272                     if (mg) {
4273                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4274                         assert(rx);
4275                     }
4276                     if (rx) {
4277                         rx = reg_temp_copy(NULL, rx);
4278                     }
4279                     else {
4280                         U32 pm_flags = 0;
4281                         const I32 osize = PL_regsize;
4282
4283                         if (DO_UTF8(ret)) {
4284                             assert (SvUTF8(ret));
4285                         } else if (SvUTF8(ret)) {
4286                             /* Not doing UTF-8, despite what the SV says. Is
4287                                this only if we're trapped in use 'bytes'?  */
4288                             /* Make a copy of the octet sequence, but without
4289                                the flag on, as the compiler now honours the
4290                                SvUTF8 flag on ret.  */
4291                             STRLEN len;
4292                             const char *const p = SvPV(ret, len);
4293                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4294                         }
4295                         rx = CALLREGCOMP(ret, pm_flags);
4296                         if (!(SvFLAGS(ret)
4297                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4298                                  | SVs_GMG))) {
4299                             /* This isn't a first class regexp. Instead, it's
4300                                caching a regexp onto an existing, Perl visible
4301                                scalar.  */
4302                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4303                         }
4304                         PL_regsize = osize;
4305                     }
4306                     re_sv = rx;
4307                     re = (struct regexp *)SvANY(rx);
4308                 }
4309                 RXp_MATCH_COPIED_off(re);
4310                 re->subbeg = rex->subbeg;
4311                 re->sublen = rex->sublen;
4312                 rei = RXi_GET(re);
4313                 DEBUG_EXECUTE_r(
4314                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4315                         "Matching embedded");
4316                 );
4317                 startpoint = rei->program + 1;
4318                 ST.close_paren = 0; /* only used for GOSUB */
4319                 /* borrowed from regtry */
4320                 if (PL_reg_start_tmpl <= re->nparens) {
4321                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4322                     if(PL_reg_start_tmp)
4323                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4324                     else
4325                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4326                 }
4327
4328         eval_recurse_doit: /* Share code with GOSUB below this line */
4329                 /* run the pattern returned from (??{...}) */
4330                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4331                 REGCP_SET(ST.lastcp);
4332
4333                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4334
4335                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4336                 PL_reglastparen = &re->lastparen;
4337                 PL_reglastcloseparen = &re->lastcloseparen;
4338                 re->lastparen = 0;
4339                 re->lastcloseparen = 0;
4340
4341                 PL_reginput = locinput;
4342                 PL_regsize = 0;
4343
4344                 /* XXXX This is too dramatic a measure... */
4345                 PL_reg_maxiter = 0;
4346
4347                 ST.toggle_reg_flags = PL_reg_flags;
4348                 if (RX_UTF8(re_sv))
4349                     PL_reg_flags |= RF_utf8;
4350                 else
4351                     PL_reg_flags &= ~RF_utf8;
4352                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4353
4354                 ST.prev_rex = rex_sv;
4355                 ST.prev_curlyx = cur_curlyx;
4356                 SETREX(rex_sv,re_sv);
4357                 rex = re;
4358                 rexi = rei;
4359                 cur_curlyx = NULL;
4360                 ST.B = next;
4361                 ST.prev_eval = cur_eval;
4362                 cur_eval = st;
4363                 /* now continue from first node in postoned RE */
4364                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4365                 /* NOTREACHED */
4366             }
4367             /* logical is 1,   /(?(?{...})X|Y)/ */
4368             sw = cBOOL(SvTRUE(ret));
4369             logical = 0;
4370             break;
4371         }
4372
4373         case EVAL_AB: /* cleanup after a successful (??{A})B */
4374             /* note: this is called twice; first after popping B, then A */
4375             PL_reg_flags ^= ST.toggle_reg_flags;
4376             ReREFCNT_dec(rex_sv);
4377             SETREX(rex_sv,ST.prev_rex);
4378             rex = (struct regexp *)SvANY(rex_sv);
4379             rexi = RXi_GET(rex);
4380             regcpblow(ST.cp);
4381             cur_eval = ST.prev_eval;
4382             cur_curlyx = ST.prev_curlyx;
4383
4384             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4385             PL_reglastparen = &rex->lastparen;
4386             PL_reglastcloseparen = &rex->lastcloseparen;
4387             /* also update PL_regoffs */
4388             PL_regoffs = rex->offs;
4389
4390             /* XXXX This is too dramatic a measure... */
4391             PL_reg_maxiter = 0;
4392             if ( nochange_depth )
4393                 nochange_depth--;
4394             sayYES;
4395
4396
4397         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4398             /* note: this is called twice; first after popping B, then A */
4399             PL_reg_flags ^= ST.toggle_reg_flags;
4400             ReREFCNT_dec(rex_sv);
4401             SETREX(rex_sv,ST.prev_rex);
4402             rex = (struct regexp *)SvANY(rex_sv);
4403             rexi = RXi_GET(rex);
4404             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4405             PL_reglastparen = &rex->lastparen;
4406             PL_reglastcloseparen = &rex->lastcloseparen;
4407
4408             PL_reginput = locinput;
4409             REGCP_UNWIND(ST.lastcp);
4410             regcppop(rex);
4411             cur_eval = ST.prev_eval;
4412             cur_curlyx = ST.prev_curlyx;
4413             /* XXXX This is too dramatic a measure... */
4414             PL_reg_maxiter = 0;
4415             if ( nochange_depth )
4416                 nochange_depth--;
4417             sayNO_SILENT;
4418 #undef ST
4419
4420         case OPEN:
4421             n = ARG(scan);  /* which paren pair */
4422             PL_reg_start_tmp[n] = locinput;
4423             if (n > PL_regsize)
4424                 PL_regsize = n;
4425             lastopen = n;
4426             break;
4427         case CLOSE:
4428             n = ARG(scan);  /* which paren pair */
4429             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4430             PL_regoffs[n].end = locinput - PL_bostr;
4431             /*if (n > PL_regsize)
4432                 PL_regsize = n;*/
4433             if (n > *PL_reglastparen)
4434                 *PL_reglastparen = n;
4435             *PL_reglastcloseparen = n;
4436             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4437                 goto fake_end;
4438             }
4439             break;
4440         case ACCEPT:
4441             if (ARG(scan)){
4442                 regnode *cursor;
4443                 for (cursor=scan;
4444                      cursor && OP(cursor)!=END;
4445                      cursor=regnext(cursor))
4446                 {
4447                     if ( OP(cursor)==CLOSE ){
4448                         n = ARG(cursor);
4449                         if ( n <= lastopen ) {
4450                             PL_regoffs[n].start
4451                                 = PL_reg_start_tmp[n] - PL_bostr;
4452                             PL_regoffs[n].end = locinput - PL_bostr;
4453                             /*if (n > PL_regsize)
4454                             PL_regsize = n;*/
4455                             if (n > *PL_reglastparen)
4456                                 *PL_reglastparen = n;
4457                             *PL_reglastcloseparen = n;
4458                             if ( n == ARG(scan) || (cur_eval &&
4459                                 cur_eval->u.eval.close_paren == n))
4460                                 break;
4461                         }
4462                     }
4463                 }
4464             }
4465             goto fake_end;
4466             /*NOTREACHED*/
4467         case GROUPP:
4468             n = ARG(scan);  /* which paren pair */
4469             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4470             break;
4471         case NGROUPP:
4472             /* reg_check_named_buff_matched returns 0 for no match */
4473             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4474             break;
4475         case INSUBP:
4476             n = ARG(scan);
4477             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4478             break;
4479         case DEFINEP:
4480             sw = 0;
4481             break;
4482         case IFTHEN:
4483             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4484             if (sw)
4485                 next = NEXTOPER(NEXTOPER(scan));
4486             else {
4487                 next = scan + ARG(scan);
4488                 if (OP(next) == IFTHEN) /* Fake one. */
4489                     next = NEXTOPER(NEXTOPER(next));
4490             }
4491             break;
4492         case LOGICAL:
4493             logical = scan->flags;
4494             break;
4495
4496 /*******************************************************************
4497
4498 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4499 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4500 STAR/PLUS/CURLY/CURLYN are used instead.)
4501
4502 A*B is compiled as <CURLYX><A><WHILEM><B>
4503
4504 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4505 state, which contains the current count, initialised to -1. It also sets
4506 cur_curlyx to point to this state, with any previous value saved in the
4507 state block.
4508
4509 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4510 since the pattern may possibly match zero times (i.e. it's a while {} loop
4511 rather than a do {} while loop).
4512
4513 Each entry to WHILEM represents a successful match of A. The count in the
4514 CURLYX block is incremented, another WHILEM state is pushed, and execution
4515 passes to A or B depending on greediness and the current count.
4516
4517 For example, if matching against the string a1a2a3b (where the aN are
4518 substrings that match /A/), then the match progresses as follows: (the
4519 pushed states are interspersed with the bits of strings matched so far):
4520
4521     <CURLYX cnt=-1>
4522     <CURLYX cnt=0><WHILEM>
4523     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4524     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4525     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4526     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4527
4528 (Contrast this with something like CURLYM, which maintains only a single
4529 backtrack state:
4530
4531     <CURLYM cnt=0> a1
4532     a1 <CURLYM cnt=1> a2
4533     a1 a2 <CURLYM cnt=2> a3
4534     a1 a2 a3 <CURLYM cnt=3> b
4535 )
4536
4537 Each WHILEM state block marks a point to backtrack to upon partial failure
4538 of A or B, and also contains some minor state data related to that
4539 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4540 overall state, such as the count, and pointers to the A and B ops.
4541
4542 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4543 must always point to the *current* CURLYX block, the rules are:
4544
4545 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4546 and set cur_curlyx to point the new block.
4547
4548 When popping the CURLYX block after a successful or unsuccessful match,
4549 restore the previous cur_curlyx.
4550
4551 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4552 to the outer one saved in the CURLYX block.
4553
4554 When popping the WHILEM block after a successful or unsuccessful B match,
4555 restore the previous cur_curlyx.
4556
4557 Here's an example for the pattern (AI* BI)*BO
4558 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4559
4560 cur_
4561 curlyx backtrack stack
4562 ------ ---------------
4563 NULL
4564 CO     <CO prev=NULL> <WO>
4565 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4566 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4567 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4568
4569 At this point the pattern succeeds, and we work back down the stack to
4570 clean up, restoring as we go:
4571
4572 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4573 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4574 CO     <CO prev=NULL> <WO>
4575 NULL
4576
4577 *******************************************************************/
4578
4579 #define ST st->u.curlyx
4580
4581         case CURLYX:    /* start of /A*B/  (for complex A) */
4582         {
4583             /* No need to save/restore up to this paren */
4584             I32 parenfloor = scan->flags;
4585
4586             assert(next); /* keep Coverity happy */
4587             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4588                 next += ARG(next);
4589
4590             /* XXXX Probably it is better to teach regpush to support
4591                parenfloor > PL_regsize... */
4592             if (parenfloor > (I32)*PL_reglastparen)
4593                 parenfloor = *PL_reglastparen; /* Pessimization... */
4594
4595             ST.prev_curlyx= cur_curlyx;
4596             cur_curlyx = st;
4597             ST.cp = PL_savestack_ix;
4598
4599             /* these fields contain the state of the current curly.
4600              * they are accessed by subsequent WHILEMs */
4601             ST.parenfloor = parenfloor;
4602             ST.me = scan;
4603             ST.B = next;
4604             ST.minmod = minmod;
4605             minmod = 0;
4606             ST.count = -1;      /* this will be updated by WHILEM */
4607             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4608
4609             PL_reginput = locinput;
4610             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4611             /* NOTREACHED */
4612         }
4613
4614         case CURLYX_end: /* just finished matching all of A*B */
4615             cur_curlyx = ST.prev_curlyx;
4616             sayYES;
4617             /* NOTREACHED */
4618
4619         case CURLYX_end_fail: /* just failed to match all of A*B */
4620             regcpblow(ST.cp);
4621             cur_curlyx = ST.prev_curlyx;
4622             sayNO;
4623             /* NOTREACHED */
4624
4625
4626 #undef ST
4627 #define ST st->u.whilem
4628
4629         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4630         {
4631             /* see the discussion above about CURLYX/WHILEM */
4632             I32 n;
4633             int min = ARG1(cur_curlyx->u.curlyx.me);
4634             int max = ARG2(cur_curlyx->u.curlyx.me);
4635             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4636
4637             assert(cur_curlyx); /* keep Coverity happy */
4638             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4639             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4640             ST.cache_offset = 0;
4641             ST.cache_mask = 0;
4642
4643             PL_reginput = locinput;
4644
4645             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4646                   "%*s  whilem: matched %ld out of %d..%d\n",
4647                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4648             );
4649
4650             /* First just match a string of min A's. */
4651
4652             if (n < min) {
4653                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4654                 cur_curlyx->u.curlyx.lastloc = locinput;
4655                 REGCP_SET(ST.lastcp);
4656
4657                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4658                 /* NOTREACHED */
4659             }
4660
4661             /* If degenerate A matches "", assume A done. */
4662
4663             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4664                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4665                    "%*s  whilem: empty match detected, trying continuation...\n",
4666                    REPORT_CODE_OFF+depth*2, "")
4667                 );
4668                 goto do_whilem_B_max;
4669             }
4670
4671             /* super-linear cache processing */
4672
4673             if (scan->flags) {
4674
4675                 if (!PL_reg_maxiter) {
4676                     /* start the countdown: Postpone detection until we
4677                      * know the match is not *that* much linear. */
4678                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4679                     /* possible overflow for long strings and many CURLYX's */
4680                     if (PL_reg_maxiter < 0)
4681                         PL_reg_maxiter = I32_MAX;
4682                     PL_reg_leftiter = PL_reg_maxiter;
4683                 }
4684
4685                 if (PL_reg_leftiter-- == 0) {
4686                     /* initialise cache */
4687                     const I32 size = (PL_reg_maxiter + 7)/8;
4688                     if (PL_reg_poscache) {
4689                         if ((I32)PL_reg_poscache_size < size) {
4690                             Renew(PL_reg_poscache, size, char);
4691                             PL_reg_poscache_size = size;
4692                         }
4693                         Zero(PL_reg_poscache, size, char);
4694                     }
4695                     else {
4696                         PL_reg_poscache_size = size;
4697                         Newxz(PL_reg_poscache, size, char);
4698                     }
4699                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4700       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4701                               PL_colors[4], PL_colors[5])
4702                     );
4703                 }
4704
4705                 if (PL_reg_leftiter < 0) {
4706                     /* have we already failed at this position? */
4707                     I32 offset, mask;
4708                     offset  = (scan->flags & 0xf) - 1
4709                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4710                     mask    = 1 << (offset % 8);
4711                     offset /= 8;
4712                     if (PL_reg_poscache[offset] & mask) {
4713                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4714                             "%*s  whilem: (cache) already tried at this position...\n",
4715                             REPORT_CODE_OFF+depth*2, "")
4716                         );
4717                         sayNO; /* cache records failure */
4718                     }
4719                     ST.cache_offset = offset;
4720                     ST.cache_mask   = mask;
4721                 }
4722             }
4723
4724             /* Prefer B over A for minimal matching. */
4725
4726             if (cur_curlyx->u.curlyx.minmod) {
4727                 ST.save_curlyx = cur_curlyx;
4728                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4729                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4730                 REGCP_SET(ST.lastcp);
4731                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4732                 /* NOTREACHED */
4733             }
4734
4735             /* Prefer A over B for maximal matching. */
4736
4737             if (n < max) { /* More greed allowed? */
4738                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4739                 cur_curlyx->u.curlyx.lastloc = locinput;
4740                 REGCP_SET(ST.lastcp);
4741                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4742                 /* NOTREACHED */
4743             }
4744             goto do_whilem_B_max;
4745         }
4746         /* NOTREACHED */
4747
4748         case WHILEM_B_min: /* just matched B in a minimal match */
4749         case WHILEM_B_max: /* just matched B in a maximal match */
4750             cur_curlyx = ST.save_curlyx;
4751             sayYES;
4752             /* NOTREACHED */
4753
4754         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4755             cur_curlyx = ST.save_curlyx;
4756             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4757             cur_curlyx->u.curlyx.count--;
4758             CACHEsayNO;
4759             /* NOTREACHED */
4760
4761         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4762             /* FALL THROUGH */
4763         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4764             REGCP_UNWIND(ST.lastcp);
4765             regcppop(rex);
4766             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4767             cur_curlyx->u.curlyx.count--;
4768             CACHEsayNO;
4769             /* NOTREACHED */
4770
4771         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4772             REGCP_UNWIND(ST.lastcp);
4773             regcppop(rex);      /* Restore some previous $<digit>s? */
4774             PL_reginput = locinput;
4775             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4776                 "%*s  whilem: failed, trying continuation...\n",
4777                 REPORT_CODE_OFF+depth*2, "")
4778             );
4779           do_whilem_B_max:
4780             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4781                 && ckWARN(WARN_REGEXP)
4782                 && !(PL_reg_flags & RF_warned))
4783             {
4784                 PL_reg_flags |= RF_warned;
4785                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4786                      "Complex regular subexpression recursion",
4787                      REG_INFTY - 1);
4788             }
4789
4790             /* now try B */
4791             ST.save_curlyx = cur_curlyx;
4792             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4793             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4794             /* NOTREACHED */
4795
4796         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4797             cur_curlyx = ST.save_curlyx;
4798             REGCP_UNWIND(ST.lastcp);
4799             regcppop(rex);
4800
4801             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4802                 /* Maximum greed exceeded */
4803                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4804                     && ckWARN(WARN_REGEXP)
4805                     && !(PL_reg_flags & RF_warned))
4806                 {
4807                     PL_reg_flags |= RF_warned;
4808                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4809                         "%s limit (%d) exceeded",
4810                         "Complex regular subexpression recursion",
4811                         REG_INFTY - 1);
4812                 }
4813                 cur_curlyx->u.curlyx.count--;
4814                 CACHEsayNO;
4815             }
4816
4817             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4818                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4819             );
4820             /* Try grabbing another A and see if it helps. */
4821             PL_reginput = locinput;
4822             cur_curlyx->u.curlyx.lastloc = locinput;
4823             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4824             REGCP_SET(ST.lastcp);
4825             PUSH_STATE_GOTO(WHILEM_A_min,
4826                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4827             /* NOTREACHED */
4828
4829 #undef  ST
4830 #define ST st->u.branch
4831
4832         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4833             next = scan + ARG(scan);
4834             if (next == scan)
4835                 next = NULL;
4836             scan = NEXTOPER(scan);
4837             /* FALL THROUGH */
4838
4839         case BRANCH:        /*  /(...|A|...)/ */
4840             scan = NEXTOPER(scan); /* scan now points to inner node */
4841             ST.lastparen = *PL_reglastparen;
4842             ST.next_branch = next;
4843             REGCP_SET(ST.cp);
4844             PL_reginput = locinput;
4845
4846             /* Now go into the branch */
4847             if (has_cutgroup) {
4848                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4849             } else {
4850                 PUSH_STATE_GOTO(BRANCH_next, scan);
4851             }
4852             /* NOTREACHED */
4853         case CUTGROUP:
4854             PL_reginput = locinput;
4855             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4856                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4857             PUSH_STATE_GOTO(CUTGROUP_next,next);
4858             /* NOTREACHED */
4859         case CUTGROUP_next_fail:
4860             do_cutgroup = 1;
4861             no_final = 1;
4862             if (st->u.mark.mark_name)
4863                 sv_commit = st->u.mark.mark_name;
4864             sayNO;
4865             /* NOTREACHED */
4866         case BRANCH_next:
4867             sayYES;
4868             /* NOTREACHED */
4869         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4870             if (do_cutgroup) {
4871                 do_cutgroup = 0;
4872                 no_final = 0;
4873             }
4874             REGCP_UNWIND(ST.cp);
4875             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4876                 PL_regoffs[n].end = -1;
4877             *PL_reglastparen = n;
4878             /*dmq: *PL_reglastcloseparen = n; */
4879             scan = ST.next_branch;
4880             /* no more branches? */
4881             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4882                 DEBUG_EXECUTE_r({
4883                     PerlIO_printf( Perl_debug_log,
4884                         "%*s  %sBRANCH failed...%s\n",
4885                         REPORT_CODE_OFF+depth*2, "",
4886                         PL_colors[4],
4887                         PL_colors[5] );
4888                 });
4889                 sayNO_SILENT;
4890             }
4891             continue; /* execute next BRANCH[J] op */
4892             /* NOTREACHED */
4893
4894         case MINMOD:
4895             minmod = 1;
4896             break;
4897
4898 #undef  ST
4899 #define ST st->u.curlym
4900
4901         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4902
4903             /* This is an optimisation of CURLYX that enables us to push
4904              * only a single backtracking state, no matter how many matches
4905              * there are in {m,n}. It relies on the pattern being constant
4906              * length, with no parens to influence future backrefs
4907              */
4908
4909             ST.me = scan;
4910             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4911
4912             /* if paren positive, emulate an OPEN/CLOSE around A */
4913             if (ST.me->flags) {
4914                 U32 paren = ST.me->flags;
4915                 if (paren > PL_regsize)
4916                     PL_regsize = paren;
4917                 if (paren > *PL_reglastparen)
4918                     *PL_reglastparen = paren;
4919                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4920             }
4921             ST.A = scan;
4922             ST.B = next;
4923             ST.alen = 0;
4924             ST.count = 0;
4925             ST.minmod = minmod;
4926             minmod = 0;
4927             ST.c1 = CHRTEST_UNINIT;
4928             REGCP_SET(ST.cp);
4929
4930             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4931                 goto curlym_do_B;
4932
4933           curlym_do_A: /* execute the A in /A{m,n}B/  */
4934             PL_reginput = locinput;
4935             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4936             /* NOTREACHED */
4937
4938         case CURLYM_A: /* we've just matched an A */
4939             locinput = st->locinput;
4940             nextchr = UCHARAT(locinput);
4941
4942             ST.count++;
4943             /* after first match, determine A's length: u.curlym.alen */
4944             if (ST.count == 1) {
4945                 if (PL_reg_match_utf8) {
4946                     char *s = locinput;
4947                     while (s < PL_reginput) {
4948                         ST.alen++;
4949                         s += UTF8SKIP(s);
4950                     }
4951                 }
4952                 else {
4953                     ST.alen = PL_reginput - locinput;
4954                 }
4955                 if (ST.alen == 0)
4956                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4957             }
4958             DEBUG_EXECUTE_r(
4959                 PerlIO_printf(Perl_debug_log,
4960                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4961                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4962                           (IV) ST.count, (IV)ST.alen)
4963             );
4964
4965             locinput = PL_reginput;
4966
4967             if (cur_eval && cur_eval->u.eval.close_paren &&
4968                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4969                 goto fake_end;
4970
4971             {
4972                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4973                 if ( max == REG_INFTY || ST.count < max )
4974                     goto curlym_do_A; /* try to match another A */
4975             }
4976             goto curlym_do_B; /* try to match B */
4977
4978         case CURLYM_A_fail: /* just failed to match an A */
4979             REGCP_UNWIND(ST.cp);
4980
4981             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4982                 || (cur_eval && cur_eval->u.eval.close_paren &&
4983                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4984                 sayNO;
4985
4986           curlym_do_B: /* execute the B in /A{m,n}B/  */
4987             PL_reginput = locinput;
4988             if (ST.c1 == CHRTEST_UNINIT) {
4989                 /* calculate c1 and c2 for possible match of 1st char
4990                  * following curly */
4991                 ST.c1 = ST.c2 = CHRTEST_VOID;
4992                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4993                     regnode *text_node = ST.B;
4994                     if (! HAS_TEXT(text_node))
4995                         FIND_NEXT_IMPT(text_node);
4996                     /* this used to be
4997
4998                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4999
5000                         But the former is redundant in light of the latter.
5001
5002                         if this changes back then the macro for
5003                         IS_TEXT and friends need to change.
5004                      */
5005                     if (PL_regkind[OP(text_node)] == EXACT)
5006                     {
5007
5008                         ST.c1 = (U8)*STRING(text_node);
5009                         switch (OP(text_node)) {
5010                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5011                             case EXACTFA:
5012                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5013                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5014                             default: ST.c2 = ST.c1;
5015                         }
5016                     }
5017                 }
5018             }
5019
5020             DEBUG_EXECUTE_r(
5021                 PerlIO_printf(Perl_debug_log,
5022                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5023                     (int)(REPORT_CODE_OFF+(depth*2)),
5024                     "", (IV)ST.count)
5025                 );
5026             if (ST.c1 != CHRTEST_VOID
5027                     && UCHARAT(PL_reginput) != ST.c1
5028                     && UCHARAT(PL_reginput) != ST.c2)
5029             {
5030                 /* simulate B failing */
5031                 DEBUG_OPTIMISE_r(
5032                     PerlIO_printf(Perl_debug_log,
5033                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5034                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5035                         (IV)ST.c1,(IV)ST.c2
5036                 ));
5037                 state_num = CURLYM_B_fail;
5038                 goto reenter_switch;
5039             }
5040
5041             if (ST.me->flags) {
5042                 /* mark current A as captured */
5043                 I32 paren = ST.me->flags;
5044                 if (ST.count) {
5045                     PL_regoffs[paren].start
5046                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5047                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5048                     /*dmq: *PL_reglastcloseparen = paren; */
5049                 }
5050                 else
5051                     PL_regoffs[paren].end = -1;
5052                 if (cur_eval && cur_eval->u.eval.close_paren &&
5053                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5054                 {
5055                     if (ST.count)
5056                         goto fake_end;
5057                     else
5058                         sayNO;
5059                 }
5060             }
5061
5062             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5063             /* NOTREACHED */
5064
5065         case CURLYM_B_fail: /* just failed to match a B */
5066             REGCP_UNWIND(ST.cp);
5067             if (ST.minmod) {
5068                 I32 max = ARG2(ST.me);
5069                 if (max != REG_INFTY && ST.count == max)
5070                     sayNO;
5071                 goto curlym_do_A; /* try to match a further A */
5072             }
5073             /* backtrack one A */
5074             if (ST.count == ARG1(ST.me) /* min */)
5075                 sayNO;
5076             ST.count--;
5077             locinput = HOPc(locinput, -ST.alen);
5078             goto curlym_do_B; /* try to match B */
5079
5080 #undef ST
5081 #define ST st->u.curly
5082
5083 #define CURLY_SETPAREN(paren, success) \
5084     if (paren) { \
5085         if (success) { \
5086             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5087             PL_regoffs[paren].end = locinput - PL_bostr; \
5088             *PL_reglastcloseparen = paren; \
5089         } \
5090         else \
5091             PL_regoffs[paren].end = -1; \
5092     }
5093
5094         case STAR:              /*  /A*B/ where A is width 1 */
5095             ST.paren = 0;
5096             ST.min = 0;
5097             ST.max = REG_INFTY;
5098             scan = NEXTOPER(scan);
5099             goto repeat;
5100         case PLUS:              /*  /A+B/ where A is width 1 */
5101             ST.paren = 0;
5102             ST.min = 1;
5103             ST.max = REG_INFTY;
5104             scan = NEXTOPER(scan);
5105             goto repeat;
5106         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5107             ST.paren = scan->flags;     /* Which paren to set */
5108             if (ST.paren > PL_regsize)
5109                 PL_regsize = ST.paren;
5110             if (ST.paren > *PL_reglastparen)
5111                 *PL_reglastparen = ST.paren;
5112             ST.min = ARG1(scan);  /* min to match */
5113             ST.max = ARG2(scan);  /* max to match */
5114             if (cur_eval && cur_eval->u.eval.close_paren &&
5115                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5116                 ST.min=1;
5117                 ST.max=1;
5118             }
5119             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5120             goto repeat;
5121         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5122             ST.paren = 0;
5123             ST.min = ARG1(scan);  /* min to match */
5124             ST.max = ARG2(scan);  /* max to match */
5125             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5126           repeat:
5127             /*
5128             * Lookahead to avoid useless match attempts
5129             * when we know what character comes next.
5130             *
5131             * Used to only do .*x and .*?x, but now it allows
5132             * for )'s, ('s and (?{ ... })'s to be in the way
5133             * of the quantifier and the EXACT-like node.  -- japhy
5134             */
5135
5136             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5137                 sayNO;
5138             if (HAS_TEXT(next) || JUMPABLE(next)) {
5139                 U8 *s;
5140                 regnode *text_node = next;
5141
5142                 if (! HAS_TEXT(text_node))
5143                     FIND_NEXT_IMPT(text_node);
5144
5145                 if (! HAS_TEXT(text_node))
5146                     ST.c1 = ST.c2 = CHRTEST_VOID;
5147                 else {
5148                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5149                         ST.c1 = ST.c2 = CHRTEST_VOID;
5150                         goto assume_ok_easy;
5151                     }
5152                     else
5153                         s = (U8*)STRING(text_node);
5154
5155                     /*  Currently we only get here when
5156
5157                         PL_rekind[OP(text_node)] == EXACT
5158
5159                         if this changes back then the macro for IS_TEXT and
5160                         friends need to change. */
5161                     if (!UTF_PATTERN) {
5162                         ST.c1 = *s;
5163                         switch (OP(text_node)) {
5164                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5165                             case EXACTFA:
5166                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5167                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5168                             default: ST.c2 = ST.c1; break;
5169                         }
5170                     }
5171                     else { /* UTF_PATTERN */
5172                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5173                              STRLEN ulen1, ulen2;
5174                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5175                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5176
5177                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5178                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5179 #ifdef EBCDIC
5180                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5181                                                     ckWARN(WARN_UTF8) ?
5182                                                     0 : UTF8_ALLOW_ANY);
5183                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5184                                                     ckWARN(WARN_UTF8) ?
5185                                                     0 : UTF8_ALLOW_ANY);
5186 #else
5187                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5188                                                     uniflags);
5189                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5190                                                     uniflags);
5191 #endif
5192                         }
5193                         else {
5194                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5195                                                      uniflags);
5196                         }
5197                     }
5198                 }
5199             }
5200             else
5201                 ST.c1 = ST.c2 = CHRTEST_VOID;
5202         assume_ok_easy:
5203
5204             ST.A = scan;
5205             ST.B = next;
5206             PL_reginput = locinput;
5207             if (minmod) {
5208                 minmod = 0;
5209                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5210                     sayNO;
5211                 ST.count = ST.min;
5212                 locinput = PL_reginput;
5213                 REGCP_SET(ST.cp);
5214                 if (ST.c1 == CHRTEST_VOID)
5215                     goto curly_try_B_min;
5216
5217                 ST.oldloc = locinput;
5218
5219                 /* set ST.maxpos to the furthest point along the
5220                  * string that could possibly match */
5221                 if  (ST.max == REG_INFTY) {
5222                     ST.maxpos = PL_regeol - 1;
5223                     if (utf8_target)
5224                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5225                             ST.maxpos--;
5226                 }
5227                 else if (utf8_target) {
5228                     int m = ST.max - ST.min;
5229                     for (ST.maxpos = locinput;
5230                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5231                         ST.maxpos += UTF8SKIP(ST.maxpos);
5232                 }
5233                 else {
5234                     ST.maxpos = locinput + ST.max - ST.min;
5235                     if (ST.maxpos >= PL_regeol)
5236                         ST.maxpos = PL_regeol - 1;
5237                 }
5238                 goto curly_try_B_min_known;
5239
5240             }
5241             else {
5242                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5243                 locinput = PL_reginput;
5244                 if (ST.count < ST.min)
5245                     sayNO;
5246                 if ((ST.count > ST.min)
5247                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5248                 {
5249                     /* A{m,n} must come at the end of the string, there's
5250                      * no point in backing off ... */
5251                     ST.min = ST.count;
5252                     /* ...except that $ and \Z can match before *and* after
5253                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5254                        We may back off by one in this case. */
5255                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5256                         ST.min--;
5257                 }
5258                 REGCP_SET(ST.cp);
5259                 goto curly_try_B_max;
5260             }
5261             /* NOTREACHED */
5262
5263
5264         case CURLY_B_min_known_fail:
5265             /* failed to find B in a non-greedy match where c1,c2 valid */
5266             if (ST.paren && ST.count)
5267                 PL_regoffs[ST.paren].end = -1;
5268
5269             PL_reginput = locinput;     /* Could be reset... */
5270             REGCP_UNWIND(ST.cp);
5271             /* Couldn't or didn't -- move forward. */
5272             ST.oldloc = locinput;
5273             if (utf8_target)
5274                 locinput += UTF8SKIP(locinput);
5275             else
5276                 locinput++;
5277             ST.count++;
5278           curly_try_B_min_known:
5279              /* find the next place where 'B' could work, then call B */
5280             {
5281                 int n;
5282                 if (utf8_target) {
5283                     n = (ST.oldloc == locinput) ? 0 : 1;
5284                     if (ST.c1 == ST.c2) {
5285                         STRLEN len;
5286                         /* set n to utf8_distance(oldloc, locinput) */
5287                         while (locinput <= ST.maxpos &&
5288                                utf8n_to_uvchr((U8*)locinput,
5289                                               UTF8_MAXBYTES, &len,
5290                                               uniflags) != (UV)ST.c1) {
5291                             locinput += len;
5292                             n++;
5293                         }
5294                     }
5295                     else {
5296                         /* set n to utf8_distance(oldloc, locinput) */
5297                         while (locinput <= ST.maxpos) {
5298                             STRLEN len;
5299                             const UV c = utf8n_to_uvchr((U8*)locinput,
5300                                                   UTF8_MAXBYTES, &len,
5301                                                   uniflags);
5302                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5303                                 break;
5304                             locinput += len;
5305                             n++;
5306                         }
5307                     }
5308                 }
5309                 else {
5310                     if (ST.c1 == ST.c2) {
5311                         while (locinput <= ST.maxpos &&
5312                                UCHARAT(locinput) != ST.c1)
5313                             locinput++;
5314                     }
5315                     else {
5316                         while (locinput <= ST.maxpos
5317                                && UCHARAT(locinput) != ST.c1
5318                                && UCHARAT(locinput) != ST.c2)
5319                             locinput++;
5320                     }
5321                     n = locinput - ST.oldloc;
5322                 }
5323                 if (locinput > ST.maxpos)
5324                     sayNO;
5325                 /* PL_reginput == oldloc now */
5326                 if (n) {
5327                     ST.count += n;
5328                     if (regrepeat(rex, ST.A, n, depth) < n)
5329                         sayNO;
5330                 }
5331                 PL_reginput = locinput;
5332                 CURLY_SETPAREN(ST.paren, ST.count);
5333                 if (cur_eval && cur_eval->u.eval.close_paren &&
5334                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5335                     goto fake_end;
5336                 }
5337                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5338             }
5339             /* NOTREACHED */
5340
5341
5342         case CURLY_B_min_fail:
5343             /* failed to find B in a non-greedy match where c1,c2 invalid */
5344             if (ST.paren && ST.count)
5345                 PL_regoffs[ST.paren].end = -1;
5346
5347             REGCP_UNWIND(ST.cp);
5348             /* failed -- move forward one */
5349             PL_reginput = locinput;
5350             if (regrepeat(rex, ST.A, 1, depth)) {
5351                 ST.count++;
5352                 locinput = PL_reginput;
5353                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5354                         ST.count > 0)) /* count overflow ? */
5355                 {
5356                   curly_try_B_min:
5357                     CURLY_SETPAREN(ST.paren, ST.count);
5358                     if (cur_eval && cur_eval->u.eval.close_paren &&
5359                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5360                         goto fake_end;
5361                     }
5362                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5363                 }
5364             }
5365             sayNO;
5366             /* NOTREACHED */
5367
5368
5369         curly_try_B_max:
5370             /* a successful greedy match: now try to match B */
5371             if (cur_eval && cur_eval->u.eval.close_paren &&
5372                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5373                 goto fake_end;
5374             }
5375             {
5376                 UV c = 0;
5377                 if (ST.c1 != CHRTEST_VOID)
5378                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5379                                            UTF8_MAXBYTES, 0, uniflags)
5380                                 : (UV) UCHARAT(PL_reginput);
5381                 /* If it could work, try it. */
5382                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5383                     CURLY_SETPAREN(ST.paren, ST.count);
5384                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5385                     /* NOTREACHED */
5386                 }
5387             }
5388             /* FALL THROUGH */
5389         case CURLY_B_max_fail:
5390             /* failed to find B in a greedy match */
5391             if (ST.paren && ST.count)
5392                 PL_regoffs[ST.paren].end = -1;
5393
5394             REGCP_UNWIND(ST.cp);
5395             /*  back up. */
5396             if (--ST.count < ST.min)
5397                 sayNO;
5398             PL_reginput = locinput = HOPc(locinput, -1);
5399             goto curly_try_B_max;
5400
5401 #undef ST
5402
5403         case END:
5404             fake_end:
5405             if (cur_eval) {
5406                 /* we've just finished A in /(??{A})B/; now continue with B */
5407                 I32 tmpix;
5408                 st->u.eval.toggle_reg_flags
5409                             = cur_eval->u.eval.toggle_reg_flags;
5410                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5411
5412                 st->u.eval.prev_rex = rex_sv;           /* inner */
5413                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5414                 rex = (struct regexp *)SvANY(rex_sv);
5415                 rexi = RXi_GET(rex);
5416                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5417                 ReREFCNT_inc(rex_sv);
5418                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5419
5420                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5421                 PL_reglastparen = &rex->lastparen;
5422                 PL_reglastcloseparen = &rex->lastcloseparen;
5423
5424                 REGCP_SET(st->u.eval.lastcp);
5425                 PL_reginput = locinput;
5426
5427                 /* Restore parens of the outer rex without popping the
5428                  * savestack */
5429                 tmpix = PL_savestack_ix;
5430                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5431                 regcppop(rex);
5432                 PL_savestack_ix = tmpix;
5433
5434                 st->u.eval.prev_eval = cur_eval;
5435                 cur_eval = cur_eval->u.eval.prev_eval;
5436                 DEBUG_EXECUTE_r(
5437                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5438                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5439                 if ( nochange_depth )
5440                     nochange_depth--;
5441
5442                 PUSH_YES_STATE_GOTO(EVAL_AB,
5443                         st->u.eval.prev_eval->u.eval.B); /* match B */
5444             }
5445
5446             if (locinput < reginfo->till) {
5447                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5448                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5449                                       PL_colors[4],
5450                                       (long)(locinput - PL_reg_starttry),
5451                                       (long)(reginfo->till - PL_reg_starttry),
5452                                       PL_colors[5]));
5453
5454                 sayNO_SILENT;           /* Cannot match: too short. */
5455             }
5456             PL_reginput = locinput;     /* put where regtry can find it */
5457             sayYES;                     /* Success! */
5458
5459         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5460             DEBUG_EXECUTE_r(
5461             PerlIO_printf(Perl_debug_log,
5462                 "%*s  %ssubpattern success...%s\n",
5463                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5464             PL_reginput = locinput;     /* put where regtry can find it */
5465             sayYES;                     /* Success! */
5466
5467 #undef  ST
5468 #define ST st->u.ifmatch
5469
5470         case SUSPEND:   /* (?>A) */
5471             ST.wanted = 1;
5472             PL_reginput = locinput;
5473             goto do_ifmatch;
5474
5475         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5476             ST.wanted = 0;
5477             goto ifmatch_trivial_fail_test;
5478
5479         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5480             ST.wanted = 1;
5481           ifmatch_trivial_fail_test:
5482             if (scan->flags) {
5483                 char * const s = HOPBACKc(locinput, scan->flags);
5484                 if (!s) {
5485                     /* trivial fail */
5486                     if (logical) {
5487                         logical = 0;
5488                         sw = 1 - cBOOL(ST.wanted);
5489                     }
5490                     else if (ST.wanted)
5491                         sayNO;
5492                     next = scan + ARG(scan);
5493                     if (next == scan)
5494                         next = NULL;
5495                     break;
5496                 }
5497                 PL_reginput = s;
5498             }
5499             else
5500                 PL_reginput = locinput;
5501
5502           do_ifmatch:
5503             ST.me = scan;
5504             ST.logical = logical;
5505             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5506
5507             /* execute body of (?...A) */
5508             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5509             /* NOTREACHED */
5510
5511         case IFMATCH_A_fail: /* body of (?...A) failed */
5512             ST.wanted = !ST.wanted;
5513             /* FALL THROUGH */
5514
5515         case IFMATCH_A: /* body of (?...A) succeeded */
5516             if (ST.logical) {
5517                 sw = cBOOL(ST.wanted);
5518             }
5519             else if (!ST.wanted)
5520                 sayNO;
5521
5522             if (OP(ST.me) == SUSPEND)
5523                 locinput = PL_reginput;
5524             else {
5525                 locinput = PL_reginput = st->locinput;
5526                 nextchr = UCHARAT(locinput);
5527             }
5528             scan = ST.me + ARG(ST.me);
5529             if (scan == ST.me)
5530                 scan = NULL;
5531             continue; /* execute B */
5532
5533 #undef ST
5534
5535         case LONGJMP:
5536             next = scan + ARG(scan);
5537             if (next == scan)
5538                 next = NULL;
5539             break;
5540         case COMMIT:
5541             reginfo->cutpoint = PL_regeol;
5542             /* FALLTHROUGH */
5543         case PRUNE:
5544             PL_reginput = locinput;
5545             if (!scan->flags)
5546                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5547             PUSH_STATE_GOTO(COMMIT_next,next);
5548             /* NOTREACHED */
5549         case COMMIT_next_fail:
5550             no_final = 1;
5551             /* FALLTHROUGH */
5552         case OPFAIL:
5553             sayNO;
5554             /* NOTREACHED */
5555
5556 #define ST st->u.mark
5557         case MARKPOINT:
5558             ST.prev_mark = mark_state;
5559             ST.mark_name = sv_commit = sv_yes_mark
5560                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5561             mark_state = st;
5562             ST.mark_loc = PL_reginput = locinput;
5563             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5564             /* NOTREACHED */
5565         case MARKPOINT_next:
5566             mark_state = ST.prev_mark;
5567             sayYES;
5568             /* NOTREACHED */
5569         case MARKPOINT_next_fail:
5570             if (popmark && sv_eq(ST.mark_name,popmark))
5571             {
5572                 if (ST.mark_loc > startpoint)
5573                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5574                 popmark = NULL; /* we found our mark */
5575                 sv_commit = ST.mark_name;
5576
5577                 DEBUG_EXECUTE_r({
5578                         PerlIO_printf(Perl_debug_log,
5579                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5580                             REPORT_CODE_OFF+depth*2, "",
5581                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5582                 });
5583             }
5584             mark_state = ST.prev_mark;
5585             sv_yes_mark = mark_state ?
5586                 mark_state->u.mark.mark_name : NULL;
5587             sayNO;
5588             /* NOTREACHED */
5589         case SKIP:
5590             PL_reginput = locinput;
5591             if (scan->flags) {
5592                 /* (*SKIP) : if we fail we cut here*/
5593                 ST.mark_name = NULL;
5594                 ST.mark_loc = locinput;
5595                 PUSH_STATE_GOTO(SKIP_next,next);
5596             } else {
5597                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5598                    otherwise do nothing.  Meaning we need to scan
5599                  */
5600                 regmatch_state *cur = mark_state;
5601                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5602
5603                 while (cur) {
5604                     if ( sv_eq( cur->u.mark.mark_name,
5605                                 find ) )
5606                     {
5607                         ST.mark_name = find;
5608                         PUSH_STATE_GOTO( SKIP_next, next );
5609                     }
5610                     cur = cur->u.mark.prev_mark;
5611                 }
5612             }
5613             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5614             break;
5615         case SKIP_next_fail:
5616             if (ST.mark_name) {
5617                 /* (*CUT:NAME) - Set up to search for the name as we
5618                    collapse the stack*/
5619                 popmark = ST.mark_name;
5620             } else {
5621                 /* (*CUT) - No name, we cut here.*/
5622                 if (ST.mark_loc > startpoint)
5623                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5624                 /* but we set sv_commit to latest mark_name if there
5625                    is one so they can test to see how things lead to this
5626                    cut */
5627                 if (mark_state)
5628                     sv_commit=mark_state->u.mark.mark_name;
5629             }
5630             no_final = 1;
5631             sayNO;
5632             /* NOTREACHED */
5633 #undef ST
5634         case FOLDCHAR:
5635             n = ARG(scan);
5636             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5637                 locinput += ln;
5638             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5639                 sayNO;
5640             } else  {
5641                 U8 folded[UTF8_MAXBYTES_CASE+1];
5642                 STRLEN foldlen;
5643                 const char * const l = locinput;
5644                 char *e = PL_regeol;
5645                 to_uni_fold(n, folded, &foldlen);
5646
5647                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5648                                l, &e, 0,  utf8_target)) {
5649                         sayNO;
5650                 }
5651                 locinput = e;
5652             }
5653             nextchr = UCHARAT(locinput);
5654             break;
5655         case LNBREAK:
5656             if ((n=is_LNBREAK(locinput,utf8_target))) {
5657                 locinput += n;
5658                 nextchr = UCHARAT(locinput);
5659             } else
5660                 sayNO;
5661             break;
5662
5663 #define CASE_CLASS(nAmE)                              \
5664         case nAmE:                                    \
5665             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5666                 locinput += n;                        \
5667                 nextchr = UCHARAT(locinput);          \
5668             } else                                    \
5669                 sayNO;                                \
5670             break;                                    \
5671         case N##nAmE:                                 \
5672             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5673                 sayNO;                                \
5674             } else {                                  \
5675                 locinput += UTF8SKIP(locinput);       \
5676                 nextchr = UCHARAT(locinput);          \
5677             }                                         \
5678             break
5679
5680         CASE_CLASS(VERTWS);
5681         CASE_CLASS(HORIZWS);
5682 #undef CASE_CLASS
5683
5684         default:
5685             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5686                           PTR2UV(scan), OP(scan));
5687             Perl_croak(aTHX_ "regexp memory corruption");
5688
5689         } /* end switch */
5690
5691         /* switch break jumps here */
5692         scan = next; /* prepare to execute the next op and ... */
5693         continue;    /* ... jump back to the top, reusing st */
5694         /* NOTREACHED */
5695
5696       push_yes_state:
5697         /* push a state that backtracks on success */
5698         st->u.yes.prev_yes_state = yes_state;
5699         yes_state = st;
5700         /* FALL THROUGH */
5701       push_state:
5702         /* push a new regex state, then continue at scan  */
5703         {
5704             regmatch_state *newst;
5705
5706             DEBUG_STACK_r({
5707                 regmatch_state *cur = st;
5708                 regmatch_state *curyes = yes_state;
5709                 int curd = depth;
5710                 regmatch_slab *slab = PL_regmatch_slab;
5711                 for (;curd > -1;cur--,curd--) {
5712                     if (cur < SLAB_FIRST(slab)) {
5713                         slab = slab->prev;
5714                         cur = SLAB_LAST(slab);
5715                     }
5716                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5717                         REPORT_CODE_OFF + 2 + depth * 2,"",
5718                         curd, PL_reg_name[cur->resume_state],
5719                         (curyes == cur) ? "yes" : ""
5720                     );
5721                     if (curyes == cur)
5722                         curyes = cur->u.yes.prev_yes_state;
5723                 }
5724             } else
5725                 DEBUG_STATE_pp("push")
5726             );
5727             depth++;
5728             st->locinput = locinput;
5729             newst = st+1;
5730             if (newst >  SLAB_LAST(PL_regmatch_slab))
5731                 newst = S_push_slab(aTHX);
5732             PL_regmatch_state = newst;
5733
5734             locinput = PL_reginput;
5735             nextchr = UCHARAT(locinput);
5736             st = newst;
5737             continue;
5738             /* NOTREACHED */
5739         }
5740     }
5741
5742     /*
5743     * We get here only if there's trouble -- normally "case END" is
5744     * the terminating point.
5745     */
5746     Perl_croak(aTHX_ "corrupted regexp pointers");
5747     /*NOTREACHED*/
5748     sayNO;
5749
5750 yes:
5751     if (yes_state) {
5752         /* we have successfully completed a subexpression, but we must now
5753          * pop to the state marked by yes_state and continue from there */
5754         assert(st != yes_state);
5755 #ifdef DEBUGGING
5756         while (st != yes_state) {
5757             st--;
5758             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5759                 PL_regmatch_slab = PL_regmatch_slab->prev;
5760                 st = SLAB_LAST(PL_regmatch_slab);
5761             }
5762             DEBUG_STATE_r({
5763                 if (no_final) {
5764                     DEBUG_STATE_pp("pop (no final)");
5765                 } else {
5766                     DEBUG_STATE_pp("pop (yes)");
5767                 }
5768             });
5769             depth--;
5770         }
5771 #else
5772         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5773             || yes_state > SLAB_LAST(PL_regmatch_slab))
5774         {
5775             /* not in this slab, pop slab */
5776             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5777             PL_regmatch_slab = PL_regmatch_slab->prev;
5778             st = SLAB_LAST(PL_regmatch_slab);
5779         }
5780         depth -= (st - yes_state);
5781 #endif
5782         st = yes_state;
5783         yes_state = st->u.yes.prev_yes_state;
5784         PL_regmatch_state = st;
5785
5786         if (no_final) {
5787             locinput= st->locinput;
5788             nextchr = UCHARAT(locinput);
5789         }
5790         state_num = st->resume_state + no_final;
5791         goto reenter_switch;
5792     }
5793
5794     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5795                           PL_colors[4], PL_colors[5]));
5796
5797     if (PL_reg_eval_set) {
5798         /* each successfully executed (?{...}) block does the equivalent of
5799          *   local $^R = do {...}
5800          * When popping the save stack, all these locals would be undone;
5801          * bypass this by setting the outermost saved $^R to the latest
5802          * value */
5803         if (oreplsv != GvSV(PL_replgv))
5804             sv_setsv(oreplsv, GvSV(PL_replgv));
5805     }
5806     result = 1;
5807     goto final_exit;
5808
5809 no:
5810     DEBUG_EXECUTE_r(
5811         PerlIO_printf(Perl_debug_log,
5812             "%*s  %sfailed...%s\n",
5813             REPORT_CODE_OFF+depth*2, "",
5814             PL_colors[4], PL_colors[5])
5815         );
5816
5817 no_silent:
5818     if (no_final) {
5819         if (yes_state) {
5820             goto yes;
5821         } else {
5822             goto final_exit;
5823         }
5824     }
5825     if (depth) {
5826         /* there's a previous state to backtrack to */
5827         st--;
5828         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5829             PL_regmatch_slab = PL_regmatch_slab->prev;
5830             st = SLAB_LAST(PL_regmatch_slab);
5831         }
5832         PL_regmatch_state = st;
5833         locinput= st->locinput;
5834         nextchr = UCHARAT(locinput);
5835
5836         DEBUG_STATE_pp("pop");
5837         depth--;
5838         if (yes_state == st)
5839             yes_state = st->u.yes.prev_yes_state;
5840
5841         state_num = st->resume_state + 1; /* failure = success + 1 */
5842         goto reenter_switch;
5843     }
5844     result = 0;
5845
5846   final_exit:
5847     if (rex->intflags & PREGf_VERBARG_SEEN) {
5848         SV *sv_err = get_sv("REGERROR", 1);
5849         SV *sv_mrk = get_sv("REGMARK", 1);
5850         if (result) {
5851             sv_commit = &PL_sv_no;
5852             if (!sv_yes_mark)
5853                 sv_yes_mark = &PL_sv_yes;
5854         } else {
5855             if (!sv_commit)
5856                 sv_commit = &PL_sv_yes;
5857             sv_yes_mark = &PL_sv_no;
5858         }
5859         sv_setsv(sv_err, sv_commit);
5860         sv_setsv(sv_mrk, sv_yes_mark);
5861     }
5862
5863     /* clean up; in particular, free all slabs above current one */
5864     LEAVE_SCOPE(oldsave);
5865
5866     return result;
5867 }
5868
5869 /*
5870  - regrepeat - repeatedly match something simple, report how many
5871  */
5872 /*
5873  * [This routine now assumes that it will only match on things of length 1.
5874  * That was true before, but now we assume scan - reginput is the count,
5875  * rather than incrementing count on every character.  [Er, except utf8.]]
5876  */
5877 STATIC I32
5878 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5879 {
5880     dVAR;
5881     register char *scan;
5882     register I32 c;
5883     register char *loceol = PL_regeol;
5884     register I32 hardcount = 0;
5885     register bool utf8_target = PL_reg_match_utf8;
5886     UV utf8_flags;
5887 #ifndef DEBUGGING
5888     PERL_UNUSED_ARG(depth);
5889 #endif
5890
5891     PERL_ARGS_ASSERT_REGREPEAT;
5892
5893     scan = PL_reginput;
5894     if (max == REG_INFTY)
5895         max = I32_MAX;
5896     else if (max < loceol - scan)
5897         loceol = scan + max;
5898     switch (OP(p)) {
5899     case REG_ANY:
5900         if (utf8_target) {
5901             loceol = PL_regeol;
5902             while (scan < loceol && hardcount < max && *scan != '\n') {
5903                 scan += UTF8SKIP(scan);
5904                 hardcount++;
5905             }
5906         } else {
5907             while (scan < loceol && *scan != '\n')
5908                 scan++;
5909         }
5910         break;
5911     case SANY:
5912         if (utf8_target) {
5913             loceol = PL_regeol;
5914             while (scan < loceol && hardcount < max) {
5915                 scan += UTF8SKIP(scan);
5916                 hardcount++;
5917             }
5918         }
5919         else
5920             scan = loceol;
5921         break;
5922     case CANY:
5923         scan = loceol;
5924         break;
5925     case EXACT:
5926         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5927          * means they match only characters in the string that can be expressed
5928          * as a single byte.  For non-utf8 strings, that means a simple match.
5929          * For utf8 strings, the character matched must be an invariant, or
5930          * downgradable to a single byte.  The pattern's utf8ness is
5931          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5932          * it is, it's an invariant */
5933
5934         c = (U8)*STRING(p);
5935         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5936
5937         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5938             while (scan < loceol && UCHARAT(scan) == c) {
5939                 scan++;
5940             }
5941         }
5942         else {
5943
5944             /* Here, the string is utf8, and the pattern char is different
5945              * in utf8 than not, so can't compare them directly.  Outside the
5946              * loop, find find the two utf8 bytes that represent c, and then
5947              * look for those in sequence in the utf8 string */
5948             U8 high = UTF8_TWO_BYTE_HI(c);
5949             U8 low = UTF8_TWO_BYTE_LO(c);
5950             loceol = PL_regeol;
5951
5952             while (hardcount < max
5953                     && scan + 1 < loceol
5954                     && UCHARAT(scan) == high
5955                     && UCHARAT(scan + 1) == low)
5956             {
5957                 scan += 2;
5958                 hardcount++;
5959             }
5960         }
5961         break;
5962     case EXACTFA:
5963         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5964         goto do_exactf;
5965
5966     case EXACTFL:
5967         PL_reg_flags |= RF_tainted;
5968         utf8_flags = FOLDEQ_UTF8_LOCALE;
5969         goto do_exactf;
5970
5971     case EXACTF:
5972     case EXACTFU:
5973         utf8_flags = 0;
5974
5975         /* The comments for the EXACT case above apply as well to these fold
5976          * ones */
5977
5978     do_exactf:
5979         c = (U8)*STRING(p);
5980         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5981
5982         if (utf8_target) { /* Use full Unicode fold matching */
5983             char *tmpeol = loceol;
5984             while (hardcount < max
5985                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5986                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5987             {
5988                 scan = tmpeol;
5989                 tmpeol = loceol;
5990                 hardcount++;
5991             }
5992
5993             /* XXX Note that the above handles properly the German sharp s in
5994              * the pattern matching ss in the string.  But it doesn't handle
5995              * properly cases where the string contains say 'LIGATURE ff' and
5996              * the pattern is 'f+'.  This would require, say, a new function or
5997              * revised interface to foldEQ_utf8(), in which the maximum number
5998              * of characters to match could be passed and it would return how
5999              * many actually did.  This is just one of many cases where
6000              * multi-char folds don't work properly, and so the fix is being
6001              * deferred */
6002         }
6003         else {
6004             U8 folded;
6005
6006             /* Here, the string isn't utf8 and c is a single byte; and either
6007              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6008              * doesn't affect c.  Can just do simple comparisons for exact or
6009              * fold matching. */
6010             switch (OP(p)) {
6011                 case EXACTF: folded = PL_fold[c]; break;
6012                 case EXACTFA:
6013                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6014                 case EXACTFL: folded = PL_fold_locale[c]; break;
6015                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6016             }
6017             while (scan < loceol &&
6018                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6019             {
6020                 scan++;
6021             }
6022         }
6023         break;
6024     case ANYOFV:
6025     case ANYOF:
6026         if (utf8_target || OP(p) == ANYOFV) {
6027             STRLEN inclasslen;
6028             loceol = PL_regeol;
6029             inclasslen = loceol - scan;
6030             while (hardcount < max
6031                    && ((inclasslen = loceol - scan) > 0)
6032                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6033             {
6034                 scan += inclasslen;
6035                 hardcount++;
6036             }
6037         } else {
6038             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6039                 scan++;
6040         }
6041         break;
6042     case ALNUMU:
6043         if (utf8_target) {
6044     utf8_wordchar:
6045             loceol = PL_regeol;
6046             LOAD_UTF8_CHARCLASS_ALNUM();
6047             while (hardcount < max && scan < loceol &&
6048                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6049             {
6050                 scan += UTF8SKIP(scan);
6051                 hardcount++;
6052             }
6053         } else {
6054             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6055                 scan++;
6056             }
6057         }
6058         break;
6059     case ALNUM:
6060         if (utf8_target)
6061             goto utf8_wordchar;
6062         while (scan < loceol && isALNUM((U8) *scan)) {
6063             scan++;
6064         }
6065         break;
6066     case ALNUMA:
6067         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6068             scan++;
6069         }
6070         break;
6071     case ALNUML:
6072         PL_reg_flags |= RF_tainted;
6073         if (utf8_target) {
6074             loceol = PL_regeol;
6075             while (hardcount < max && scan < loceol &&
6076                    isALNUM_LC_utf8((U8*)scan)) {
6077                 scan += UTF8SKIP(scan);
6078                 hardcount++;
6079             }
6080         } else {
6081             while (scan < loceol && isALNUM_LC(*scan))
6082                 scan++;
6083         }
6084         break;
6085     case NALNUMU:
6086         if (utf8_target) {
6087
6088     utf8_Nwordchar:
6089
6090             loceol = PL_regeol;
6091             LOAD_UTF8_CHARCLASS_ALNUM();
6092             while (hardcount < max && scan < loceol &&
6093                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6094             {
6095                 scan += UTF8SKIP(scan);
6096                 hardcount++;
6097             }
6098         } else {
6099             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6100                 scan++;
6101             }
6102         }
6103         break;
6104     case NALNUM:
6105         if (utf8_target)
6106             goto utf8_Nwordchar;
6107         while (scan < loceol && ! isALNUM((U8) *scan)) {
6108             scan++;
6109         }
6110         break;
6111     case NALNUMA:
6112         if (utf8_target) {
6113             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6114                 scan += UTF8SKIP(scan);
6115             }
6116         }
6117         else {
6118             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6119                 scan++;
6120             }
6121         }
6122         break;
6123     case NALNUML:
6124         PL_reg_flags |= RF_tainted;
6125         if (utf8_target) {
6126             loceol = PL_regeol;
6127             while (hardcount < max && scan < loceol &&
6128                    !isALNUM_LC_utf8((U8*)scan)) {
6129                 scan += UTF8SKIP(scan);
6130                 hardcount++;
6131             }
6132         } else {
6133             while (scan < loceol && !isALNUM_LC(*scan))
6134                 scan++;
6135         }
6136         break;
6137     case SPACEU:
6138         if (utf8_target) {
6139
6140     utf8_space:
6141
6142             loceol = PL_regeol;
6143             LOAD_UTF8_CHARCLASS_SPACE();
6144             while (hardcount < max && scan < loceol &&
6145                    (*scan == ' ' ||
6146                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6147             {
6148                 scan += UTF8SKIP(scan);
6149                 hardcount++;
6150             }
6151             break;
6152         }
6153         else {
6154             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6155                 scan++;
6156             }
6157             break;
6158         }
6159     case SPACE:
6160         if (utf8_target)
6161             goto utf8_space;
6162
6163         while (scan < loceol && isSPACE((U8) *scan)) {
6164             scan++;
6165         }
6166         break;
6167     case SPACEA:
6168         while (scan < loceol && isSPACE_A((U8) *scan)) {
6169             scan++;
6170         }
6171         break;
6172     case SPACEL:
6173         PL_reg_flags |= RF_tainted;
6174         if (utf8_target) {
6175             loceol = PL_regeol;
6176             while (hardcount < max && scan < loceol &&
6177                    isSPACE_LC_utf8((U8*)scan)) {
6178                 scan += UTF8SKIP(scan);
6179                 hardcount++;
6180             }
6181         } else {
6182             while (scan < loceol && isSPACE_LC(*scan))
6183                 scan++;
6184         }
6185         break;
6186     case NSPACEU:
6187         if (utf8_target) {
6188
6189     utf8_Nspace:
6190
6191             loceol = PL_regeol;
6192             LOAD_UTF8_CHARCLASS_SPACE();
6193             while (hardcount < max && scan < loceol &&
6194                    ! (*scan == ' ' ||
6195                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6196             {
6197                 scan += UTF8SKIP(scan);
6198                 hardcount++;
6199             }
6200             break;
6201         }
6202         else {
6203             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6204                 scan++;
6205             }
6206         }
6207         break;
6208     case NSPACE:
6209         if (utf8_target)
6210             goto utf8_Nspace;
6211
6212         while (scan < loceol && ! isSPACE((U8) *scan)) {
6213             scan++;
6214         }
6215         break;
6216     case NSPACEA:
6217         if (utf8_target) {
6218             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6219                 scan += UTF8SKIP(scan);
6220             }
6221         }
6222         else {
6223             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6224                 scan++;
6225             }
6226         }
6227         break;
6228     case NSPACEL:
6229         PL_reg_flags |= RF_tainted;
6230         if (utf8_target) {
6231             loceol = PL_regeol;
6232             while (hardcount < max && scan < loceol &&
6233                    !isSPACE_LC_utf8((U8*)scan)) {
6234                 scan += UTF8SKIP(scan);
6235                 hardcount++;
6236             }
6237         } else {
6238             while (scan < loceol && !isSPACE_LC(*scan))
6239                 scan++;
6240         }
6241         break;
6242     case DIGIT:
6243         if (utf8_target) {
6244             loceol = PL_regeol;
6245             LOAD_UTF8_CHARCLASS_DIGIT();
6246             while (hardcount < max && scan < loceol &&
6247                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6248                 scan += UTF8SKIP(scan);
6249                 hardcount++;
6250             }
6251         } else {
6252             while (scan < loceol && isDIGIT(*scan))
6253                 scan++;
6254         }
6255         break;
6256     case DIGITA:
6257         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6258             scan++;
6259         }
6260         break;
6261     case DIGITL:
6262         PL_reg_flags |= RF_tainted;
6263         if (utf8_target) {
6264             loceol = PL_regeol;
6265             while (hardcount < max && scan < loceol &&
6266                    isDIGIT_LC_utf8((U8*)scan)) {
6267                 scan += UTF8SKIP(scan);
6268                 hardcount++;
6269             }
6270         } else {
6271             while (scan < loceol && isDIGIT_LC(*scan))
6272                 scan++;
6273         }
6274         break;
6275     case NDIGIT:
6276         if (utf8_target) {
6277             loceol = PL_regeol;
6278             LOAD_UTF8_CHARCLASS_DIGIT();
6279             while (hardcount < max && scan < loceol &&
6280                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6281                 scan += UTF8SKIP(scan);
6282                 hardcount++;
6283             }
6284         } else {
6285             while (scan < loceol && !isDIGIT(*scan))
6286                 scan++;
6287         }
6288         break;
6289     case NDIGITA:
6290         if (utf8_target) {
6291             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6292                 scan += UTF8SKIP(scan);
6293             }
6294         }
6295         else {
6296             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6297                 scan++;
6298             }
6299         }
6300         break;
6301     case NDIGITL:
6302         PL_reg_flags |= RF_tainted;
6303         if (utf8_target) {
6304             loceol = PL_regeol;
6305             while (hardcount < max && scan < loceol &&
6306                    !isDIGIT_LC_utf8((U8*)scan)) {
6307                 scan += UTF8SKIP(scan);
6308                 hardcount++;
6309             }
6310         } else {
6311             while (scan < loceol && !isDIGIT_LC(*scan))
6312                 scan++;
6313         }
6314         break;
6315     case LNBREAK:
6316         if (utf8_target) {
6317             loceol = PL_regeol;
6318             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6319                 scan += c;
6320                 hardcount++;
6321             }
6322         } else {
6323             /*
6324               LNBREAK can match two latin chars, which is ok,
6325               because we have a null terminated string, but we
6326               have to use hardcount in this situation
6327             */
6328             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6329                 scan+=c;
6330                 hardcount++;
6331             }
6332         }
6333         break;
6334     case HORIZWS:
6335         if (utf8_target) {
6336             loceol = PL_regeol;
6337             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6338                 scan += c;
6339                 hardcount++;
6340             }
6341         } else {
6342             while (scan < loceol && is_HORIZWS_latin1(scan))
6343                 scan++;
6344         }
6345         break;
6346     case NHORIZWS:
6347         if (utf8_target) {
6348             loceol = PL_regeol;
6349             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6350                 scan += UTF8SKIP(scan);
6351                 hardcount++;
6352             }
6353         } else {
6354             while (scan < loceol && !is_HORIZWS_latin1(scan))
6355                 scan++;
6356
6357         }
6358         break;
6359     case VERTWS:
6360         if (utf8_target) {
6361             loceol = PL_regeol;
6362             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6363                 scan += c;
6364                 hardcount++;
6365             }
6366         } else {
6367             while (scan < loceol && is_VERTWS_latin1(scan))
6368                 scan++;
6369
6370         }
6371         break;
6372     case NVERTWS:
6373         if (utf8_target) {
6374             loceol = PL_regeol;
6375             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6376                 scan += UTF8SKIP(scan);
6377                 hardcount++;
6378             }
6379         } else {
6380             while (scan < loceol && !is_VERTWS_latin1(scan))
6381                 scan++;
6382
6383         }
6384         break;
6385
6386     default:            /* Called on something of 0 width. */
6387         break;          /* So match right here or not at all. */
6388     }
6389
6390     if (hardcount)
6391         c = hardcount;
6392     else
6393         c = scan - PL_reginput;
6394     PL_reginput = scan;
6395
6396     DEBUG_r({
6397         GET_RE_DEBUG_FLAGS_DECL;
6398         DEBUG_EXECUTE_r({
6399             SV * const prop = sv_newmortal();
6400             regprop(prog, prop, p);
6401             PerlIO_printf(Perl_debug_log,
6402                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6403                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6404         });
6405     });
6406
6407     return(c);
6408 }
6409
6410
6411 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6412 /*
6413 - regclass_swash - prepare the utf8 swash
6414 */
6415
6416 SV *
6417 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6418 {
6419     dVAR;
6420     SV *sw  = NULL;
6421     SV *si  = NULL;
6422     SV *alt = NULL;
6423     RXi_GET_DECL(prog,progi);
6424     const struct reg_data * const data = prog ? progi->data : NULL;
6425
6426     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6427
6428     assert(ANYOF_NONBITMAP(node));
6429
6430     if (data && data->count) {
6431         const U32 n = ARG(node);
6432
6433         if (data->what[n] == 's') {
6434             SV * const rv = MUTABLE_SV(data->data[n]);
6435             AV * const av = MUTABLE_AV(SvRV(rv));
6436             SV **const ary = AvARRAY(av);
6437             SV **a, **b;
6438
6439             /* See the end of regcomp.c:S_regclass() for
6440              * documentation of these array elements. */
6441
6442             si = *ary;
6443             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6444             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6445
6446             if (a)
6447                 sw = *a;
6448             else if (si && doinit) {
6449                 sw = swash_init("utf8", "", si, 1, 0);
6450                 (void)av_store(av, 1, sw);
6451             }
6452             if (b)
6453                 alt = *b;
6454         }
6455     }
6456
6457     if (listsvp)
6458         *listsvp = si;
6459     if (altsvp)
6460         *altsvp  = alt;
6461
6462     return sw;
6463 }
6464 #endif
6465
6466 /*
6467  - reginclass - determine if a character falls into a character class
6468
6469   n is the ANYOF regnode
6470   p is the target string
6471   lenp is pointer to the maximum number of bytes of how far to go in p
6472     (This is assumed wthout checking to always be at least the current
6473     character's size)
6474   utf8_target tells whether p is in UTF-8.
6475
6476   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6477   from a successful match, the value it points to will be updated to how many
6478   bytes in p were matched.  If there was no match, the value is undefined,
6479   possibly changed from the input.
6480
6481   Note that this can be a synthetic start class, a combination of various
6482   nodes, so things you think might be mutually exclusive, such as locale,
6483   aren't.  It can match both locale and non-locale
6484
6485  */
6486
6487 STATIC bool
6488 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6489 {
6490     dVAR;
6491     const char flags = ANYOF_FLAGS(n);
6492     bool match = FALSE;
6493     UV c = *p;
6494     STRLEN c_len = 0;
6495     STRLEN maxlen;
6496
6497     PERL_ARGS_ASSERT_REGINCLASS;
6498
6499     /* If c is not already the code point, get it */
6500     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6501         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6502                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6503                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6504                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6505                  * UTF8_ALLOW_FFFF */
6506         if (c_len == (STRLEN)-1)
6507             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6508     }
6509     else {
6510         c_len = 1;
6511     }
6512
6513     /* Use passed in max length, or one character if none passed in or less
6514      * than one character.  And assume will match just one character.  This is
6515      * overwritten later if matched more. */
6516     if (lenp) {
6517         maxlen = (*lenp > c_len) ? *lenp : c_len;
6518         *lenp = c_len;
6519
6520     }
6521     else {
6522         maxlen = c_len;
6523     }
6524
6525     /* If this character is potentially in the bitmap, check it */
6526     if (c < 256) {
6527         if (ANYOF_BITMAP_TEST(n, c))
6528             match = TRUE;
6529         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6530                 && ! utf8_target
6531                 && ! isASCII(c))
6532         {
6533             match = TRUE;
6534         }
6535
6536         else if (flags & ANYOF_LOCALE) {
6537             PL_reg_flags |= RF_tainted;
6538
6539             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6540                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6541             {
6542                 match = TRUE;
6543             }
6544             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6545                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6546                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6547                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6548                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6549                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6550                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6562                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6563                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6564                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6565                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6566                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6567                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6568                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6569                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6570                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6571                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6572                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6573                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6574                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6575                      ) /* How's that for a conditional? */
6576             ) {
6577                 match = TRUE;
6578             }
6579         }
6580     }
6581
6582     /* If the bitmap didn't (or couldn't) match, and something outside the
6583      * bitmap could match, try that.  Locale nodes specifiy completely the
6584      * behavior of code points in the bit map (otherwise, a utf8 target would
6585      * cause them to be treated as Unicode and not locale), except in
6586      * the very unlikely event when this node is a synthetic start class, which
6587      * could be a combination of locale and non-locale nodes.  So allow locale
6588      * to match for the synthetic start class, which will give a false
6589      * positive that will be resolved when the match is done again as not part
6590      * of the synthetic start class */
6591     if (!match) {
6592         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6593             match = TRUE;       /* Everything above 255 matches */
6594         }
6595         else if (ANYOF_NONBITMAP(n)
6596                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6597                      || (utf8_target
6598                          && (c >=256
6599                              || (! (flags & ANYOF_LOCALE))
6600                              || (flags & ANYOF_IS_SYNTHETIC)))))
6601         {
6602             AV *av;
6603             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6604
6605             if (sw) {
6606                 U8 * utf8_p;
6607                 if (utf8_target) {
6608                     utf8_p = (U8 *) p;
6609                 } else {
6610
6611                     /* Not utf8.  Convert as much of the string as available up
6612                      * to the limit of how far the (single) character in the
6613                      * pattern can possibly match (no need to go further).  If
6614                      * the node is a straight ANYOF or not folding, it can't
6615                      * match more than one.  Otherwise, It can match up to how
6616                      * far a single char can fold to.  Since not utf8, each
6617                      * character is a single byte, so the max it can be in
6618                      * bytes is the same as the max it can be in characters */
6619                     STRLEN len = (OP(n) == ANYOF
6620                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6621                                   ? 1
6622                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6623                                     ? maxlen
6624                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6625                     utf8_p = bytes_to_utf8(p, &len);
6626                 }
6627
6628                 if (swash_fetch(sw, utf8_p, TRUE))
6629                     match = TRUE;
6630                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6631
6632                     /* Here, we need to test if the fold of the target string
6633                      * matches.  The non-multi char folds have all been moved to
6634                      * the compilation phase, and the multi-char folds have
6635                      * been stored by regcomp into 'av'; we linearly check to
6636                      * see if any match the target string (folded).   We know
6637                      * that the originals were each one character, but we don't
6638                      * currently know how many characters/bytes each folded to,
6639                      * except we do know that there are small limits imposed by
6640                      * Unicode.  XXX A performance enhancement would be to have
6641                      * regcomp.c store the max number of chars/bytes that are
6642                      * in an av entry, as, say the 0th element.  Even better
6643                      * would be to have a hash of the few characters that can
6644                      * start a multi-char fold to the max number of chars of
6645                      * those folds.
6646                      *
6647                      * If there is a match, we will need to advance (if lenp is
6648                      * specified) the match pointer in the target string.  But
6649                      * what we are comparing here isn't that string directly,
6650                      * but its fold, whose length may differ from the original.
6651                      * As we go along in constructing the fold, therefore, we
6652                      * create a map so that we know how many bytes in the
6653                      * source to advance given that we have matched a certain
6654                      * number of bytes in the fold.  This map is stored in
6655                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6656                      * the fold of the first character that we are folding.
6657                      * Then map_fold_len_back[n] is set to the number of bytes
6658                      * in that first character.  Similarly let m be the
6659                      * corresponding number for the second character to be
6660                      * folded.  Then map_fold_len_back[n+m] is set to the
6661                      * number of bytes occupied by the first two source
6662                      * characters. ... */
6663                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6664                     U8 folded[UTF8_MAXBYTES_CASE+1];
6665                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6666                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6667                                                   chars */
6668
6669                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6670
6671                         /* Here, only need to fold the first char of the target
6672                          * string.  It the source wasn't utf8, is 1 byte long */
6673                         to_utf8_fold(utf8_p, folded, &foldlen);
6674                         total_foldlen = foldlen;
6675                         map_fold_len_back[foldlen] = (utf8_target)
6676                                                      ? UTF8SKIP(utf8_p)
6677                                                      : 1;
6678                     }
6679                     else {
6680
6681                         /* Here, need to fold more than the first char.  Do so
6682                          * up to the limits */
6683                         U8* source_ptr = utf8_p;    /* The source for the fold
6684                                                        is the regex target
6685                                                        string */
6686                         U8* folded_ptr = folded;
6687                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6688                                                        available byte in the
6689                                                        target string */
6690                         U8 i;
6691                         for (i = 0;
6692                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6693                              i++)
6694                         {
6695
6696                             /* Fold the next character */
6697                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6698                             STRLEN this_char_foldlen;
6699                             to_utf8_fold(source_ptr,
6700                                          this_char_folded,
6701                                          &this_char_foldlen);
6702
6703                             /* Bail if it would exceed the byte limit for
6704                              * folding a single char. */
6705                             if (this_char_foldlen + folded_ptr - folded >
6706                                                             UTF8_MAXBYTES_CASE)
6707                             {
6708                                 break;
6709                             }
6710
6711                             /* Add the fold of this character */
6712                             Copy(this_char_folded,
6713                                  folded_ptr,
6714                                  this_char_foldlen,
6715                                  U8);
6716                             source_ptr += UTF8SKIP(source_ptr);
6717                             folded_ptr += this_char_foldlen;
6718                             total_foldlen = folded_ptr - folded;
6719
6720                             /* Create map from the number of bytes in the fold
6721                              * back to the number of bytes in the source.  If
6722                              * the source isn't utf8, the byte count is just
6723                              * the number of characters so far */
6724                             map_fold_len_back[total_foldlen]
6725                                                       = (utf8_target)
6726                                                         ? source_ptr - utf8_p
6727                                                         : i + 1;
6728                         }
6729                         *folded_ptr = '\0';
6730                     }
6731
6732
6733                     /* Do the linear search to see if the fold is in the list
6734                      * of multi-char folds. */
6735                     if (av) {
6736                         I32 i;
6737                         for (i = 0; i <= av_len(av); i++) {
6738                             SV* const sv = *av_fetch(av, i, FALSE);
6739                             STRLEN len;
6740                             const char * const s = SvPV_const(sv, len);
6741
6742                             if (len <= total_foldlen && memEQ(s,
6743                                                                (char*)folded,
6744                                                                len))
6745                             {
6746
6747                                 /* Advance the target string ptr to account for
6748                                  * this fold, but have to translate from the
6749                                  * folded length to the corresponding source
6750                                  * length. */
6751                                 if (lenp) {
6752                                     *lenp = map_fold_len_back[len];
6753                                     assert(*lenp != 0); /* Otherwise will loop */
6754                                 }
6755                                 match = TRUE;
6756                                 break;
6757                             }
6758                         }
6759                     }
6760                 }
6761
6762                 /* If we allocated a string above, free it */
6763                 if (! utf8_target) Safefree(utf8_p);
6764             }
6765         }
6766     }
6767
6768     return (flags & ANYOF_INVERT) ? !match : match;
6769 }
6770
6771 STATIC U8 *
6772 S_reghop3(U8 *s, I32 off, const U8* lim)
6773 {
6774     dVAR;
6775
6776     PERL_ARGS_ASSERT_REGHOP3;
6777
6778     if (off >= 0) {
6779         while (off-- && s < lim) {
6780             /* XXX could check well-formedness here */
6781             s += UTF8SKIP(s);
6782         }
6783     }
6784     else {
6785         while (off++ && s > lim) {
6786             s--;
6787             if (UTF8_IS_CONTINUED(*s)) {
6788                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6789                     s--;
6790             }
6791             /* XXX could check well-formedness here */
6792         }
6793     }
6794     return s;
6795 }
6796
6797 #ifdef XXX_dmq
6798 /* there are a bunch of places where we use two reghop3's that should
6799    be replaced with this routine. but since thats not done yet
6800    we ifdef it out - dmq
6801 */
6802 STATIC U8 *
6803 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6804 {
6805     dVAR;
6806
6807     PERL_ARGS_ASSERT_REGHOP4;
6808
6809     if (off >= 0) {
6810         while (off-- && s < rlim) {
6811             /* XXX could check well-formedness here */
6812             s += UTF8SKIP(s);
6813         }
6814     }
6815     else {
6816         while (off++ && s > llim) {
6817             s--;
6818             if (UTF8_IS_CONTINUED(*s)) {
6819                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6820                     s--;
6821             }
6822             /* XXX could check well-formedness here */
6823         }
6824     }
6825     return s;
6826 }
6827 #endif
6828
6829 STATIC U8 *
6830 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6831 {
6832     dVAR;
6833
6834     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6835
6836     if (off >= 0) {
6837         while (off-- && s < lim) {
6838             /* XXX could check well-formedness here */
6839             s += UTF8SKIP(s);
6840         }
6841         if (off >= 0)
6842             return NULL;
6843     }
6844     else {
6845         while (off++ && s > lim) {
6846             s--;
6847             if (UTF8_IS_CONTINUED(*s)) {
6848                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6849                     s--;
6850             }
6851             /* XXX could check well-formedness here */
6852         }
6853         if (off <= 0)
6854             return NULL;
6855     }
6856     return s;
6857 }
6858
6859 static void
6860 restore_pos(pTHX_ void *arg)
6861 {
6862     dVAR;
6863     regexp * const rex = (regexp *)arg;
6864     if (PL_reg_eval_set) {
6865         if (PL_reg_oldsaved) {
6866             rex->subbeg = PL_reg_oldsaved;
6867             rex->sublen = PL_reg_oldsavedlen;
6868 #ifdef PERL_OLD_COPY_ON_WRITE
6869             rex->saved_copy = PL_nrs;
6870 #endif
6871             RXp_MATCH_COPIED_on(rex);
6872         }
6873         PL_reg_magic->mg_len = PL_reg_oldpos;
6874         PL_reg_eval_set = 0;
6875         PL_curpm = PL_reg_oldcurpm;
6876     }
6877 }
6878
6879 STATIC void
6880 S_to_utf8_substr(pTHX_ register regexp *prog)
6881 {
6882     int i = 1;
6883
6884     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6885
6886     do {
6887         if (prog->substrs->data[i].substr
6888             && !prog->substrs->data[i].utf8_substr) {
6889             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6890             prog->substrs->data[i].utf8_substr = sv;
6891             sv_utf8_upgrade(sv);
6892             if (SvVALID(prog->substrs->data[i].substr)) {
6893                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6894                 if (flags & FBMcf_TAIL) {
6895                     /* Trim the trailing \n that fbm_compile added last
6896                        time.  */
6897                     SvCUR_set(sv, SvCUR(sv) - 1);
6898                     /* Whilst this makes the SV technically "invalid" (as its
6899                        buffer is no longer followed by "\0") when fbm_compile()
6900                        adds the "\n" back, a "\0" is restored.  */
6901                 }
6902                 fbm_compile(sv, flags);
6903             }
6904             if (prog->substrs->data[i].substr == prog->check_substr)
6905                 prog->check_utf8 = sv;
6906         }
6907     } while (i--);
6908 }
6909
6910 STATIC void
6911 S_to_byte_substr(pTHX_ register regexp *prog)
6912 {
6913     dVAR;
6914     int i = 1;
6915
6916     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6917
6918     do {
6919         if (prog->substrs->data[i].utf8_substr
6920             && !prog->substrs->data[i].substr) {
6921             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6922             if (sv_utf8_downgrade(sv, TRUE)) {
6923                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6924                     const U8 flags
6925                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6926                     if (flags & FBMcf_TAIL) {
6927                         /* Trim the trailing \n that fbm_compile added last
6928                            time.  */
6929                         SvCUR_set(sv, SvCUR(sv) - 1);
6930                     }
6931                     fbm_compile(sv, flags);
6932                 }
6933             } else {
6934                 SvREFCNT_dec(sv);
6935                 sv = &PL_sv_undef;
6936             }
6937             prog->substrs->data[i].substr = sv;
6938             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6939                 prog->check_substr = sv;
6940         }
6941     } while (i--);
6942 }
6943
6944 /*
6945  * Local variables:
6946  * c-indentation-style: bsd
6947  * c-basic-offset: 4
6948  * indent-tabs-mode: t
6949  * End:
6950  *
6951  * ex: set ts=8 sts=4 sw=4 noet:
6952  */