src/5014002/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 154
 155 /* The actual code for CCC_TRY, which uses several variables from the routine
 156  * it's callable from.  It is designed to be the bulk of a case statement.
 157  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 158  *      nextchr matches the class.
 159  * UTF8_TEST is the whole test string to use for utf8 targets
 160  * LOAD is what to use to test, and if not present to load in the swash for the
 161  *      class
 162  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 163  *      UTF8_TEST test.
 164  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 165  * utf8 and a variant, load the swash if necessary and test using the utf8
 166  * test.  Advance to the next character if test is ok, otherwise fail; If not
 167  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 168  * fails, or advance to the next character */
 169
 170 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 171     if (locinput >= PL_regeol) {                                              \
 172         sayNO;                                                                \
 173     }                                                                         \
 174     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 175         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 176         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 177             sayNO;                                                            \
 178         }                                                                     \
 179         locinput += PL_utf8skip[nextchr];                                     \
 180         nextchr = UCHARAT(locinput);                                          \
 181         break;                                                                \
 182     }                                                                         \
 183     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 184         sayNO;                                                                \
 185     }                                                                         \
 186     nextchr = UCHARAT(++locinput);                                            \
 187     break;
 188
 189 /* Handle the non-locale cases for a character class and its complement.  It
 190  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 191  * This is because that code fails when the test succeeds, so we want to have
 192  * the test fail so that the code succeeds.  The swash is stored in a
 193  * predictable PL_ place */
 194 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 195                            CLASS, STR)                                        \
 196     case NAME:                                                                \
 197         _CCC_TRY_CODE( !, FUNC,                                               \
 198                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 199                                             (U8*)locinput, TRUE)),            \
 200                           CLASS, STR)                                         \
 201     case NNAME:                                                               \
 202         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 203                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 204                                             (U8*)locinput, TRUE)),            \
 205                           CLASS, STR)                                         \
 206
 207 /* Generate the case statements for both locale and non-locale character
 208  * classes in regmatch for classes that don't have special unicode semantics.
 209  * Locales don't use an immediate swash, but an intermediary special locale
 210  * function that is called on the pointer to the current place in the input
 211  * string.  That function will resolve to needing the same swash.  One might
 212  * think that because we don't know what the locale will match, we shouldn't
 213  * check with the swash loading function that it loaded properly; ie, that we
 214  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 215  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 216  * irrelevant here */
 217 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 218                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 219                 NAMEA, NNAMEA, FUNCA,                                         \
 220                 CLASS, STR)                                                   \
 221     case NAMEL:                                                               \
 222         PL_reg_flags |= RF_tainted;                                           \
 223         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 224     case NNAMEL:                                                              \
 225         PL_reg_flags |= RF_tainted;                                           \
 226         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 227                        CLASS, STR)                                            \
 228     case NAMEA:                                                               \
 229         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 230             sayNO;                                                            \
 231         }                                                                     \
 232         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 233         nextchr = UCHARAT(++locinput);                                        \
 234         break;                                                                \
 235     case NNAMEA:                                                              \
 236         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 237             sayNO;                                                            \
 238         }                                                                     \
 239         if (utf8_target) {                                                    \
 240             locinput += PL_utf8skip[nextchr];                                 \
 241             nextchr = UCHARAT(locinput);                                      \
 242         }                                                                     \
 243         else {                                                                \
 244             nextchr = UCHARAT(++locinput);                                    \
 245         }                                                                     \
 246         break;                                                                \
 247     /* Generate the non-locale cases */                                       \
 248     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 249
 250 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 251  * statements to handle separate Unicode semantics nodes */
 252 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 253                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 254                   NAMEU, NNAMEU, FUNCU,                                        \
 255                   NAMEA, NNAMEA, FUNCA,                                        \
 256                   CLASS, STR)                                                  \
 257     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 258             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 259             NAMEA, NNAMEA, FUNCA,                                              \
 260             CLASS, STR)                                                        \
 261     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 262
 263 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 264
 265 /* for use after a quantifier and before an EXACT-like node -- japhy */
 266 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 267  *
 268  * NOTE that *nothing* that affects backtracking should be in here, specifically
 269  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 270  * node that is in between two EXACT like nodes when ascertaining what the required
 271  * "follow" character is. This should probably be moved to regex compile time
 272  * although it may be done at run time beause of the REF possibility - more
 273  * investigation required. -- demerphq
 274 */
 275 #define JUMPABLE(rn) (      \
 276     OP(rn) == OPEN ||       \
 277     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 278     OP(rn) == EVAL ||   \
 279     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 280     OP(rn) == PLUS || OP(rn) == MINMOD || \
 281     OP(rn) == KEEPS || \
 282     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 283 )
 284 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 285
 286 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 287
 288 #if 0
 289 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 290    we don't need this definition. */
 291 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 292 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 293 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 294
 295 #else
 296 /* ... so we use this as its faster. */
 297 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 298 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 299 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 300 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 301
 302 #endif
 303
 304 /*
 305   Search for mandatory following text node; for lookahead, the text must
 306   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 307 */
 308 #define FIND_NEXT_IMPT(rn) STMT_START { \
 309     while (JUMPABLE(rn)) { \
 310         const OPCODE type = OP(rn); \
 311         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 312             rn = NEXTOPER(NEXTOPER(rn)); \
 313         else if (type == PLUS) \
 314             rn = NEXTOPER(rn); \
 315         else if (type == IFMATCH) \
 316             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 317         else rn += NEXT_OFF(rn); \
 318     } \
 319 } STMT_END
 320
 321
 322 static void restore_pos(pTHX_ void *arg);
 323
 324 #define REGCP_PAREN_ELEMS 4
 325 #define REGCP_OTHER_ELEMS 5
 326 #define REGCP_FRAME_ELEMS 1
 327 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 328  * are needed for the regexp context stack bookkeeping. */
 329
 330 STATIC CHECKPOINT
 331 S_regcppush(pTHX_ I32 parenfloor)
 332 {
 333     dVAR;
 334     const int retval = PL_savestack_ix;
 335     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 336     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 337     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 338     int p;
 339     GET_RE_DEBUG_FLAGS_DECL;
 340
 341     if (paren_elems_to_push < 0)
 342         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 343
 344     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 345         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 346                    " out of range (%lu-%ld)",
 347                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 348
 349     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 350
 351     for (p = PL_regsize; p > parenfloor; p--) {
 352 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 353         SSPUSHINT(PL_regoffs[p].end);
 354         SSPUSHINT(PL_regoffs[p].start);
 355         SSPUSHPTR(PL_reg_start_tmp[p]);
 356         SSPUSHINT(p);
 357         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 358           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 359                       (UV)p, (IV)PL_regoffs[p].start,
 360                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 361                       (IV)PL_regoffs[p].end
 362         ));
 363     }
 364 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 365     SSPUSHPTR(PL_regoffs);
 366     SSPUSHINT(PL_regsize);
 367     SSPUSHINT(*PL_reglastparen);
 368     SSPUSHINT(*PL_reglastcloseparen);
 369     SSPUSHPTR(PL_reginput);
 370     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 371
 372     return retval;
 373 }
 374
 375 /* These are needed since we do not localize EVAL nodes: */
 376 #define REGCP_SET(cp)                                           \
 377     DEBUG_STATE_r(                                              \
 378             PerlIO_printf(Perl_debug_log,                       \
 379                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 380                 (IV)PL_savestack_ix));                          \
 381     cp = PL_savestack_ix
 382
 383 #define REGCP_UNWIND(cp)                                        \
 384     DEBUG_STATE_r(                                              \
 385         if (cp != PL_savestack_ix)                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 388                 (IV)(cp), (IV)PL_savestack_ix));                \
 389     regcpblow(cp)
 390
 391 STATIC char *
 392 S_regcppop(pTHX_ const regexp *rex)
 393 {
 394     dVAR;
 395     UV i;
 396     char *input;
 397     GET_RE_DEBUG_FLAGS_DECL;
 398
 399     PERL_ARGS_ASSERT_REGCPPOP;
 400
 401     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 402     i = SSPOPUV;
 403     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 404     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 405     input = (char *) SSPOPPTR;
 406     *PL_reglastcloseparen = SSPOPINT;
 407     *PL_reglastparen = SSPOPINT;
 408     PL_regsize = SSPOPINT;
 409     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 410
 411     i -= REGCP_OTHER_ELEMS;
 412     /* Now restore the parentheses context. */
 413     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 414         I32 tmps;
 415         U32 paren = (U32)SSPOPINT;
 416         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 417         PL_regoffs[paren].start = SSPOPINT;
 418         tmps = SSPOPINT;
 419         if (paren <= *PL_reglastparen)
 420             PL_regoffs[paren].end = tmps;
 421         DEBUG_BUFFERS_r(
 422             PerlIO_printf(Perl_debug_log,
 423                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 424                           (UV)paren, (IV)PL_regoffs[paren].start,
 425                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 426                           (IV)PL_regoffs[paren].end,
 427                           (paren > *PL_reglastparen ? "(no)" : ""));
 428         );
 429     }
 430     DEBUG_BUFFERS_r(
 431         if (*PL_reglastparen + 1 <= rex->nparens) {
 432             PerlIO_printf(Perl_debug_log,
 433                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 434                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 435         }
 436     );
 437 #if 1
 438     /* It would seem that the similar code in regtry()
 439      * already takes care of this, and in fact it is in
 440      * a better location to since this code can #if 0-ed out
 441      * but the code in regtry() is needed or otherwise tests
 442      * requiring null fields (pat.t#187 and split.t#{13,14}
 443      * (as of patchlevel 7877)  will fail.  Then again,
 444      * this code seems to be necessary or otherwise
 445      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 446      * --jhi updated by dapm */
 447     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 448         if (i > PL_regsize)
 449             PL_regoffs[i].start = -1;
 450         PL_regoffs[i].end = -1;
 451     }
 452 #endif
 453     return input;
 454 }
 455
 456 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 457
 458 /*
 459  * pregexec and friends
 460  */
 461
 462 #ifndef PERL_IN_XSUB_RE
 463 /*
 464  - pregexec - match a regexp against a string
 465  */
 466 I32
 467 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 468          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 469 /* strend: pointer to null at end of string */
 470 /* strbeg: real beginning of string */
 471 /* minend: end of match must be >=minend after stringarg. */
 472 /* nosave: For optimizations. */
 473 {
 474     PERL_ARGS_ASSERT_PREGEXEC;
 475
 476     return
 477         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 478                       nosave ? 0 : REXEC_COPY_STR);
 479 }
 480 #endif
 481
 482 /*
 483  * Need to implement the following flags for reg_anch:
 484  *
 485  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 486  * USE_INTUIT_ML
 487  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 488  * INTUIT_AUTORITATIVE_ML
 489  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 490  * INTUIT_ONCE_ML
 491  *
 492  * Another flag for this function: SECOND_TIME (so that float substrs
 493  * with giant delta may be not rechecked).
 494  */
 495
 496 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 497
 498 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 499    Otherwise, only SvCUR(sv) is used to get strbeg. */
 500
 501 /* XXXX We assume that strpos is strbeg unless sv. */
 502
 503 /* XXXX Some places assume that there is a fixed substring.
 504         An update may be needed if optimizer marks as "INTUITable"
 505         RExen without fixed substrings.  Similarly, it is assumed that
 506         lengths of all the strings are no more than minlen, thus they
 507         cannot come from lookahead.
 508         (Or minlen should take into account lookahead.)
 509   NOTE: Some of this comment is not correct. minlen does now take account
 510   of lookahead/behind. Further research is required. -- demerphq
 511
 512 */
 513
 514 /* A failure to find a constant substring means that there is no need to make
 515    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 516    finding a substring too deep into the string means that less calls to
 517    regtry() should be needed.
 518
 519    REx compiler's optimizer found 4 possible hints:
 520         a) Anchored substring;
 521         b) Fixed substring;
 522         c) Whether we are anchored (beginning-of-line or \G);
 523         d) First node (of those at offset 0) which may distinguish positions;
 524    We use a)b)d) and multiline-part of c), and try to find a position in the
 525    string which does not contradict any of them.
 526  */
 527
 528 /* Most of decisions we do here should have been done at compile time.
 529    The nodes of the REx which we used for the search should have been
 530    deleted from the finite automaton. */
 531
 532 char *
 533 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 534                      char *strend, const U32 flags, re_scream_pos_data *data)
 535 {
 536     dVAR;
 537     struct regexp *const prog = (struct regexp *)SvANY(rx);
 538     register I32 start_shift = 0;
 539     /* Should be nonnegative! */
 540     register I32 end_shift   = 0;
 541     register char *s;
 542     register SV *check;
 543     char *strbeg;
 544     char *t;
 545     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 546     I32 ml_anch;
 547     register char *other_last = NULL;   /* other substr checked before this */
 548     char *check_at = NULL;              /* check substr found at this pos */
 549     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 550     RXi_GET_DECL(prog,progi);
 551 #ifdef DEBUGGING
 552     const char * const i_strpos = strpos;
 553 #endif
 554     GET_RE_DEBUG_FLAGS_DECL;
 555
 556     PERL_ARGS_ASSERT_RE_INTUIT_START;
 557
 558     RX_MATCH_UTF8_set(rx,utf8_target);
 559
 560     if (RX_UTF8(rx)) {
 561         PL_reg_flags |= RF_utf8;
 562     }
 563     DEBUG_EXECUTE_r(
 564         debug_start_match(rx, utf8_target, strpos, strend,
 565             sv ? "Guessing start of match in sv for"
 566                : "Guessing start of match in string for");
 567               );
 568
 569     /* CHR_DIST() would be more correct here but it makes things slow. */
 570     if (prog->minlen > strend - strpos) {
 571         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 572                               "String too short... [re_intuit_start]\n"));
 573         goto fail;
 574     }
 575
 576     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 577     PL_regeol = strend;
 578     if (utf8_target) {
 579         if (!prog->check_utf8 && prog->check_substr)
 580             to_utf8_substr(prog);
 581         check = prog->check_utf8;
 582     } else {
 583         if (!prog->check_substr && prog->check_utf8)
 584             to_byte_substr(prog);
 585         check = prog->check_substr;
 586     }
 587     if (check == &PL_sv_undef) {
 588         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 589                 "Non-utf8 string cannot match utf8 check string\n"));
 590         goto fail;
 591     }
 592     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 593         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 594                      || ( (prog->extflags & RXf_ANCH_BOL)
 595                           && !multiline ) );    /* Check after \n? */
 596
 597         if (!ml_anch) {
 598           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 599                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 600                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 601                && sv && !SvROK(sv)
 602                && (strpos != strbeg)) {
 603               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 604               goto fail;
 605           }
 606           if (prog->check_offset_min == prog->check_offset_max &&
 607               !(prog->extflags & RXf_CANY_SEEN)) {
 608             /* Substring at constant offset from beg-of-str... */
 609             I32 slen;
 610
 611             s = HOP3c(strpos, prog->check_offset_min, strend);
 612
 613             if (SvTAIL(check)) {
 614                 slen = SvCUR(check);    /* >= 1 */
 615
 616                 if ( strend - s > slen || strend - s < slen - 1
 617                      || (strend - s == slen && strend[-1] != '\n')) {
 618                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 619                     goto fail_finish;
 620                 }
 621                 /* Now should match s[0..slen-2] */
 622                 slen--;
 623                 if (slen && (*SvPVX_const(check) != *s
 624                              || (slen > 1
 625                                  && memNE(SvPVX_const(check), s, slen)))) {
 626                   report_neq:
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 628                     goto fail_finish;
 629                 }
 630             }
 631             else if (*SvPVX_const(check) != *s
 632                      || ((slen = SvCUR(check)) > 1
 633                          && memNE(SvPVX_const(check), s, slen)))
 634                 goto report_neq;
 635             check_at = s;
 636             goto success_at_start;
 637           }
 638         }
 639         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 640         s = strpos;
 641         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 642         end_shift = prog->check_end_shift;
 643
 644         if (!ml_anch) {
 645             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 646                                          - (SvTAIL(check) != 0);
 647             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 648
 649             if (end_shift < eshift)
 650                 end_shift = eshift;
 651         }
 652     }
 653     else {                              /* Can match at random position */
 654         ml_anch = 0;
 655         s = strpos;
 656         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 657         end_shift = prog->check_end_shift;
 658
 659         /* end shift should be non negative here */
 660     }
 661
 662 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 663     if (end_shift < 0)
 664         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 665                    (IV)end_shift, RX_PRECOMP(prog));
 666 #endif
 667
 668   restart:
 669     /* Find a possible match in the region s..strend by looking for
 670        the "check" substring in the region corrected by start/end_shift. */
 671
 672     {
 673         I32 srch_start_shift = start_shift;
 674         I32 srch_end_shift = end_shift;
 675         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 676             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 677             srch_start_shift = strbeg - s;
 678         }
 679     DEBUG_OPTIMISE_MORE_r({
 680         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 681             (IV)prog->check_offset_min,
 682             (IV)srch_start_shift,
 683             (IV)srch_end_shift,
 684             (IV)prog->check_end_shift);
 685     });
 686
 687     if (flags & REXEC_SCREAM) {
 688         I32 p = -1;                     /* Internal iterator of scream. */
 689         I32 * const pp = data ? data->scream_pos : &p;
 690
 691         if (PL_screamfirst[BmRARE(check)] >= 0
 692             || ( BmRARE(check) == '\n'
 693                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 694                  && SvTAIL(check) ))
 695             s = screaminstr(sv, check,
 696                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 697         else
 698             goto fail_finish;
 699         /* we may be pointing at the wrong string */
 700         if (s && RXp_MATCH_COPIED(prog))
 701             s = strbeg + (s - SvPVX_const(sv));
 702         if (data)
 703             *data->scream_olds = s;
 704     }
 705     else {
 706         U8* start_point;
 707         U8* end_point;
 708         if (prog->extflags & RXf_CANY_SEEN) {
 709             start_point= (U8*)(s + srch_start_shift);
 710             end_point= (U8*)(strend - srch_end_shift);
 711         } else {
 712             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 713             end_point= HOP3(strend, -srch_end_shift, strbeg);
 714         }
 715         DEBUG_OPTIMISE_MORE_r({
 716             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 717                 (int)(end_point - start_point),
 718                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 719                 start_point);
 720         });
 721
 722         s = fbm_instr( start_point, end_point,
 723                       check, multiline ? FBMrf_MULTILINE : 0);
 724     }
 725     }
 726     /* Update the count-of-usability, remove useless subpatterns,
 727         unshift s.  */
 728
 729     DEBUG_EXECUTE_r({
 730         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 731             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 732         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 733                           (s ? "Found" : "Did not find"),
 734             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 735                 ? "anchored" : "floating"),
 736             quoted,
 737             RE_SV_TAIL(check),
 738             (s ? " at offset " : "...\n") );
 739     });
 740
 741     if (!s)
 742         goto fail_finish;
 743     /* Finish the diagnostic message */
 744     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 745
 746     /* XXX dmq: first branch is for positive lookbehind...
 747        Our check string is offset from the beginning of the pattern.
 748        So we need to do any stclass tests offset forward from that
 749        point. I think. :-(
 750      */
 751
 752
 753
 754     check_at=s;
 755
 756
 757     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 758        Start with the other substr.
 759        XXXX no SCREAM optimization yet - and a very coarse implementation
 760        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 761                 *always* match.  Probably should be marked during compile...
 762        Probably it is right to do no SCREAM here...
 763      */
 764
 765     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 766                 : (prog->float_substr && prog->anchored_substr))
 767     {
 768         /* Take into account the "other" substring. */
 769         /* XXXX May be hopelessly wrong for UTF... */
 770         if (!other_last)
 771             other_last = strpos;
 772         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 773           do_other_anchored:
 774             {
 775                 char * const last = HOP3c(s, -start_shift, strbeg);
 776                 char *last1, *last2;
 777                 char * const saved_s = s;
 778                 SV* must;
 779
 780                 t = s - prog->check_offset_max;
 781                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 782                     && (!utf8_target
 783                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 784                             && t > strpos)))
 785                     NOOP;
 786                 else
 787                     t = strpos;
 788                 t = HOP3c(t, prog->anchored_offset, strend);
 789                 if (t < other_last)     /* These positions already checked */
 790                     t = other_last;
 791                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 792                 if (last < last1)
 793                     last1 = last;
 794                 /* XXXX It is not documented what units *_offsets are in.
 795                    We assume bytes, but this is clearly wrong.
 796                    Meaning this code needs to be carefully reviewed for errors.
 797                    dmq.
 798                   */
 799
 800                 /* On end-of-str: see comment below. */
 801                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 802                 if (must == &PL_sv_undef) {
 803                     s = (char*)NULL;
 804                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 805                 }
 806                 else
 807                     s = fbm_instr(
 808                         (unsigned char*)t,
 809                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 810                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 811                         must,
 812                         multiline ? FBMrf_MULTILINE : 0
 813                     );
 814                 DEBUG_EXECUTE_r({
 815                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 816                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 817                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 818                         (s ? "Found" : "Contradicts"),
 819                         quoted, RE_SV_TAIL(must));
 820                 });
 821
 822
 823                 if (!s) {
 824                     if (last1 >= last2) {
 825                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 826                                                 ", giving up...\n"));
 827                         goto fail_finish;
 828                     }
 829                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 830                         ", trying floating at offset %ld...\n",
 831                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 832                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 833                     s = HOP3c(last, 1, strend);
 834                     goto restart;
 835                 }
 836                 else {
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 838                           (long)(s - i_strpos)));
 839                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 840                     other_last = HOP3c(s, 1, strend);
 841                     s = saved_s;
 842                     if (t == strpos)
 843                         goto try_at_start;
 844                     goto try_at_offset;
 845                 }
 846             }
 847         }
 848         else {          /* Take into account the floating substring. */
 849             char *last, *last1;
 850             char * const saved_s = s;
 851             SV* must;
 852
 853             t = HOP3c(s, -start_shift, strbeg);
 854             last1 = last =
 855                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 856             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 857                 last = HOP3c(t, prog->float_max_offset, strend);
 858             s = HOP3c(t, prog->float_min_offset, strend);
 859             if (s < other_last)
 860                 s = other_last;
 861  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 862             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 863             /* fbm_instr() takes into account exact value of end-of-str
 864                if the check is SvTAIL(ed).  Since false positives are OK,
 865                and end-of-str is not later than strend we are OK. */
 866             if (must == &PL_sv_undef) {
 867                 s = (char*)NULL;
 868                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 869             }
 870             else
 871                 s = fbm_instr((unsigned char*)s,
 872                               (unsigned char*)last + SvCUR(must)
 873                                   - (SvTAIL(must)!=0),
 874                               must, multiline ? FBMrf_MULTILINE : 0);
 875             DEBUG_EXECUTE_r({
 876                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 879                     (s ? "Found" : "Contradicts"),
 880                     quoted, RE_SV_TAIL(must));
 881             });
 882             if (!s) {
 883                 if (last1 == last) {
 884                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 885                                             ", giving up...\n"));
 886                     goto fail_finish;
 887                 }
 888                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 889                     ", trying anchored starting at offset %ld...\n",
 890                     (long)(saved_s + 1 - i_strpos)));
 891                 other_last = last;
 892                 s = HOP3c(t, 1, strend);
 893                 goto restart;
 894             }
 895             else {
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 897                       (long)(s - i_strpos)));
 898                 other_last = s; /* Fix this later. --Hugo */
 899                 s = saved_s;
 900                 if (t == strpos)
 901                     goto try_at_start;
 902                 goto try_at_offset;
 903             }
 904         }
 905     }
 906
 907
 908     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 909
 910     DEBUG_OPTIMISE_MORE_r(
 911         PerlIO_printf(Perl_debug_log,
 912             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 913             (IV)prog->check_offset_min,
 914             (IV)prog->check_offset_max,
 915             (IV)(s-strpos),
 916             (IV)(t-strpos),
 917             (IV)(t-s),
 918             (IV)(strend-strpos)
 919         )
 920     );
 921
 922     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 923         && (!utf8_target
 924             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 925                  && t > strpos)))
 926     {
 927         /* Fixed substring is found far enough so that the match
 928            cannot start at strpos. */
 929       try_at_offset:
 930         if (ml_anch && t[-1] != '\n') {
 931             /* Eventually fbm_*() should handle this, but often
 932                anchored_offset is not 0, so this check will not be wasted. */
 933             /* XXXX In the code below we prefer to look for "^" even in
 934                presence of anchored substrings.  And we search even
 935                beyond the found float position.  These pessimizations
 936                are historical artefacts only.  */
 937           find_anchor:
 938             while (t < strend - prog->minlen) {
 939                 if (*t == '\n') {
 940                     if (t < check_at - prog->check_offset_min) {
 941                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 942                             /* Since we moved from the found position,
 943                                we definitely contradict the found anchored
 944                                substr.  Due to the above check we do not
 945                                contradict "check" substr.
 946                                Thus we can arrive here only if check substr
 947                                is float.  Redo checking for "other"=="fixed".
 948                              */
 949                             strpos = t + 1;
 950                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 951                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 952                             goto do_other_anchored;
 953                         }
 954                         /* We don't contradict the found floating substring. */
 955                         /* XXXX Why not check for STCLASS? */
 956                         s = t + 1;
 957                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 958                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 959                         goto set_useful;
 960                     }
 961                     /* Position contradicts check-string */
 962                     /* XXXX probably better to look for check-string
 963                        than for "\n", so one should lower the limit for t? */
 964                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 965                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 966                     other_last = strpos = s = t + 1;
 967                     goto restart;
 968                 }
 969                 t++;
 970             }
 971             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 972                         PL_colors[0], PL_colors[1]));
 973             goto fail_finish;
 974         }
 975         else {
 976             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 977                         PL_colors[0], PL_colors[1]));
 978         }
 979         s = t;
 980       set_useful:
 981         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 982     }
 983     else {
 984         /* The found string does not prohibit matching at strpos,
 985            - no optimization of calling REx engine can be performed,
 986            unless it was an MBOL and we are not after MBOL,
 987            or a future STCLASS check will fail this. */
 988       try_at_start:
 989         /* Even in this situation we may use MBOL flag if strpos is offset
 990            wrt the start of the string. */
 991         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 992             && (strpos != strbeg) && strpos[-1] != '\n'
 993             /* May be due to an implicit anchor of m{.*foo}  */
 994             && !(prog->intflags & PREGf_IMPLICIT))
 995         {
 996             t = strpos;
 997             goto find_anchor;
 998         }
 999         DEBUG_EXECUTE_r( if (ml_anch)
1000             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1001                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1002         );
1003       success_at_start:
1004         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1005             && (utf8_target ? (
1006                 prog->check_utf8                /* Could be deleted already */
1007                 && --BmUSEFUL(prog->check_utf8) < 0
1008                 && (prog->check_utf8 == prog->float_utf8)
1009             ) : (
1010                 prog->check_substr              /* Could be deleted already */
1011                 && --BmUSEFUL(prog->check_substr) < 0
1012                 && (prog->check_substr == prog->float_substr)
1013             )))
1014         {
1015             /* If flags & SOMETHING - do not do it many times on the same match */
1016             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1017             /* XXX Does the destruction order has to change with utf8_target? */
1018             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1019             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1020             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1021             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1022             check = NULL;                       /* abort */
1023             s = strpos;
1024             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1025                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1026             if (prog->intflags & PREGf_IMPLICIT)
1027                 prog->extflags &= ~RXf_ANCH_MBOL;
1028             /* XXXX This is a remnant of the old implementation.  It
1029                     looks wasteful, since now INTUIT can use many
1030                     other heuristics. */
1031             prog->extflags &= ~RXf_USE_INTUIT;
1032             /* XXXX What other flags might need to be cleared in this branch? */
1033         }
1034         else
1035             s = strpos;
1036     }
1037
1038     /* Last resort... */
1039     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1040     /* trie stclasses are too expensive to use here, we are better off to
1041        leave it to regmatch itself */
1042     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1043         /* minlen == 0 is possible if regstclass is \b or \B,
1044            and the fixed substr is ''$.
1045            Since minlen is already taken into account, s+1 is before strend;
1046            accidentally, minlen >= 1 guaranties no false positives at s + 1
1047            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1048            regstclass does not come from lookahead...  */
1049         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1050            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1051         const U8* const str = (U8*)STRING(progi->regstclass);
1052         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1053                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1054                     : 1);
1055         char * endpos;
1056         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1057             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1058         else if (prog->float_substr || prog->float_utf8)
1059             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1060         else
1061             endpos= strend;
1062
1063         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1064                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1065
1066         t = s;
1067         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1068         if (!s) {
1069 #ifdef DEBUGGING
1070             const char *what = NULL;
1071 #endif
1072             if (endpos == strend) {
1073                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074                                 "Could not match STCLASS...\n") );
1075                 goto fail;
1076             }
1077             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1078                                    "This position contradicts STCLASS...\n") );
1079             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1080                 goto fail;
1081             /* Contradict one of substrings */
1082             if (prog->anchored_substr || prog->anchored_utf8) {
1083                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1084                     DEBUG_EXECUTE_r( what = "anchored" );
1085                   hop_and_restart:
1086                     s = HOP3c(t, 1, strend);
1087                     if (s + start_shift + end_shift > strend) {
1088                         /* XXXX Should be taken into account earlier? */
1089                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1090                                                "Could not match STCLASS...\n") );
1091                         goto fail;
1092                     }
1093                     if (!check)
1094                         goto giveup;
1095                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1096                                 "Looking for %s substr starting at offset %ld...\n",
1097                                  what, (long)(s + start_shift - i_strpos)) );
1098                     goto restart;
1099                 }
1100                 /* Have both, check_string is floating */
1101                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1102                     goto retry_floating_check;
1103                 /* Recheck anchored substring, but not floating... */
1104                 s = check_at;
1105                 if (!check)
1106                     goto giveup;
1107                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                           "Looking for anchored substr starting at offset %ld...\n",
1109                           (long)(other_last - i_strpos)) );
1110                 goto do_other_anchored;
1111             }
1112             /* Another way we could have checked stclass at the
1113                current position only: */
1114             if (ml_anch) {
1115                 s = t = t + 1;
1116                 if (!check)
1117                     goto giveup;
1118                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                           "Looking for /%s^%s/m starting at offset %ld...\n",
1120                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1121                 goto try_at_offset;
1122             }
1123             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1124                 goto fail;
1125             /* Check is floating substring. */
1126           retry_floating_check:
1127             t = check_at - start_shift;
1128             DEBUG_EXECUTE_r( what = "floating" );
1129             goto hop_and_restart;
1130         }
1131         if (t != s) {
1132             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1133                         "By STCLASS: moving %ld --> %ld\n",
1134                                   (long)(t - i_strpos), (long)(s - i_strpos))
1135                    );
1136         }
1137         else {
1138             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1139                                   "Does not contradict STCLASS...\n");
1140                    );
1141         }
1142     }
1143   giveup:
1144     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1145                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1146                           PL_colors[5], (long)(s - i_strpos)) );
1147     return s;
1148
1149   fail_finish:                          /* Substring not found */
1150     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1151         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1152   fail:
1153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1154                           PL_colors[4], PL_colors[5]));
1155     return NULL;
1156 }
1157
1158 #define DECL_TRIE_TYPE(scan) \
1159     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1160                     trie_type = (scan->flags != EXACT) \
1161                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1162                               : (utf8_target ? trie_utf8 : trie_plain)
1163
1164 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1165 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1166     switch (trie_type) {                                                    \
1167     case trie_utf8_fold:                                                    \
1168         if ( foldlen>0 ) {                                                  \
1169             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1170             foldlen -= len;                                                 \
1171             uscan += len;                                                   \
1172             len=0;                                                          \
1173         } else {                                                            \
1174             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1175             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1176             foldlen -= UNISKIP( uvc );                                      \
1177             uscan = foldbuf + UNISKIP( uvc );                               \
1178         }                                                                   \
1179         break;                                                              \
1180     case trie_latin_utf8_fold:                                              \
1181         if ( foldlen>0 ) {                                                  \
1182             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1183             foldlen -= len;                                                 \
1184             uscan += len;                                                   \
1185             len=0;                                                          \
1186         } else {                                                            \
1187             len = 1;                                                        \
1188             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1189             foldlen -= UNISKIP( uvc );                                      \
1190             uscan = foldbuf + UNISKIP( uvc );                               \
1191         }                                                                   \
1192         break;                                                              \
1193     case trie_utf8:                                                         \
1194         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1195         break;                                                              \
1196     case trie_plain:                                                        \
1197         uvc = (UV)*uc;                                                      \
1198         len = 1;                                                            \
1199     }                                                                       \
1200     if (uvc < 256) {                                                        \
1201         charid = trie->charmap[ uvc ];                                      \
1202     }                                                                       \
1203     else {                                                                  \
1204         charid = 0;                                                         \
1205         if (widecharmap) {                                                  \
1206             SV** const svpp = hv_fetch(widecharmap,                         \
1207                         (char*)&uvc, sizeof(UV), 0);                        \
1208             if (svpp)                                                       \
1209                 charid = (U16)SvIV(*svpp);                                  \
1210         }                                                                   \
1211     }                                                                       \
1212 } STMT_END
1213
1214 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1215 STMT_START {                                              \
1216     while (s <= e) {                                      \
1217         if ( (CoNd)                                       \
1218              && (ln == 1 || folder(s, pat_string, ln))    \
1219              && (!reginfo || regtry(reginfo, &s)) )       \
1220             goto got_it;                                  \
1221         s++;                                              \
1222     }                                                     \
1223 } STMT_END
1224
1225 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1226 STMT_START {                                          \
1227     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1228         CoDe                                          \
1229         s += uskip;                                   \
1230     }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_SCAN(CoDe)                          \
1234 STMT_START {                                          \
1235     while (s < strend) {                              \
1236         CoDe                                          \
1237         s++;                                          \
1238     }                                                 \
1239 } STMT_END
1240
1241 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1242 REXEC_FBC_UTF8_SCAN(                                  \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1254 REXEC_FBC_SCAN(                                       \
1255     if (CoNd) {                                       \
1256         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1257             goto got_it;                              \
1258         else                                          \
1259             tmp = doevery;                            \
1260     }                                                 \
1261     else                                              \
1262         tmp = 1;                                      \
1263 )
1264
1265 #define REXEC_FBC_TRYIT               \
1266 if ((!reginfo || regtry(reginfo, &s))) \
1267     goto got_it
1268
1269 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1270     if (utf8_target) {                                             \
1271         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1272     }                                                          \
1273     else {                                                     \
1274         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1275     }
1276
1277 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1278     if (utf8_target) {                                             \
1279         UtFpReLoAd;                                            \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }
1285
1286 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1287     PL_reg_flags |= RF_tainted;                                \
1288     if (utf8_target) {                                             \
1289         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1290     }                                                          \
1291     else {                                                     \
1292         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1293     }
1294
1295 #define DUMP_EXEC_POS(li,s,doutf8) \
1296     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1297
1298
1299 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1300         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1301         tmp = TEST_NON_UTF8(tmp);                                              \
1302         REXEC_FBC_UTF8_SCAN(                                                   \
1303             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1304                 tmp = !tmp;                                                    \
1305                 IF_SUCCESS;                                                    \
1306             }                                                                  \
1307             else {                                                             \
1308                 IF_FAIL;                                                       \
1309             }                                                                  \
1310         );                                                                     \
1311
1312 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1313         if (s == PL_bostr) {                                                   \
1314             tmp = '\n';                                                        \
1315         }                                                                      \
1316         else {                                                                 \
1317             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1318             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1319         }                                                                      \
1320         tmp = TeSt1_UtF8;                                                      \
1321         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1322         REXEC_FBC_UTF8_SCAN(                                                   \
1323             if (tmp == ! (TeSt2_UtF8)) { \
1324                 tmp = !tmp;                                                    \
1325                 IF_SUCCESS;                                                    \
1326             }                                                                  \
1327             else {                                                             \
1328                 IF_FAIL;                                                       \
1329             }                                                                  \
1330         );                                                                     \
1331
1332 /* The only difference between the BOUND and NBOUND cases is that
1333  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1334  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1335  * with the other one being empty */
1336 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1337     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1338
1339 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1340     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1341
1342 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1343     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1344
1345 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1346     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1347
1348
1349 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1350  * be passed in completely with the variable name being tested, which isn't
1351  * such a clean interface, but this is easier to read than it was before.  We
1352  * are looking for the boundary (or non-boundary between a word and non-word
1353  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1354  * must be different.  Find the "wordness" of the character just prior to this
1355  * one, and compare it with the wordness of this one.  If they differ, we have
1356  * a boundary.  At the beginning of the string, pretend that the previous
1357  * character was a new-line */
1358 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359     if (utf8_target) {                                                         \
1360                 UTF8_CODE \
1361     }                                                                          \
1362     else {  /* Not utf8 */                                                     \
1363         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1364         tmp = TEST_NON_UTF8(tmp);                                              \
1365         REXEC_FBC_SCAN(                                                        \
1366             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1367                 tmp = !tmp;                                                    \
1368                 IF_SUCCESS;                                                    \
1369             }                                                                  \
1370             else {                                                             \
1371                 IF_FAIL;                                                       \
1372             }                                                                  \
1373         );                                                                     \
1374     }                                                                          \
1375     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1376         goto got_it;
1377
1378 /* We know what class REx starts with.  Try to find this position... */
1379 /* if reginfo is NULL, its a dryrun */
1380 /* annoyingly all the vars in this routine have different names from their counterparts
1381    in regmatch. /grrr */
1382
1383 STATIC char *
1384 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1385     const char *strend, regmatch_info *reginfo)
1386 {
1387         dVAR;
1388         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1389         char *pat_string;   /* The pattern's exactish string */
1390         char *pat_end;      /* ptr to end char of pat_string */
1391         re_fold_t folder;       /* Function for computing non-utf8 folds */
1392         const U8 *fold_array;   /* array for folding ords < 256 */
1393         STRLEN ln;
1394         STRLEN lnc;
1395         register STRLEN uskip;
1396         U8 c1;
1397         U8 c2;
1398         char *e;
1399         register I32 tmp = 1;   /* Scratch variable? */
1400         register const bool utf8_target = PL_reg_match_utf8;
1401         UV utf8_fold_flags = 0;
1402         RXi_GET_DECL(prog,progi);
1403
1404         PERL_ARGS_ASSERT_FIND_BYCLASS;
1405
1406         /* We know what class it must start with. */
1407         switch (OP(c)) {
1408         case ANYOFV:
1409         case ANYOF:
1410             if (utf8_target || OP(c) == ANYOFV) {
1411                 STRLEN inclasslen = strend - s;
1412                 REXEC_FBC_UTF8_CLASS_SCAN(
1413                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1414             }
1415             else {
1416                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1417             }
1418             break;
1419         case CANY:
1420             REXEC_FBC_SCAN(
1421                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1422                     goto got_it;
1423                 else
1424                     tmp = doevery;
1425             );
1426             break;
1427
1428         case EXACTFA:
1429             if (UTF_PATTERN || utf8_target) {
1430                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1431                 goto do_exactf_utf8;
1432             }
1433             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1434             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1435             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1436
1437         case EXACTFU:
1438             if (UTF_PATTERN || utf8_target) {
1439                 utf8_fold_flags = 0;
1440                 goto do_exactf_utf8;
1441             }
1442             fold_array = PL_fold_latin1;
1443             folder = foldEQ_latin1;
1444             /* XXX This uses the full utf8 fold because if the pattern contains
1445              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1446              * There could be a new node type, say EXACTFU_SS, which is
1447              * generated by regcomp only if there is an 'ss', and then every
1448              * other case could goto do_exactf_non_utf8;*/
1449             goto do_exactf_utf8;
1450
1451         case EXACTF:
1452             if (UTF_PATTERN || utf8_target) {
1453                 utf8_fold_flags = 0;
1454                 goto do_exactf_utf8;
1455             }
1456             fold_array = PL_fold;
1457             folder = foldEQ;
1458             goto do_exactf_non_utf8;
1459
1460         case EXACTFL:
1461             if (UTF_PATTERN || utf8_target) {
1462                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1463                 goto do_exactf_utf8;
1464             }
1465             fold_array = PL_fold_locale;
1466             folder = foldEQ_locale;
1467
1468             /* FALL THROUGH */
1469
1470         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1471
1472             /* The idea in the non-utf8 EXACTF* cases is to first find the
1473              * first character of the EXACTF* node and then, if necessary,
1474              * case-insensitively compare the full text of the node.  c1 is the
1475              * first character.  c2 is its fold.  This logic will not work for
1476              * Unicode semantics and the german sharp ss, which hence should
1477              * not be compiled into a node that gets here. */
1478             pat_string = STRING(c);
1479             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1480
1481             e = HOP3c(strend, -((I32)ln), s);
1482
1483             if (!reginfo && e < s) {
1484                 e = s;                  /* Due to minlen logic of intuit() */
1485             }
1486
1487             c1 = *pat_string;
1488             c2 = fold_array[c1];
1489             if (c1 == c2) { /* If char and fold are the same */
1490                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1491             }
1492             else {
1493                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1494             }
1495             break;
1496
1497         do_exactf_utf8:
1498
1499             /* If one of the operands is in utf8, we can't use the simpler
1500              * folding above, due to the fact that many different characters
1501              * can have the same fold, or portion of a fold, or different-
1502              * length fold */
1503             pat_string = STRING(c);
1504             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1505             pat_end = pat_string + ln;
1506             lnc = (UTF_PATTERN) /* length to match in characters */
1507                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1508                     : ln;
1509
1510             e = HOP3c(strend, -((I32)lnc), s);
1511
1512             if (!reginfo && e < s) {
1513                 e = s;                  /* Due to minlen logic of intuit() */
1514             }
1515
1516             while (s <= e) {
1517                 char *my_strend= (char *)strend;
1518                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1519                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1520                     && (!reginfo || regtry(reginfo, &s)) )
1521                 {
1522                     goto got_it;
1523                 }
1524                 s += UTF8SKIP(s);
1525             }
1526             break;
1527         case BOUNDL:
1528             PL_reg_flags |= RF_tainted;
1529             FBC_BOUND(isALNUM_LC,
1530                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1531                       isALNUM_LC_utf8((U8*)s));
1532             break;
1533         case NBOUNDL:
1534             PL_reg_flags |= RF_tainted;
1535             FBC_NBOUND(isALNUM_LC,
1536                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1537                        isALNUM_LC_utf8((U8*)s));
1538             break;
1539         case BOUND:
1540             FBC_BOUND(isWORDCHAR,
1541                       isALNUM_uni(tmp),
1542                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1543             break;
1544         case BOUNDA:
1545             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1546                              isWORDCHAR_A(tmp),
1547                              isWORDCHAR_A((U8*)s));
1548             break;
1549         case NBOUND:
1550             FBC_NBOUND(isWORDCHAR,
1551                        isALNUM_uni(tmp),
1552                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1553             break;
1554         case NBOUNDA:
1555             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1556                               isWORDCHAR_A(tmp),
1557                               isWORDCHAR_A((U8*)s));
1558             break;
1559         case BOUNDU:
1560             FBC_BOUND(isWORDCHAR_L1,
1561                       isALNUM_uni(tmp),
1562                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1563             break;
1564         case NBOUNDU:
1565             FBC_NBOUND(isWORDCHAR_L1,
1566                        isALNUM_uni(tmp),
1567                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1568             break;
1569         case ALNUML:
1570             REXEC_FBC_CSCAN_TAINT(
1571                 isALNUM_LC_utf8((U8*)s),
1572                 isALNUM_LC(*s)
1573             );
1574             break;
1575         case ALNUMU:
1576             REXEC_FBC_CSCAN_PRELOAD(
1577                 LOAD_UTF8_CHARCLASS_ALNUM(),
1578                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1579                 isWORDCHAR_L1((U8) *s)
1580             );
1581             break;
1582         case ALNUM:
1583             REXEC_FBC_CSCAN_PRELOAD(
1584                 LOAD_UTF8_CHARCLASS_ALNUM(),
1585                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1586                 isWORDCHAR((U8) *s)
1587             );
1588             break;
1589         case ALNUMA:
1590             /* Don't need to worry about utf8, as it can match only a single
1591              * byte invariant character */
1592             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1593             break;
1594         case NALNUMU:
1595             REXEC_FBC_CSCAN_PRELOAD(
1596                 LOAD_UTF8_CHARCLASS_ALNUM(),
1597                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1598                 ! isWORDCHAR_L1((U8) *s)
1599             );
1600             break;
1601         case NALNUM:
1602             REXEC_FBC_CSCAN_PRELOAD(
1603                 LOAD_UTF8_CHARCLASS_ALNUM(),
1604                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1605                 ! isALNUM(*s)
1606             );
1607             break;
1608         case NALNUMA:
1609             REXEC_FBC_CSCAN(
1610                 !isWORDCHAR_A(*s),
1611                 !isWORDCHAR_A(*s)
1612             );
1613             break;
1614         case NALNUML:
1615             REXEC_FBC_CSCAN_TAINT(
1616                 !isALNUM_LC_utf8((U8*)s),
1617                 !isALNUM_LC(*s)
1618             );
1619             break;
1620         case SPACEU:
1621             REXEC_FBC_CSCAN_PRELOAD(
1622                 LOAD_UTF8_CHARCLASS_SPACE(),
1623                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1624                 isSPACE_L1((U8) *s)
1625             );
1626             break;
1627         case SPACE:
1628             REXEC_FBC_CSCAN_PRELOAD(
1629                 LOAD_UTF8_CHARCLASS_SPACE(),
1630                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1631                 isSPACE((U8) *s)
1632             );
1633             break;
1634         case SPACEA:
1635             /* Don't need to worry about utf8, as it can match only a single
1636              * byte invariant character */
1637             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1638             break;
1639         case SPACEL:
1640             REXEC_FBC_CSCAN_TAINT(
1641                 isSPACE_LC_utf8((U8*)s),
1642                 isSPACE_LC(*s)
1643             );
1644             break;
1645         case NSPACEU:
1646             REXEC_FBC_CSCAN_PRELOAD(
1647                 LOAD_UTF8_CHARCLASS_SPACE(),
1648                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1649                 ! isSPACE_L1((U8) *s)
1650             );
1651             break;
1652         case NSPACE:
1653             REXEC_FBC_CSCAN_PRELOAD(
1654                 LOAD_UTF8_CHARCLASS_SPACE(),
1655                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1656                 ! isSPACE((U8) *s)
1657             );
1658             break;
1659         case NSPACEA:
1660             REXEC_FBC_CSCAN(
1661                 !isSPACE_A(*s),
1662                 !isSPACE_A(*s)
1663             );
1664             break;
1665         case NSPACEL:
1666             REXEC_FBC_CSCAN_TAINT(
1667                 !isSPACE_LC_utf8((U8*)s),
1668                 !isSPACE_LC(*s)
1669             );
1670             break;
1671         case DIGIT:
1672             REXEC_FBC_CSCAN_PRELOAD(
1673                 LOAD_UTF8_CHARCLASS_DIGIT(),
1674                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1675                 isDIGIT(*s)
1676             );
1677             break;
1678         case DIGITA:
1679             /* Don't need to worry about utf8, as it can match only a single
1680              * byte invariant character */
1681             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1682             break;
1683         case DIGITL:
1684             REXEC_FBC_CSCAN_TAINT(
1685                 isDIGIT_LC_utf8((U8*)s),
1686                 isDIGIT_LC(*s)
1687             );
1688             break;
1689         case NDIGIT:
1690             REXEC_FBC_CSCAN_PRELOAD(
1691                 LOAD_UTF8_CHARCLASS_DIGIT(),
1692                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1693                 !isDIGIT(*s)
1694             );
1695             break;
1696         case NDIGITA:
1697             REXEC_FBC_CSCAN(
1698                 !isDIGIT_A(*s),
1699                 !isDIGIT_A(*s)
1700             );
1701             break;
1702         case NDIGITL:
1703             REXEC_FBC_CSCAN_TAINT(
1704                 !isDIGIT_LC_utf8((U8*)s),
1705                 !isDIGIT_LC(*s)
1706             );
1707             break;
1708         case LNBREAK:
1709             REXEC_FBC_CSCAN(
1710                 is_LNBREAK_utf8(s),
1711                 is_LNBREAK_latin1(s)
1712             );
1713             break;
1714         case VERTWS:
1715             REXEC_FBC_CSCAN(
1716                 is_VERTWS_utf8(s),
1717                 is_VERTWS_latin1(s)
1718             );
1719             break;
1720         case NVERTWS:
1721             REXEC_FBC_CSCAN(
1722                 !is_VERTWS_utf8(s),
1723                 !is_VERTWS_latin1(s)
1724             );
1725             break;
1726         case HORIZWS:
1727             REXEC_FBC_CSCAN(
1728                 is_HORIZWS_utf8(s),
1729                 is_HORIZWS_latin1(s)
1730             );
1731             break;
1732         case NHORIZWS:
1733             REXEC_FBC_CSCAN(
1734                 !is_HORIZWS_utf8(s),
1735                 !is_HORIZWS_latin1(s)
1736             );
1737             break;
1738         case AHOCORASICKC:
1739         case AHOCORASICK:
1740             {
1741                 DECL_TRIE_TYPE(c);
1742                 /* what trie are we using right now */
1743                 reg_ac_data *aho
1744                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1745                 reg_trie_data *trie
1746                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1747                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1748
1749                 const char *last_start = strend - trie->minlen;
1750 #ifdef DEBUGGING
1751                 const char *real_start = s;
1752 #endif
1753                 STRLEN maxlen = trie->maxlen;
1754                 SV *sv_points;
1755                 U8 **points; /* map of where we were in the input string
1756                                 when reading a given char. For ASCII this
1757                                 is unnecessary overhead as the relationship
1758                                 is always 1:1, but for Unicode, especially
1759                                 case folded Unicode this is not true. */
1760                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1761                 U8 *bitmap=NULL;
1762
1763
1764                 GET_RE_DEBUG_FLAGS_DECL;
1765
1766                 /* We can't just allocate points here. We need to wrap it in
1767                  * an SV so it gets freed properly if there is a croak while
1768                  * running the match */
1769                 ENTER;
1770                 SAVETMPS;
1771                 sv_points=newSV(maxlen * sizeof(U8 *));
1772                 SvCUR_set(sv_points,
1773                     maxlen * sizeof(U8 *));
1774                 SvPOK_on(sv_points);
1775                 sv_2mortal(sv_points);
1776                 points=(U8**)SvPV_nolen(sv_points );
1777                 if ( trie_type != trie_utf8_fold
1778                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1779                 {
1780                     if (trie->bitmap)
1781                         bitmap=(U8*)trie->bitmap;
1782                     else
1783                         bitmap=(U8*)ANYOF_BITMAP(c);
1784                 }
1785                 /* this is the Aho-Corasick algorithm modified a touch
1786                    to include special handling for long "unknown char"
1787                    sequences. The basic idea being that we use AC as long
1788                    as we are dealing with a possible matching char, when
1789                    we encounter an unknown char (and we have not encountered
1790                    an accepting state) we scan forward until we find a legal
1791                    starting char.
1792                    AC matching is basically that of trie matching, except
1793                    that when we encounter a failing transition, we fall back
1794                    to the current states "fail state", and try the current char
1795                    again, a process we repeat until we reach the root state,
1796                    state 1, or a legal transition. If we fail on the root state
1797                    then we can either terminate if we have reached an accepting
1798                    state previously, or restart the entire process from the beginning
1799                    if we have not.
1800
1801                  */
1802                 while (s <= last_start) {
1803                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1804                     U8 *uc = (U8*)s;
1805                     U16 charid = 0;
1806                     U32 base = 1;
1807                     U32 state = 1;
1808                     UV uvc = 0;
1809                     STRLEN len = 0;
1810                     STRLEN foldlen = 0;
1811                     U8 *uscan = (U8*)NULL;
1812                     U8 *leftmost = NULL;
1813 #ifdef DEBUGGING
1814                     U32 accepted_word= 0;
1815 #endif
1816                     U32 pointpos = 0;
1817
1818                     while ( state && uc <= (U8*)strend ) {
1819                         int failed=0;
1820                         U32 word = aho->states[ state ].wordnum;
1821
1822                         if( state==1 ) {
1823                             if ( bitmap ) {
1824                                 DEBUG_TRIE_EXECUTE_r(
1825                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1826                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1827                                             (char *)uc, utf8_target );
1828                                         PerlIO_printf( Perl_debug_log,
1829                                             " Scanning for legal start char...\n");
1830                                     }
1831                                 );
1832                                 if (utf8_target) {
1833                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1834                                         uc += UTF8SKIP(uc);
1835                                     }
1836                                 } else {
1837                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1838                                         uc++;
1839                                     }
1840                                 }
1841                                 s= (char *)uc;
1842                             }
1843                             if (uc >(U8*)last_start) break;
1844                         }
1845
1846                         if ( word ) {
1847                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1848                             if (!leftmost || lpos < leftmost) {
1849                                 DEBUG_r(accepted_word=word);
1850                                 leftmost= lpos;
1851                             }
1852                             if (base==0) break;
1853
1854                         }
1855                         points[pointpos++ % maxlen]= uc;
1856                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1857                                              uscan, len, uvc, charid, foldlen,
1858                                              foldbuf, uniflags);
1859                         DEBUG_TRIE_EXECUTE_r({
1860                             dump_exec_pos( (char *)uc, c, strend, real_start,
1861                                 s,   utf8_target );
1862                             PerlIO_printf(Perl_debug_log,
1863                                 " Charid:%3u CP:%4"UVxf" ",
1864                                  charid, uvc);
1865                         });
1866
1867                         do {
1868 #ifdef DEBUGGING
1869                             word = aho->states[ state ].wordnum;
1870 #endif
1871                             base = aho->states[ state ].trans.base;
1872
1873                             DEBUG_TRIE_EXECUTE_r({
1874                                 if (failed)
1875                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1876                                         s,   utf8_target );
1877                                 PerlIO_printf( Perl_debug_log,
1878                                     "%sState: %4"UVxf", word=%"UVxf,
1879                                     failed ? " Fail transition to " : "",
1880                                     (UV)state, (UV)word);
1881                             });
1882                             if ( base ) {
1883                                 U32 tmp;
1884                                 I32 offset;
1885                                 if (charid &&
1886                                      ( ((offset = base + charid
1887                                         - 1 - trie->uniquecharcount)) >= 0)
1888                                      && ((U32)offset < trie->lasttrans)
1889                                      && trie->trans[offset].check == state
1890                                      && (tmp=trie->trans[offset].next))
1891                                 {
1892                                     DEBUG_TRIE_EXECUTE_r(
1893                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1894                                     state = tmp;
1895                                     break;
1896                                 }
1897                                 else {
1898                                     DEBUG_TRIE_EXECUTE_r(
1899                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1900                                     failed = 1;
1901                                     state = aho->fail[state];
1902                                 }
1903                             }
1904                             else {
1905                                 /* we must be accepting here */
1906                                 DEBUG_TRIE_EXECUTE_r(
1907                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1908                                 failed = 1;
1909                                 break;
1910                             }
1911                         } while(state);
1912                         uc += len;
1913                         if (failed) {
1914                             if (leftmost)
1915                                 break;
1916                             if (!state) state = 1;
1917                         }
1918                     }
1919                     if ( aho->states[ state ].wordnum ) {
1920                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1921                         if (!leftmost || lpos < leftmost) {
1922                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1923                             leftmost = lpos;
1924                         }
1925                     }
1926                     if (leftmost) {
1927                         s = (char*)leftmost;
1928                         DEBUG_TRIE_EXECUTE_r({
1929                             PerlIO_printf(
1930                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1931                                 (UV)accepted_word, (IV)(s - real_start)
1932                             );
1933                         });
1934                         if (!reginfo || regtry(reginfo, &s)) {
1935                             FREETMPS;
1936                             LEAVE;
1937                             goto got_it;
1938                         }
1939                         s = HOPc(s,1);
1940                         DEBUG_TRIE_EXECUTE_r({
1941                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1942                         });
1943                     } else {
1944                         DEBUG_TRIE_EXECUTE_r(
1945                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1946                         break;
1947                     }
1948                 }
1949                 FREETMPS;
1950                 LEAVE;
1951             }
1952             break;
1953         default:
1954             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1955             break;
1956         }
1957         return 0;
1958       got_it:
1959         return s;
1960 }
1961
1962
1963 /*
1964  - regexec_flags - match a regexp against a string
1965  */
1966 I32
1967 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1968               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1969 /* strend: pointer to null at end of string */
1970 /* strbeg: real beginning of string */
1971 /* minend: end of match must be >=minend after stringarg. */
1972 /* data: May be used for some additional optimizations.
1973          Currently its only used, with a U32 cast, for transmitting
1974          the ganch offset when doing a /g match. This will change */
1975 /* nosave: For optimizations. */
1976 {
1977     dVAR;
1978     struct regexp *const prog = (struct regexp *)SvANY(rx);
1979     /*register*/ char *s;
1980     register regnode *c;
1981     /*register*/ char *startpos = stringarg;
1982     I32 minlen;         /* must match at least this many chars */
1983     I32 dontbother = 0; /* how many characters not to try at end */
1984     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1985     I32 scream_pos = -1;                /* Internal iterator of scream. */
1986     char *scream_olds = NULL;
1987     const bool utf8_target = cBOOL(DO_UTF8(sv));
1988     I32 multiline;
1989     RXi_GET_DECL(prog,progi);
1990     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1991     regexp_paren_pair *swap = NULL;
1992     GET_RE_DEBUG_FLAGS_DECL;
1993
1994     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1995     PERL_UNUSED_ARG(data);
1996
1997     /* Be paranoid... */
1998     if (prog == NULL || startpos == NULL) {
1999         Perl_croak(aTHX_ "NULL regexp parameter");
2000         return 0;
2001     }
2002
2003     multiline = prog->extflags & RXf_PMf_MULTILINE;
2004     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2005
2006     RX_MATCH_UTF8_set(rx, utf8_target);
2007     DEBUG_EXECUTE_r(
2008         debug_start_match(rx, utf8_target, startpos, strend,
2009         "Matching");
2010     );
2011
2012     minlen = prog->minlen;
2013
2014     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2015         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2016                               "String too short [regexec_flags]...\n"));
2017         goto phooey;
2018     }
2019
2020
2021     /* Check validity of program. */
2022     if (UCHARAT(progi->program) != REG_MAGIC) {
2023         Perl_croak(aTHX_ "corrupted regexp program");
2024     }
2025
2026     PL_reg_flags = 0;
2027     PL_reg_eval_set = 0;
2028     PL_reg_maxiter = 0;
2029
2030     if (RX_UTF8(rx))
2031         PL_reg_flags |= RF_utf8;
2032
2033     /* Mark beginning of line for ^ and lookbehind. */
2034     reginfo.bol = startpos; /* XXX not used ??? */
2035     PL_bostr  = strbeg;
2036     reginfo.sv = sv;
2037
2038     /* Mark end of line for $ (and such) */
2039     PL_regeol = strend;
2040
2041     /* see how far we have to get to not match where we matched before */
2042     reginfo.till = startpos+minend;
2043
2044     /* If there is a "must appear" string, look for it. */
2045     s = startpos;
2046
2047     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2048         MAGIC *mg;
2049         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2050             reginfo.ganch = startpos + prog->gofs;
2051             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2052               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2053         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2054                   && SvMAGIC(sv)
2055                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2056                   && mg->mg_len >= 0) {
2057             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2058             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2059                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2060
2061             if (prog->extflags & RXf_ANCH_GPOS) {
2062                 if (s > reginfo.ganch)
2063                     goto phooey;
2064                 s = reginfo.ganch - prog->gofs;
2065                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2066                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2067                 if (s < strbeg)
2068                     goto phooey;
2069             }
2070         }
2071         else if (data) {
2072             reginfo.ganch = strbeg + PTR2UV(data);
2073             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2074                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2075
2076         } else {                                /* pos() not defined */
2077             reginfo.ganch = strbeg;
2078             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2079                  "GPOS: reginfo.ganch = strbeg\n"));
2080         }
2081     }
2082     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2083         /* We have to be careful. If the previous successful match
2084            was from this regex we don't want a subsequent partially
2085            successful match to clobber the old results.
2086            So when we detect this possibility we add a swap buffer
2087            to the re, and switch the buffer each match. If we fail
2088            we switch it back, otherwise we leave it swapped.
2089         */
2090         swap = prog->offs;
2091         /* do we need a save destructor here for eval dies? */
2092         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2093     }
2094     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2095         re_scream_pos_data d;
2096
2097         d.scream_olds = &scream_olds;
2098         d.scream_pos = &scream_pos;
2099         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2100         if (!s) {
2101             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2102             goto phooey;        /* not present */
2103         }
2104     }
2105
2106
2107
2108     /* Simplest case:  anchored match need be tried only once. */
2109     /*  [unless only anchor is BOL and multiline is set] */
2110     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2111         if (s == startpos && regtry(&reginfo, &startpos))
2112             goto got_it;
2113         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2114                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2115         {
2116             char *end;
2117
2118             if (minlen)
2119                 dontbother = minlen - 1;
2120             end = HOP3c(strend, -dontbother, strbeg) - 1;
2121             /* for multiline we only have to try after newlines */
2122             if (prog->check_substr || prog->check_utf8) {
2123                 /* because of the goto we can not easily reuse the macros for bifurcating the
2124                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2125                 if (utf8_target) {
2126                     if (s == startpos)
2127                         goto after_try_utf8;
2128                     while (1) {
2129                         if (regtry(&reginfo, &s)) {
2130                             goto got_it;
2131                         }
2132                       after_try_utf8:
2133                         if (s > end) {
2134                             goto phooey;
2135                         }
2136                         if (prog->extflags & RXf_USE_INTUIT) {
2137                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2138                             if (!s) {
2139                                 goto phooey;
2140                             }
2141                         }
2142                         else {
2143                             s += UTF8SKIP(s);
2144                         }
2145                     }
2146                 } /* end search for check string in unicode */
2147                 else {
2148                     if (s == startpos) {
2149                         goto after_try_latin;
2150                     }
2151                     while (1) {
2152                         if (regtry(&reginfo, &s)) {
2153                             goto got_it;
2154                         }
2155                       after_try_latin:
2156                         if (s > end) {
2157                             goto phooey;
2158                         }
2159                         if (prog->extflags & RXf_USE_INTUIT) {
2160                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2161                             if (!s) {
2162                                 goto phooey;
2163                             }
2164                         }
2165                         else {
2166                             s++;
2167                         }
2168                     }
2169                 } /* end search for check string in latin*/
2170             } /* end search for check string */
2171             else { /* search for newline */
2172                 if (s > startpos) {
2173                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2174                     s--;
2175                 }
2176                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2177                 while (s < end) {
2178                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2179                         if (regtry(&reginfo, &s))
2180                             goto got_it;
2181                     }
2182                 }
2183             } /* end search for newline */
2184         } /* end anchored/multiline check string search */
2185         goto phooey;
2186     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2187     {
2188         /* the warning about reginfo.ganch being used without initialization
2189            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2190            and we only enter this block when the same bit is set. */
2191         char *tmp_s = reginfo.ganch - prog->gofs;
2192
2193         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2194             goto got_it;
2195         goto phooey;
2196     }
2197
2198     /* Messy cases:  unanchored match. */
2199     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2200         /* we have /x+whatever/ */
2201         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2202         char ch;
2203 #ifdef DEBUGGING
2204         int did_match = 0;
2205 #endif
2206         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2207             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2208         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2209
2210         if (utf8_target) {
2211             REXEC_FBC_SCAN(
2212                 if (*s == ch) {
2213                     DEBUG_EXECUTE_r( did_match = 1 );
2214                     if (regtry(&reginfo, &s)) goto got_it;
2215                     s += UTF8SKIP(s);
2216                     while (s < strend && *s == ch)
2217                         s += UTF8SKIP(s);
2218                 }
2219             );
2220         }
2221         else {
2222             REXEC_FBC_SCAN(
2223                 if (*s == ch) {
2224                     DEBUG_EXECUTE_r( did_match = 1 );
2225                     if (regtry(&reginfo, &s)) goto got_it;
2226                     s++;
2227                     while (s < strend && *s == ch)
2228                         s++;
2229                 }
2230             );
2231         }
2232         DEBUG_EXECUTE_r(if (!did_match)
2233                 PerlIO_printf(Perl_debug_log,
2234                                   "Did not find anchored character...\n")
2235                );
2236     }
2237     else if (prog->anchored_substr != NULL
2238               || prog->anchored_utf8 != NULL
2239               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2240                   && prog->float_max_offset < strend - s)) {
2241         SV *must;
2242         I32 back_max;
2243         I32 back_min;
2244         char *last;
2245         char *last1;            /* Last position checked before */
2246 #ifdef DEBUGGING
2247         int did_match = 0;
2248 #endif
2249         if (prog->anchored_substr || prog->anchored_utf8) {
2250             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2251                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2252             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2253             back_max = back_min = prog->anchored_offset;
2254         } else {
2255             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2256                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2257             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2258             back_max = prog->float_max_offset;
2259             back_min = prog->float_min_offset;
2260         }
2261
2262
2263         if (must == &PL_sv_undef)
2264             /* could not downgrade utf8 check substring, so must fail */
2265             goto phooey;
2266
2267         if (back_min<0) {
2268             last = strend;
2269         } else {
2270             last = HOP3c(strend,        /* Cannot start after this */
2271                   -(I32)(CHR_SVLEN(must)
2272                          - (SvTAIL(must) != 0) + back_min), strbeg);
2273         }
2274         if (s > PL_bostr)
2275             last1 = HOPc(s, -1);
2276         else
2277             last1 = s - 1;      /* bogus */
2278
2279         /* XXXX check_substr already used to find "s", can optimize if
2280            check_substr==must. */
2281         scream_pos = -1;
2282         dontbother = end_shift;
2283         strend = HOPc(strend, -dontbother);
2284         while ( (s <= last) &&
2285                 ((flags & REXEC_SCREAM)
2286                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2287                                     end_shift, &scream_pos, 0))
2288                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2289                                   (unsigned char*)strend, must,
2290                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2291             /* we may be pointing at the wrong string */
2292             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2293                 s = strbeg + (s - SvPVX_const(sv));
2294             DEBUG_EXECUTE_r( did_match = 1 );
2295             if (HOPc(s, -back_max) > last1) {
2296                 last1 = HOPc(s, -back_min);
2297                 s = HOPc(s, -back_max);
2298             }
2299             else {
2300                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2301
2302                 last1 = HOPc(s, -back_min);
2303                 s = t;
2304             }
2305             if (utf8_target) {
2306                 while (s <= last1) {
2307                     if (regtry(&reginfo, &s))
2308                         goto got_it;
2309                     s += UTF8SKIP(s);
2310                 }
2311             }
2312             else {
2313                 while (s <= last1) {
2314                     if (regtry(&reginfo, &s))
2315                         goto got_it;
2316                     s++;
2317                 }
2318             }
2319         }
2320         DEBUG_EXECUTE_r(if (!did_match) {
2321             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2322                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2323             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2324                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2325                                ? "anchored" : "floating"),
2326                 quoted, RE_SV_TAIL(must));
2327         });
2328         goto phooey;
2329     }
2330     else if ( (c = progi->regstclass) ) {
2331         if (minlen) {
2332             const OPCODE op = OP(progi->regstclass);
2333             /* don't bother with what can't match */
2334             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2335                 strend = HOPc(strend, -(minlen - 1));
2336         }
2337         DEBUG_EXECUTE_r({
2338             SV * const prop = sv_newmortal();
2339             regprop(prog, prop, c);
2340             {
2341                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2342                     s,strend-s,60);
2343                 PerlIO_printf(Perl_debug_log,
2344                     "Matching stclass %.*s against %s (%d bytes)\n",
2345                     (int)SvCUR(prop), SvPVX_const(prop),
2346                      quoted, (int)(strend - s));
2347             }
2348         });
2349         if (find_byclass(prog, c, s, strend, &reginfo))
2350             goto got_it;
2351         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2352     }
2353     else {
2354         dontbother = 0;
2355         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2356             /* Trim the end. */
2357             char *last;
2358             SV* float_real;
2359
2360             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2361                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2362             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2363
2364             if (flags & REXEC_SCREAM) {
2365                 last = screaminstr(sv, float_real, s - strbeg,
2366                                    end_shift, &scream_pos, 1); /* last one */
2367                 if (!last)
2368                     last = scream_olds; /* Only one occurrence. */
2369                 /* we may be pointing at the wrong string */
2370                 else if (RXp_MATCH_COPIED(prog))
2371                     s = strbeg + (s - SvPVX_const(sv));
2372             }
2373             else {
2374                 STRLEN len;
2375                 const char * const little = SvPV_const(float_real, len);
2376
2377                 if (SvTAIL(float_real)) {
2378                     if (memEQ(strend - len + 1, little, len - 1))
2379                         last = strend - len + 1;
2380                     else if (!multiline)
2381                         last = memEQ(strend - len, little, len)
2382                             ? strend - len : NULL;
2383                     else
2384                         goto find_last;
2385                 } else {
2386                   find_last:
2387                     if (len)
2388                         last = rninstr(s, strend, little, little + len);
2389                     else
2390                         last = strend;  /* matching "$" */
2391                 }
2392             }
2393             if (last == NULL) {
2394                 DEBUG_EXECUTE_r(
2395                     PerlIO_printf(Perl_debug_log,
2396                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2397                         PL_colors[4], PL_colors[5]));
2398                 goto phooey; /* Should not happen! */
2399             }
2400             dontbother = strend - last + prog->float_min_offset;
2401         }
2402         if (minlen && (dontbother < minlen))
2403             dontbother = minlen - 1;
2404         strend -= dontbother;              /* this one's always in bytes! */
2405         /* We don't know much -- general case. */
2406         if (utf8_target) {
2407             for (;;) {
2408                 if (regtry(&reginfo, &s))
2409                     goto got_it;
2410                 if (s >= strend)
2411                     break;
2412                 s += UTF8SKIP(s);
2413             };
2414         }
2415         else {
2416             do {
2417                 if (regtry(&reginfo, &s))
2418                     goto got_it;
2419             } while (s++ < strend);
2420         }
2421     }
2422
2423     /* Failure. */
2424     goto phooey;
2425
2426 got_it:
2427     Safefree(swap);
2428     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2429
2430     if (PL_reg_eval_set)
2431         restore_pos(aTHX_ prog);
2432     if (RXp_PAREN_NAMES(prog))
2433         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2434
2435     /* make sure $`, $&, $', and $digit will work later */
2436     if ( !(flags & REXEC_NOT_FIRST) ) {
2437         RX_MATCH_COPY_FREE(rx);
2438         if (flags & REXEC_COPY_STR) {
2439             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2440 #ifdef PERL_OLD_COPY_ON_WRITE
2441             if ((SvIsCOW(sv)
2442                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2443                 if (DEBUG_C_TEST) {
2444                     PerlIO_printf(Perl_debug_log,
2445                                   "Copy on write: regexp capture, type %d\n",
2446                                   (int) SvTYPE(sv));
2447                 }
2448                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2449                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2450                 assert (SvPOKp(prog->saved_copy));
2451             } else
2452 #endif
2453             {
2454                 RX_MATCH_COPIED_on(rx);
2455                 s = savepvn(strbeg, i);
2456                 prog->subbeg = s;
2457             }
2458             prog->sublen = i;
2459         }
2460         else {
2461             prog->subbeg = strbeg;
2462             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2463         }
2464     }
2465
2466     return 1;
2467
2468 phooey:
2469     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2470                           PL_colors[4], PL_colors[5]));
2471     if (PL_reg_eval_set)
2472         restore_pos(aTHX_ prog);
2473     if (swap) {
2474         /* we failed :-( roll it back */
2475         Safefree(prog->offs);
2476         prog->offs = swap;
2477     }
2478
2479     return 0;
2480 }
2481
2482
2483 /*
2484  - regtry - try match at specific point
2485  */
2486 STATIC I32                      /* 0 failure, 1 success */
2487 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2488 {
2489     dVAR;
2490     CHECKPOINT lastcp;
2491     REGEXP *const rx = reginfo->prog;
2492     regexp *const prog = (struct regexp *)SvANY(rx);
2493     RXi_GET_DECL(prog,progi);
2494     GET_RE_DEBUG_FLAGS_DECL;
2495
2496     PERL_ARGS_ASSERT_REGTRY;
2497
2498     reginfo->cutpoint=NULL;
2499
2500     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2501         MAGIC *mg;
2502
2503         PL_reg_eval_set = RS_init;
2504         DEBUG_EXECUTE_r(DEBUG_s(
2505             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2506                           (IV)(PL_stack_sp - PL_stack_base));
2507             ));
2508         SAVESTACK_CXPOS();
2509         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2510         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2511         SAVETMPS;
2512         /* Apparently this is not needed, judging by wantarray. */
2513         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2514            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2515
2516         if (reginfo->sv) {
2517             /* Make $_ available to executed code. */
2518             if (reginfo->sv != DEFSV) {
2519                 SAVE_DEFSV;
2520                 DEFSV_set(reginfo->sv);
2521             }
2522
2523             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2524                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2525                 /* prepare for quick setting of pos */
2526 #ifdef PERL_OLD_COPY_ON_WRITE
2527                 if (SvIsCOW(reginfo->sv))
2528                     sv_force_normal_flags(reginfo->sv, 0);
2529 #endif
2530                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2531                                  &PL_vtbl_mglob, NULL, 0);
2532                 mg->mg_len = -1;
2533             }
2534             PL_reg_magic    = mg;
2535             PL_reg_oldpos   = mg->mg_len;
2536             SAVEDESTRUCTOR_X(restore_pos, prog);
2537         }
2538         if (!PL_reg_curpm) {
2539             Newxz(PL_reg_curpm, 1, PMOP);
2540 #ifdef USE_ITHREADS
2541             {
2542                 SV* const repointer = &PL_sv_undef;
2543                 /* this regexp is also owned by the new PL_reg_curpm, which
2544                    will try to free it.  */
2545                 av_push(PL_regex_padav, repointer);
2546                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2547                 PL_regex_pad = AvARRAY(PL_regex_padav);
2548             }
2549 #endif
2550         }
2551 #ifdef USE_ITHREADS
2552         /* It seems that non-ithreads works both with and without this code.
2553            So for efficiency reasons it seems best not to have the code
2554            compiled when it is not needed.  */
2555         /* This is safe against NULLs: */
2556         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2557         /* PM_reg_curpm owns a reference to this regexp.  */
2558         (void)ReREFCNT_inc(rx);
2559 #endif
2560         PM_SETRE(PL_reg_curpm, rx);
2561         PL_reg_oldcurpm = PL_curpm;
2562         PL_curpm = PL_reg_curpm;
2563         if (RXp_MATCH_COPIED(prog)) {
2564             /*  Here is a serious problem: we cannot rewrite subbeg,
2565                 since it may be needed if this match fails.  Thus
2566                 $` inside (?{}) could fail... */
2567             PL_reg_oldsaved = prog->subbeg;
2568             PL_reg_oldsavedlen = prog->sublen;
2569 #ifdef PERL_OLD_COPY_ON_WRITE
2570             PL_nrs = prog->saved_copy;
2571 #endif
2572             RXp_MATCH_COPIED_off(prog);
2573         }
2574         else
2575             PL_reg_oldsaved = NULL;
2576         prog->subbeg = PL_bostr;
2577         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2578     }
2579     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2580     prog->offs[0].start = *startpos - PL_bostr;
2581     PL_reginput = *startpos;
2582     PL_reglastparen = &prog->lastparen;
2583     PL_reglastcloseparen = &prog->lastcloseparen;
2584     prog->lastparen = 0;
2585     prog->lastcloseparen = 0;
2586     PL_regsize = 0;
2587     PL_regoffs = prog->offs;
2588     if (PL_reg_start_tmpl <= prog->nparens) {
2589         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2590         if(PL_reg_start_tmp)
2591             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2592         else
2593             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2594     }
2595
2596     /* XXXX What this code is doing here?!!!  There should be no need
2597        to do this again and again, PL_reglastparen should take care of
2598        this!  --ilya*/
2599
2600     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2601      * Actually, the code in regcppop() (which Ilya may be meaning by
2602      * PL_reglastparen), is not needed at all by the test suite
2603      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2604      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2605      * Meanwhile, this code *is* needed for the
2606      * above-mentioned test suite tests to succeed.  The common theme
2607      * on those tests seems to be returning null fields from matches.
2608      * --jhi updated by dapm */
2609 #if 1
2610     if (prog->nparens) {
2611         regexp_paren_pair *pp = PL_regoffs;
2612         register I32 i;
2613         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2614             ++pp;
2615             pp->start = -1;
2616             pp->end = -1;
2617         }
2618     }
2619 #endif
2620     REGCP_SET(lastcp);
2621     if (regmatch(reginfo, progi->program + 1)) {
2622         PL_regoffs[0].end = PL_reginput - PL_bostr;
2623         return 1;
2624     }
2625     if (reginfo->cutpoint)
2626         *startpos= reginfo->cutpoint;
2627     REGCP_UNWIND(lastcp);
2628     return 0;
2629 }
2630
2631
2632 #define sayYES goto yes
2633 #define sayNO goto no
2634 #define sayNO_SILENT goto no_silent
2635
2636 /* we dont use STMT_START/END here because it leads to
2637    "unreachable code" warnings, which are bogus, but distracting. */
2638 #define CACHEsayNO \
2639     if (ST.cache_mask) \
2640        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2641     sayNO
2642
2643 /* this is used to determine how far from the left messages like
2644    'failed...' are printed. It should be set such that messages
2645    are inline with the regop output that created them.
2646 */
2647 #define REPORT_CODE_OFF 32
2648
2649
2650 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2651 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2652
2653 #define SLAB_FIRST(s) (&(s)->states[0])
2654 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2655
2656 /* grab a new slab and return the first slot in it */
2657
2658 STATIC regmatch_state *
2659 S_push_slab(pTHX)
2660 {
2661 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2662     dMY_CXT;
2663 #endif
2664     regmatch_slab *s = PL_regmatch_slab->next;
2665     if (!s) {
2666         Newx(s, 1, regmatch_slab);
2667         s->prev = PL_regmatch_slab;
2668         s->next = NULL;
2669         PL_regmatch_slab->next = s;
2670     }
2671     PL_regmatch_slab = s;
2672     return SLAB_FIRST(s);
2673 }
2674
2675
2676 /* push a new state then goto it */
2677
2678 #define PUSH_STATE_GOTO(state, node) \
2679     scan = node; \
2680     st->resume_state = state; \
2681     goto push_state;
2682
2683 /* push a new state with success backtracking, then goto it */
2684
2685 #define PUSH_YES_STATE_GOTO(state, node) \
2686     scan = node; \
2687     st->resume_state = state; \
2688     goto push_yes_state;
2689
2690
2691
2692 /*
2693
2694 regmatch() - main matching routine
2695
2696 This is basically one big switch statement in a loop. We execute an op,
2697 set 'next' to point the next op, and continue. If we come to a point which
2698 we may need to backtrack to on failure such as (A|B|C), we push a
2699 backtrack state onto the backtrack stack. On failure, we pop the top
2700 state, and re-enter the loop at the state indicated. If there are no more
2701 states to pop, we return failure.
2702
2703 Sometimes we also need to backtrack on success; for example /A+/, where
2704 after successfully matching one A, we need to go back and try to
2705 match another one; similarly for lookahead assertions: if the assertion
2706 completes successfully, we backtrack to the state just before the assertion
2707 and then carry on.  In these cases, the pushed state is marked as
2708 'backtrack on success too'. This marking is in fact done by a chain of
2709 pointers, each pointing to the previous 'yes' state. On success, we pop to
2710 the nearest yes state, discarding any intermediate failure-only states.
2711 Sometimes a yes state is pushed just to force some cleanup code to be
2712 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2713 it to free the inner regex.
2714
2715 Note that failure backtracking rewinds the cursor position, while
2716 success backtracking leaves it alone.
2717
2718 A pattern is complete when the END op is executed, while a subpattern
2719 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2720 ops trigger the "pop to last yes state if any, otherwise return true"
2721 behaviour.
2722
2723 A common convention in this function is to use A and B to refer to the two
2724 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2725 the subpattern to be matched possibly multiple times, while B is the entire
2726 rest of the pattern. Variable and state names reflect this convention.
2727
2728 The states in the main switch are the union of ops and failure/success of
2729 substates associated with with that op.  For example, IFMATCH is the op
2730 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2731 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2732 successfully matched A and IFMATCH_A_fail is a state saying that we have
2733 just failed to match A. Resume states always come in pairs. The backtrack
2734 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2735 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2736 on success or failure.
2737
2738 The struct that holds a backtracking state is actually a big union, with
2739 one variant for each major type of op. The variable st points to the
2740 top-most backtrack struct. To make the code clearer, within each
2741 block of code we #define ST to alias the relevant union.
2742
2743 Here's a concrete example of a (vastly oversimplified) IFMATCH
2744 implementation:
2745
2746     switch (state) {
2747     ....
2748
2749 #define ST st->u.ifmatch
2750
2751     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2752         ST.foo = ...; // some state we wish to save
2753         ...
2754         // push a yes backtrack state with a resume value of
2755         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2756         // first node of A:
2757         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2758         // NOTREACHED
2759
2760     case IFMATCH_A: // we have successfully executed A; now continue with B
2761         next = B;
2762         bar = ST.foo; // do something with the preserved value
2763         break;
2764
2765     case IFMATCH_A_fail: // A failed, so the assertion failed
2766         ...;   // do some housekeeping, then ...
2767         sayNO; // propagate the failure
2768
2769 #undef ST
2770
2771     ...
2772     }
2773
2774 For any old-timers reading this who are familiar with the old recursive
2775 approach, the code above is equivalent to:
2776
2777     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2778     {
2779         int foo = ...
2780         ...
2781         if (regmatch(A)) {
2782             next = B;
2783             bar = foo;
2784             break;
2785         }
2786         ...;   // do some housekeeping, then ...
2787         sayNO; // propagate the failure
2788     }
2789
2790 The topmost backtrack state, pointed to by st, is usually free. If you
2791 want to claim it, populate any ST.foo fields in it with values you wish to
2792 save, then do one of
2793
2794         PUSH_STATE_GOTO(resume_state, node);
2795         PUSH_YES_STATE_GOTO(resume_state, node);
2796
2797 which sets that backtrack state's resume value to 'resume_state', pushes a
2798 new free entry to the top of the backtrack stack, then goes to 'node'.
2799 On backtracking, the free slot is popped, and the saved state becomes the
2800 new free state. An ST.foo field in this new top state can be temporarily
2801 accessed to retrieve values, but once the main loop is re-entered, it
2802 becomes available for reuse.
2803
2804 Note that the depth of the backtrack stack constantly increases during the
2805 left-to-right execution of the pattern, rather than going up and down with
2806 the pattern nesting. For example the stack is at its maximum at Z at the
2807 end of the pattern, rather than at X in the following:
2808
2809     /(((X)+)+)+....(Y)+....Z/
2810
2811 The only exceptions to this are lookahead/behind assertions and the cut,
2812 (?>A), which pop all the backtrack states associated with A before
2813 continuing.
2814
2815 Backtrack state structs are allocated in slabs of about 4K in size.
2816 PL_regmatch_state and st always point to the currently active state,
2817 and PL_regmatch_slab points to the slab currently containing
2818 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2819 allocated, and is never freed until interpreter destruction. When the slab
2820 is full, a new one is allocated and chained to the end. At exit from
2821 regmatch(), slabs allocated since entry are freed.
2822
2823 */
2824
2825
2826 #define DEBUG_STATE_pp(pp)                                  \
2827     DEBUG_STATE_r({                                         \
2828         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2829         PerlIO_printf(Perl_debug_log,                       \
2830             "    %*s"pp" %s%s%s%s%s\n",                     \
2831             depth*2, "",                                    \
2832             PL_reg_name[st->resume_state],                     \
2833             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2834             ((st==yes_state) ? "Y" : ""),                   \
2835             ((st==mark_state) ? "M" : ""),                  \
2836             ((st==yes_state||st==mark_state) ? "]" : "")    \
2837         );                                                  \
2838     });
2839
2840
2841 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2842
2843 #ifdef DEBUGGING
2844
2845 STATIC void
2846 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2847     const char *start, const char *end, const char *blurb)
2848 {
2849     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2850
2851     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2852
2853     if (!PL_colorset)
2854             reginitcolors();
2855     {
2856         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2857             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2858
2859         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2860             start, end - start, 60);
2861
2862         PerlIO_printf(Perl_debug_log,
2863             "%s%s REx%s %s against %s\n",
2864                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2865
2866         if (utf8_target||utf8_pat)
2867             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2868                 utf8_pat ? "pattern" : "",
2869                 utf8_pat && utf8_target ? " and " : "",
2870                 utf8_target ? "string" : ""
2871             );
2872     }
2873 }
2874
2875 STATIC void
2876 S_dump_exec_pos(pTHX_ const char *locinput,
2877                       const regnode *scan,
2878                       const char *loc_regeol,
2879                       const char *loc_bostr,
2880                       const char *loc_reg_starttry,
2881                       const bool utf8_target)
2882 {
2883     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2884     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2885     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2886     /* The part of the string before starttry has one color
2887        (pref0_len chars), between starttry and current
2888        position another one (pref_len - pref0_len chars),
2889        after the current position the third one.
2890        We assume that pref0_len <= pref_len, otherwise we
2891        decrease pref0_len.  */
2892     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2893         ? (5 + taill) - l : locinput - loc_bostr;
2894     int pref0_len;
2895
2896     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2897
2898     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2899         pref_len++;
2900     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2901     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2902         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2903               ? (5 + taill) - pref_len : loc_regeol - locinput);
2904     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2905         l--;
2906     if (pref0_len < 0)
2907         pref0_len = 0;
2908     if (pref0_len > pref_len)
2909         pref0_len = pref_len;
2910     {
2911         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2912
2913         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2914             (locinput - pref_len),pref0_len, 60, 4, 5);
2915
2916         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2917                     (locinput - pref_len + pref0_len),
2918                     pref_len - pref0_len, 60, 2, 3);
2919
2920         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2921                     locinput, loc_regeol - locinput, 10, 0, 1);
2922
2923         const STRLEN tlen=len0+len1+len2;
2924         PerlIO_printf(Perl_debug_log,
2925                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2926                     (IV)(locinput - loc_bostr),
2927                     len0, s0,
2928                     len1, s1,
2929                     (docolor ? "" : "> <"),
2930                     len2, s2,
2931                     (int)(tlen > 19 ? 0 :  19 - tlen),
2932                     "");
2933     }
2934 }
2935
2936 #endif
2937
2938 /* reg_check_named_buff_matched()
2939  * Checks to see if a named buffer has matched. The data array of
2940  * buffer numbers corresponding to the buffer is expected to reside
2941  * in the regexp->data->data array in the slot stored in the ARG() of
2942  * node involved. Note that this routine doesn't actually care about the
2943  * name, that information is not preserved from compilation to execution.
2944  * Returns the index of the leftmost defined buffer with the given name
2945  * or 0 if non of the buffers matched.
2946  */
2947 STATIC I32
2948 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2949 {
2950     I32 n;
2951     RXi_GET_DECL(rex,rexi);
2952     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2953     I32 *nums=(I32*)SvPVX(sv_dat);
2954
2955     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2956
2957     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2958         if ((I32)*PL_reglastparen >= nums[n] &&
2959             PL_regoffs[nums[n]].end != -1)
2960         {
2961             return nums[n];
2962         }
2963     }
2964     return 0;
2965 }
2966
2967
2968 /* free all slabs above current one  - called during LEAVE_SCOPE */
2969
2970 STATIC void
2971 S_clear_backtrack_stack(pTHX_ void *p)
2972 {
2973     regmatch_slab *s = PL_regmatch_slab->next;
2974     PERL_UNUSED_ARG(p);
2975
2976     if (!s)
2977         return;
2978     PL_regmatch_slab->next = NULL;
2979     while (s) {
2980         regmatch_slab * const osl = s;
2981         s = s->next;
2982         Safefree(osl);
2983     }
2984 }
2985
2986
2987 #define SETREX(Re1,Re2) \
2988     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2989     Re1 = (Re2)
2990
2991 STATIC I32                      /* 0 failure, 1 success */
2992 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2993 {
2994 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2995     dMY_CXT;
2996 #endif
2997     dVAR;
2998     register const bool utf8_target = PL_reg_match_utf8;
2999     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3000     REGEXP *rex_sv = reginfo->prog;
3001     regexp *rex = (struct regexp *)SvANY(rex_sv);
3002     RXi_GET_DECL(rex,rexi);
3003     I32 oldsave;
3004     /* the current state. This is a cached copy of PL_regmatch_state */
3005     register regmatch_state *st;
3006     /* cache heavy used fields of st in registers */
3007     register regnode *scan;
3008     register regnode *next;
3009     register U32 n = 0; /* general value; init to avoid compiler warning */
3010     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3011     register char *locinput = PL_reginput;
3012     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3013
3014     bool result = 0;        /* return value of S_regmatch */
3015     int depth = 0;          /* depth of backtrack stack */
3016     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3017     const U32 max_nochange_depth =
3018         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3019         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3020     regmatch_state *yes_state = NULL; /* state to pop to on success of
3021                                                             subpattern */
3022     /* mark_state piggy backs on the yes_state logic so that when we unwind
3023        the stack on success we can update the mark_state as we go */
3024     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3025     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3026     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3027     U32 state_num;
3028     bool no_final = 0;      /* prevent failure from backtracking? */
3029     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3030     char *startpoint = PL_reginput;
3031     SV *popmark = NULL;     /* are we looking for a mark? */
3032     SV *sv_commit = NULL;   /* last mark name seen in failure */
3033     SV *sv_yes_mark = NULL; /* last mark name we have seen
3034                                during a successful match */
3035     U32 lastopen = 0;       /* last open we saw */
3036     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3037     SV* const oreplsv = GvSV(PL_replgv);
3038     /* these three flags are set by various ops to signal information to
3039      * the very next op. They have a useful lifetime of exactly one loop
3040      * iteration, and are not preserved or restored by state pushes/pops
3041      */
3042     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3043     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3044     int logical = 0;        /* the following EVAL is:
3045                                 0: (?{...})
3046                                 1: (?(?{...})X|Y)
3047                                 2: (??{...})
3048                                or the following IFMATCH/UNLESSM is:
3049                                 false: plain (?=foo)
3050                                 true:  used as a condition: (?(?=foo))
3051                             */
3052 #ifdef DEBUGGING
3053     GET_RE_DEBUG_FLAGS_DECL;
3054 #endif
3055
3056     PERL_ARGS_ASSERT_REGMATCH;
3057
3058     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3059             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3060     }));
3061     /* on first ever call to regmatch, allocate first slab */
3062     if (!PL_regmatch_slab) {
3063         Newx(PL_regmatch_slab, 1, regmatch_slab);
3064         PL_regmatch_slab->prev = NULL;
3065         PL_regmatch_slab->next = NULL;
3066         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3067     }
3068
3069     oldsave = PL_savestack_ix;
3070     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3071     SAVEVPTR(PL_regmatch_slab);
3072     SAVEVPTR(PL_regmatch_state);
3073
3074     /* grab next free state slot */
3075     st = ++PL_regmatch_state;
3076     if (st >  SLAB_LAST(PL_regmatch_slab))
3077         st = PL_regmatch_state = S_push_slab(aTHX);
3078
3079     /* Note that nextchr is a byte even in UTF */
3080     nextchr = UCHARAT(locinput);
3081     scan = prog;
3082     while (scan != NULL) {
3083
3084         DEBUG_EXECUTE_r( {
3085             SV * const prop = sv_newmortal();
3086             regnode *rnext=regnext(scan);
3087             DUMP_EXEC_POS( locinput, scan, utf8_target );
3088             regprop(rex, prop, scan);
3089
3090             PerlIO_printf(Perl_debug_log,
3091                     "%3"IVdf":%*s%s(%"IVdf")\n",
3092                     (IV)(scan - rexi->program), depth*2, "",
3093                     SvPVX_const(prop),
3094                     (PL_regkind[OP(scan)] == END || !rnext) ?
3095                         0 : (IV)(rnext - rexi->program));
3096         });
3097
3098         next = scan + NEXT_OFF(scan);
3099         if (next == scan)
3100             next = NULL;
3101         state_num = OP(scan);
3102
3103       reenter_switch:
3104
3105         assert(PL_reglastparen == &rex->lastparen);
3106         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3107         assert(PL_regoffs == rex->offs);
3108
3109         switch (state_num) {
3110         case BOL:
3111             if (locinput == PL_bostr)
3112             {
3113                 /* reginfo->till = reginfo->bol; */
3114                 break;
3115             }
3116             sayNO;
3117         case MBOL:
3118             if (locinput == PL_bostr ||
3119                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3120             {
3121                 break;
3122             }
3123             sayNO;
3124         case SBOL:
3125             if (locinput == PL_bostr)
3126                 break;
3127             sayNO;
3128         case GPOS:
3129             if (locinput == reginfo->ganch)
3130                 break;
3131             sayNO;
3132
3133         case KEEPS:
3134             /* update the startpoint */
3135             st->u.keeper.val = PL_regoffs[0].start;
3136             PL_reginput = locinput;
3137             PL_regoffs[0].start = locinput - PL_bostr;
3138             PUSH_STATE_GOTO(KEEPS_next, next);
3139             /*NOT-REACHED*/
3140         case KEEPS_next_fail:
3141             /* rollback the start point change */
3142             PL_regoffs[0].start = st->u.keeper.val;
3143             sayNO_SILENT;
3144             /*NOT-REACHED*/
3145         case EOL:
3146                 goto seol;
3147         case MEOL:
3148             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3149                 sayNO;
3150             break;
3151         case SEOL:
3152           seol:
3153             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3154                 sayNO;
3155             if (PL_regeol - locinput > 1)
3156                 sayNO;
3157             break;
3158         case EOS:
3159             if (PL_regeol != locinput)
3160                 sayNO;
3161             break;
3162         case SANY:
3163             if (!nextchr && locinput >= PL_regeol)
3164                 sayNO;
3165             if (utf8_target) {
3166                 locinput += PL_utf8skip[nextchr];
3167                 if (locinput > PL_regeol)
3168                     sayNO;
3169                 nextchr = UCHARAT(locinput);
3170             }
3171             else
3172                 nextchr = UCHARAT(++locinput);
3173             break;
3174         case CANY:
3175             if (!nextchr && locinput >= PL_regeol)
3176                 sayNO;
3177             nextchr = UCHARAT(++locinput);
3178             break;
3179         case REG_ANY:
3180             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3181                 sayNO;
3182             if (utf8_target) {
3183                 locinput += PL_utf8skip[nextchr];
3184                 if (locinput > PL_regeol)
3185                     sayNO;
3186                 nextchr = UCHARAT(locinput);
3187             }
3188             else
3189                 nextchr = UCHARAT(++locinput);
3190             break;
3191
3192 #undef  ST
3193 #define ST st->u.trie
3194         case TRIEC:
3195             /* In this case the charclass data is available inline so
3196                we can fail fast without a lot of extra overhead.
3197              */
3198             if (scan->flags == EXACT || !utf8_target) {
3199                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3200                     DEBUG_EXECUTE_r(
3201                         PerlIO_printf(Perl_debug_log,
3202                                   "%*s  %sfailed to match trie start class...%s\n",
3203                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3204                     );
3205                     sayNO_SILENT;
3206                     /* NOTREACHED */
3207                 }
3208             }
3209             /* FALL THROUGH */
3210         case TRIE:
3211             /* the basic plan of execution of the trie is:
3212              * At the beginning, run though all the states, and
3213              * find the longest-matching word. Also remember the position
3214              * of the shortest matching word. For example, this pattern:
3215              *    1  2 3 4    5
3216              *    ab|a|x|abcd|abc
3217              * when matched against the string "abcde", will generate
3218              * accept states for all words except 3, with the longest
3219              * matching word being 4, and the shortest being 1 (with
3220              * the position being after char 1 of the string).
3221              *
3222              * Then for each matching word, in word order (i.e. 1,2,4,5),
3223              * we run the remainder of the pattern; on each try setting
3224              * the current position to the character following the word,
3225              * returning to try the next word on failure.
3226              *
3227              * We avoid having to build a list of words at runtime by
3228              * using a compile-time structure, wordinfo[].prev, which
3229              * gives, for each word, the previous accepting word (if any).
3230              * In the case above it would contain the mappings 1->2, 2->0,
3231              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3232              * the longest word (4 above), a list of all words, by
3233              * following the list of prev pointers; this gives us the
3234              * unordered list 4,5,1,2. Then given the current word we have
3235              * just tried, we can go through the list and find the
3236              * next-biggest word to try (so if we just failed on word 2,
3237              * the next in the list is 4).
3238              *
3239              * Since at runtime we don't record the matching position in
3240              * the string for each word, we have to work that out for
3241              * each word we're about to process. The wordinfo table holds
3242              * the character length of each word; given that we recorded
3243              * at the start: the position of the shortest word and its
3244              * length in chars, we just need to move the pointer the
3245              * difference between the two char lengths. Depending on
3246              * Unicode status and folding, that's cheap or expensive.
3247              *
3248              * This algorithm is optimised for the case where are only a
3249              * small number of accept states, i.e. 0,1, or maybe 2.
3250              * With lots of accepts states, and having to try all of them,
3251              * it becomes quadratic on number of accept states to find all
3252              * the next words.
3253              */
3254
3255             {
3256                 /* what type of TRIE am I? (utf8 makes this contextual) */
3257                 DECL_TRIE_TYPE(scan);
3258
3259                 /* what trie are we using right now */
3260                 reg_trie_data * const trie
3261                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3262                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3263                 U32 state = trie->startstate;
3264
3265                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3266                     !TRIE_BITMAP_TEST(trie,*locinput)
3267                 ) {
3268                     if (trie->states[ state ].wordnum) {
3269                          DEBUG_EXECUTE_r(
3270                             PerlIO_printf(Perl_debug_log,
3271                                           "%*s  %smatched empty string...%s\n",
3272                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3273                         );
3274                         if (!trie->jump)
3275                             break;
3276                     } else {
3277                         DEBUG_EXECUTE_r(
3278                             PerlIO_printf(Perl_debug_log,
3279                                           "%*s  %sfailed to match trie start class...%s\n",
3280                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3281                         );
3282                         sayNO_SILENT;
3283                    }
3284                 }
3285
3286             {
3287                 U8 *uc = ( U8* )locinput;
3288
3289                 STRLEN len = 0;
3290                 STRLEN foldlen = 0;
3291                 U8 *uscan = (U8*)NULL;
3292                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3293                 U32 charcount = 0; /* how many input chars we have matched */
3294                 U32 accepted = 0; /* have we seen any accepting states? */
3295
3296                 ST.B = next;
3297                 ST.jump = trie->jump;
3298                 ST.me = scan;
3299                 ST.firstpos = NULL;
3300                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3301                 ST.nextword = 0;
3302
3303                 /* fully traverse the TRIE; note the position of the
3304                    shortest accept state and the wordnum of the longest
3305                    accept state */
3306
3307                 while ( state && uc <= (U8*)PL_regeol ) {
3308                     U32 base = trie->states[ state ].trans.base;
3309                     UV uvc = 0;
3310                     U16 charid = 0;
3311                     U16 wordnum;
3312                     wordnum = trie->states[ state ].wordnum;
3313
3314                     if (wordnum) { /* it's an accept state */
3315                         if (!accepted) {
3316                             accepted = 1;
3317                             /* record first match position */
3318                             if (ST.longfold) {
3319                                 ST.firstpos = (U8*)locinput;
3320                                 ST.firstchars = 0;
3321                             }
3322                             else {
3323                                 ST.firstpos = uc;
3324                                 ST.firstchars = charcount;
3325                             }
3326                         }
3327                         if (!ST.nextword || wordnum < ST.nextword)
3328                             ST.nextword = wordnum;
3329                         ST.topword = wordnum;
3330                     }
3331
3332                     DEBUG_TRIE_EXECUTE_r({
3333                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3334                                 PerlIO_printf( Perl_debug_log,
3335                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3336                                     2+depth * 2, "", PL_colors[4],
3337                                     (UV)state, (accepted ? 'Y' : 'N'));
3338                     });
3339
3340                     /* read a char and goto next state */
3341                     if ( base ) {
3342                         I32 offset;
3343                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3344                                              uscan, len, uvc, charid, foldlen,
3345                                              foldbuf, uniflags);
3346                         charcount++;
3347                         if (foldlen>0)
3348                             ST.longfold = TRUE;
3349                         if (charid &&
3350                              ( ((offset =
3351                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3352
3353                              && ((U32)offset < trie->lasttrans)
3354                              && trie->trans[offset].check == state)
3355                         {
3356                             state = trie->trans[offset].next;
3357                         }
3358                         else {
3359                             state = 0;
3360                         }
3361                         uc += len;
3362
3363                     }
3364                     else {
3365                         state = 0;
3366                     }
3367                     DEBUG_TRIE_EXECUTE_r(
3368                         PerlIO_printf( Perl_debug_log,
3369                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3370                             charid, uvc, (UV)state, PL_colors[5] );
3371                     );
3372                 }
3373                 if (!accepted)
3374                    sayNO;
3375
3376                 /* calculate total number of accept states */
3377                 {
3378                     U16 w = ST.topword;
3379                     accepted = 0;
3380                     while (w) {
3381                         w = trie->wordinfo[w].prev;
3382                         accepted++;
3383                     }
3384                     ST.accepted = accepted;
3385                 }
3386
3387                 DEBUG_EXECUTE_r(
3388                     PerlIO_printf( Perl_debug_log,
3389                         "%*s  %sgot %"IVdf" possible matches%s\n",
3390                         REPORT_CODE_OFF + depth * 2, "",
3391                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3392                 );
3393                 goto trie_first_try; /* jump into the fail handler */
3394             }}
3395             /* NOTREACHED */
3396
3397         case TRIE_next_fail: /* we failed - try next alternative */
3398             if ( ST.jump) {
3399                 REGCP_UNWIND(ST.cp);
3400                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3401                     PL_regoffs[n].end = -1;
3402                 *PL_reglastparen = n;
3403             }
3404             if (!--ST.accepted) {
3405                 DEBUG_EXECUTE_r({
3406                     PerlIO_printf( Perl_debug_log,
3407                         "%*s  %sTRIE failed...%s\n",
3408                         REPORT_CODE_OFF+depth*2, "",
3409                         PL_colors[4],
3410                         PL_colors[5] );
3411                 });
3412                 sayNO_SILENT;
3413             }
3414             {
3415                 /* Find next-highest word to process.  Note that this code
3416                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3417                 register U16 min = 0;
3418                 register U16 word;
3419                 register U16 const nextword = ST.nextword;
3420                 register reg_trie_wordinfo * const wordinfo
3421                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3422                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3423                     if (word > nextword && (!min || word < min))
3424                         min = word;
3425                 }
3426                 ST.nextword = min;
3427             }
3428
3429           trie_first_try:
3430             if (do_cutgroup) {
3431                 do_cutgroup = 0;
3432                 no_final = 0;
3433             }
3434
3435             if ( ST.jump) {
3436                 ST.lastparen = *PL_reglastparen;
3437                 REGCP_SET(ST.cp);
3438             }
3439
3440             /* find start char of end of current word */
3441             {
3442                 U32 chars; /* how many chars to skip */
3443                 U8 *uc = ST.firstpos;
3444                 reg_trie_data * const trie
3445                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3446
3447                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3448                             >=  ST.firstchars);
3449                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3450                             - ST.firstchars;
3451
3452                 if (ST.longfold) {
3453                     /* the hard option - fold each char in turn and find
3454                      * its folded length (which may be different */
3455                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3456                     STRLEN foldlen;
3457                     STRLEN len;
3458                     UV uvc;
3459                     U8 *uscan;
3460
3461                     while (chars) {
3462                         if (utf8_target) {
3463                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3464                                                     uniflags);
3465                             uc += len;
3466                         }
3467                         else {
3468                             uvc = *uc;
3469                             uc++;
3470                         }
3471                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3472                         uscan = foldbuf;
3473                         while (foldlen) {
3474                             if (!--chars)
3475                                 break;
3476                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3477                                             uniflags);
3478                             uscan += len;
3479                             foldlen -= len;
3480                         }
3481                     }
3482                 }
3483                 else {
3484                     if (utf8_target)
3485                         while (chars--)
3486                             uc += UTF8SKIP(uc);
3487                     else
3488                         uc += chars;
3489                 }
3490                 PL_reginput = (char *)uc;
3491             }
3492
3493             scan = (ST.jump && ST.jump[ST.nextword])
3494                         ? ST.me + ST.jump[ST.nextword]
3495                         : ST.B;
3496
3497             DEBUG_EXECUTE_r({
3498                 PerlIO_printf( Perl_debug_log,
3499                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3500                     REPORT_CODE_OFF+depth*2, "",
3501                     PL_colors[4],
3502                     ST.nextword,
3503                     PL_colors[5]
3504                     );
3505             });
3506
3507             if (ST.accepted > 1 || has_cutgroup) {
3508                 PUSH_STATE_GOTO(TRIE_next, scan);
3509                 /* NOTREACHED */
3510             }
3511             /* only one choice left - just continue */
3512             DEBUG_EXECUTE_r({
3513                 AV *const trie_words
3514                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3515                 SV ** const tmp = av_fetch( trie_words,
3516                     ST.nextword-1, 0 );
3517                 SV *sv= tmp ? sv_newmortal() : NULL;
3518
3519                 PerlIO_printf( Perl_debug_log,
3520                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3521                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3522                     ST.nextword,
3523                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3524                             PL_colors[0], PL_colors[1],
3525                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3526                         )
3527                     : "not compiled under -Dr",
3528                     PL_colors[5] );
3529             });
3530
3531             locinput = PL_reginput;
3532             nextchr = UCHARAT(locinput);
3533             continue; /* execute rest of RE */
3534             /* NOTREACHED */
3535 #undef  ST
3536
3537         case EXACT: {
3538             char *s = STRING(scan);
3539             ln = STR_LEN(scan);
3540             if (utf8_target != UTF_PATTERN) {
3541                 /* The target and the pattern have differing utf8ness. */
3542                 char *l = locinput;
3543                 const char * const e = s + ln;
3544
3545                 if (utf8_target) {
3546                     /* The target is utf8, the pattern is not utf8. */
3547                     while (s < e) {
3548                         STRLEN ulen;
3549                         if (l >= PL_regeol)
3550                              sayNO;
3551                         if (NATIVE_TO_UNI(*(U8*)s) !=
3552                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3553                                             uniflags))
3554                              sayNO;
3555                         l += ulen;
3556                         s ++;
3557                     }
3558                 }
3559                 else {
3560                     /* The target is not utf8, the pattern is utf8. */
3561                     while (s < e) {
3562                         STRLEN ulen;
3563                         if (l >= PL_regeol)
3564                             sayNO;
3565                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3566                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3567                                            uniflags))
3568                             sayNO;
3569                         s += ulen;
3570                         l ++;
3571                     }
3572                 }
3573                 locinput = l;
3574                 nextchr = UCHARAT(locinput);
3575                 break;
3576             }
3577             /* The target and the pattern have the same utf8ness. */
3578             /* Inline the first character, for speed. */
3579             if (UCHARAT(s) != nextchr)
3580                 sayNO;
3581             if (PL_regeol - locinput < ln)
3582                 sayNO;
3583             if (ln > 1 && memNE(s, locinput, ln))
3584                 sayNO;
3585             locinput += ln;
3586             nextchr = UCHARAT(locinput);
3587             break;
3588             }
3589         case EXACTFL: {
3590             re_fold_t folder;
3591             const U8 * fold_array;
3592             const char * s;
3593             U32 fold_utf8_flags;
3594
3595             PL_reg_flags |= RF_tainted;
3596             folder = foldEQ_locale;
3597             fold_array = PL_fold_locale;
3598             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3599             goto do_exactf;
3600
3601         case EXACTFU:
3602             folder = foldEQ_latin1;
3603             fold_array = PL_fold_latin1;
3604             fold_utf8_flags = 0;
3605             goto do_exactf;
3606
3607         case EXACTFA:
3608             folder = foldEQ_latin1;
3609             fold_array = PL_fold_latin1;
3610             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3611             goto do_exactf;
3612
3613         case EXACTF:
3614             folder = foldEQ;
3615             fold_array = PL_fold;
3616             fold_utf8_flags = 0;
3617
3618           do_exactf:
3619             s = STRING(scan);
3620             ln = STR_LEN(scan);
3621
3622             if (utf8_target || UTF_PATTERN) {
3623               /* Either target or the pattern are utf8. */
3624                 const char * const l = locinput;
3625                 char *e = PL_regeol;
3626
3627                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3628                                l, &e, 0,  utf8_target, fold_utf8_flags))
3629                 {
3630                     sayNO;
3631                 }
3632                 locinput = e;
3633                 nextchr = UCHARAT(locinput);
3634                 break;
3635             }
3636
3637             /* Neither the target nor the pattern are utf8 */
3638             if (UCHARAT(s) != nextchr &&
3639                 UCHARAT(s) != fold_array[nextchr])
3640             {
3641                 sayNO;
3642             }
3643             if (PL_regeol - locinput < ln)
3644                 sayNO;
3645             if (ln > 1 && ! folder(s, locinput, ln))
3646                 sayNO;
3647             locinput += ln;
3648             nextchr = UCHARAT(locinput);
3649             break;
3650         }
3651
3652         /* XXX Could improve efficiency by separating these all out using a
3653          * macro or in-line function.  At that point regcomp.c would no longer
3654          * have to set the FLAGS fields of these */
3655         case BOUNDL:
3656         case NBOUNDL:
3657             PL_reg_flags |= RF_tainted;
3658             /* FALL THROUGH */
3659         case BOUND:
3660         case BOUNDU:
3661         case BOUNDA:
3662         case NBOUND:
3663         case NBOUNDU:
3664         case NBOUNDA:
3665             /* was last char in word? */
3666             if (utf8_target
3667                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3668                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3669             {
3670                 if (locinput == PL_bostr)
3671                     ln = '\n';
3672                 else {
3673                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3674
3675                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3676                 }
3677                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3678                     ln = isALNUM_uni(ln);
3679                     LOAD_UTF8_CHARCLASS_ALNUM();
3680                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3681                 }
3682                 else {
3683                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3684                     n = isALNUM_LC_utf8((U8*)locinput);
3685                 }
3686             }
3687             else {
3688
3689                 /* Here the string isn't utf8, or is utf8 and only ascii
3690                  * characters are to match \w.  In the latter case looking at
3691                  * the byte just prior to the current one may be just the final
3692                  * byte of a multi-byte character.  This is ok.  There are two
3693                  * cases:
3694                  * 1) it is a single byte character, and then the test is doing
3695                  *      just what it's supposed to.
3696                  * 2) it is a multi-byte character, in which case the final
3697                  *      byte is never mistakable for ASCII, and so the test
3698                  *      will say it is not a word character, which is the
3699                  *      correct answer. */
3700                 ln = (locinput != PL_bostr) ?
3701                     UCHARAT(locinput - 1) : '\n';
3702                 switch (FLAGS(scan)) {
3703                     case REGEX_UNICODE_CHARSET:
3704                         ln = isWORDCHAR_L1(ln);
3705                         n = isWORDCHAR_L1(nextchr);
3706                         break;
3707                     case REGEX_LOCALE_CHARSET:
3708                         ln = isALNUM_LC(ln);
3709                         n = isALNUM_LC(nextchr);
3710                         break;
3711                     case REGEX_DEPENDS_CHARSET:
3712                         ln = isALNUM(ln);
3713                         n = isALNUM(nextchr);
3714                         break;
3715                     case REGEX_ASCII_RESTRICTED_CHARSET:
3716                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3717                         ln = isWORDCHAR_A(ln);
3718                         n = isWORDCHAR_A(nextchr);
3719                         break;
3720                     default:
3721                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3722                         break;
3723                 }
3724             }
3725             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3726              * regcomp.sym */
3727             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3728                     sayNO;
3729             break;
3730         case ANYOFV:
3731         case ANYOF:
3732             if (utf8_target || state_num == ANYOFV) {
3733                 STRLEN inclasslen = PL_regeol - locinput;
3734                 if (locinput >= PL_regeol)
3735                     sayNO;
3736
3737                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3738                     sayNO;
3739                 locinput += inclasslen;
3740                 nextchr = UCHARAT(locinput);
3741                 break;
3742             }
3743             else {
3744                 if (nextchr < 0)
3745                     nextchr = UCHARAT(locinput);
3746                 if (!nextchr && locinput >= PL_regeol)
3747                     sayNO;
3748                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3749                     sayNO;
3750                 nextchr = UCHARAT(++locinput);
3751                 break;
3752             }
3753             break;
3754         /* Special char classes - The defines start on line 129 or so */
3755         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3756                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3757                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3758                   ALNUMA, NALNUMA, isWORDCHAR_A,
3759                   alnum, "a");
3760
3761         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3762                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3763                   SPACEU, NSPACEU, isSPACE_L1,
3764                   SPACEA, NSPACEA, isSPACE_A,
3765                   space, " ");
3766
3767         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3768                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3769                 DIGITA, NDIGITA, isDIGIT_A,
3770                 digit, "0");
3771
3772         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3773                        a Unicode extended Grapheme Cluster */
3774             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3775               extended Grapheme Cluster is:
3776
3777                CR LF
3778                | Prepend* Begin Extend*
3779                | .
3780
3781                Begin is (Hangul-syllable | ! Control)
3782                Extend is (Grapheme_Extend | Spacing_Mark)
3783                Control is [ GCB_Control CR LF ]
3784
3785                The discussion below shows how the code for CLUMP is derived
3786                from this regex.  Note that most of these concepts are from
3787                property values of the Grapheme Cluster Boundary (GCB) property.
3788                No code point can have multiple property values for a given
3789                property.  Thus a code point in Prepend can't be in Control, but
3790                it must be in !Control.  This is why Control above includes
3791                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3792                property separately, and so can't be in GCB_Control, even though
3793                they logically are controls.  Control is not the same as gc=cc,
3794                but includes format and other characters as well.
3795
3796                The Unicode definition of Hangul-syllable is:
3797                    L+
3798                    | (L* ( ( V | LV ) V* | LVT ) T*)
3799                    | T+
3800                   )
3801                Each of these is a value for the GCB property, and hence must be
3802                disjoint, so the order they are tested is immaterial, so the
3803                above can safely be changed to
3804                    T+
3805                    | L+
3806                    | (L* ( LVT | ( V | LV ) V*) T*)
3807
3808                The last two terms can be combined like this:
3809                    L* ( L
3810                         | (( LVT | ( V | LV ) V*) T*))
3811
3812                And refactored into this:
3813                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3814
3815                That means that if we have seen any L's at all we can quit
3816                there, but if the next character is a LVT, a V or and LV we
3817                should keep going.
3818
3819                There is a subtlety with Prepend* which showed up in testing.
3820                Note that the Begin, and only the Begin is required in:
3821                 | Prepend* Begin Extend*
3822                Also, Begin contains '! Control'.  A Prepend must be a '!
3823                Control', which means it must be a Begin.  What it comes down to
3824                is that if we match Prepend* and then find no suitable Begin
3825                afterwards, that if we backtrack the last Prepend, that one will
3826                be a suitable Begin.
3827             */
3828
3829             if (locinput >= PL_regeol)
3830                 sayNO;
3831             if  (! utf8_target) {
3832
3833                 /* Match either CR LF  or '.', as all the other possibilities
3834                  * require utf8 */
3835                 locinput++;         /* Match the . or CR */
3836                 if (nextchr == '\r'
3837                     && locinput < PL_regeol
3838                     && UCHARAT(locinput) == '\n') locinput++;
3839             }
3840             else {
3841
3842                 /* Utf8: See if is ( CR LF ); already know that locinput <
3843                  * PL_regeol, so locinput+1 is in bounds */
3844                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3845                     locinput += 2;
3846                 }
3847                 else {
3848                     /* In case have to backtrack to beginning, then match '.' */
3849                     char *starting = locinput;
3850
3851                     /* In case have to backtrack the last prepend */
3852                     char *previous_prepend = 0;
3853
3854                     LOAD_UTF8_CHARCLASS_GCB();
3855
3856                     /* Match (prepend)* */
3857                     while (locinput < PL_regeol
3858                            && swash_fetch(PL_utf8_X_prepend,
3859                                           (U8*)locinput, utf8_target))
3860                     {
3861                         previous_prepend = locinput;
3862                         locinput += UTF8SKIP(locinput);
3863                     }
3864
3865                     /* As noted above, if we matched a prepend character, but
3866                      * the next thing won't match, back off the last prepend we
3867                      * matched, as it is guaranteed to match the begin */
3868                     if (previous_prepend
3869                         && (locinput >=  PL_regeol
3870                             || ! swash_fetch(PL_utf8_X_begin,
3871                                              (U8*)locinput, utf8_target)))
3872                     {
3873                         locinput = previous_prepend;
3874                     }
3875
3876                     /* Note that here we know PL_regeol > locinput, as we
3877                      * tested that upon input to this switch case, and if we
3878                      * moved locinput forward, we tested the result just above
3879                      * and it either passed, or we backed off so that it will
3880                      * now pass */
3881                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3882
3883                         /* Here did not match the required 'Begin' in the
3884                          * second term.  So just match the very first
3885                          * character, the '.' of the final term of the regex */
3886                         locinput = starting + UTF8SKIP(starting);
3887                     } else {
3888
3889                         /* Here is the beginning of a character that can have
3890                          * an extender.  It is either a hangul syllable, or a
3891                          * non-control */
3892                         if (swash_fetch(PL_utf8_X_non_hangul,
3893                                         (U8*)locinput, utf8_target))
3894                         {
3895
3896                             /* Here not a Hangul syllable, must be a
3897                              * ('!  * Control') */
3898                             locinput += UTF8SKIP(locinput);
3899                         } else {
3900
3901                             /* Here is a Hangul syllable.  It can be composed
3902                              * of several individual characters.  One
3903                              * possibility is T+ */
3904                             if (swash_fetch(PL_utf8_X_T,
3905                                             (U8*)locinput, utf8_target))
3906                             {
3907                                 while (locinput < PL_regeol
3908                                         && swash_fetch(PL_utf8_X_T,
3909                                                         (U8*)locinput, utf8_target))
3910                                 {
3911                                     locinput += UTF8SKIP(locinput);
3912                                 }
3913                             } else {
3914
3915                                 /* Here, not T+, but is a Hangul.  That means
3916                                  * it is one of the others: L, LV, LVT or V,
3917                                  * and matches:
3918                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3919
3920                                 /* Match L*           */
3921                                 while (locinput < PL_regeol
3922                                         && swash_fetch(PL_utf8_X_L,
3923                                                         (U8*)locinput, utf8_target))
3924                                 {
3925                                     locinput += UTF8SKIP(locinput);
3926                                 }
3927
3928                                 /* Here, have exhausted L*.  If the next
3929                                  * character is not an LV, LVT nor V, it means
3930                                  * we had to have at least one L, so matches L+
3931                                  * in the original equation, we have a complete
3932                                  * hangul syllable.  Are done. */
3933
3934                                 if (locinput < PL_regeol
3935                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3936                                                     (U8*)locinput, utf8_target))
3937                                 {
3938
3939                                     /* Otherwise keep going.  Must be LV, LVT
3940                                      * or V.  See if LVT */
3941                                     if (swash_fetch(PL_utf8_X_LVT,
3942                                                     (U8*)locinput, utf8_target))
3943                                     {
3944                                         locinput += UTF8SKIP(locinput);
3945                                     } else {
3946
3947                                         /* Must be  V or LV.  Take it, then
3948                                          * match V*     */
3949                                         locinput += UTF8SKIP(locinput);
3950                                         while (locinput < PL_regeol
3951                                                 && swash_fetch(PL_utf8_X_V,
3952                                                          (U8*)locinput, utf8_target))
3953                                         {
3954                                             locinput += UTF8SKIP(locinput);
3955                                         }
3956                                     }
3957
3958                                     /* And any of LV, LVT, or V can be followed
3959                                      * by T*            */
3960                                     while (locinput < PL_regeol
3961                                            && swash_fetch(PL_utf8_X_T,
3962                                                            (U8*)locinput,
3963                                                            utf8_target))
3964                                     {
3965                                         locinput += UTF8SKIP(locinput);
3966                                     }
3967                                 }
3968                             }
3969                         }
3970
3971                         /* Match any extender */
3972                         while (locinput < PL_regeol
3973                                 && swash_fetch(PL_utf8_X_extend,
3974                                                 (U8*)locinput, utf8_target))
3975                         {
3976                             locinput += UTF8SKIP(locinput);
3977                         }
3978                     }
3979                 }
3980                 if (locinput > PL_regeol) sayNO;
3981             }
3982             nextchr = UCHARAT(locinput);
3983             break;
3984
3985         case NREFFL:
3986         {   /* The capture buffer cases.  The ones beginning with N for the
3987                named buffers just convert to the equivalent numbered and
3988                pretend they were called as the corresponding numbered buffer
3989                op.  */
3990             /* don't initialize these in the declaration, it makes C++
3991                unhappy */
3992             char *s;
3993             char type;
3994             re_fold_t folder;
3995             const U8 *fold_array;
3996             UV utf8_fold_flags;
3997
3998             PL_reg_flags |= RF_tainted;
3999             folder = foldEQ_locale;
4000             fold_array = PL_fold_locale;
4001             type = REFFL;
4002             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4003             goto do_nref;
4004
4005         case NREFFA:
4006             folder = foldEQ_latin1;
4007             fold_array = PL_fold_latin1;
4008             type = REFFA;
4009             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4010             goto do_nref;
4011
4012         case NREFFU:
4013             folder = foldEQ_latin1;
4014             fold_array = PL_fold_latin1;
4015             type = REFFU;
4016             utf8_fold_flags = 0;
4017             goto do_nref;
4018
4019         case NREFF:
4020             folder = foldEQ;
4021             fold_array = PL_fold;
4022             type = REFF;
4023             utf8_fold_flags = 0;
4024             goto do_nref;
4025
4026         case NREF:
4027             type = REF;
4028             folder = NULL;
4029             fold_array = NULL;
4030             utf8_fold_flags = 0;
4031           do_nref:
4032
4033             /* For the named back references, find the corresponding buffer
4034              * number */
4035             n = reg_check_named_buff_matched(rex,scan);
4036
4037             if ( ! n ) {
4038                 sayNO;
4039             }
4040             goto do_nref_ref_common;
4041
4042         case REFFL:
4043             PL_reg_flags |= RF_tainted;
4044             folder = foldEQ_locale;
4045             fold_array = PL_fold_locale;
4046             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4047             goto do_ref;
4048
4049         case REFFA:
4050             folder = foldEQ_latin1;
4051             fold_array = PL_fold_latin1;
4052             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4053             goto do_ref;
4054
4055         case REFFU:
4056             folder = foldEQ_latin1;
4057             fold_array = PL_fold_latin1;
4058             utf8_fold_flags = 0;
4059             goto do_ref;
4060
4061         case REFF:
4062             folder = foldEQ;
4063             fold_array = PL_fold;
4064             utf8_fold_flags = 0;
4065             goto do_ref;
4066
4067         case REF:
4068             folder = NULL;
4069             fold_array = NULL;
4070             utf8_fold_flags = 0;
4071
4072           do_ref:
4073             type = OP(scan);
4074             n = ARG(scan);  /* which paren pair */
4075
4076           do_nref_ref_common:
4077             ln = PL_regoffs[n].start;
4078             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4079             if (*PL_reglastparen < n || ln == -1)
4080                 sayNO;                  /* Do not match unless seen CLOSEn. */
4081             if (ln == PL_regoffs[n].end)
4082                 break;
4083
4084             s = PL_bostr + ln;
4085             if (type != REF     /* REF can do byte comparison */
4086                 && (utf8_target || type == REFFU))
4087             { /* XXX handle REFFL better */
4088                 char * limit = PL_regeol;
4089
4090                 /* This call case insensitively compares the entire buffer
4091                     * at s, with the current input starting at locinput, but
4092                     * not going off the end given by PL_regeol, and returns in
4093                     * limit upon success, how much of the current input was
4094                     * matched */
4095                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4096                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4097                 {
4098                     sayNO;
4099                 }
4100                 locinput = limit;
4101                 nextchr = UCHARAT(locinput);
4102                 break;
4103             }
4104
4105             /* Not utf8:  Inline the first character, for speed. */
4106             if (UCHARAT(s) != nextchr &&
4107                 (type == REF ||
4108                  UCHARAT(s) != fold_array[nextchr]))
4109                 sayNO;
4110             ln = PL_regoffs[n].end - ln;
4111             if (locinput + ln > PL_regeol)
4112                 sayNO;
4113             if (ln > 1 && (type == REF
4114                            ? memNE(s, locinput, ln)
4115                            : ! folder(s, locinput, ln)))
4116                 sayNO;
4117             locinput += ln;
4118             nextchr = UCHARAT(locinput);
4119             break;
4120         }
4121         case NOTHING:
4122         case TAIL:
4123             break;
4124         case BACK:
4125             break;
4126
4127 #undef  ST
4128 #define ST st->u.eval
4129         {
4130             SV *ret;
4131             REGEXP *re_sv;
4132             regexp *re;
4133             regexp_internal *rei;
4134             regnode *startpoint;
4135
4136         case GOSTART:
4137         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4138             if (cur_eval && cur_eval->locinput==locinput) {
4139                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4140                     Perl_croak(aTHX_ "Infinite recursion in regex");
4141                 if ( ++nochange_depth > max_nochange_depth )
4142                     Perl_croak(aTHX_
4143                         "Pattern subroutine nesting without pos change"
4144                         " exceeded limit in regex");
4145             } else {
4146                 nochange_depth = 0;
4147             }
4148             re_sv = rex_sv;
4149             re = rex;
4150             rei = rexi;
4151             (void)ReREFCNT_inc(rex_sv);
4152             if (OP(scan)==GOSUB) {
4153                 startpoint = scan + ARG2L(scan);
4154                 ST.close_paren = ARG(scan);
4155             } else {
4156                 startpoint = rei->program+1;
4157                 ST.close_paren = 0;
4158             }
4159             goto eval_recurse_doit;
4160             /* NOTREACHED */
4161         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4162             if (cur_eval && cur_eval->locinput==locinput) {
4163                 if ( ++nochange_depth > max_nochange_depth )
4164                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4165             } else {
4166                 nochange_depth = 0;
4167             }
4168             {
4169                 /* execute the code in the {...} */
4170                 dSP;
4171                 SV ** const before = SP;
4172                 OP_4tree * const oop = PL_op;
4173                 COP * const ocurcop = PL_curcop;
4174                 PAD *old_comppad;
4175                 char *saved_regeol = PL_regeol;
4176                 struct re_save_state saved_state;
4177
4178                 /* To not corrupt the existing regex state while executing the
4179                  * eval we would normally put it on the save stack, like with
4180                  * save_re_context. However, re-evals have a weird scoping so we
4181                  * can't just add ENTER/LEAVE here. With that, things like
4182                  *
4183                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4184                  *
4185                  * would break, as they expect the localisation to be unwound
4186                  * only when the re-engine backtracks through the bit that
4187                  * localised it.
4188                  *
4189                  * What we do instead is just saving the state in a local c
4190                  * variable.
4191                  */
4192                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4193
4194                 n = ARG(scan);
4195                 PL_op = (OP_4tree*)rexi->data->data[n];
4196                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4197                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4198                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4199                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4200
4201                 if (sv_yes_mark) {
4202                     SV *sv_mrk = get_sv("REGMARK", 1);
4203                     sv_setsv(sv_mrk, sv_yes_mark);
4204                 }
4205
4206                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4207                 SPAGAIN;
4208                 if (SP == before)
4209                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4210                 else {
4211                     ret = POPs;
4212                     PUTBACK;
4213                 }
4214
4215                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4216
4217                 PL_op = oop;
4218                 PAD_RESTORE_LOCAL(old_comppad);
4219                 PL_curcop = ocurcop;
4220                 PL_regeol = saved_regeol;
4221                 if (!logical) {
4222                     /* /(?{...})/ */
4223                     sv_setsv(save_scalar(PL_replgv), ret);
4224                     break;
4225                 }
4226             }
4227             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4228                 logical = 0;
4229                 {
4230                     /* extract RE object from returned value; compiling if
4231                      * necessary */
4232                     MAGIC *mg = NULL;
4233                     REGEXP *rx = NULL;
4234
4235                     if (SvROK(ret)) {
4236                         SV *const sv = SvRV(ret);
4237
4238                         if (SvTYPE(sv) == SVt_REGEXP) {
4239                             rx = (REGEXP*) sv;
4240                         } else if (SvSMAGICAL(sv)) {
4241                             mg = mg_find(sv, PERL_MAGIC_qr);
4242                             assert(mg);
4243                         }
4244                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4245                         rx = (REGEXP*) ret;
4246                     } else if (SvSMAGICAL(ret)) {
4247                         if (SvGMAGICAL(ret)) {
4248                             /* I don't believe that there is ever qr magic
4249                                here.  */
4250                             assert(!mg_find(ret, PERL_MAGIC_qr));
4251                             sv_unmagic(ret, PERL_MAGIC_qr);
4252                         }
4253                         else {
4254                             mg = mg_find(ret, PERL_MAGIC_qr);
4255                             /* testing suggests mg only ends up non-NULL for
4256                                scalars who were upgraded and compiled in the
4257                                else block below. In turn, this is only
4258                                triggered in the "postponed utf8 string" tests
4259                                in t/op/pat.t  */
4260                         }
4261                     }
4262
4263                     if (mg) {
4264                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4265                         assert(rx);
4266                     }
4267                     if (rx) {
4268                         rx = reg_temp_copy(NULL, rx);
4269                     }
4270                     else {
4271                         U32 pm_flags = 0;
4272                         const I32 osize = PL_regsize;
4273
4274                         if (DO_UTF8(ret)) {
4275                             assert (SvUTF8(ret));
4276                         } else if (SvUTF8(ret)) {
4277                             /* Not doing UTF-8, despite what the SV says. Is
4278                                this only if we're trapped in use 'bytes'?  */
4279                             /* Make a copy of the octet sequence, but without
4280                                the flag on, as the compiler now honours the
4281                                SvUTF8 flag on ret.  */
4282                             STRLEN len;
4283                             const char *const p = SvPV(ret, len);
4284                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4285                         }
4286                         rx = CALLREGCOMP(ret, pm_flags);
4287                         if (!(SvFLAGS(ret)
4288                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4289                                  | SVs_GMG))) {
4290                             /* This isn't a first class regexp. Instead, it's
4291                                caching a regexp onto an existing, Perl visible
4292                                scalar.  */
4293                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4294                         }
4295                         PL_regsize = osize;
4296                     }
4297                     re_sv = rx;
4298                     re = (struct regexp *)SvANY(rx);
4299                 }
4300                 RXp_MATCH_COPIED_off(re);
4301                 re->subbeg = rex->subbeg;
4302                 re->sublen = rex->sublen;
4303                 rei = RXi_GET(re);
4304                 DEBUG_EXECUTE_r(
4305                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4306                         "Matching embedded");
4307                 );
4308                 startpoint = rei->program + 1;
4309                 ST.close_paren = 0; /* only used for GOSUB */
4310                 /* borrowed from regtry */
4311                 if (PL_reg_start_tmpl <= re->nparens) {
4312                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4313                     if(PL_reg_start_tmp)
4314                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4315                     else
4316                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4317                 }
4318
4319         eval_recurse_doit: /* Share code with GOSUB below this line */
4320                 /* run the pattern returned from (??{...}) */
4321                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4322                 REGCP_SET(ST.lastcp);
4323
4324                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4325
4326                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4327                 PL_reglastparen = &re->lastparen;
4328                 PL_reglastcloseparen = &re->lastcloseparen;
4329                 re->lastparen = 0;
4330                 re->lastcloseparen = 0;
4331
4332                 PL_reginput = locinput;
4333                 PL_regsize = 0;
4334
4335                 /* XXXX This is too dramatic a measure... */
4336                 PL_reg_maxiter = 0;
4337
4338                 ST.toggle_reg_flags = PL_reg_flags;
4339                 if (RX_UTF8(re_sv))
4340                     PL_reg_flags |= RF_utf8;
4341                 else
4342                     PL_reg_flags &= ~RF_utf8;
4343                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4344
4345                 ST.prev_rex = rex_sv;
4346                 ST.prev_curlyx = cur_curlyx;
4347                 SETREX(rex_sv,re_sv);
4348                 rex = re;
4349                 rexi = rei;
4350                 cur_curlyx = NULL;
4351                 ST.B = next;
4352                 ST.prev_eval = cur_eval;
4353                 cur_eval = st;
4354                 /* now continue from first node in postoned RE */
4355                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4356                 /* NOTREACHED */
4357             }
4358             /* logical is 1,   /(?(?{...})X|Y)/ */
4359             sw = cBOOL(SvTRUE(ret));
4360             logical = 0;
4361             break;
4362         }
4363
4364         case EVAL_AB: /* cleanup after a successful (??{A})B */
4365             /* note: this is called twice; first after popping B, then A */
4366             PL_reg_flags ^= ST.toggle_reg_flags;
4367             ReREFCNT_dec(rex_sv);
4368             SETREX(rex_sv,ST.prev_rex);
4369             rex = (struct regexp *)SvANY(rex_sv);
4370             rexi = RXi_GET(rex);
4371             regcpblow(ST.cp);
4372             cur_eval = ST.prev_eval;
4373             cur_curlyx = ST.prev_curlyx;
4374
4375             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4376             PL_reglastparen = &rex->lastparen;
4377             PL_reglastcloseparen = &rex->lastcloseparen;
4378             /* also update PL_regoffs */
4379             PL_regoffs = rex->offs;
4380
4381             /* XXXX This is too dramatic a measure... */
4382             PL_reg_maxiter = 0;
4383             if ( nochange_depth )
4384                 nochange_depth--;
4385             sayYES;
4386
4387
4388         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4389             /* note: this is called twice; first after popping B, then A */
4390             PL_reg_flags ^= ST.toggle_reg_flags;
4391             ReREFCNT_dec(rex_sv);
4392             SETREX(rex_sv,ST.prev_rex);
4393             rex = (struct regexp *)SvANY(rex_sv);
4394             rexi = RXi_GET(rex);
4395             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4396             PL_reglastparen = &rex->lastparen;
4397             PL_reglastcloseparen = &rex->lastcloseparen;
4398
4399             PL_reginput = locinput;
4400             REGCP_UNWIND(ST.lastcp);
4401             regcppop(rex);
4402             cur_eval = ST.prev_eval;
4403             cur_curlyx = ST.prev_curlyx;
4404             /* XXXX This is too dramatic a measure... */
4405             PL_reg_maxiter = 0;
4406             if ( nochange_depth )
4407                 nochange_depth--;
4408             sayNO_SILENT;
4409 #undef ST
4410
4411         case OPEN:
4412             n = ARG(scan);  /* which paren pair */
4413             PL_reg_start_tmp[n] = locinput;
4414             if (n > PL_regsize)
4415                 PL_regsize = n;
4416             lastopen = n;
4417             break;
4418         case CLOSE:
4419             n = ARG(scan);  /* which paren pair */
4420             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4421             PL_regoffs[n].end = locinput - PL_bostr;
4422             /*if (n > PL_regsize)
4423                 PL_regsize = n;*/
4424             if (n > *PL_reglastparen)
4425                 *PL_reglastparen = n;
4426             *PL_reglastcloseparen = n;
4427             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4428                 goto fake_end;
4429             }
4430             break;
4431         case ACCEPT:
4432             if (ARG(scan)){
4433                 regnode *cursor;
4434                 for (cursor=scan;
4435                      cursor && OP(cursor)!=END;
4436                      cursor=regnext(cursor))
4437                 {
4438                     if ( OP(cursor)==CLOSE ){
4439                         n = ARG(cursor);
4440                         if ( n <= lastopen ) {
4441                             PL_regoffs[n].start
4442                                 = PL_reg_start_tmp[n] - PL_bostr;
4443                             PL_regoffs[n].end = locinput - PL_bostr;
4444                             /*if (n > PL_regsize)
4445                             PL_regsize = n;*/
4446                             if (n > *PL_reglastparen)
4447                                 *PL_reglastparen = n;
4448                             *PL_reglastcloseparen = n;
4449                             if ( n == ARG(scan) || (cur_eval &&
4450                                 cur_eval->u.eval.close_paren == n))
4451                                 break;
4452                         }
4453                     }
4454                 }
4455             }
4456             goto fake_end;
4457             /*NOTREACHED*/
4458         case GROUPP:
4459             n = ARG(scan);  /* which paren pair */
4460             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4461             break;
4462         case NGROUPP:
4463             /* reg_check_named_buff_matched returns 0 for no match */
4464             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4465             break;
4466         case INSUBP:
4467             n = ARG(scan);
4468             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4469             break;
4470         case DEFINEP:
4471             sw = 0;
4472             break;
4473         case IFTHEN:
4474             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4475             if (sw)
4476                 next = NEXTOPER(NEXTOPER(scan));
4477             else {
4478                 next = scan + ARG(scan);
4479                 if (OP(next) == IFTHEN) /* Fake one. */
4480                     next = NEXTOPER(NEXTOPER(next));
4481             }
4482             break;
4483         case LOGICAL:
4484             logical = scan->flags;
4485             break;
4486
4487 /*******************************************************************
4488
4489 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4490 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4491 STAR/PLUS/CURLY/CURLYN are used instead.)
4492
4493 A*B is compiled as <CURLYX><A><WHILEM><B>
4494
4495 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4496 state, which contains the current count, initialised to -1. It also sets
4497 cur_curlyx to point to this state, with any previous value saved in the
4498 state block.
4499
4500 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4501 since the pattern may possibly match zero times (i.e. it's a while {} loop
4502 rather than a do {} while loop).
4503
4504 Each entry to WHILEM represents a successful match of A. The count in the
4505 CURLYX block is incremented, another WHILEM state is pushed, and execution
4506 passes to A or B depending on greediness and the current count.
4507
4508 For example, if matching against the string a1a2a3b (where the aN are
4509 substrings that match /A/), then the match progresses as follows: (the
4510 pushed states are interspersed with the bits of strings matched so far):
4511
4512     <CURLYX cnt=-1>
4513     <CURLYX cnt=0><WHILEM>
4514     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4515     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4516     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4517     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4518
4519 (Contrast this with something like CURLYM, which maintains only a single
4520 backtrack state:
4521
4522     <CURLYM cnt=0> a1
4523     a1 <CURLYM cnt=1> a2
4524     a1 a2 <CURLYM cnt=2> a3
4525     a1 a2 a3 <CURLYM cnt=3> b
4526 )
4527
4528 Each WHILEM state block marks a point to backtrack to upon partial failure
4529 of A or B, and also contains some minor state data related to that
4530 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4531 overall state, such as the count, and pointers to the A and B ops.
4532
4533 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4534 must always point to the *current* CURLYX block, the rules are:
4535
4536 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4537 and set cur_curlyx to point the new block.
4538
4539 When popping the CURLYX block after a successful or unsuccessful match,
4540 restore the previous cur_curlyx.
4541
4542 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4543 to the outer one saved in the CURLYX block.
4544
4545 When popping the WHILEM block after a successful or unsuccessful B match,
4546 restore the previous cur_curlyx.
4547
4548 Here's an example for the pattern (AI* BI)*BO
4549 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4550
4551 cur_
4552 curlyx backtrack stack
4553 ------ ---------------
4554 NULL
4555 CO     <CO prev=NULL> <WO>
4556 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4557 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4558 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4559
4560 At this point the pattern succeeds, and we work back down the stack to
4561 clean up, restoring as we go:
4562
4563 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4564 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4565 CO     <CO prev=NULL> <WO>
4566 NULL
4567
4568 *******************************************************************/
4569
4570 #define ST st->u.curlyx
4571
4572         case CURLYX:    /* start of /A*B/  (for complex A) */
4573         {
4574             /* No need to save/restore up to this paren */
4575             I32 parenfloor = scan->flags;
4576
4577             assert(next); /* keep Coverity happy */
4578             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4579                 next += ARG(next);
4580
4581             /* XXXX Probably it is better to teach regpush to support
4582                parenfloor > PL_regsize... */
4583             if (parenfloor > (I32)*PL_reglastparen)
4584                 parenfloor = *PL_reglastparen; /* Pessimization... */
4585
4586             ST.prev_curlyx= cur_curlyx;
4587             cur_curlyx = st;
4588             ST.cp = PL_savestack_ix;
4589
4590             /* these fields contain the state of the current curly.
4591              * they are accessed by subsequent WHILEMs */
4592             ST.parenfloor = parenfloor;
4593             ST.me = scan;
4594             ST.B = next;
4595             ST.minmod = minmod;
4596             minmod = 0;
4597             ST.count = -1;      /* this will be updated by WHILEM */
4598             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4599
4600             PL_reginput = locinput;
4601             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4602             /* NOTREACHED */
4603         }
4604
4605         case CURLYX_end: /* just finished matching all of A*B */
4606             cur_curlyx = ST.prev_curlyx;
4607             sayYES;
4608             /* NOTREACHED */
4609
4610         case CURLYX_end_fail: /* just failed to match all of A*B */
4611             regcpblow(ST.cp);
4612             cur_curlyx = ST.prev_curlyx;
4613             sayNO;
4614             /* NOTREACHED */
4615
4616
4617 #undef ST
4618 #define ST st->u.whilem
4619
4620         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4621         {
4622             /* see the discussion above about CURLYX/WHILEM */
4623             I32 n;
4624             int min = ARG1(cur_curlyx->u.curlyx.me);
4625             int max = ARG2(cur_curlyx->u.curlyx.me);
4626             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4627
4628             assert(cur_curlyx); /* keep Coverity happy */
4629             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4630             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4631             ST.cache_offset = 0;
4632             ST.cache_mask = 0;
4633
4634             PL_reginput = locinput;
4635
4636             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4637                   "%*s  whilem: matched %ld out of %d..%d\n",
4638                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4639             );
4640
4641             /* First just match a string of min A's. */
4642
4643             if (n < min) {
4644                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4645                 cur_curlyx->u.curlyx.lastloc = locinput;
4646                 REGCP_SET(ST.lastcp);
4647
4648                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4649                 /* NOTREACHED */
4650             }
4651
4652             /* If degenerate A matches "", assume A done. */
4653
4654             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4655                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4656                    "%*s  whilem: empty match detected, trying continuation...\n",
4657                    REPORT_CODE_OFF+depth*2, "")
4658                 );
4659                 goto do_whilem_B_max;
4660             }
4661
4662             /* super-linear cache processing */
4663
4664             if (scan->flags) {
4665
4666                 if (!PL_reg_maxiter) {
4667                     /* start the countdown: Postpone detection until we
4668                      * know the match is not *that* much linear. */
4669                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4670                     /* possible overflow for long strings and many CURLYX's */
4671                     if (PL_reg_maxiter < 0)
4672                         PL_reg_maxiter = I32_MAX;
4673                     PL_reg_leftiter = PL_reg_maxiter;
4674                 }
4675
4676                 if (PL_reg_leftiter-- == 0) {
4677                     /* initialise cache */
4678                     const I32 size = (PL_reg_maxiter + 7)/8;
4679                     if (PL_reg_poscache) {
4680                         if ((I32)PL_reg_poscache_size < size) {
4681                             Renew(PL_reg_poscache, size, char);
4682                             PL_reg_poscache_size = size;
4683                         }
4684                         Zero(PL_reg_poscache, size, char);
4685                     }
4686                     else {
4687                         PL_reg_poscache_size = size;
4688                         Newxz(PL_reg_poscache, size, char);
4689                     }
4690                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4691       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4692                               PL_colors[4], PL_colors[5])
4693                     );
4694                 }
4695
4696                 if (PL_reg_leftiter < 0) {
4697                     /* have we already failed at this position? */
4698                     I32 offset, mask;
4699                     offset  = (scan->flags & 0xf) - 1
4700                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4701                     mask    = 1 << (offset % 8);
4702                     offset /= 8;
4703                     if (PL_reg_poscache[offset] & mask) {
4704                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4705                             "%*s  whilem: (cache) already tried at this position...\n",
4706                             REPORT_CODE_OFF+depth*2, "")
4707                         );
4708                         sayNO; /* cache records failure */
4709                     }
4710                     ST.cache_offset = offset;
4711                     ST.cache_mask   = mask;
4712                 }
4713             }
4714
4715             /* Prefer B over A for minimal matching. */
4716
4717             if (cur_curlyx->u.curlyx.minmod) {
4718                 ST.save_curlyx = cur_curlyx;
4719                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4720                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4721                 REGCP_SET(ST.lastcp);
4722                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4723                 /* NOTREACHED */
4724             }
4725
4726             /* Prefer A over B for maximal matching. */
4727
4728             if (n < max) { /* More greed allowed? */
4729                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4730                 cur_curlyx->u.curlyx.lastloc = locinput;
4731                 REGCP_SET(ST.lastcp);
4732                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4733                 /* NOTREACHED */
4734             }
4735             goto do_whilem_B_max;
4736         }
4737         /* NOTREACHED */
4738
4739         case WHILEM_B_min: /* just matched B in a minimal match */
4740         case WHILEM_B_max: /* just matched B in a maximal match */
4741             cur_curlyx = ST.save_curlyx;
4742             sayYES;
4743             /* NOTREACHED */
4744
4745         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4746             cur_curlyx = ST.save_curlyx;
4747             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4748             cur_curlyx->u.curlyx.count--;
4749             CACHEsayNO;
4750             /* NOTREACHED */
4751
4752         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4753             /* FALL THROUGH */
4754         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4755             REGCP_UNWIND(ST.lastcp);
4756             regcppop(rex);
4757             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4758             cur_curlyx->u.curlyx.count--;
4759             CACHEsayNO;
4760             /* NOTREACHED */
4761
4762         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4763             REGCP_UNWIND(ST.lastcp);
4764             regcppop(rex);      /* Restore some previous $<digit>s? */
4765             PL_reginput = locinput;
4766             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4767                 "%*s  whilem: failed, trying continuation...\n",
4768                 REPORT_CODE_OFF+depth*2, "")
4769             );
4770           do_whilem_B_max:
4771             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4772                 && ckWARN(WARN_REGEXP)
4773                 && !(PL_reg_flags & RF_warned))
4774             {
4775                 PL_reg_flags |= RF_warned;
4776                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4777                      "Complex regular subexpression recursion",
4778                      REG_INFTY - 1);
4779             }
4780
4781             /* now try B */
4782             ST.save_curlyx = cur_curlyx;
4783             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4784             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4785             /* NOTREACHED */
4786
4787         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4788             cur_curlyx = ST.save_curlyx;
4789             REGCP_UNWIND(ST.lastcp);
4790             regcppop(rex);
4791
4792             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4793                 /* Maximum greed exceeded */
4794                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4795                     && ckWARN(WARN_REGEXP)
4796                     && !(PL_reg_flags & RF_warned))
4797                 {
4798                     PL_reg_flags |= RF_warned;
4799                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4800                         "%s limit (%d) exceeded",
4801                         "Complex regular subexpression recursion",
4802                         REG_INFTY - 1);
4803                 }
4804                 cur_curlyx->u.curlyx.count--;
4805                 CACHEsayNO;
4806             }
4807
4808             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4809                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4810             );
4811             /* Try grabbing another A and see if it helps. */
4812             PL_reginput = locinput;
4813             cur_curlyx->u.curlyx.lastloc = locinput;
4814             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4815             REGCP_SET(ST.lastcp);
4816             PUSH_STATE_GOTO(WHILEM_A_min,
4817                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4818             /* NOTREACHED */
4819
4820 #undef  ST
4821 #define ST st->u.branch
4822
4823         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4824             next = scan + ARG(scan);
4825             if (next == scan)
4826                 next = NULL;
4827             scan = NEXTOPER(scan);
4828             /* FALL THROUGH */
4829
4830         case BRANCH:        /*  /(...|A|...)/ */
4831             scan = NEXTOPER(scan); /* scan now points to inner node */
4832             ST.lastparen = *PL_reglastparen;
4833             ST.next_branch = next;
4834             REGCP_SET(ST.cp);
4835             PL_reginput = locinput;
4836
4837             /* Now go into the branch */
4838             if (has_cutgroup) {
4839                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4840             } else {
4841                 PUSH_STATE_GOTO(BRANCH_next, scan);
4842             }
4843             /* NOTREACHED */
4844         case CUTGROUP:
4845             PL_reginput = locinput;
4846             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4847                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4848             PUSH_STATE_GOTO(CUTGROUP_next,next);
4849             /* NOTREACHED */
4850         case CUTGROUP_next_fail:
4851             do_cutgroup = 1;
4852             no_final = 1;
4853             if (st->u.mark.mark_name)
4854                 sv_commit = st->u.mark.mark_name;
4855             sayNO;
4856             /* NOTREACHED */
4857         case BRANCH_next:
4858             sayYES;
4859             /* NOTREACHED */
4860         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4861             if (do_cutgroup) {
4862                 do_cutgroup = 0;
4863                 no_final = 0;
4864             }
4865             REGCP_UNWIND(ST.cp);
4866             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4867                 PL_regoffs[n].end = -1;
4868             *PL_reglastparen = n;
4869             /*dmq: *PL_reglastcloseparen = n; */
4870             scan = ST.next_branch;
4871             /* no more branches? */
4872             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4873                 DEBUG_EXECUTE_r({
4874                     PerlIO_printf( Perl_debug_log,
4875                         "%*s  %sBRANCH failed...%s\n",
4876                         REPORT_CODE_OFF+depth*2, "",
4877                         PL_colors[4],
4878                         PL_colors[5] );
4879                 });
4880                 sayNO_SILENT;
4881             }
4882             continue; /* execute next BRANCH[J] op */
4883             /* NOTREACHED */
4884
4885         case MINMOD:
4886             minmod = 1;
4887             break;
4888
4889 #undef  ST
4890 #define ST st->u.curlym
4891
4892         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4893
4894             /* This is an optimisation of CURLYX that enables us to push
4895              * only a single backtracking state, no matter how many matches
4896              * there are in {m,n}. It relies on the pattern being constant
4897              * length, with no parens to influence future backrefs
4898              */
4899
4900             ST.me = scan;
4901             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4902
4903             /* if paren positive, emulate an OPEN/CLOSE around A */
4904             if (ST.me->flags) {
4905                 U32 paren = ST.me->flags;
4906                 if (paren > PL_regsize)
4907                     PL_regsize = paren;
4908                 if (paren > *PL_reglastparen)
4909                     *PL_reglastparen = paren;
4910                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4911             }
4912             ST.A = scan;
4913             ST.B = next;
4914             ST.alen = 0;
4915             ST.count = 0;
4916             ST.minmod = minmod;
4917             minmod = 0;
4918             ST.c1 = CHRTEST_UNINIT;
4919             REGCP_SET(ST.cp);
4920
4921             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4922                 goto curlym_do_B;
4923
4924           curlym_do_A: /* execute the A in /A{m,n}B/  */
4925             PL_reginput = locinput;
4926             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4927             /* NOTREACHED */
4928
4929         case CURLYM_A: /* we've just matched an A */
4930             locinput = st->locinput;
4931             nextchr = UCHARAT(locinput);
4932
4933             ST.count++;
4934             /* after first match, determine A's length: u.curlym.alen */
4935             if (ST.count == 1) {
4936                 if (PL_reg_match_utf8) {
4937                     char *s = locinput;
4938                     while (s < PL_reginput) {
4939                         ST.alen++;
4940                         s += UTF8SKIP(s);
4941                     }
4942                 }
4943                 else {
4944                     ST.alen = PL_reginput - locinput;
4945                 }
4946                 if (ST.alen == 0)
4947                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4948             }
4949             DEBUG_EXECUTE_r(
4950                 PerlIO_printf(Perl_debug_log,
4951                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4952                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4953                           (IV) ST.count, (IV)ST.alen)
4954             );
4955
4956             locinput = PL_reginput;
4957
4958             if (cur_eval && cur_eval->u.eval.close_paren &&
4959                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4960                 goto fake_end;
4961
4962             {
4963                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4964                 if ( max == REG_INFTY || ST.count < max )
4965                     goto curlym_do_A; /* try to match another A */
4966             }
4967             goto curlym_do_B; /* try to match B */
4968
4969         case CURLYM_A_fail: /* just failed to match an A */
4970             REGCP_UNWIND(ST.cp);
4971
4972             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4973                 || (cur_eval && cur_eval->u.eval.close_paren &&
4974                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4975                 sayNO;
4976
4977           curlym_do_B: /* execute the B in /A{m,n}B/  */
4978             PL_reginput = locinput;
4979             if (ST.c1 == CHRTEST_UNINIT) {
4980                 /* calculate c1 and c2 for possible match of 1st char
4981                  * following curly */
4982                 ST.c1 = ST.c2 = CHRTEST_VOID;
4983                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4984                     regnode *text_node = ST.B;
4985                     if (! HAS_TEXT(text_node))
4986                         FIND_NEXT_IMPT(text_node);
4987                     /* this used to be
4988
4989                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4990
4991                         But the former is redundant in light of the latter.
4992
4993                         if this changes back then the macro for
4994                         IS_TEXT and friends need to change.
4995                      */
4996                     if (PL_regkind[OP(text_node)] == EXACT)
4997                     {
4998
4999                         ST.c1 = (U8)*STRING(text_node);
5000                         switch (OP(text_node)) {
5001                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5002                             case EXACTFA:
5003                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5004                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5005                             default: ST.c2 = ST.c1;
5006                         }
5007                     }
5008                 }
5009             }
5010
5011             DEBUG_EXECUTE_r(
5012                 PerlIO_printf(Perl_debug_log,
5013                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5014                     (int)(REPORT_CODE_OFF+(depth*2)),
5015                     "", (IV)ST.count)
5016                 );
5017             if (ST.c1 != CHRTEST_VOID
5018                     && UCHARAT(PL_reginput) != ST.c1
5019                     && UCHARAT(PL_reginput) != ST.c2)
5020             {
5021                 /* simulate B failing */
5022                 DEBUG_OPTIMISE_r(
5023                     PerlIO_printf(Perl_debug_log,
5024                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5025                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5026                         (IV)ST.c1,(IV)ST.c2
5027                 ));
5028                 state_num = CURLYM_B_fail;
5029                 goto reenter_switch;
5030             }
5031
5032             if (ST.me->flags) {
5033                 /* mark current A as captured */
5034                 I32 paren = ST.me->flags;
5035                 if (ST.count) {
5036                     PL_regoffs[paren].start
5037                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5038                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5039                     /*dmq: *PL_reglastcloseparen = paren; */
5040                 }
5041                 else
5042                     PL_regoffs[paren].end = -1;
5043                 if (cur_eval && cur_eval->u.eval.close_paren &&
5044                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5045                 {
5046                     if (ST.count)
5047                         goto fake_end;
5048                     else
5049                         sayNO;
5050                 }
5051             }
5052
5053             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5054             /* NOTREACHED */
5055
5056         case CURLYM_B_fail: /* just failed to match a B */
5057             REGCP_UNWIND(ST.cp);
5058             if (ST.minmod) {
5059                 I32 max = ARG2(ST.me);
5060                 if (max != REG_INFTY && ST.count == max)
5061                     sayNO;
5062                 goto curlym_do_A; /* try to match a further A */
5063             }
5064             /* backtrack one A */
5065             if (ST.count == ARG1(ST.me) /* min */)
5066                 sayNO;
5067             ST.count--;
5068             locinput = HOPc(locinput, -ST.alen);
5069             goto curlym_do_B; /* try to match B */
5070
5071 #undef ST
5072 #define ST st->u.curly
5073
5074 #define CURLY_SETPAREN(paren, success) \
5075     if (paren) { \
5076         if (success) { \
5077             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5078             PL_regoffs[paren].end = locinput - PL_bostr; \
5079             *PL_reglastcloseparen = paren; \
5080         } \
5081         else \
5082             PL_regoffs[paren].end = -1; \
5083     }
5084
5085         case STAR:              /*  /A*B/ where A is width 1 */
5086             ST.paren = 0;
5087             ST.min = 0;
5088             ST.max = REG_INFTY;
5089             scan = NEXTOPER(scan);
5090             goto repeat;
5091         case PLUS:              /*  /A+B/ where A is width 1 */
5092             ST.paren = 0;
5093             ST.min = 1;
5094             ST.max = REG_INFTY;
5095             scan = NEXTOPER(scan);
5096             goto repeat;
5097         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5098             ST.paren = scan->flags;     /* Which paren to set */
5099             if (ST.paren > PL_regsize)
5100                 PL_regsize = ST.paren;
5101             if (ST.paren > *PL_reglastparen)
5102                 *PL_reglastparen = ST.paren;
5103             ST.min = ARG1(scan);  /* min to match */
5104             ST.max = ARG2(scan);  /* max to match */
5105             if (cur_eval && cur_eval->u.eval.close_paren &&
5106                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5107                 ST.min=1;
5108                 ST.max=1;
5109             }
5110             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5111             goto repeat;
5112         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5113             ST.paren = 0;
5114             ST.min = ARG1(scan);  /* min to match */
5115             ST.max = ARG2(scan);  /* max to match */
5116             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5117           repeat:
5118             /*
5119             * Lookahead to avoid useless match attempts
5120             * when we know what character comes next.
5121             *
5122             * Used to only do .*x and .*?x, but now it allows
5123             * for )'s, ('s and (?{ ... })'s to be in the way
5124             * of the quantifier and the EXACT-like node.  -- japhy
5125             */
5126
5127             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5128                 sayNO;
5129             if (HAS_TEXT(next) || JUMPABLE(next)) {
5130                 U8 *s;
5131                 regnode *text_node = next;
5132
5133                 if (! HAS_TEXT(text_node))
5134                     FIND_NEXT_IMPT(text_node);
5135
5136                 if (! HAS_TEXT(text_node))
5137                     ST.c1 = ST.c2 = CHRTEST_VOID;
5138                 else {
5139                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5140                         ST.c1 = ST.c2 = CHRTEST_VOID;
5141                         goto assume_ok_easy;
5142                     }
5143                     else
5144                         s = (U8*)STRING(text_node);
5145
5146                     /*  Currently we only get here when
5147
5148                         PL_rekind[OP(text_node)] == EXACT
5149
5150                         if this changes back then the macro for IS_TEXT and
5151                         friends need to change. */
5152                     if (!UTF_PATTERN) {
5153                         ST.c1 = *s;
5154                         switch (OP(text_node)) {
5155                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5156                             case EXACTFA:
5157                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5158                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5159                             default: ST.c2 = ST.c1; break;
5160                         }
5161                     }
5162                     else { /* UTF_PATTERN */
5163                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5164                              STRLEN ulen1, ulen2;
5165                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5166                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5167
5168                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5169                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5170 #ifdef EBCDIC
5171                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5172                                                     ckWARN(WARN_UTF8) ?
5173                                                     0 : UTF8_ALLOW_ANY);
5174                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5175                                                     ckWARN(WARN_UTF8) ?
5176                                                     0 : UTF8_ALLOW_ANY);
5177 #else
5178                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5179                                                     uniflags);
5180                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5181                                                     uniflags);
5182 #endif
5183                         }
5184                         else {
5185                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5186                                                      uniflags);
5187                         }
5188                     }
5189                 }
5190             }
5191             else
5192                 ST.c1 = ST.c2 = CHRTEST_VOID;
5193         assume_ok_easy:
5194
5195             ST.A = scan;
5196             ST.B = next;
5197             PL_reginput = locinput;
5198             if (minmod) {
5199                 minmod = 0;
5200                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5201                     sayNO;
5202                 ST.count = ST.min;
5203                 locinput = PL_reginput;
5204                 REGCP_SET(ST.cp);
5205                 if (ST.c1 == CHRTEST_VOID)
5206                     goto curly_try_B_min;
5207
5208                 ST.oldloc = locinput;
5209
5210                 /* set ST.maxpos to the furthest point along the
5211                  * string that could possibly match */
5212                 if  (ST.max == REG_INFTY) {
5213                     ST.maxpos = PL_regeol - 1;
5214                     if (utf8_target)
5215                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5216                             ST.maxpos--;
5217                 }
5218                 else if (utf8_target) {
5219                     int m = ST.max - ST.min;
5220                     for (ST.maxpos = locinput;
5221                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5222                         ST.maxpos += UTF8SKIP(ST.maxpos);
5223                 }
5224                 else {
5225                     ST.maxpos = locinput + ST.max - ST.min;
5226                     if (ST.maxpos >= PL_regeol)
5227                         ST.maxpos = PL_regeol - 1;
5228                 }
5229                 goto curly_try_B_min_known;
5230
5231             }
5232             else {
5233                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5234                 locinput = PL_reginput;
5235                 if (ST.count < ST.min)
5236                     sayNO;
5237                 if ((ST.count > ST.min)
5238                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5239                 {
5240                     /* A{m,n} must come at the end of the string, there's
5241                      * no point in backing off ... */
5242                     ST.min = ST.count;
5243                     /* ...except that $ and \Z can match before *and* after
5244                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5245                        We may back off by one in this case. */
5246                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5247                         ST.min--;
5248                 }
5249                 REGCP_SET(ST.cp);
5250                 goto curly_try_B_max;
5251             }
5252             /* NOTREACHED */
5253
5254
5255         case CURLY_B_min_known_fail:
5256             /* failed to find B in a non-greedy match where c1,c2 valid */
5257             if (ST.paren && ST.count)
5258                 PL_regoffs[ST.paren].end = -1;
5259
5260             PL_reginput = locinput;     /* Could be reset... */
5261             REGCP_UNWIND(ST.cp);
5262             /* Couldn't or didn't -- move forward. */
5263             ST.oldloc = locinput;
5264             if (utf8_target)
5265                 locinput += UTF8SKIP(locinput);
5266             else
5267                 locinput++;
5268             ST.count++;
5269           curly_try_B_min_known:
5270              /* find the next place where 'B' could work, then call B */
5271             {
5272                 int n;
5273                 if (utf8_target) {
5274                     n = (ST.oldloc == locinput) ? 0 : 1;
5275                     if (ST.c1 == ST.c2) {
5276                         STRLEN len;
5277                         /* set n to utf8_distance(oldloc, locinput) */
5278                         while (locinput <= ST.maxpos &&
5279                                utf8n_to_uvchr((U8*)locinput,
5280                                               UTF8_MAXBYTES, &len,
5281                                               uniflags) != (UV)ST.c1) {
5282                             locinput += len;
5283                             n++;
5284                         }
5285                     }
5286                     else {
5287                         /* set n to utf8_distance(oldloc, locinput) */
5288                         while (locinput <= ST.maxpos) {
5289                             STRLEN len;
5290                             const UV c = utf8n_to_uvchr((U8*)locinput,
5291                                                   UTF8_MAXBYTES, &len,
5292                                                   uniflags);
5293                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5294                                 break;
5295                             locinput += len;
5296                             n++;
5297                         }
5298                     }
5299                 }
5300                 else {
5301                     if (ST.c1 == ST.c2) {
5302                         while (locinput <= ST.maxpos &&
5303                                UCHARAT(locinput) != ST.c1)
5304                             locinput++;
5305                     }
5306                     else {
5307                         while (locinput <= ST.maxpos
5308                                && UCHARAT(locinput) != ST.c1
5309                                && UCHARAT(locinput) != ST.c2)
5310                             locinput++;
5311                     }
5312                     n = locinput - ST.oldloc;
5313                 }
5314                 if (locinput > ST.maxpos)
5315                     sayNO;
5316                 /* PL_reginput == oldloc now */
5317                 if (n) {
5318                     ST.count += n;
5319                     if (regrepeat(rex, ST.A, n, depth) < n)
5320                         sayNO;
5321                 }
5322                 PL_reginput = locinput;
5323                 CURLY_SETPAREN(ST.paren, ST.count);
5324                 if (cur_eval && cur_eval->u.eval.close_paren &&
5325                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5326                     goto fake_end;
5327                 }
5328                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5329             }
5330             /* NOTREACHED */
5331
5332
5333         case CURLY_B_min_fail:
5334             /* failed to find B in a non-greedy match where c1,c2 invalid */
5335             if (ST.paren && ST.count)
5336                 PL_regoffs[ST.paren].end = -1;
5337
5338             REGCP_UNWIND(ST.cp);
5339             /* failed -- move forward one */
5340             PL_reginput = locinput;
5341             if (regrepeat(rex, ST.A, 1, depth)) {
5342                 ST.count++;
5343                 locinput = PL_reginput;
5344                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5345                         ST.count > 0)) /* count overflow ? */
5346                 {
5347                   curly_try_B_min:
5348                     CURLY_SETPAREN(ST.paren, ST.count);
5349                     if (cur_eval && cur_eval->u.eval.close_paren &&
5350                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5351                         goto fake_end;
5352                     }
5353                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5354                 }
5355             }
5356             sayNO;
5357             /* NOTREACHED */
5358
5359
5360         curly_try_B_max:
5361             /* a successful greedy match: now try to match B */
5362             if (cur_eval && cur_eval->u.eval.close_paren &&
5363                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5364                 goto fake_end;
5365             }
5366             {
5367                 UV c = 0;
5368                 if (ST.c1 != CHRTEST_VOID)
5369                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5370                                            UTF8_MAXBYTES, 0, uniflags)
5371                                 : (UV) UCHARAT(PL_reginput);
5372                 /* If it could work, try it. */
5373                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5374                     CURLY_SETPAREN(ST.paren, ST.count);
5375                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5376                     /* NOTREACHED */
5377                 }
5378             }
5379             /* FALL THROUGH */
5380         case CURLY_B_max_fail:
5381             /* failed to find B in a greedy match */
5382             if (ST.paren && ST.count)
5383                 PL_regoffs[ST.paren].end = -1;
5384
5385             REGCP_UNWIND(ST.cp);
5386             /*  back up. */
5387             if (--ST.count < ST.min)
5388                 sayNO;
5389             PL_reginput = locinput = HOPc(locinput, -1);
5390             goto curly_try_B_max;
5391
5392 #undef ST
5393
5394         case END:
5395             fake_end:
5396             if (cur_eval) {
5397                 /* we've just finished A in /(??{A})B/; now continue with B */
5398                 I32 tmpix;
5399                 st->u.eval.toggle_reg_flags
5400                             = cur_eval->u.eval.toggle_reg_flags;
5401                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5402
5403                 st->u.eval.prev_rex = rex_sv;           /* inner */
5404                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5405                 rex = (struct regexp *)SvANY(rex_sv);
5406                 rexi = RXi_GET(rex);
5407                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5408                 (void)ReREFCNT_inc(rex_sv);
5409                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5410
5411                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5412                 PL_reglastparen = &rex->lastparen;
5413                 PL_reglastcloseparen = &rex->lastcloseparen;
5414
5415                 REGCP_SET(st->u.eval.lastcp);
5416                 PL_reginput = locinput;
5417
5418                 /* Restore parens of the outer rex without popping the
5419                  * savestack */
5420                 tmpix = PL_savestack_ix;
5421                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5422                 regcppop(rex);
5423                 PL_savestack_ix = tmpix;
5424
5425                 st->u.eval.prev_eval = cur_eval;
5426                 cur_eval = cur_eval->u.eval.prev_eval;
5427                 DEBUG_EXECUTE_r(
5428                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5429                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5430                 if ( nochange_depth )
5431                     nochange_depth--;
5432
5433                 PUSH_YES_STATE_GOTO(EVAL_AB,
5434                         st->u.eval.prev_eval->u.eval.B); /* match B */
5435             }
5436
5437             if (locinput < reginfo->till) {
5438                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5439                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5440                                       PL_colors[4],
5441                                       (long)(locinput - PL_reg_starttry),
5442                                       (long)(reginfo->till - PL_reg_starttry),
5443                                       PL_colors[5]));
5444
5445                 sayNO_SILENT;           /* Cannot match: too short. */
5446             }
5447             PL_reginput = locinput;     /* put where regtry can find it */
5448             sayYES;                     /* Success! */
5449
5450         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5451             DEBUG_EXECUTE_r(
5452             PerlIO_printf(Perl_debug_log,
5453                 "%*s  %ssubpattern success...%s\n",
5454                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5455             PL_reginput = locinput;     /* put where regtry can find it */
5456             sayYES;                     /* Success! */
5457
5458 #undef  ST
5459 #define ST st->u.ifmatch
5460
5461         case SUSPEND:   /* (?>A) */
5462             ST.wanted = 1;
5463             PL_reginput = locinput;
5464             goto do_ifmatch;
5465
5466         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5467             ST.wanted = 0;
5468             goto ifmatch_trivial_fail_test;
5469
5470         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5471             ST.wanted = 1;
5472           ifmatch_trivial_fail_test:
5473             if (scan->flags) {
5474                 char * const s = HOPBACKc(locinput, scan->flags);
5475                 if (!s) {
5476                     /* trivial fail */
5477                     if (logical) {
5478                         logical = 0;
5479                         sw = 1 - cBOOL(ST.wanted);
5480                     }
5481                     else if (ST.wanted)
5482                         sayNO;
5483                     next = scan + ARG(scan);
5484                     if (next == scan)
5485                         next = NULL;
5486                     break;
5487                 }
5488                 PL_reginput = s;
5489             }
5490             else
5491                 PL_reginput = locinput;
5492
5493           do_ifmatch:
5494             ST.me = scan;
5495             ST.logical = logical;
5496             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5497
5498             /* execute body of (?...A) */
5499             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5500             /* NOTREACHED */
5501
5502         case IFMATCH_A_fail: /* body of (?...A) failed */
5503             ST.wanted = !ST.wanted;
5504             /* FALL THROUGH */
5505
5506         case IFMATCH_A: /* body of (?...A) succeeded */
5507             if (ST.logical) {
5508                 sw = cBOOL(ST.wanted);
5509             }
5510             else if (!ST.wanted)
5511                 sayNO;
5512
5513             if (OP(ST.me) == SUSPEND)
5514                 locinput = PL_reginput;
5515             else {
5516                 locinput = PL_reginput = st->locinput;
5517                 nextchr = UCHARAT(locinput);
5518             }
5519             scan = ST.me + ARG(ST.me);
5520             if (scan == ST.me)
5521                 scan = NULL;
5522             continue; /* execute B */
5523
5524 #undef ST
5525
5526         case LONGJMP:
5527             next = scan + ARG(scan);
5528             if (next == scan)
5529                 next = NULL;
5530             break;
5531         case COMMIT:
5532             reginfo->cutpoint = PL_regeol;
5533             /* FALLTHROUGH */
5534         case PRUNE:
5535             PL_reginput = locinput;
5536             if (!scan->flags)
5537                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5538             PUSH_STATE_GOTO(COMMIT_next,next);
5539             /* NOTREACHED */
5540         case COMMIT_next_fail:
5541             no_final = 1;
5542             /* FALLTHROUGH */
5543         case OPFAIL:
5544             sayNO;
5545             /* NOTREACHED */
5546
5547 #define ST st->u.mark
5548         case MARKPOINT:
5549             ST.prev_mark = mark_state;
5550             ST.mark_name = sv_commit = sv_yes_mark
5551                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5552             mark_state = st;
5553             ST.mark_loc = PL_reginput = locinput;
5554             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5555             /* NOTREACHED */
5556         case MARKPOINT_next:
5557             mark_state = ST.prev_mark;
5558             sayYES;
5559             /* NOTREACHED */
5560         case MARKPOINT_next_fail:
5561             if (popmark && sv_eq(ST.mark_name,popmark))
5562             {
5563                 if (ST.mark_loc > startpoint)
5564                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5565                 popmark = NULL; /* we found our mark */
5566                 sv_commit = ST.mark_name;
5567
5568                 DEBUG_EXECUTE_r({
5569                         PerlIO_printf(Perl_debug_log,
5570                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5571                             REPORT_CODE_OFF+depth*2, "",
5572                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5573                 });
5574             }
5575             mark_state = ST.prev_mark;
5576             sv_yes_mark = mark_state ?
5577                 mark_state->u.mark.mark_name : NULL;
5578             sayNO;
5579             /* NOTREACHED */
5580         case SKIP:
5581             PL_reginput = locinput;
5582             if (scan->flags) {
5583                 /* (*SKIP) : if we fail we cut here*/
5584                 ST.mark_name = NULL;
5585                 ST.mark_loc = locinput;
5586                 PUSH_STATE_GOTO(SKIP_next,next);
5587             } else {
5588                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5589                    otherwise do nothing.  Meaning we need to scan
5590                  */
5591                 regmatch_state *cur = mark_state;
5592                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5593
5594                 while (cur) {
5595                     if ( sv_eq( cur->u.mark.mark_name,
5596                                 find ) )
5597                     {
5598                         ST.mark_name = find;
5599                         PUSH_STATE_GOTO( SKIP_next, next );
5600                     }
5601                     cur = cur->u.mark.prev_mark;
5602                 }
5603             }
5604             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5605             break;
5606         case SKIP_next_fail:
5607             if (ST.mark_name) {
5608                 /* (*CUT:NAME) - Set up to search for the name as we
5609                    collapse the stack*/
5610                 popmark = ST.mark_name;
5611             } else {
5612                 /* (*CUT) - No name, we cut here.*/
5613                 if (ST.mark_loc > startpoint)
5614                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5615                 /* but we set sv_commit to latest mark_name if there
5616                    is one so they can test to see how things lead to this
5617                    cut */
5618                 if (mark_state)
5619                     sv_commit=mark_state->u.mark.mark_name;
5620             }
5621             no_final = 1;
5622             sayNO;
5623             /* NOTREACHED */
5624 #undef ST
5625         case FOLDCHAR:
5626             n = ARG(scan);
5627             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5628                 locinput += ln;
5629             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5630                 sayNO;
5631             } else  {
5632                 U8 folded[UTF8_MAXBYTES_CASE+1];
5633                 STRLEN foldlen;
5634                 const char * const l = locinput;
5635                 char *e = PL_regeol;
5636                 to_uni_fold(n, folded, &foldlen);
5637
5638                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5639                                l, &e, 0,  utf8_target)) {
5640                         sayNO;
5641                 }
5642                 locinput = e;
5643             }
5644             nextchr = UCHARAT(locinput);
5645             break;
5646         case LNBREAK:
5647             if ((n=is_LNBREAK(locinput,utf8_target))) {
5648                 locinput += n;
5649                 nextchr = UCHARAT(locinput);
5650             } else
5651                 sayNO;
5652             break;
5653
5654 #define CASE_CLASS(nAmE)                              \
5655         case nAmE:                                    \
5656             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5657                 locinput += n;                        \
5658                 nextchr = UCHARAT(locinput);          \
5659             } else                                    \
5660                 sayNO;                                \
5661             break;                                    \
5662         case N##nAmE:                                 \
5663             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5664                 sayNO;                                \
5665             } else {                                  \
5666                 locinput += UTF8SKIP(locinput);       \
5667                 nextchr = UCHARAT(locinput);          \
5668             }                                         \
5669             break
5670
5671         CASE_CLASS(VERTWS);
5672         CASE_CLASS(HORIZWS);
5673 #undef CASE_CLASS
5674
5675         default:
5676             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5677                           PTR2UV(scan), OP(scan));
5678             Perl_croak(aTHX_ "regexp memory corruption");
5679
5680         } /* end switch */
5681
5682         /* switch break jumps here */
5683         scan = next; /* prepare to execute the next op and ... */
5684         continue;    /* ... jump back to the top, reusing st */
5685         /* NOTREACHED */
5686
5687       push_yes_state:
5688         /* push a state that backtracks on success */
5689         st->u.yes.prev_yes_state = yes_state;
5690         yes_state = st;
5691         /* FALL THROUGH */
5692       push_state:
5693         /* push a new regex state, then continue at scan  */
5694         {
5695             regmatch_state *newst;
5696
5697             DEBUG_STACK_r({
5698                 regmatch_state *cur = st;
5699                 regmatch_state *curyes = yes_state;
5700                 int curd = depth;
5701                 regmatch_slab *slab = PL_regmatch_slab;
5702                 for (;curd > -1;cur--,curd--) {
5703                     if (cur < SLAB_FIRST(slab)) {
5704                         slab = slab->prev;
5705                         cur = SLAB_LAST(slab);
5706                     }
5707                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5708                         REPORT_CODE_OFF + 2 + depth * 2,"",
5709                         curd, PL_reg_name[cur->resume_state],
5710                         (curyes == cur) ? "yes" : ""
5711                     );
5712                     if (curyes == cur)
5713                         curyes = cur->u.yes.prev_yes_state;
5714                 }
5715             } else
5716                 DEBUG_STATE_pp("push")
5717             );
5718             depth++;
5719             st->locinput = locinput;
5720             newst = st+1;
5721             if (newst >  SLAB_LAST(PL_regmatch_slab))
5722                 newst = S_push_slab(aTHX);
5723             PL_regmatch_state = newst;
5724
5725             locinput = PL_reginput;
5726             nextchr = UCHARAT(locinput);
5727             st = newst;
5728             continue;
5729             /* NOTREACHED */
5730         }
5731     }
5732
5733     /*
5734     * We get here only if there's trouble -- normally "case END" is
5735     * the terminating point.
5736     */
5737     Perl_croak(aTHX_ "corrupted regexp pointers");
5738     /*NOTREACHED*/
5739     sayNO;
5740
5741 yes:
5742     if (yes_state) {
5743         /* we have successfully completed a subexpression, but we must now
5744          * pop to the state marked by yes_state and continue from there */
5745         assert(st != yes_state);
5746 #ifdef DEBUGGING
5747         while (st != yes_state) {
5748             st--;
5749             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5750                 PL_regmatch_slab = PL_regmatch_slab->prev;
5751                 st = SLAB_LAST(PL_regmatch_slab);
5752             }
5753             DEBUG_STATE_r({
5754                 if (no_final) {
5755                     DEBUG_STATE_pp("pop (no final)");
5756                 } else {
5757                     DEBUG_STATE_pp("pop (yes)");
5758                 }
5759             });
5760             depth--;
5761         }
5762 #else
5763         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5764             || yes_state > SLAB_LAST(PL_regmatch_slab))
5765         {
5766             /* not in this slab, pop slab */
5767             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5768             PL_regmatch_slab = PL_regmatch_slab->prev;
5769             st = SLAB_LAST(PL_regmatch_slab);
5770         }
5771         depth -= (st - yes_state);
5772 #endif
5773         st = yes_state;
5774         yes_state = st->u.yes.prev_yes_state;
5775         PL_regmatch_state = st;
5776
5777         if (no_final) {
5778             locinput= st->locinput;
5779             nextchr = UCHARAT(locinput);
5780         }
5781         state_num = st->resume_state + no_final;
5782         goto reenter_switch;
5783     }
5784
5785     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5786                           PL_colors[4], PL_colors[5]));
5787
5788     if (PL_reg_eval_set) {
5789         /* each successfully executed (?{...}) block does the equivalent of
5790          *   local $^R = do {...}
5791          * When popping the save stack, all these locals would be undone;
5792          * bypass this by setting the outermost saved $^R to the latest
5793          * value */
5794         if (oreplsv != GvSV(PL_replgv))
5795             sv_setsv(oreplsv, GvSV(PL_replgv));
5796     }
5797     result = 1;
5798     goto final_exit;
5799
5800 no:
5801     DEBUG_EXECUTE_r(
5802         PerlIO_printf(Perl_debug_log,
5803             "%*s  %sfailed...%s\n",
5804             REPORT_CODE_OFF+depth*2, "",
5805             PL_colors[4], PL_colors[5])
5806         );
5807
5808 no_silent:
5809     if (no_final) {
5810         if (yes_state) {
5811             goto yes;
5812         } else {
5813             goto final_exit;
5814         }
5815     }
5816     if (depth) {
5817         /* there's a previous state to backtrack to */
5818         st--;
5819         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5820             PL_regmatch_slab = PL_regmatch_slab->prev;
5821             st = SLAB_LAST(PL_regmatch_slab);
5822         }
5823         PL_regmatch_state = st;
5824         locinput= st->locinput;
5825         nextchr = UCHARAT(locinput);
5826
5827         DEBUG_STATE_pp("pop");
5828         depth--;
5829         if (yes_state == st)
5830             yes_state = st->u.yes.prev_yes_state;
5831
5832         state_num = st->resume_state + 1; /* failure = success + 1 */
5833         goto reenter_switch;
5834     }
5835     result = 0;
5836
5837   final_exit:
5838     if (rex->intflags & PREGf_VERBARG_SEEN) {
5839         SV *sv_err = get_sv("REGERROR", 1);
5840         SV *sv_mrk = get_sv("REGMARK", 1);
5841         if (result) {
5842             sv_commit = &PL_sv_no;
5843             if (!sv_yes_mark)
5844                 sv_yes_mark = &PL_sv_yes;
5845         } else {
5846             if (!sv_commit)
5847                 sv_commit = &PL_sv_yes;
5848             sv_yes_mark = &PL_sv_no;
5849         }
5850         sv_setsv(sv_err, sv_commit);
5851         sv_setsv(sv_mrk, sv_yes_mark);
5852     }
5853
5854     /* clean up; in particular, free all slabs above current one */
5855     LEAVE_SCOPE(oldsave);
5856
5857     return result;
5858 }
5859
5860 /*
5861  - regrepeat - repeatedly match something simple, report how many
5862  */
5863 /*
5864  * [This routine now assumes that it will only match on things of length 1.
5865  * That was true before, but now we assume scan - reginput is the count,
5866  * rather than incrementing count on every character.  [Er, except utf8.]]
5867  */
5868 STATIC I32
5869 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5870 {
5871     dVAR;
5872     register char *scan;
5873     register I32 c;
5874     register char *loceol = PL_regeol;
5875     register I32 hardcount = 0;
5876     register bool utf8_target = PL_reg_match_utf8;
5877     UV utf8_flags;
5878 #ifndef DEBUGGING
5879     PERL_UNUSED_ARG(depth);
5880 #endif
5881
5882     PERL_ARGS_ASSERT_REGREPEAT;
5883
5884     scan = PL_reginput;
5885     if (max == REG_INFTY)
5886         max = I32_MAX;
5887     else if (max < loceol - scan)
5888         loceol = scan + max;
5889     switch (OP(p)) {
5890     case REG_ANY:
5891         if (utf8_target) {
5892             loceol = PL_regeol;
5893             while (scan < loceol && hardcount < max && *scan != '\n') {
5894                 scan += UTF8SKIP(scan);
5895                 hardcount++;
5896             }
5897         } else {
5898             while (scan < loceol && *scan != '\n')
5899                 scan++;
5900         }
5901         break;
5902     case SANY:
5903         if (utf8_target) {
5904             loceol = PL_regeol;
5905             while (scan < loceol && hardcount < max) {
5906                 scan += UTF8SKIP(scan);
5907                 hardcount++;
5908             }
5909         }
5910         else
5911             scan = loceol;
5912         break;
5913     case CANY:
5914         scan = loceol;
5915         break;
5916     case EXACT:
5917         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5918          * means they match only characters in the string that can be expressed
5919          * as a single byte.  For non-utf8 strings, that means a simple match.
5920          * For utf8 strings, the character matched must be an invariant, or
5921          * downgradable to a single byte.  The pattern's utf8ness is
5922          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5923          * it is, it's an invariant */
5924
5925         c = (U8)*STRING(p);
5926         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5927
5928         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5929             while (scan < loceol && UCHARAT(scan) == c) {
5930                 scan++;
5931             }
5932         }
5933         else {
5934
5935             /* Here, the string is utf8, and the pattern char is different
5936              * in utf8 than not, so can't compare them directly.  Outside the
5937              * loop, find find the two utf8 bytes that represent c, and then
5938              * look for those in sequence in the utf8 string */
5939             U8 high = UTF8_TWO_BYTE_HI(c);
5940             U8 low = UTF8_TWO_BYTE_LO(c);
5941             loceol = PL_regeol;
5942
5943             while (hardcount < max
5944                     && scan + 1 < loceol
5945                     && UCHARAT(scan) == high
5946                     && UCHARAT(scan + 1) == low)
5947             {
5948                 scan += 2;
5949                 hardcount++;
5950             }
5951         }
5952         break;
5953     case EXACTFA:
5954         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5955         goto do_exactf;
5956
5957     case EXACTFL:
5958         PL_reg_flags |= RF_tainted;
5959         utf8_flags = FOLDEQ_UTF8_LOCALE;
5960         goto do_exactf;
5961
5962     case EXACTF:
5963     case EXACTFU:
5964         utf8_flags = 0;
5965
5966         /* The comments for the EXACT case above apply as well to these fold
5967          * ones */
5968
5969     do_exactf:
5970         c = (U8)*STRING(p);
5971         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5972
5973         if (utf8_target) { /* Use full Unicode fold matching */
5974             char *tmpeol = loceol;
5975             while (hardcount < max
5976                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5977                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5978             {
5979                 scan = tmpeol;
5980                 tmpeol = loceol;
5981                 hardcount++;
5982             }
5983
5984             /* XXX Note that the above handles properly the German sharp s in
5985              * the pattern matching ss in the string.  But it doesn't handle
5986              * properly cases where the string contains say 'LIGATURE ff' and
5987              * the pattern is 'f+'.  This would require, say, a new function or
5988              * revised interface to foldEQ_utf8(), in which the maximum number
5989              * of characters to match could be passed and it would return how
5990              * many actually did.  This is just one of many cases where
5991              * multi-char folds don't work properly, and so the fix is being
5992              * deferred */
5993         }
5994         else {
5995             U8 folded;
5996
5997             /* Here, the string isn't utf8 and c is a single byte; and either
5998              * the pattern isn't utf8 or c is an invariant, so its utf8ness
5999              * doesn't affect c.  Can just do simple comparisons for exact or
6000              * fold matching. */
6001             switch (OP(p)) {
6002                 case EXACTF: folded = PL_fold[c]; break;
6003                 case EXACTFA:
6004                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6005                 case EXACTFL: folded = PL_fold_locale[c]; break;
6006                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6007             }
6008             while (scan < loceol &&
6009                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6010             {
6011                 scan++;
6012             }
6013         }
6014         break;
6015     case ANYOFV:
6016     case ANYOF:
6017         if (utf8_target || OP(p) == ANYOFV) {
6018             STRLEN inclasslen;
6019             loceol = PL_regeol;
6020             inclasslen = loceol - scan;
6021             while (hardcount < max
6022                    && ((inclasslen = loceol - scan) > 0)
6023                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6024             {
6025                 scan += inclasslen;
6026                 hardcount++;
6027             }
6028         } else {
6029             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6030                 scan++;
6031         }
6032         break;
6033     case ALNUMU:
6034         if (utf8_target) {
6035     utf8_wordchar:
6036             loceol = PL_regeol;
6037             LOAD_UTF8_CHARCLASS_ALNUM();
6038             while (hardcount < max && scan < loceol &&
6039                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6040             {
6041                 scan += UTF8SKIP(scan);
6042                 hardcount++;
6043             }
6044         } else {
6045             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6046                 scan++;
6047             }
6048         }
6049         break;
6050     case ALNUM:
6051         if (utf8_target)
6052             goto utf8_wordchar;
6053         while (scan < loceol && isALNUM((U8) *scan)) {
6054             scan++;
6055         }
6056         break;
6057     case ALNUMA:
6058         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6059             scan++;
6060         }
6061         break;
6062     case ALNUML:
6063         PL_reg_flags |= RF_tainted;
6064         if (utf8_target) {
6065             loceol = PL_regeol;
6066             while (hardcount < max && scan < loceol &&
6067                    isALNUM_LC_utf8((U8*)scan)) {
6068                 scan += UTF8SKIP(scan);
6069                 hardcount++;
6070             }
6071         } else {
6072             while (scan < loceol && isALNUM_LC(*scan))
6073                 scan++;
6074         }
6075         break;
6076     case NALNUMU:
6077         if (utf8_target) {
6078
6079     utf8_Nwordchar:
6080
6081             loceol = PL_regeol;
6082             LOAD_UTF8_CHARCLASS_ALNUM();
6083             while (hardcount < max && scan < loceol &&
6084                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6085             {
6086                 scan += UTF8SKIP(scan);
6087                 hardcount++;
6088             }
6089         } else {
6090             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6091                 scan++;
6092             }
6093         }
6094         break;
6095     case NALNUM:
6096         if (utf8_target)
6097             goto utf8_Nwordchar;
6098         while (scan < loceol && ! isALNUM((U8) *scan)) {
6099             scan++;
6100         }
6101         break;
6102     case NALNUMA:
6103         if (utf8_target) {
6104             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6105                 scan += UTF8SKIP(scan);
6106             }
6107         }
6108         else {
6109             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6110                 scan++;
6111             }
6112         }
6113         break;
6114     case NALNUML:
6115         PL_reg_flags |= RF_tainted;
6116         if (utf8_target) {
6117             loceol = PL_regeol;
6118             while (hardcount < max && scan < loceol &&
6119                    !isALNUM_LC_utf8((U8*)scan)) {
6120                 scan += UTF8SKIP(scan);
6121                 hardcount++;
6122             }
6123         } else {
6124             while (scan < loceol && !isALNUM_LC(*scan))
6125                 scan++;
6126         }
6127         break;
6128     case SPACEU:
6129         if (utf8_target) {
6130
6131     utf8_space:
6132
6133             loceol = PL_regeol;
6134             LOAD_UTF8_CHARCLASS_SPACE();
6135             while (hardcount < max && scan < loceol &&
6136                    (*scan == ' ' ||
6137                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6138             {
6139                 scan += UTF8SKIP(scan);
6140                 hardcount++;
6141             }
6142             break;
6143         }
6144         else {
6145             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6146                 scan++;
6147             }
6148             break;
6149         }
6150     case SPACE:
6151         if (utf8_target)
6152             goto utf8_space;
6153
6154         while (scan < loceol && isSPACE((U8) *scan)) {
6155             scan++;
6156         }
6157         break;
6158     case SPACEA:
6159         while (scan < loceol && isSPACE_A((U8) *scan)) {
6160             scan++;
6161         }
6162         break;
6163     case SPACEL:
6164         PL_reg_flags |= RF_tainted;
6165         if (utf8_target) {
6166             loceol = PL_regeol;
6167             while (hardcount < max && scan < loceol &&
6168                    isSPACE_LC_utf8((U8*)scan)) {
6169                 scan += UTF8SKIP(scan);
6170                 hardcount++;
6171             }
6172         } else {
6173             while (scan < loceol && isSPACE_LC(*scan))
6174                 scan++;
6175         }
6176         break;
6177     case NSPACEU:
6178         if (utf8_target) {
6179
6180     utf8_Nspace:
6181
6182             loceol = PL_regeol;
6183             LOAD_UTF8_CHARCLASS_SPACE();
6184             while (hardcount < max && scan < loceol &&
6185                    ! (*scan == ' ' ||
6186                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6187             {
6188                 scan += UTF8SKIP(scan);
6189                 hardcount++;
6190             }
6191             break;
6192         }
6193         else {
6194             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6195                 scan++;
6196             }
6197         }
6198         break;
6199     case NSPACE:
6200         if (utf8_target)
6201             goto utf8_Nspace;
6202
6203         while (scan < loceol && ! isSPACE((U8) *scan)) {
6204             scan++;
6205         }
6206         break;
6207     case NSPACEA:
6208         if (utf8_target) {
6209             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6210                 scan += UTF8SKIP(scan);
6211             }
6212         }
6213         else {
6214             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6215                 scan++;
6216             }
6217         }
6218         break;
6219     case NSPACEL:
6220         PL_reg_flags |= RF_tainted;
6221         if (utf8_target) {
6222             loceol = PL_regeol;
6223             while (hardcount < max && scan < loceol &&
6224                    !isSPACE_LC_utf8((U8*)scan)) {
6225                 scan += UTF8SKIP(scan);
6226                 hardcount++;
6227             }
6228         } else {
6229             while (scan < loceol && !isSPACE_LC(*scan))
6230                 scan++;
6231         }
6232         break;
6233     case DIGIT:
6234         if (utf8_target) {
6235             loceol = PL_regeol;
6236             LOAD_UTF8_CHARCLASS_DIGIT();
6237             while (hardcount < max && scan < loceol &&
6238                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6239                 scan += UTF8SKIP(scan);
6240                 hardcount++;
6241             }
6242         } else {
6243             while (scan < loceol && isDIGIT(*scan))
6244                 scan++;
6245         }
6246         break;
6247     case DIGITA:
6248         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6249             scan++;
6250         }
6251         break;
6252     case DIGITL:
6253         PL_reg_flags |= RF_tainted;
6254         if (utf8_target) {
6255             loceol = PL_regeol;
6256             while (hardcount < max && scan < loceol &&
6257                    isDIGIT_LC_utf8((U8*)scan)) {
6258                 scan += UTF8SKIP(scan);
6259                 hardcount++;
6260             }
6261         } else {
6262             while (scan < loceol && isDIGIT_LC(*scan))
6263                 scan++;
6264         }
6265         break;
6266     case NDIGIT:
6267         if (utf8_target) {
6268             loceol = PL_regeol;
6269             LOAD_UTF8_CHARCLASS_DIGIT();
6270             while (hardcount < max && scan < loceol &&
6271                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6272                 scan += UTF8SKIP(scan);
6273                 hardcount++;
6274             }
6275         } else {
6276             while (scan < loceol && !isDIGIT(*scan))
6277                 scan++;
6278         }
6279         break;
6280     case NDIGITA:
6281         if (utf8_target) {
6282             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6283                 scan += UTF8SKIP(scan);
6284             }
6285         }
6286         else {
6287             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6288                 scan++;
6289             }
6290         }
6291         break;
6292     case NDIGITL:
6293         PL_reg_flags |= RF_tainted;
6294         if (utf8_target) {
6295             loceol = PL_regeol;
6296             while (hardcount < max && scan < loceol &&
6297                    !isDIGIT_LC_utf8((U8*)scan)) {
6298                 scan += UTF8SKIP(scan);
6299                 hardcount++;
6300             }
6301         } else {
6302             while (scan < loceol && !isDIGIT_LC(*scan))
6303                 scan++;
6304         }
6305         break;
6306     case LNBREAK:
6307         if (utf8_target) {
6308             loceol = PL_regeol;
6309             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6310                 scan += c;
6311                 hardcount++;
6312             }
6313         } else {
6314             /*
6315               LNBREAK can match two latin chars, which is ok,
6316               because we have a null terminated string, but we
6317               have to use hardcount in this situation
6318             */
6319             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6320                 scan+=c;
6321                 hardcount++;
6322             }
6323         }
6324         break;
6325     case HORIZWS:
6326         if (utf8_target) {
6327             loceol = PL_regeol;
6328             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6329                 scan += c;
6330                 hardcount++;
6331             }
6332         } else {
6333             while (scan < loceol && is_HORIZWS_latin1(scan))
6334                 scan++;
6335         }
6336         break;
6337     case NHORIZWS:
6338         if (utf8_target) {
6339             loceol = PL_regeol;
6340             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6341                 scan += UTF8SKIP(scan);
6342                 hardcount++;
6343             }
6344         } else {
6345             while (scan < loceol && !is_HORIZWS_latin1(scan))
6346                 scan++;
6347
6348         }
6349         break;
6350     case VERTWS:
6351         if (utf8_target) {
6352             loceol = PL_regeol;
6353             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6354                 scan += c;
6355                 hardcount++;
6356             }
6357         } else {
6358             while (scan < loceol && is_VERTWS_latin1(scan))
6359                 scan++;
6360
6361         }
6362         break;
6363     case NVERTWS:
6364         if (utf8_target) {
6365             loceol = PL_regeol;
6366             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6367                 scan += UTF8SKIP(scan);
6368                 hardcount++;
6369             }
6370         } else {
6371             while (scan < loceol && !is_VERTWS_latin1(scan))
6372                 scan++;
6373
6374         }
6375         break;
6376
6377     default:            /* Called on something of 0 width. */
6378         break;          /* So match right here or not at all. */
6379     }
6380
6381     if (hardcount)
6382         c = hardcount;
6383     else
6384         c = scan - PL_reginput;
6385     PL_reginput = scan;
6386
6387     DEBUG_r({
6388         GET_RE_DEBUG_FLAGS_DECL;
6389         DEBUG_EXECUTE_r({
6390             SV * const prop = sv_newmortal();
6391             regprop(prog, prop, p);
6392             PerlIO_printf(Perl_debug_log,
6393                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6394                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6395         });
6396     });
6397
6398     return(c);
6399 }
6400
6401
6402 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6403 /*
6404 - regclass_swash - prepare the utf8 swash
6405 */
6406
6407 SV *
6408 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6409 {
6410     dVAR;
6411     SV *sw  = NULL;
6412     SV *si  = NULL;
6413     SV *alt = NULL;
6414     RXi_GET_DECL(prog,progi);
6415     const struct reg_data * const data = prog ? progi->data : NULL;
6416
6417     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6418
6419     assert(ANYOF_NONBITMAP(node));
6420
6421     if (data && data->count) {
6422         const U32 n = ARG(node);
6423
6424         if (data->what[n] == 's') {
6425             SV * const rv = MUTABLE_SV(data->data[n]);
6426             AV * const av = MUTABLE_AV(SvRV(rv));
6427             SV **const ary = AvARRAY(av);
6428             SV **a, **b;
6429
6430             /* See the end of regcomp.c:S_regclass() for
6431              * documentation of these array elements. */
6432
6433             si = *ary;
6434             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6435             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6436
6437             if (a)
6438                 sw = *a;
6439             else if (si && doinit) {
6440                 sw = swash_init("utf8", "", si, 1, 0);
6441                 (void)av_store(av, 1, sw);
6442             }
6443             if (b)
6444                 alt = *b;
6445         }
6446     }
6447
6448     if (listsvp)
6449         *listsvp = si;
6450     if (altsvp)
6451         *altsvp  = alt;
6452
6453     return sw;
6454 }
6455 #endif
6456
6457 /*
6458  - reginclass - determine if a character falls into a character class
6459
6460   n is the ANYOF regnode
6461   p is the target string
6462   lenp is pointer to the maximum number of bytes of how far to go in p
6463     (This is assumed wthout checking to always be at least the current
6464     character's size)
6465   utf8_target tells whether p is in UTF-8.
6466
6467   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6468   from a successful match, the value it points to will be updated to how many
6469   bytes in p were matched.  If there was no match, the value is undefined,
6470   possibly changed from the input.
6471
6472   Note that this can be a synthetic start class, a combination of various
6473   nodes, so things you think might be mutually exclusive, such as locale,
6474   aren't.  It can match both locale and non-locale
6475
6476  */
6477
6478 STATIC bool
6479 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6480 {
6481     dVAR;
6482     const char flags = ANYOF_FLAGS(n);
6483     bool match = FALSE;
6484     UV c = *p;
6485     STRLEN c_len = 0;
6486     STRLEN maxlen;
6487
6488     PERL_ARGS_ASSERT_REGINCLASS;
6489
6490     /* If c is not already the code point, get it */
6491     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6492         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6493                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6494                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6495                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6496                  * UTF8_ALLOW_FFFF */
6497         if (c_len == (STRLEN)-1)
6498             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6499     }
6500     else {
6501         c_len = 1;
6502     }
6503
6504     /* Use passed in max length, or one character if none passed in or less
6505      * than one character.  And assume will match just one character.  This is
6506      * overwritten later if matched more. */
6507     if (lenp) {
6508         maxlen = (*lenp > c_len) ? *lenp : c_len;
6509         *lenp = c_len;
6510
6511     }
6512     else {
6513         maxlen = c_len;
6514     }
6515
6516     /* If this character is potentially in the bitmap, check it */
6517     if (c < 256) {
6518         if (ANYOF_BITMAP_TEST(n, c))
6519             match = TRUE;
6520         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6521                 && ! utf8_target
6522                 && ! isASCII(c))
6523         {
6524             match = TRUE;
6525         }
6526
6527         else if (flags & ANYOF_LOCALE) {
6528             PL_reg_flags |= RF_tainted;
6529
6530             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6531                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6532             {
6533                 match = TRUE;
6534             }
6535             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6536                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6537                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6538                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6539                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6540                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6541                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6542                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6543                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6544                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6545                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6546                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6547                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6548                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6549                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6550                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6562                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6563                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6564                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6565                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6566                      ) /* How's that for a conditional? */
6567             ) {
6568                 match = TRUE;
6569             }
6570         }
6571     }
6572
6573     /* If the bitmap didn't (or couldn't) match, and something outside the
6574      * bitmap could match, try that.  Locale nodes specifiy completely the
6575      * behavior of code points in the bit map (otherwise, a utf8 target would
6576      * cause them to be treated as Unicode and not locale), except in
6577      * the very unlikely event when this node is a synthetic start class, which
6578      * could be a combination of locale and non-locale nodes.  So allow locale
6579      * to match for the synthetic start class, which will give a false
6580      * positive that will be resolved when the match is done again as not part
6581      * of the synthetic start class */
6582     if (!match) {
6583         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6584             match = TRUE;       /* Everything above 255 matches */
6585         }
6586         else if (ANYOF_NONBITMAP(n)
6587                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6588                      || (utf8_target
6589                          && (c >=256
6590                              || (! (flags & ANYOF_LOCALE))
6591                              || (flags & ANYOF_IS_SYNTHETIC)))))
6592         {
6593             AV *av;
6594             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6595
6596             if (sw) {
6597                 U8 * utf8_p;
6598                 if (utf8_target) {
6599                     utf8_p = (U8 *) p;
6600                 } else {
6601
6602                     /* Not utf8.  Convert as much of the string as available up
6603                      * to the limit of how far the (single) character in the
6604                      * pattern can possibly match (no need to go further).  If
6605                      * the node is a straight ANYOF or not folding, it can't
6606                      * match more than one.  Otherwise, It can match up to how
6607                      * far a single char can fold to.  Since not utf8, each
6608                      * character is a single byte, so the max it can be in
6609                      * bytes is the same as the max it can be in characters */
6610                     STRLEN len = (OP(n) == ANYOF
6611                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6612                                   ? 1
6613                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6614                                     ? maxlen
6615                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6616                     utf8_p = bytes_to_utf8(p, &len);
6617                 }
6618
6619                 if (swash_fetch(sw, utf8_p, TRUE))
6620                     match = TRUE;
6621                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6622
6623                     /* Here, we need to test if the fold of the target string
6624                      * matches.  The non-multi char folds have all been moved to
6625                      * the compilation phase, and the multi-char folds have
6626                      * been stored by regcomp into 'av'; we linearly check to
6627                      * see if any match the target string (folded).   We know
6628                      * that the originals were each one character, but we don't
6629                      * currently know how many characters/bytes each folded to,
6630                      * except we do know that there are small limits imposed by
6631                      * Unicode.  XXX A performance enhancement would be to have
6632                      * regcomp.c store the max number of chars/bytes that are
6633                      * in an av entry, as, say the 0th element.  Even better
6634                      * would be to have a hash of the few characters that can
6635                      * start a multi-char fold to the max number of chars of
6636                      * those folds.
6637                      *
6638                      * If there is a match, we will need to advance (if lenp is
6639                      * specified) the match pointer in the target string.  But
6640                      * what we are comparing here isn't that string directly,
6641                      * but its fold, whose length may differ from the original.
6642                      * As we go along in constructing the fold, therefore, we
6643                      * create a map so that we know how many bytes in the
6644                      * source to advance given that we have matched a certain
6645                      * number of bytes in the fold.  This map is stored in
6646                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6647                      * the fold of the first character that we are folding.
6648                      * Then map_fold_len_back[n] is set to the number of bytes
6649                      * in that first character.  Similarly let m be the
6650                      * corresponding number for the second character to be
6651                      * folded.  Then map_fold_len_back[n+m] is set to the
6652                      * number of bytes occupied by the first two source
6653                      * characters. ... */
6654                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6655                     U8 folded[UTF8_MAXBYTES_CASE+1];
6656                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6657                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6658                                                   chars */
6659
6660                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6661
6662                         /* Here, only need to fold the first char of the target
6663                          * string.  It the source wasn't utf8, is 1 byte long */
6664                         to_utf8_fold(utf8_p, folded, &foldlen);
6665                         total_foldlen = foldlen;
6666                         map_fold_len_back[foldlen] = (utf8_target)
6667                                                      ? UTF8SKIP(utf8_p)
6668                                                      : 1;
6669                     }
6670                     else {
6671
6672                         /* Here, need to fold more than the first char.  Do so
6673                          * up to the limits */
6674                         U8* source_ptr = utf8_p;    /* The source for the fold
6675                                                        is the regex target
6676                                                        string */
6677                         U8* folded_ptr = folded;
6678                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6679                                                        available byte in the
6680                                                        target string */
6681                         U8 i;
6682                         for (i = 0;
6683                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6684                              i++)
6685                         {
6686
6687                             /* Fold the next character */
6688                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6689                             STRLEN this_char_foldlen;
6690                             to_utf8_fold(source_ptr,
6691                                          this_char_folded,
6692                                          &this_char_foldlen);
6693
6694                             /* Bail if it would exceed the byte limit for
6695                              * folding a single char. */
6696                             if (this_char_foldlen + folded_ptr - folded >
6697                                                             UTF8_MAXBYTES_CASE)
6698                             {
6699                                 break;
6700                             }
6701
6702                             /* Add the fold of this character */
6703                             Copy(this_char_folded,
6704                                  folded_ptr,
6705                                  this_char_foldlen,
6706                                  U8);
6707                             source_ptr += UTF8SKIP(source_ptr);
6708                             folded_ptr += this_char_foldlen;
6709                             total_foldlen = folded_ptr - folded;
6710
6711                             /* Create map from the number of bytes in the fold
6712                              * back to the number of bytes in the source.  If
6713                              * the source isn't utf8, the byte count is just
6714                              * the number of characters so far */
6715                             map_fold_len_back[total_foldlen]
6716                                                       = (utf8_target)
6717                                                         ? source_ptr - utf8_p
6718                                                         : i + 1;
6719                         }
6720                         *folded_ptr = '\0';
6721                     }
6722
6723
6724                     /* Do the linear search to see if the fold is in the list
6725                      * of multi-char folds. */
6726                     if (av) {
6727                         I32 i;
6728                         for (i = 0; i <= av_len(av); i++) {
6729                             SV* const sv = *av_fetch(av, i, FALSE);
6730                             STRLEN len;
6731                             const char * const s = SvPV_const(sv, len);
6732
6733                             if (len <= total_foldlen
6734                                 && memEQ(s, (char*)folded, len)
6735
6736                                    /* If 0, means matched a partial char. See
6737                                     * [perl #90536] */
6738                                 && map_fold_len_back[len])
6739                             {
6740
6741                                 /* Advance the target string ptr to account for
6742                                  * this fold, but have to translate from the
6743                                  * folded length to the corresponding source
6744                                  * length. */
6745                                 if (lenp) {
6746                                     *lenp = map_fold_len_back[len];
6747                                 }
6748                                 match = TRUE;
6749                                 break;
6750                             }
6751                         }
6752                     }
6753                 }
6754
6755                 /* If we allocated a string above, free it */
6756                 if (! utf8_target) Safefree(utf8_p);
6757             }
6758         }
6759     }
6760
6761     return (flags & ANYOF_INVERT) ? !match : match;
6762 }
6763
6764 STATIC U8 *
6765 S_reghop3(U8 *s, I32 off, const U8* lim)
6766 {
6767     dVAR;
6768
6769     PERL_ARGS_ASSERT_REGHOP3;
6770
6771     if (off >= 0) {
6772         while (off-- && s < lim) {
6773             /* XXX could check well-formedness here */
6774             s += UTF8SKIP(s);
6775         }
6776     }
6777     else {
6778         while (off++ && s > lim) {
6779             s--;
6780             if (UTF8_IS_CONTINUED(*s)) {
6781                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6782                     s--;
6783             }
6784             /* XXX could check well-formedness here */
6785         }
6786     }
6787     return s;
6788 }
6789
6790 #ifdef XXX_dmq
6791 /* there are a bunch of places where we use two reghop3's that should
6792    be replaced with this routine. but since thats not done yet
6793    we ifdef it out - dmq
6794 */
6795 STATIC U8 *
6796 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6797 {
6798     dVAR;
6799
6800     PERL_ARGS_ASSERT_REGHOP4;
6801
6802     if (off >= 0) {
6803         while (off-- && s < rlim) {
6804             /* XXX could check well-formedness here */
6805             s += UTF8SKIP(s);
6806         }
6807     }
6808     else {
6809         while (off++ && s > llim) {
6810             s--;
6811             if (UTF8_IS_CONTINUED(*s)) {
6812                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6813                     s--;
6814             }
6815             /* XXX could check well-formedness here */
6816         }
6817     }
6818     return s;
6819 }
6820 #endif
6821
6822 STATIC U8 *
6823 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6824 {
6825     dVAR;
6826
6827     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6828
6829     if (off >= 0) {
6830         while (off-- && s < lim) {
6831             /* XXX could check well-formedness here */
6832             s += UTF8SKIP(s);
6833         }
6834         if (off >= 0)
6835             return NULL;
6836     }
6837     else {
6838         while (off++ && s > lim) {
6839             s--;
6840             if (UTF8_IS_CONTINUED(*s)) {
6841                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6842                     s--;
6843             }
6844             /* XXX could check well-formedness here */
6845         }
6846         if (off <= 0)
6847             return NULL;
6848     }
6849     return s;
6850 }
6851
6852 static void
6853 restore_pos(pTHX_ void *arg)
6854 {
6855     dVAR;
6856     regexp * const rex = (regexp *)arg;
6857     if (PL_reg_eval_set) {
6858         if (PL_reg_oldsaved) {
6859             rex->subbeg = PL_reg_oldsaved;
6860             rex->sublen = PL_reg_oldsavedlen;
6861 #ifdef PERL_OLD_COPY_ON_WRITE
6862             rex->saved_copy = PL_nrs;
6863 #endif
6864             RXp_MATCH_COPIED_on(rex);
6865         }
6866         PL_reg_magic->mg_len = PL_reg_oldpos;
6867         PL_reg_eval_set = 0;
6868         PL_curpm = PL_reg_oldcurpm;
6869     }
6870 }
6871
6872 STATIC void
6873 S_to_utf8_substr(pTHX_ register regexp *prog)
6874 {
6875     int i = 1;
6876
6877     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6878
6879     do {
6880         if (prog->substrs->data[i].substr
6881             && !prog->substrs->data[i].utf8_substr) {
6882             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6883             prog->substrs->data[i].utf8_substr = sv;
6884             sv_utf8_upgrade(sv);
6885             if (SvVALID(prog->substrs->data[i].substr)) {
6886                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6887                 if (flags & FBMcf_TAIL) {
6888                     /* Trim the trailing \n that fbm_compile added last
6889                        time.  */
6890                     SvCUR_set(sv, SvCUR(sv) - 1);
6891                     /* Whilst this makes the SV technically "invalid" (as its
6892                        buffer is no longer followed by "\0") when fbm_compile()
6893                        adds the "\n" back, a "\0" is restored.  */
6894                 }
6895                 fbm_compile(sv, flags);
6896             }
6897             if (prog->substrs->data[i].substr == prog->check_substr)
6898                 prog->check_utf8 = sv;
6899         }
6900     } while (i--);
6901 }
6902
6903 STATIC void
6904 S_to_byte_substr(pTHX_ register regexp *prog)
6905 {
6906     dVAR;
6907     int i = 1;
6908
6909     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6910
6911     do {
6912         if (prog->substrs->data[i].utf8_substr
6913             && !prog->substrs->data[i].substr) {
6914             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6915             if (sv_utf8_downgrade(sv, TRUE)) {
6916                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6917                     const U8 flags
6918                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6919                     if (flags & FBMcf_TAIL) {
6920                         /* Trim the trailing \n that fbm_compile added last
6921                            time.  */
6922                         SvCUR_set(sv, SvCUR(sv) - 1);
6923                     }
6924                     fbm_compile(sv, flags);
6925                 }
6926             } else {
6927                 SvREFCNT_dec(sv);
6928                 sv = &PL_sv_undef;
6929             }
6930             prog->substrs->data[i].substr = sv;
6931             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6932                 prog->check_substr = sv;
6933         }
6934     } while (i--);
6935 }
6936
6937 /*
6938  * Local variables:
6939  * c-indentation-style: bsd
6940  * c-basic-offset: 4
6941  * indent-tabs-mode: t
6942  * End:
6943  *
6944  * ex: set ts=8 sts=4 sw=4 noet:
6945  */