5013011/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 154
 155 /* The actual code for CCC_TRY, which uses several variables from the routine
 156  * it's callable from.  It is designed to be the bulk of a case statement.
 157  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 158  *      nextchr matches the class.
 159  * UTF8_TEST is the whole test string to use for utf8 targets
 160  * LOAD is what to use to test, and if not present to load in the swash for the
 161  *      class
 162  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 163  *      UTF8_TEST test.
 164  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 165  * utf8 and a variant, load the swash if necessary and test using the utf8
 166  * test.  Advance to the next character if test is ok, otherwise fail; If not
 167  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 168  * fails, or advance to the next character */
 169
 170 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 171     if (locinput >= PL_regeol) {                                              \
 172         sayNO;                                                                \
 173     }                                                                         \
 174     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 175         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 176         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 177             sayNO;                                                            \
 178         }                                                                     \
 179         locinput += PL_utf8skip[nextchr];                                     \
 180         nextchr = UCHARAT(locinput);                                          \
 181         break;                                                                \
 182     }                                                                         \
 183     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 184         sayNO;                                                                \
 185     }                                                                         \
 186     nextchr = UCHARAT(++locinput);                                            \
 187     break;
 188
 189 /* Handle the non-locale cases for a character class and its complement.  It
 190  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 191  * This is because that code fails when the test succeeds, so we want to have
 192  * the test fail so that the code succeeds.  The swash is stored in a
 193  * predictable PL_ place */
 194 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 195                            CLASS, STR)                                        \
 196     case NAME:                                                                \
 197         _CCC_TRY_CODE( !, FUNC,                                               \
 198                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 199                                             (U8*)locinput, TRUE)),            \
 200                           CLASS, STR)                                         \
 201     case NNAME:                                                               \
 202         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 203                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 204                                             (U8*)locinput, TRUE)),            \
 205                           CLASS, STR)                                         \
 206
 207 /* Generate the case statements for both locale and non-locale character
 208  * classes in regmatch for classes that don't have special unicode semantics.
 209  * Locales don't use an immediate swash, but an intermediary special locale
 210  * function that is called on the pointer to the current place in the input
 211  * string.  That function will resolve to needing the same swash.  One might
 212  * think that because we don't know what the locale will match, we shouldn't
 213  * check with the swash loading function that it loaded properly; ie, that we
 214  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 215  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 216  * irrelevant here */
 217 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 218                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 219                 NAMEA, NNAMEA, FUNCA,                                         \
 220                 CLASS, STR)                                                   \
 221     case NAMEL:                                                               \
 222         PL_reg_flags |= RF_tainted;                                           \
 223         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 224     case NNAMEL:                                                              \
 225         PL_reg_flags |= RF_tainted;                                           \
 226         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 227                        CLASS, STR)                                            \
 228     case NAMEA:                                                               \
 229         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 230             sayNO;                                                            \
 231         }                                                                     \
 232         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 233         nextchr = UCHARAT(++locinput);                                        \
 234         break;                                                                \
 235     case NNAMEA:                                                              \
 236         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 237             sayNO;                                                            \
 238         }                                                                     \
 239         if (utf8_target) {                                                    \
 240             locinput += PL_utf8skip[nextchr];                                 \
 241             nextchr = UCHARAT(locinput);                                      \
 242         }                                                                     \
 243         else {                                                                \
 244             nextchr = UCHARAT(++locinput);                                    \
 245         }                                                                     \
 246         break;                                                                \
 247     /* Generate the non-locale cases */                                       \
 248     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 249
 250 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 251  * statements to handle separate Unicode semantics nodes */
 252 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 253                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 254                   NAMEU, NNAMEU, FUNCU,                                        \
 255                   NAMEA, NNAMEA, FUNCA,                                        \
 256                   CLASS, STR)                                                  \
 257     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 258             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 259             NAMEA, NNAMEA, FUNCA,                                              \
 260             CLASS, STR)                                                        \
 261     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 262
 263 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 264
 265 /* for use after a quantifier and before an EXACT-like node -- japhy */
 266 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 267  *
 268  * NOTE that *nothing* that affects backtracking should be in here, specifically
 269  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 270  * node that is in between two EXACT like nodes when ascertaining what the required
 271  * "follow" character is. This should probably be moved to regex compile time
 272  * although it may be done at run time beause of the REF possibility - more
 273  * investigation required. -- demerphq
 274 */
 275 #define JUMPABLE(rn) (      \
 276     OP(rn) == OPEN ||       \
 277     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 278     OP(rn) == EVAL ||   \
 279     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 280     OP(rn) == PLUS || OP(rn) == MINMOD || \
 281     OP(rn) == KEEPS || \
 282     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 283 )
 284 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 285
 286 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 287
 288 #if 0
 289 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 290    we don't need this definition. */
 291 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 292 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 293 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 294
 295 #else
 296 /* ... so we use this as its faster. */
 297 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 298 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 299 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 300 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 301
 302 #endif
 303
 304 /*
 305   Search for mandatory following text node; for lookahead, the text must
 306   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 307 */
 308 #define FIND_NEXT_IMPT(rn) STMT_START { \
 309     while (JUMPABLE(rn)) { \
 310         const OPCODE type = OP(rn); \
 311         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 312             rn = NEXTOPER(NEXTOPER(rn)); \
 313         else if (type == PLUS) \
 314             rn = NEXTOPER(rn); \
 315         else if (type == IFMATCH) \
 316             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 317         else rn += NEXT_OFF(rn); \
 318     } \
 319 } STMT_END
 320
 321
 322 static void restore_pos(pTHX_ void *arg);
 323
 324 #define REGCP_PAREN_ELEMS 4
 325 #define REGCP_OTHER_ELEMS 5
 326 #define REGCP_FRAME_ELEMS 1
 327 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 328  * are needed for the regexp context stack bookkeeping. */
 329
 330 STATIC CHECKPOINT
 331 S_regcppush(pTHX_ I32 parenfloor)
 332 {
 333     dVAR;
 334     const int retval = PL_savestack_ix;
 335     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 336     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 337     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 338     int p;
 339     GET_RE_DEBUG_FLAGS_DECL;
 340
 341     if (paren_elems_to_push < 0)
 342         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 343
 344     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 345         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 346                    " out of range (%lu-%ld)",
 347                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 348
 349     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 350
 351     for (p = PL_regsize; p > parenfloor; p--) {
 352 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 353         SSPUSHINT(PL_regoffs[p].end);
 354         SSPUSHINT(PL_regoffs[p].start);
 355         SSPUSHPTR(PL_reg_start_tmp[p]);
 356         SSPUSHINT(p);
 357         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 358           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 359                       (UV)p, (IV)PL_regoffs[p].start,
 360                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 361                       (IV)PL_regoffs[p].end
 362         ));
 363     }
 364 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 365     SSPUSHPTR(PL_regoffs);
 366     SSPUSHINT(PL_regsize);
 367     SSPUSHINT(*PL_reglastparen);
 368     SSPUSHINT(*PL_reglastcloseparen);
 369     SSPUSHPTR(PL_reginput);
 370     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 371
 372     return retval;
 373 }
 374
 375 /* These are needed since we do not localize EVAL nodes: */
 376 #define REGCP_SET(cp)                                           \
 377     DEBUG_STATE_r(                                              \
 378             PerlIO_printf(Perl_debug_log,                       \
 379                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 380                 (IV)PL_savestack_ix));                          \
 381     cp = PL_savestack_ix
 382
 383 #define REGCP_UNWIND(cp)                                        \
 384     DEBUG_STATE_r(                                              \
 385         if (cp != PL_savestack_ix)                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 388                 (IV)(cp), (IV)PL_savestack_ix));                \
 389     regcpblow(cp)
 390
 391 STATIC char *
 392 S_regcppop(pTHX_ const regexp *rex)
 393 {
 394     dVAR;
 395     UV i;
 396     char *input;
 397     GET_RE_DEBUG_FLAGS_DECL;
 398
 399     PERL_ARGS_ASSERT_REGCPPOP;
 400
 401     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 402     i = SSPOPUV;
 403     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 404     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 405     input = (char *) SSPOPPTR;
 406     *PL_reglastcloseparen = SSPOPINT;
 407     *PL_reglastparen = SSPOPINT;
 408     PL_regsize = SSPOPINT;
 409     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 410
 411     i -= REGCP_OTHER_ELEMS;
 412     /* Now restore the parentheses context. */
 413     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 414         I32 tmps;
 415         U32 paren = (U32)SSPOPINT;
 416         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 417         PL_regoffs[paren].start = SSPOPINT;
 418         tmps = SSPOPINT;
 419         if (paren <= *PL_reglastparen)
 420             PL_regoffs[paren].end = tmps;
 421         DEBUG_BUFFERS_r(
 422             PerlIO_printf(Perl_debug_log,
 423                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 424                           (UV)paren, (IV)PL_regoffs[paren].start,
 425                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 426                           (IV)PL_regoffs[paren].end,
 427                           (paren > *PL_reglastparen ? "(no)" : ""));
 428         );
 429     }
 430     DEBUG_BUFFERS_r(
 431         if (*PL_reglastparen + 1 <= rex->nparens) {
 432             PerlIO_printf(Perl_debug_log,
 433                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 434                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 435         }
 436     );
 437 #if 1
 438     /* It would seem that the similar code in regtry()
 439      * already takes care of this, and in fact it is in
 440      * a better location to since this code can #if 0-ed out
 441      * but the code in regtry() is needed or otherwise tests
 442      * requiring null fields (pat.t#187 and split.t#{13,14}
 443      * (as of patchlevel 7877)  will fail.  Then again,
 444      * this code seems to be necessary or otherwise
 445      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 446      * --jhi updated by dapm */
 447     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 448         if (i > PL_regsize)
 449             PL_regoffs[i].start = -1;
 450         PL_regoffs[i].end = -1;
 451     }
 452 #endif
 453     return input;
 454 }
 455
 456 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 457
 458 /*
 459  * pregexec and friends
 460  */
 461
 462 #ifndef PERL_IN_XSUB_RE
 463 /*
 464  - pregexec - match a regexp against a string
 465  */
 466 I32
 467 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 468          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 469 /* strend: pointer to null at end of string */
 470 /* strbeg: real beginning of string */
 471 /* minend: end of match must be >=minend after stringarg. */
 472 /* nosave: For optimizations. */
 473 {
 474     PERL_ARGS_ASSERT_PREGEXEC;
 475
 476     return
 477         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 478                       nosave ? 0 : REXEC_COPY_STR);
 479 }
 480 #endif
 481
 482 /*
 483  * Need to implement the following flags for reg_anch:
 484  *
 485  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 486  * USE_INTUIT_ML
 487  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 488  * INTUIT_AUTORITATIVE_ML
 489  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 490  * INTUIT_ONCE_ML
 491  *
 492  * Another flag for this function: SECOND_TIME (so that float substrs
 493  * with giant delta may be not rechecked).
 494  */
 495
 496 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 497
 498 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 499    Otherwise, only SvCUR(sv) is used to get strbeg. */
 500
 501 /* XXXX We assume that strpos is strbeg unless sv. */
 502
 503 /* XXXX Some places assume that there is a fixed substring.
 504         An update may be needed if optimizer marks as "INTUITable"
 505         RExen without fixed substrings.  Similarly, it is assumed that
 506         lengths of all the strings are no more than minlen, thus they
 507         cannot come from lookahead.
 508         (Or minlen should take into account lookahead.)
 509   NOTE: Some of this comment is not correct. minlen does now take account
 510   of lookahead/behind. Further research is required. -- demerphq
 511
 512 */
 513
 514 /* A failure to find a constant substring means that there is no need to make
 515    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 516    finding a substring too deep into the string means that less calls to
 517    regtry() should be needed.
 518
 519    REx compiler's optimizer found 4 possible hints:
 520         a) Anchored substring;
 521         b) Fixed substring;
 522         c) Whether we are anchored (beginning-of-line or \G);
 523         d) First node (of those at offset 0) which may distinguish positions;
 524    We use a)b)d) and multiline-part of c), and try to find a position in the
 525    string which does not contradict any of them.
 526  */
 527
 528 /* Most of decisions we do here should have been done at compile time.
 529    The nodes of the REx which we used for the search should have been
 530    deleted from the finite automaton. */
 531
 532 char *
 533 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 534                      char *strend, const U32 flags, re_scream_pos_data *data)
 535 {
 536     dVAR;
 537     struct regexp *const prog = (struct regexp *)SvANY(rx);
 538     register I32 start_shift = 0;
 539     /* Should be nonnegative! */
 540     register I32 end_shift   = 0;
 541     register char *s;
 542     register SV *check;
 543     char *strbeg;
 544     char *t;
 545     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 546     I32 ml_anch;
 547     register char *other_last = NULL;   /* other substr checked before this */
 548     char *check_at = NULL;              /* check substr found at this pos */
 549     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 550     RXi_GET_DECL(prog,progi);
 551 #ifdef DEBUGGING
 552     const char * const i_strpos = strpos;
 553 #endif
 554     GET_RE_DEBUG_FLAGS_DECL;
 555
 556     PERL_ARGS_ASSERT_RE_INTUIT_START;
 557
 558     RX_MATCH_UTF8_set(rx,utf8_target);
 559
 560     if (RX_UTF8(rx)) {
 561         PL_reg_flags |= RF_utf8;
 562     }
 563     DEBUG_EXECUTE_r(
 564         debug_start_match(rx, utf8_target, strpos, strend,
 565             sv ? "Guessing start of match in sv for"
 566                : "Guessing start of match in string for");
 567               );
 568
 569     /* CHR_DIST() would be more correct here but it makes things slow. */
 570     if (prog->minlen > strend - strpos) {
 571         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 572                               "String too short... [re_intuit_start]\n"));
 573         goto fail;
 574     }
 575
 576     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 577     PL_regeol = strend;
 578     if (utf8_target) {
 579         if (!prog->check_utf8 && prog->check_substr)
 580             to_utf8_substr(prog);
 581         check = prog->check_utf8;
 582     } else {
 583         if (!prog->check_substr && prog->check_utf8)
 584             to_byte_substr(prog);
 585         check = prog->check_substr;
 586     }
 587     if (check == &PL_sv_undef) {
 588         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 589                 "Non-utf8 string cannot match utf8 check string\n"));
 590         goto fail;
 591     }
 592     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 593         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 594                      || ( (prog->extflags & RXf_ANCH_BOL)
 595                           && !multiline ) );    /* Check after \n? */
 596
 597         if (!ml_anch) {
 598           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 599                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 600                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 601                && sv && !SvROK(sv)
 602                && (strpos != strbeg)) {
 603               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 604               goto fail;
 605           }
 606           if (prog->check_offset_min == prog->check_offset_max &&
 607               !(prog->extflags & RXf_CANY_SEEN)) {
 608             /* Substring at constant offset from beg-of-str... */
 609             I32 slen;
 610
 611             s = HOP3c(strpos, prog->check_offset_min, strend);
 612
 613             if (SvTAIL(check)) {
 614                 slen = SvCUR(check);    /* >= 1 */
 615
 616                 if ( strend - s > slen || strend - s < slen - 1
 617                      || (strend - s == slen && strend[-1] != '\n')) {
 618                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 619                     goto fail_finish;
 620                 }
 621                 /* Now should match s[0..slen-2] */
 622                 slen--;
 623                 if (slen && (*SvPVX_const(check) != *s
 624                              || (slen > 1
 625                                  && memNE(SvPVX_const(check), s, slen)))) {
 626                   report_neq:
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 628                     goto fail_finish;
 629                 }
 630             }
 631             else if (*SvPVX_const(check) != *s
 632                      || ((slen = SvCUR(check)) > 1
 633                          && memNE(SvPVX_const(check), s, slen)))
 634                 goto report_neq;
 635             check_at = s;
 636             goto success_at_start;
 637           }
 638         }
 639         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 640         s = strpos;
 641         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 642         end_shift = prog->check_end_shift;
 643
 644         if (!ml_anch) {
 645             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 646                                          - (SvTAIL(check) != 0);
 647             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 648
 649             if (end_shift < eshift)
 650                 end_shift = eshift;
 651         }
 652     }
 653     else {                              /* Can match at random position */
 654         ml_anch = 0;
 655         s = strpos;
 656         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 657         end_shift = prog->check_end_shift;
 658
 659         /* end shift should be non negative here */
 660     }
 661
 662 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 663     if (end_shift < 0)
 664         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 665                    (IV)end_shift, RX_PRECOMP(prog));
 666 #endif
 667
 668   restart:
 669     /* Find a possible match in the region s..strend by looking for
 670        the "check" substring in the region corrected by start/end_shift. */
 671
 672     {
 673         I32 srch_start_shift = start_shift;
 674         I32 srch_end_shift = end_shift;
 675         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 676             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 677             srch_start_shift = strbeg - s;
 678         }
 679     DEBUG_OPTIMISE_MORE_r({
 680         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 681             (IV)prog->check_offset_min,
 682             (IV)srch_start_shift,
 683             (IV)srch_end_shift,
 684             (IV)prog->check_end_shift);
 685     });
 686
 687     if (flags & REXEC_SCREAM) {
 688         I32 p = -1;                     /* Internal iterator of scream. */
 689         I32 * const pp = data ? data->scream_pos : &p;
 690
 691         if (PL_screamfirst[BmRARE(check)] >= 0
 692             || ( BmRARE(check) == '\n'
 693                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 694                  && SvTAIL(check) ))
 695             s = screaminstr(sv, check,
 696                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 697         else
 698             goto fail_finish;
 699         /* we may be pointing at the wrong string */
 700         if (s && RXp_MATCH_COPIED(prog))
 701             s = strbeg + (s - SvPVX_const(sv));
 702         if (data)
 703             *data->scream_olds = s;
 704     }
 705     else {
 706         U8* start_point;
 707         U8* end_point;
 708         if (prog->extflags & RXf_CANY_SEEN) {
 709             start_point= (U8*)(s + srch_start_shift);
 710             end_point= (U8*)(strend - srch_end_shift);
 711         } else {
 712             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 713             end_point= HOP3(strend, -srch_end_shift, strbeg);
 714         }
 715         DEBUG_OPTIMISE_MORE_r({
 716             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 717                 (int)(end_point - start_point),
 718                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 719                 start_point);
 720         });
 721
 722         s = fbm_instr( start_point, end_point,
 723                       check, multiline ? FBMrf_MULTILINE : 0);
 724     }
 725     }
 726     /* Update the count-of-usability, remove useless subpatterns,
 727         unshift s.  */
 728
 729     DEBUG_EXECUTE_r({
 730         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 731             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 732         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 733                           (s ? "Found" : "Did not find"),
 734             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 735                 ? "anchored" : "floating"),
 736             quoted,
 737             RE_SV_TAIL(check),
 738             (s ? " at offset " : "...\n") );
 739     });
 740
 741     if (!s)
 742         goto fail_finish;
 743     /* Finish the diagnostic message */
 744     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 745
 746     /* XXX dmq: first branch is for positive lookbehind...
 747        Our check string is offset from the beginning of the pattern.
 748        So we need to do any stclass tests offset forward from that
 749        point. I think. :-(
 750      */
 751
 752
 753
 754     check_at=s;
 755
 756
 757     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 758        Start with the other substr.
 759        XXXX no SCREAM optimization yet - and a very coarse implementation
 760        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 761                 *always* match.  Probably should be marked during compile...
 762        Probably it is right to do no SCREAM here...
 763      */
 764
 765     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 766                 : (prog->float_substr && prog->anchored_substr))
 767     {
 768         /* Take into account the "other" substring. */
 769         /* XXXX May be hopelessly wrong for UTF... */
 770         if (!other_last)
 771             other_last = strpos;
 772         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 773           do_other_anchored:
 774             {
 775                 char * const last = HOP3c(s, -start_shift, strbeg);
 776                 char *last1, *last2;
 777                 char * const saved_s = s;
 778                 SV* must;
 779
 780                 t = s - prog->check_offset_max;
 781                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 782                     && (!utf8_target
 783                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 784                             && t > strpos)))
 785                     NOOP;
 786                 else
 787                     t = strpos;
 788                 t = HOP3c(t, prog->anchored_offset, strend);
 789                 if (t < other_last)     /* These positions already checked */
 790                     t = other_last;
 791                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 792                 if (last < last1)
 793                     last1 = last;
 794                 /* XXXX It is not documented what units *_offsets are in.
 795                    We assume bytes, but this is clearly wrong.
 796                    Meaning this code needs to be carefully reviewed for errors.
 797                    dmq.
 798                   */
 799
 800                 /* On end-of-str: see comment below. */
 801                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 802                 if (must == &PL_sv_undef) {
 803                     s = (char*)NULL;
 804                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 805                 }
 806                 else
 807                     s = fbm_instr(
 808                         (unsigned char*)t,
 809                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 810                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 811                         must,
 812                         multiline ? FBMrf_MULTILINE : 0
 813                     );
 814                 DEBUG_EXECUTE_r({
 815                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 816                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 817                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 818                         (s ? "Found" : "Contradicts"),
 819                         quoted, RE_SV_TAIL(must));
 820                 });
 821
 822
 823                 if (!s) {
 824                     if (last1 >= last2) {
 825                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 826                                                 ", giving up...\n"));
 827                         goto fail_finish;
 828                     }
 829                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 830                         ", trying floating at offset %ld...\n",
 831                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 832                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 833                     s = HOP3c(last, 1, strend);
 834                     goto restart;
 835                 }
 836                 else {
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 838                           (long)(s - i_strpos)));
 839                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 840                     other_last = HOP3c(s, 1, strend);
 841                     s = saved_s;
 842                     if (t == strpos)
 843                         goto try_at_start;
 844                     goto try_at_offset;
 845                 }
 846             }
 847         }
 848         else {          /* Take into account the floating substring. */
 849             char *last, *last1;
 850             char * const saved_s = s;
 851             SV* must;
 852
 853             t = HOP3c(s, -start_shift, strbeg);
 854             last1 = last =
 855                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 856             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 857                 last = HOP3c(t, prog->float_max_offset, strend);
 858             s = HOP3c(t, prog->float_min_offset, strend);
 859             if (s < other_last)
 860                 s = other_last;
 861  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 862             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 863             /* fbm_instr() takes into account exact value of end-of-str
 864                if the check is SvTAIL(ed).  Since false positives are OK,
 865                and end-of-str is not later than strend we are OK. */
 866             if (must == &PL_sv_undef) {
 867                 s = (char*)NULL;
 868                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 869             }
 870             else
 871                 s = fbm_instr((unsigned char*)s,
 872                               (unsigned char*)last + SvCUR(must)
 873                                   - (SvTAIL(must)!=0),
 874                               must, multiline ? FBMrf_MULTILINE : 0);
 875             DEBUG_EXECUTE_r({
 876                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 879                     (s ? "Found" : "Contradicts"),
 880                     quoted, RE_SV_TAIL(must));
 881             });
 882             if (!s) {
 883                 if (last1 == last) {
 884                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 885                                             ", giving up...\n"));
 886                     goto fail_finish;
 887                 }
 888                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 889                     ", trying anchored starting at offset %ld...\n",
 890                     (long)(saved_s + 1 - i_strpos)));
 891                 other_last = last;
 892                 s = HOP3c(t, 1, strend);
 893                 goto restart;
 894             }
 895             else {
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 897                       (long)(s - i_strpos)));
 898                 other_last = s; /* Fix this later. --Hugo */
 899                 s = saved_s;
 900                 if (t == strpos)
 901                     goto try_at_start;
 902                 goto try_at_offset;
 903             }
 904         }
 905     }
 906
 907
 908     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 909
 910     DEBUG_OPTIMISE_MORE_r(
 911         PerlIO_printf(Perl_debug_log,
 912             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 913             (IV)prog->check_offset_min,
 914             (IV)prog->check_offset_max,
 915             (IV)(s-strpos),
 916             (IV)(t-strpos),
 917             (IV)(t-s),
 918             (IV)(strend-strpos)
 919         )
 920     );
 921
 922     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 923         && (!utf8_target
 924             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 925                  && t > strpos)))
 926     {
 927         /* Fixed substring is found far enough so that the match
 928            cannot start at strpos. */
 929       try_at_offset:
 930         if (ml_anch && t[-1] != '\n') {
 931             /* Eventually fbm_*() should handle this, but often
 932                anchored_offset is not 0, so this check will not be wasted. */
 933             /* XXXX In the code below we prefer to look for "^" even in
 934                presence of anchored substrings.  And we search even
 935                beyond the found float position.  These pessimizations
 936                are historical artefacts only.  */
 937           find_anchor:
 938             while (t < strend - prog->minlen) {
 939                 if (*t == '\n') {
 940                     if (t < check_at - prog->check_offset_min) {
 941                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 942                             /* Since we moved from the found position,
 943                                we definitely contradict the found anchored
 944                                substr.  Due to the above check we do not
 945                                contradict "check" substr.
 946                                Thus we can arrive here only if check substr
 947                                is float.  Redo checking for "other"=="fixed".
 948                              */
 949                             strpos = t + 1;
 950                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 951                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 952                             goto do_other_anchored;
 953                         }
 954                         /* We don't contradict the found floating substring. */
 955                         /* XXXX Why not check for STCLASS? */
 956                         s = t + 1;
 957                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 958                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 959                         goto set_useful;
 960                     }
 961                     /* Position contradicts check-string */
 962                     /* XXXX probably better to look for check-string
 963                        than for "\n", so one should lower the limit for t? */
 964                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 965                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 966                     other_last = strpos = s = t + 1;
 967                     goto restart;
 968                 }
 969                 t++;
 970             }
 971             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 972                         PL_colors[0], PL_colors[1]));
 973             goto fail_finish;
 974         }
 975         else {
 976             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 977                         PL_colors[0], PL_colors[1]));
 978         }
 979         s = t;
 980       set_useful:
 981         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 982     }
 983     else {
 984         /* The found string does not prohibit matching at strpos,
 985            - no optimization of calling REx engine can be performed,
 986            unless it was an MBOL and we are not after MBOL,
 987            or a future STCLASS check will fail this. */
 988       try_at_start:
 989         /* Even in this situation we may use MBOL flag if strpos is offset
 990            wrt the start of the string. */
 991         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 992             && (strpos != strbeg) && strpos[-1] != '\n'
 993             /* May be due to an implicit anchor of m{.*foo}  */
 994             && !(prog->intflags & PREGf_IMPLICIT))
 995         {
 996             t = strpos;
 997             goto find_anchor;
 998         }
 999         DEBUG_EXECUTE_r( if (ml_anch)
1000             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1001                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1002         );
1003       success_at_start:
1004         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1005             && (utf8_target ? (
1006                 prog->check_utf8                /* Could be deleted already */
1007                 && --BmUSEFUL(prog->check_utf8) < 0
1008                 && (prog->check_utf8 == prog->float_utf8)
1009             ) : (
1010                 prog->check_substr              /* Could be deleted already */
1011                 && --BmUSEFUL(prog->check_substr) < 0
1012                 && (prog->check_substr == prog->float_substr)
1013             )))
1014         {
1015             /* If flags & SOMETHING - do not do it many times on the same match */
1016             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1017             /* XXX Does the destruction order has to change with utf8_target? */
1018             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1019             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1020             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1021             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1022             check = NULL;                       /* abort */
1023             s = strpos;
1024             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1025                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1026             if (prog->intflags & PREGf_IMPLICIT)
1027                 prog->extflags &= ~RXf_ANCH_MBOL;
1028             /* XXXX This is a remnant of the old implementation.  It
1029                     looks wasteful, since now INTUIT can use many
1030                     other heuristics. */
1031             prog->extflags &= ~RXf_USE_INTUIT;
1032             /* XXXX What other flags might need to be cleared in this branch? */
1033         }
1034         else
1035             s = strpos;
1036     }
1037
1038     /* Last resort... */
1039     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1040     /* trie stclasses are too expensive to use here, we are better off to
1041        leave it to regmatch itself */
1042     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1043         /* minlen == 0 is possible if regstclass is \b or \B,
1044            and the fixed substr is ''$.
1045            Since minlen is already taken into account, s+1 is before strend;
1046            accidentally, minlen >= 1 guaranties no false positives at s + 1
1047            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1048            regstclass does not come from lookahead...  */
1049         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1050            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1051         const U8* const str = (U8*)STRING(progi->regstclass);
1052         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1053                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1054                     : 1);
1055         char * endpos;
1056         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1057             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1058         else if (prog->float_substr || prog->float_utf8)
1059             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1060         else
1061             endpos= strend;
1062
1063         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1064                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1065
1066         t = s;
1067         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1068         if (!s) {
1069 #ifdef DEBUGGING
1070             const char *what = NULL;
1071 #endif
1072             if (endpos == strend) {
1073                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074                                 "Could not match STCLASS...\n") );
1075                 goto fail;
1076             }
1077             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1078                                    "This position contradicts STCLASS...\n") );
1079             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1080                 goto fail;
1081             /* Contradict one of substrings */
1082             if (prog->anchored_substr || prog->anchored_utf8) {
1083                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1084                     DEBUG_EXECUTE_r( what = "anchored" );
1085                   hop_and_restart:
1086                     s = HOP3c(t, 1, strend);
1087                     if (s + start_shift + end_shift > strend) {
1088                         /* XXXX Should be taken into account earlier? */
1089                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1090                                                "Could not match STCLASS...\n") );
1091                         goto fail;
1092                     }
1093                     if (!check)
1094                         goto giveup;
1095                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1096                                 "Looking for %s substr starting at offset %ld...\n",
1097                                  what, (long)(s + start_shift - i_strpos)) );
1098                     goto restart;
1099                 }
1100                 /* Have both, check_string is floating */
1101                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1102                     goto retry_floating_check;
1103                 /* Recheck anchored substring, but not floating... */
1104                 s = check_at;
1105                 if (!check)
1106                     goto giveup;
1107                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                           "Looking for anchored substr starting at offset %ld...\n",
1109                           (long)(other_last - i_strpos)) );
1110                 goto do_other_anchored;
1111             }
1112             /* Another way we could have checked stclass at the
1113                current position only: */
1114             if (ml_anch) {
1115                 s = t = t + 1;
1116                 if (!check)
1117                     goto giveup;
1118                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                           "Looking for /%s^%s/m starting at offset %ld...\n",
1120                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1121                 goto try_at_offset;
1122             }
1123             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1124                 goto fail;
1125             /* Check is floating substring. */
1126           retry_floating_check:
1127             t = check_at - start_shift;
1128             DEBUG_EXECUTE_r( what = "floating" );
1129             goto hop_and_restart;
1130         }
1131         if (t != s) {
1132             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1133                         "By STCLASS: moving %ld --> %ld\n",
1134                                   (long)(t - i_strpos), (long)(s - i_strpos))
1135                    );
1136         }
1137         else {
1138             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1139                                   "Does not contradict STCLASS...\n");
1140                    );
1141         }
1142     }
1143   giveup:
1144     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1145                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1146                           PL_colors[5], (long)(s - i_strpos)) );
1147     return s;
1148
1149   fail_finish:                          /* Substring not found */
1150     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1151         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1152   fail:
1153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1154                           PL_colors[4], PL_colors[5]));
1155     return NULL;
1156 }
1157
1158 #define DECL_TRIE_TYPE(scan) \
1159     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1160                     trie_type = (scan->flags != EXACT) \
1161                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1162                               : (utf8_target ? trie_utf8 : trie_plain)
1163
1164 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1165 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1166     switch (trie_type) {                                                    \
1167     case trie_utf8_fold:                                                    \
1168         if ( foldlen>0 ) {                                                  \
1169             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1170             foldlen -= len;                                                 \
1171             uscan += len;                                                   \
1172             len=0;                                                          \
1173         } else {                                                            \
1174             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1175             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1176             foldlen -= UNISKIP( uvc );                                      \
1177             uscan = foldbuf + UNISKIP( uvc );                               \
1178         }                                                                   \
1179         break;                                                              \
1180     case trie_latin_utf8_fold:                                              \
1181         if ( foldlen>0 ) {                                                  \
1182             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1183             foldlen -= len;                                                 \
1184             uscan += len;                                                   \
1185             len=0;                                                          \
1186         } else {                                                            \
1187             len = 1;                                                        \
1188             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1189             foldlen -= UNISKIP( uvc );                                      \
1190             uscan = foldbuf + UNISKIP( uvc );                               \
1191         }                                                                   \
1192         break;                                                              \
1193     case trie_utf8:                                                         \
1194         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1195         break;                                                              \
1196     case trie_plain:                                                        \
1197         uvc = (UV)*uc;                                                      \
1198         len = 1;                                                            \
1199     }                                                                       \
1200     if (uvc < 256) {                                                        \
1201         charid = trie->charmap[ uvc ];                                      \
1202     }                                                                       \
1203     else {                                                                  \
1204         charid = 0;                                                         \
1205         if (widecharmap) {                                                  \
1206             SV** const svpp = hv_fetch(widecharmap,                         \
1207                         (char*)&uvc, sizeof(UV), 0);                        \
1208             if (svpp)                                                       \
1209                 charid = (U16)SvIV(*svpp);                                  \
1210         }                                                                   \
1211     }                                                                       \
1212 } STMT_END
1213
1214 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1215 STMT_START {                                              \
1216     while (s <= e) {                                      \
1217         if ( (CoNd)                                       \
1218              && (ln == 1 || folder(s, pat_string, ln))    \
1219              && (!reginfo || regtry(reginfo, &s)) )       \
1220             goto got_it;                                  \
1221         s++;                                              \
1222     }                                                     \
1223 } STMT_END
1224
1225 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1226 STMT_START {                                          \
1227     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1228         CoDe                                          \
1229         s += uskip;                                   \
1230     }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_SCAN(CoDe)                          \
1234 STMT_START {                                          \
1235     while (s < strend) {                              \
1236         CoDe                                          \
1237         s++;                                          \
1238     }                                                 \
1239 } STMT_END
1240
1241 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1242 REXEC_FBC_UTF8_SCAN(                                  \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1254 REXEC_FBC_SCAN(                                       \
1255     if (CoNd) {                                       \
1256         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1257             goto got_it;                              \
1258         else                                          \
1259             tmp = doevery;                            \
1260     }                                                 \
1261     else                                              \
1262         tmp = 1;                                      \
1263 )
1264
1265 #define REXEC_FBC_TRYIT               \
1266 if ((!reginfo || regtry(reginfo, &s))) \
1267     goto got_it
1268
1269 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1270     if (utf8_target) {                                             \
1271         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1272     }                                                          \
1273     else {                                                     \
1274         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1275     }
1276
1277 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1278     if (utf8_target) {                                             \
1279         UtFpReLoAd;                                            \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }
1285
1286 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1287     PL_reg_flags |= RF_tainted;                                \
1288     if (utf8_target) {                                             \
1289         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1290     }                                                          \
1291     else {                                                     \
1292         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1293     }
1294
1295 #define DUMP_EXEC_POS(li,s,doutf8) \
1296     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1297
1298
1299 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1300         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1301         tmp = TEST_NON_UTF8(tmp);                                              \
1302         REXEC_FBC_UTF8_SCAN(                                                   \
1303             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1304                 tmp = !tmp;                                                    \
1305                 IF_SUCCESS;                                                    \
1306             }                                                                  \
1307             else {                                                             \
1308                 IF_FAIL;                                                       \
1309             }                                                                  \
1310         );                                                                     \
1311
1312 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1313         if (s == PL_bostr) {                                                   \
1314             tmp = '\n';                                                        \
1315         }                                                                      \
1316         else {                                                                 \
1317             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1318             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1319         }                                                                      \
1320         tmp = TeSt1_UtF8;                                                      \
1321         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1322         REXEC_FBC_UTF8_SCAN(                                                   \
1323             if (tmp == ! (TeSt2_UtF8)) { \
1324                 tmp = !tmp;                                                    \
1325                 IF_SUCCESS;                                                    \
1326             }                                                                  \
1327             else {                                                             \
1328                 IF_FAIL;                                                       \
1329             }                                                                  \
1330         );                                                                     \
1331
1332 /* The only difference between the BOUND and NBOUND cases is that
1333  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1334  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1335  * with the other one being empty */
1336 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1337     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1338
1339 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1340     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1341
1342 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1343     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1344
1345 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1346     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1347
1348
1349 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1350  * be passed in completely with the variable name being tested, which isn't
1351  * such a clean interface, but this is easier to read than it was before.  We
1352  * are looking for the boundary (or non-boundary between a word and non-word
1353  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1354  * must be different.  Find the "wordness" of the character just prior to this
1355  * one, and compare it with the wordness of this one.  If they differ, we have
1356  * a boundary.  At the beginning of the string, pretend that the previous
1357  * character was a new-line */
1358 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359     if (utf8_target) {                                                         \
1360                 UTF8_CODE \
1361     }                                                                          \
1362     else {  /* Not utf8 */                                                     \
1363         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1364         tmp = TEST_NON_UTF8(tmp);                                              \
1365         REXEC_FBC_SCAN(                                                        \
1366             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1367                 tmp = !tmp;                                                    \
1368                 IF_SUCCESS;                                                    \
1369             }                                                                  \
1370             else {                                                             \
1371                 IF_FAIL;                                                       \
1372             }                                                                  \
1373         );                                                                     \
1374     }                                                                          \
1375     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1376         goto got_it;
1377
1378 /* We know what class REx starts with.  Try to find this position... */
1379 /* if reginfo is NULL, its a dryrun */
1380 /* annoyingly all the vars in this routine have different names from their counterparts
1381    in regmatch. /grrr */
1382
1383 STATIC char *
1384 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1385     const char *strend, regmatch_info *reginfo)
1386 {
1387         dVAR;
1388         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1389         char *pat_string;   /* The pattern's exactish string */
1390         char *pat_end;      /* ptr to end char of pat_string */
1391         re_fold_t folder;       /* Function for computing non-utf8 folds */
1392         const U8 *fold_array;   /* array for folding ords < 256 */
1393         STRLEN ln;
1394         STRLEN lnc;
1395         register STRLEN uskip;
1396         U8 c1;
1397         U8 c2;
1398         char *e;
1399         register I32 tmp = 1;   /* Scratch variable? */
1400         register const bool utf8_target = PL_reg_match_utf8;
1401         UV utf8_fold_flags = 0;
1402         RXi_GET_DECL(prog,progi);
1403
1404         PERL_ARGS_ASSERT_FIND_BYCLASS;
1405
1406         /* We know what class it must start with. */
1407         switch (OP(c)) {
1408         case ANYOFV:
1409         case ANYOF:
1410             if (utf8_target || OP(c) == ANYOFV) {
1411                 STRLEN inclasslen = strend - s;
1412                 REXEC_FBC_UTF8_CLASS_SCAN(
1413                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1414             }
1415             else {
1416                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1417             }
1418             break;
1419         case CANY:
1420             REXEC_FBC_SCAN(
1421                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1422                     goto got_it;
1423                 else
1424                     tmp = doevery;
1425             );
1426             break;
1427
1428         case EXACTFA:
1429             if (UTF_PATTERN || utf8_target) {
1430                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1431                 goto do_exactf_utf8;
1432             }
1433             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1434             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1435             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1436
1437         case EXACTFU:
1438             if (UTF_PATTERN || utf8_target) {
1439                 utf8_fold_flags = 0;
1440                 goto do_exactf_utf8;
1441             }
1442             fold_array = PL_fold_latin1;
1443             folder = foldEQ_latin1;
1444             /* XXX This uses the full utf8 fold because if the pattern contains
1445              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1446              * There could be a new node type, say EXACTFU_SS, which is
1447              * generated by regcomp only if there is an 'ss', and then every
1448              * other case could goto do_exactf_non_utf8;*/
1449             goto do_exactf_utf8;
1450
1451         case EXACTF:
1452             if (UTF_PATTERN || utf8_target) {
1453                 utf8_fold_flags = 0;
1454                 goto do_exactf_utf8;
1455             }
1456             fold_array = PL_fold;
1457             folder = foldEQ;
1458             goto do_exactf_non_utf8;
1459
1460         case EXACTFL:
1461             if (UTF_PATTERN || utf8_target) {
1462                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1463                 goto do_exactf_utf8;
1464             }
1465             fold_array = PL_fold_locale;
1466             folder = foldEQ_locale;
1467
1468             /* FALL THROUGH */
1469
1470         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1471
1472             /* The idea in the non-utf8 EXACTF* cases is to first find the
1473              * first character of the EXACTF* node and then, if necessary,
1474              * case-insensitively compare the full text of the node.  c1 is the
1475              * first character.  c2 is its fold.  This logic will not work for
1476              * Unicode semantics and the german sharp ss, which hence should
1477              * not be compiled into a node that gets here. */
1478             pat_string = STRING(c);
1479             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1480
1481             e = HOP3c(strend, -((I32)ln), s);
1482
1483             if (!reginfo && e < s) {
1484                 e = s;                  /* Due to minlen logic of intuit() */
1485             }
1486
1487             c1 = *pat_string;
1488             c2 = fold_array[c1];
1489             if (c1 == c2) { /* If char and fold are the same */
1490                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1491             }
1492             else {
1493                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1494             }
1495             break;
1496
1497         do_exactf_utf8:
1498
1499             /* If one of the operands is in utf8, we can't use the simpler
1500              * folding above, due to the fact that many different characters
1501              * can have the same fold, or portion of a fold, or different-
1502              * length fold */
1503             pat_string = STRING(c);
1504             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1505             pat_end = pat_string + ln;
1506             lnc = (UTF_PATTERN) /* length to match in characters */
1507                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1508                     : ln;
1509
1510             e = HOP3c(strend, -((I32)lnc), s);
1511
1512             if (!reginfo && e < s) {
1513                 e = s;                  /* Due to minlen logic of intuit() */
1514             }
1515
1516             while (s <= e) {
1517                 char *my_strend= (char *)strend;
1518                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1519                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1520                     && (!reginfo || regtry(reginfo, &s)) )
1521                 {
1522                     goto got_it;
1523                 }
1524                 s += UTF8SKIP(s);
1525             }
1526             break;
1527         case BOUNDL:
1528             PL_reg_flags |= RF_tainted;
1529             FBC_BOUND(isALNUM_LC,
1530                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1531                       isALNUM_LC_utf8((U8*)s));
1532             break;
1533         case NBOUNDL:
1534             PL_reg_flags |= RF_tainted;
1535             FBC_NBOUND(isALNUM_LC,
1536                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1537                        isALNUM_LC_utf8((U8*)s));
1538             break;
1539         case BOUND:
1540             FBC_BOUND(isWORDCHAR,
1541                       isALNUM_uni(tmp),
1542                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1543             break;
1544         case BOUNDA:
1545             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1546                              isWORDCHAR_A(tmp),
1547                              isWORDCHAR_A((U8*)s));
1548             break;
1549         case NBOUND:
1550             FBC_NBOUND(isWORDCHAR,
1551                        isALNUM_uni(tmp),
1552                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1553             break;
1554         case NBOUNDA:
1555             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1556                               isWORDCHAR_A(tmp),
1557                               isWORDCHAR_A((U8*)s));
1558             break;
1559         case BOUNDU:
1560             FBC_BOUND(isWORDCHAR_L1,
1561                       isALNUM_uni(tmp),
1562                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1563             break;
1564         case NBOUNDU:
1565             FBC_NBOUND(isWORDCHAR_L1,
1566                        isALNUM_uni(tmp),
1567                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1568             break;
1569         case ALNUML:
1570             REXEC_FBC_CSCAN_TAINT(
1571                 isALNUM_LC_utf8((U8*)s),
1572                 isALNUM_LC(*s)
1573             );
1574             break;
1575         case ALNUMU:
1576             REXEC_FBC_CSCAN_PRELOAD(
1577                 LOAD_UTF8_CHARCLASS_ALNUM(),
1578                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1579                 isWORDCHAR_L1((U8) *s)
1580             );
1581             break;
1582         case ALNUM:
1583             REXEC_FBC_CSCAN_PRELOAD(
1584                 LOAD_UTF8_CHARCLASS_ALNUM(),
1585                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1586                 isWORDCHAR((U8) *s)
1587             );
1588             break;
1589         case ALNUMA:
1590             /* Don't need to worry about utf8, as it can match only a single
1591              * byte invariant character */
1592             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1593             break;
1594         case NALNUMU:
1595             REXEC_FBC_CSCAN_PRELOAD(
1596                 LOAD_UTF8_CHARCLASS_ALNUM(),
1597                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1598                 ! isWORDCHAR_L1((U8) *s)
1599             );
1600             break;
1601         case NALNUM:
1602             REXEC_FBC_CSCAN_PRELOAD(
1603                 LOAD_UTF8_CHARCLASS_ALNUM(),
1604                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1605                 ! isALNUM(*s)
1606             );
1607             break;
1608         case NALNUMA:
1609             REXEC_FBC_CSCAN(
1610                 !isWORDCHAR_A(*s),
1611                 !isWORDCHAR_A(*s)
1612             );
1613             break;
1614         case NALNUML:
1615             REXEC_FBC_CSCAN_TAINT(
1616                 !isALNUM_LC_utf8((U8*)s),
1617                 !isALNUM_LC(*s)
1618             );
1619             break;
1620         case SPACEU:
1621             REXEC_FBC_CSCAN_PRELOAD(
1622                 LOAD_UTF8_CHARCLASS_SPACE(),
1623                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1624                 isSPACE_L1((U8) *s)
1625             );
1626             break;
1627         case SPACE:
1628             REXEC_FBC_CSCAN_PRELOAD(
1629                 LOAD_UTF8_CHARCLASS_SPACE(),
1630                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1631                 isSPACE((U8) *s)
1632             );
1633             break;
1634         case SPACEA:
1635             /* Don't need to worry about utf8, as it can match only a single
1636              * byte invariant character */
1637             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1638             break;
1639         case SPACEL:
1640             REXEC_FBC_CSCAN_TAINT(
1641                 isSPACE_LC_utf8((U8*)s),
1642                 isSPACE_LC(*s)
1643             );
1644             break;
1645         case NSPACEU:
1646             REXEC_FBC_CSCAN_PRELOAD(
1647                 LOAD_UTF8_CHARCLASS_SPACE(),
1648                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1649                 ! isSPACE_L1((U8) *s)
1650             );
1651             break;
1652         case NSPACE:
1653             REXEC_FBC_CSCAN_PRELOAD(
1654                 LOAD_UTF8_CHARCLASS_SPACE(),
1655                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1656                 ! isSPACE((U8) *s)
1657             );
1658             break;
1659         case NSPACEA:
1660             REXEC_FBC_CSCAN(
1661                 !isSPACE_A(*s),
1662                 !isSPACE_A(*s)
1663             );
1664             break;
1665         case NSPACEL:
1666             REXEC_FBC_CSCAN_TAINT(
1667                 !isSPACE_LC_utf8((U8*)s),
1668                 !isSPACE_LC(*s)
1669             );
1670             break;
1671         case DIGIT:
1672             REXEC_FBC_CSCAN_PRELOAD(
1673                 LOAD_UTF8_CHARCLASS_DIGIT(),
1674                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1675                 isDIGIT(*s)
1676             );
1677             break;
1678         case DIGITA:
1679             /* Don't need to worry about utf8, as it can match only a single
1680              * byte invariant character */
1681             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1682             break;
1683         case DIGITL:
1684             REXEC_FBC_CSCAN_TAINT(
1685                 isDIGIT_LC_utf8((U8*)s),
1686                 isDIGIT_LC(*s)
1687             );
1688             break;
1689         case NDIGIT:
1690             REXEC_FBC_CSCAN_PRELOAD(
1691                 LOAD_UTF8_CHARCLASS_DIGIT(),
1692                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1693                 !isDIGIT(*s)
1694             );
1695             break;
1696         case NDIGITA:
1697             REXEC_FBC_CSCAN(
1698                 !isDIGIT_A(*s),
1699                 !isDIGIT_A(*s)
1700             );
1701             break;
1702         case NDIGITL:
1703             REXEC_FBC_CSCAN_TAINT(
1704                 !isDIGIT_LC_utf8((U8*)s),
1705                 !isDIGIT_LC(*s)
1706             );
1707             break;
1708         case LNBREAK:
1709             REXEC_FBC_CSCAN(
1710                 is_LNBREAK_utf8(s),
1711                 is_LNBREAK_latin1(s)
1712             );
1713             break;
1714         case VERTWS:
1715             REXEC_FBC_CSCAN(
1716                 is_VERTWS_utf8(s),
1717                 is_VERTWS_latin1(s)
1718             );
1719             break;
1720         case NVERTWS:
1721             REXEC_FBC_CSCAN(
1722                 !is_VERTWS_utf8(s),
1723                 !is_VERTWS_latin1(s)
1724             );
1725             break;
1726         case HORIZWS:
1727             REXEC_FBC_CSCAN(
1728                 is_HORIZWS_utf8(s),
1729                 is_HORIZWS_latin1(s)
1730             );
1731             break;
1732         case NHORIZWS:
1733             REXEC_FBC_CSCAN(
1734                 !is_HORIZWS_utf8(s),
1735                 !is_HORIZWS_latin1(s)
1736             );
1737             break;
1738         case AHOCORASICKC:
1739         case AHOCORASICK:
1740             {
1741                 DECL_TRIE_TYPE(c);
1742                 /* what trie are we using right now */
1743                 reg_ac_data *aho
1744                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1745                 reg_trie_data *trie
1746                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1747                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1748
1749                 const char *last_start = strend - trie->minlen;
1750 #ifdef DEBUGGING
1751                 const char *real_start = s;
1752 #endif
1753                 STRLEN maxlen = trie->maxlen;
1754                 SV *sv_points;
1755                 U8 **points; /* map of where we were in the input string
1756                                 when reading a given char. For ASCII this
1757                                 is unnecessary overhead as the relationship
1758                                 is always 1:1, but for Unicode, especially
1759                                 case folded Unicode this is not true. */
1760                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1761                 U8 *bitmap=NULL;
1762
1763
1764                 GET_RE_DEBUG_FLAGS_DECL;
1765
1766                 /* We can't just allocate points here. We need to wrap it in
1767                  * an SV so it gets freed properly if there is a croak while
1768                  * running the match */
1769                 ENTER;
1770                 SAVETMPS;
1771                 sv_points=newSV(maxlen * sizeof(U8 *));
1772                 SvCUR_set(sv_points,
1773                     maxlen * sizeof(U8 *));
1774                 SvPOK_on(sv_points);
1775                 sv_2mortal(sv_points);
1776                 points=(U8**)SvPV_nolen(sv_points );
1777                 if ( trie_type != trie_utf8_fold
1778                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1779                 {
1780                     if (trie->bitmap)
1781                         bitmap=(U8*)trie->bitmap;
1782                     else
1783                         bitmap=(U8*)ANYOF_BITMAP(c);
1784                 }
1785                 /* this is the Aho-Corasick algorithm modified a touch
1786                    to include special handling for long "unknown char"
1787                    sequences. The basic idea being that we use AC as long
1788                    as we are dealing with a possible matching char, when
1789                    we encounter an unknown char (and we have not encountered
1790                    an accepting state) we scan forward until we find a legal
1791                    starting char.
1792                    AC matching is basically that of trie matching, except
1793                    that when we encounter a failing transition, we fall back
1794                    to the current states "fail state", and try the current char
1795                    again, a process we repeat until we reach the root state,
1796                    state 1, or a legal transition. If we fail on the root state
1797                    then we can either terminate if we have reached an accepting
1798                    state previously, or restart the entire process from the beginning
1799                    if we have not.
1800
1801                  */
1802                 while (s <= last_start) {
1803                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1804                     U8 *uc = (U8*)s;
1805                     U16 charid = 0;
1806                     U32 base = 1;
1807                     U32 state = 1;
1808                     UV uvc = 0;
1809                     STRLEN len = 0;
1810                     STRLEN foldlen = 0;
1811                     U8 *uscan = (U8*)NULL;
1812                     U8 *leftmost = NULL;
1813 #ifdef DEBUGGING
1814                     U32 accepted_word= 0;
1815 #endif
1816                     U32 pointpos = 0;
1817
1818                     while ( state && uc <= (U8*)strend ) {
1819                         int failed=0;
1820                         U32 word = aho->states[ state ].wordnum;
1821
1822                         if( state==1 ) {
1823                             if ( bitmap ) {
1824                                 DEBUG_TRIE_EXECUTE_r(
1825                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1826                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1827                                             (char *)uc, utf8_target );
1828                                         PerlIO_printf( Perl_debug_log,
1829                                             " Scanning for legal start char...\n");
1830                                     }
1831                                 );
1832                                 if (utf8_target) {
1833                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1834                                         uc += UTF8SKIP(uc);
1835                                     }
1836                                 } else {
1837                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1838                                         uc++;
1839                                     }
1840                                 }
1841                                 s= (char *)uc;
1842                             }
1843                             if (uc >(U8*)last_start) break;
1844                         }
1845
1846                         if ( word ) {
1847                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1848                             if (!leftmost || lpos < leftmost) {
1849                                 DEBUG_r(accepted_word=word);
1850                                 leftmost= lpos;
1851                             }
1852                             if (base==0) break;
1853
1854                         }
1855                         points[pointpos++ % maxlen]= uc;
1856                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1857                                              uscan, len, uvc, charid, foldlen,
1858                                              foldbuf, uniflags);
1859                         DEBUG_TRIE_EXECUTE_r({
1860                             dump_exec_pos( (char *)uc, c, strend, real_start,
1861                                 s,   utf8_target );
1862                             PerlIO_printf(Perl_debug_log,
1863                                 " Charid:%3u CP:%4"UVxf" ",
1864                                  charid, uvc);
1865                         });
1866
1867                         do {
1868 #ifdef DEBUGGING
1869                             word = aho->states[ state ].wordnum;
1870 #endif
1871                             base = aho->states[ state ].trans.base;
1872
1873                             DEBUG_TRIE_EXECUTE_r({
1874                                 if (failed)
1875                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1876                                         s,   utf8_target );
1877                                 PerlIO_printf( Perl_debug_log,
1878                                     "%sState: %4"UVxf", word=%"UVxf,
1879                                     failed ? " Fail transition to " : "",
1880                                     (UV)state, (UV)word);
1881                             });
1882                             if ( base ) {
1883                                 U32 tmp;
1884                                 I32 offset;
1885                                 if (charid &&
1886                                      ( ((offset = base + charid
1887                                         - 1 - trie->uniquecharcount)) >= 0)
1888                                      && ((U32)offset < trie->lasttrans)
1889                                      && trie->trans[offset].check == state
1890                                      && (tmp=trie->trans[offset].next))
1891                                 {
1892                                     DEBUG_TRIE_EXECUTE_r(
1893                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1894                                     state = tmp;
1895                                     break;
1896                                 }
1897                                 else {
1898                                     DEBUG_TRIE_EXECUTE_r(
1899                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1900                                     failed = 1;
1901                                     state = aho->fail[state];
1902                                 }
1903                             }
1904                             else {
1905                                 /* we must be accepting here */
1906                                 DEBUG_TRIE_EXECUTE_r(
1907                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1908                                 failed = 1;
1909                                 break;
1910                             }
1911                         } while(state);
1912                         uc += len;
1913                         if (failed) {
1914                             if (leftmost)
1915                                 break;
1916                             if (!state) state = 1;
1917                         }
1918                     }
1919                     if ( aho->states[ state ].wordnum ) {
1920                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1921                         if (!leftmost || lpos < leftmost) {
1922                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1923                             leftmost = lpos;
1924                         }
1925                     }
1926                     if (leftmost) {
1927                         s = (char*)leftmost;
1928                         DEBUG_TRIE_EXECUTE_r({
1929                             PerlIO_printf(
1930                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1931                                 (UV)accepted_word, (IV)(s - real_start)
1932                             );
1933                         });
1934                         if (!reginfo || regtry(reginfo, &s)) {
1935                             FREETMPS;
1936                             LEAVE;
1937                             goto got_it;
1938                         }
1939                         s = HOPc(s,1);
1940                         DEBUG_TRIE_EXECUTE_r({
1941                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1942                         });
1943                     } else {
1944                         DEBUG_TRIE_EXECUTE_r(
1945                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1946                         break;
1947                     }
1948                 }
1949                 FREETMPS;
1950                 LEAVE;
1951             }
1952             break;
1953         default:
1954             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1955             break;
1956         }
1957         return 0;
1958       got_it:
1959         return s;
1960 }
1961
1962
1963 /*
1964  - regexec_flags - match a regexp against a string
1965  */
1966 I32
1967 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1968               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1969 /* strend: pointer to null at end of string */
1970 /* strbeg: real beginning of string */
1971 /* minend: end of match must be >=minend after stringarg. */
1972 /* data: May be used for some additional optimizations.
1973          Currently its only used, with a U32 cast, for transmitting
1974          the ganch offset when doing a /g match. This will change */
1975 /* nosave: For optimizations. */
1976 {
1977     dVAR;
1978     struct regexp *const prog = (struct regexp *)SvANY(rx);
1979     /*register*/ char *s;
1980     register regnode *c;
1981     /*register*/ char *startpos = stringarg;
1982     I32 minlen;         /* must match at least this many chars */
1983     I32 dontbother = 0; /* how many characters not to try at end */
1984     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1985     I32 scream_pos = -1;                /* Internal iterator of scream. */
1986     char *scream_olds = NULL;
1987     const bool utf8_target = cBOOL(DO_UTF8(sv));
1988     I32 multiline;
1989     RXi_GET_DECL(prog,progi);
1990     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1991     regexp_paren_pair *swap = NULL;
1992     GET_RE_DEBUG_FLAGS_DECL;
1993
1994     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1995     PERL_UNUSED_ARG(data);
1996
1997     /* Be paranoid... */
1998     if (prog == NULL || startpos == NULL) {
1999         Perl_croak(aTHX_ "NULL regexp parameter");
2000         return 0;
2001     }
2002
2003     multiline = prog->extflags & RXf_PMf_MULTILINE;
2004     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2005
2006     RX_MATCH_UTF8_set(rx, utf8_target);
2007     DEBUG_EXECUTE_r(
2008         debug_start_match(rx, utf8_target, startpos, strend,
2009         "Matching");
2010     );
2011
2012     minlen = prog->minlen;
2013
2014     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2015         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2016                               "String too short [regexec_flags]...\n"));
2017         goto phooey;
2018     }
2019
2020
2021     /* Check validity of program. */
2022     if (UCHARAT(progi->program) != REG_MAGIC) {
2023         Perl_croak(aTHX_ "corrupted regexp program");
2024     }
2025
2026     PL_reg_flags = 0;
2027     PL_reg_eval_set = 0;
2028     PL_reg_maxiter = 0;
2029
2030     if (RX_UTF8(rx))
2031         PL_reg_flags |= RF_utf8;
2032
2033     /* Mark beginning of line for ^ and lookbehind. */
2034     reginfo.bol = startpos; /* XXX not used ??? */
2035     PL_bostr  = strbeg;
2036     reginfo.sv = sv;
2037
2038     /* Mark end of line for $ (and such) */
2039     PL_regeol = strend;
2040
2041     /* see how far we have to get to not match where we matched before */
2042     reginfo.till = startpos+minend;
2043
2044     /* If there is a "must appear" string, look for it. */
2045     s = startpos;
2046
2047     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2048         MAGIC *mg;
2049         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2050             reginfo.ganch = startpos + prog->gofs;
2051             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2052               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2053         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2054                   && SvMAGIC(sv)
2055                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2056                   && mg->mg_len >= 0) {
2057             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2058             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2059                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2060
2061             if (prog->extflags & RXf_ANCH_GPOS) {
2062                 if (s > reginfo.ganch)
2063                     goto phooey;
2064                 s = reginfo.ganch - prog->gofs;
2065                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2066                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2067                 if (s < strbeg)
2068                     goto phooey;
2069             }
2070         }
2071         else if (data) {
2072             reginfo.ganch = strbeg + PTR2UV(data);
2073             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2074                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2075
2076         } else {                                /* pos() not defined */
2077             reginfo.ganch = strbeg;
2078             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2079                  "GPOS: reginfo.ganch = strbeg\n"));
2080         }
2081     }
2082     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2083         /* We have to be careful. If the previous successful match
2084            was from this regex we don't want a subsequent partially
2085            successful match to clobber the old results.
2086            So when we detect this possibility we add a swap buffer
2087            to the re, and switch the buffer each match. If we fail
2088            we switch it back, otherwise we leave it swapped.
2089         */
2090         swap = prog->offs;
2091         /* do we need a save destructor here for eval dies? */
2092         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2093     }
2094     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2095         re_scream_pos_data d;
2096
2097         d.scream_olds = &scream_olds;
2098         d.scream_pos = &scream_pos;
2099         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2100         if (!s) {
2101             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2102             goto phooey;        /* not present */
2103         }
2104     }
2105
2106
2107
2108     /* Simplest case:  anchored match need be tried only once. */
2109     /*  [unless only anchor is BOL and multiline is set] */
2110     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2111         if (s == startpos && regtry(&reginfo, &startpos))
2112             goto got_it;
2113         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2114                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2115         {
2116             char *end;
2117
2118             if (minlen)
2119                 dontbother = minlen - 1;
2120             end = HOP3c(strend, -dontbother, strbeg) - 1;
2121             /* for multiline we only have to try after newlines */
2122             if (prog->check_substr || prog->check_utf8) {
2123                 /* because of the goto we can not easily reuse the macros for bifurcating the
2124                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2125                 if (utf8_target) {
2126                     if (s == startpos)
2127                         goto after_try_utf8;
2128                     while (1) {
2129                         if (regtry(&reginfo, &s)) {
2130                             goto got_it;
2131                         }
2132                       after_try_utf8:
2133                         if (s > end) {
2134                             goto phooey;
2135                         }
2136                         if (prog->extflags & RXf_USE_INTUIT) {
2137                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2138                             if (!s) {
2139                                 goto phooey;
2140                             }
2141                         }
2142                         else {
2143                             s += UTF8SKIP(s);
2144                         }
2145                     }
2146                 } /* end search for check string in unicode */
2147                 else {
2148                     if (s == startpos) {
2149                         goto after_try_latin;
2150                     }
2151                     while (1) {
2152                         if (regtry(&reginfo, &s)) {
2153                             goto got_it;
2154                         }
2155                       after_try_latin:
2156                         if (s > end) {
2157                             goto phooey;
2158                         }
2159                         if (prog->extflags & RXf_USE_INTUIT) {
2160                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2161                             if (!s) {
2162                                 goto phooey;
2163                             }
2164                         }
2165                         else {
2166                             s++;
2167                         }
2168                     }
2169                 } /* end search for check string in latin*/
2170             } /* end search for check string */
2171             else { /* search for newline */
2172                 if (s > startpos) {
2173                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2174                     s--;
2175                 }
2176                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2177                 while (s < end) {
2178                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2179                         if (regtry(&reginfo, &s))
2180                             goto got_it;
2181                     }
2182                 }
2183             } /* end search for newline */
2184         } /* end anchored/multiline check string search */
2185         goto phooey;
2186     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2187     {
2188         /* the warning about reginfo.ganch being used without initialization
2189            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2190            and we only enter this block when the same bit is set. */
2191         char *tmp_s = reginfo.ganch - prog->gofs;
2192
2193         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2194             goto got_it;
2195         goto phooey;
2196     }
2197
2198     /* Messy cases:  unanchored match. */
2199     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2200         /* we have /x+whatever/ */
2201         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2202         char ch;
2203 #ifdef DEBUGGING
2204         int did_match = 0;
2205 #endif
2206         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2207             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2208         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2209
2210         if (utf8_target) {
2211             REXEC_FBC_SCAN(
2212                 if (*s == ch) {
2213                     DEBUG_EXECUTE_r( did_match = 1 );
2214                     if (regtry(&reginfo, &s)) goto got_it;
2215                     s += UTF8SKIP(s);
2216                     while (s < strend && *s == ch)
2217                         s += UTF8SKIP(s);
2218                 }
2219             );
2220         }
2221         else {
2222             REXEC_FBC_SCAN(
2223                 if (*s == ch) {
2224                     DEBUG_EXECUTE_r( did_match = 1 );
2225                     if (regtry(&reginfo, &s)) goto got_it;
2226                     s++;
2227                     while (s < strend && *s == ch)
2228                         s++;
2229                 }
2230             );
2231         }
2232         DEBUG_EXECUTE_r(if (!did_match)
2233                 PerlIO_printf(Perl_debug_log,
2234                                   "Did not find anchored character...\n")
2235                );
2236     }
2237     else if (prog->anchored_substr != NULL
2238               || prog->anchored_utf8 != NULL
2239               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2240                   && prog->float_max_offset < strend - s)) {
2241         SV *must;
2242         I32 back_max;
2243         I32 back_min;
2244         char *last;
2245         char *last1;            /* Last position checked before */
2246 #ifdef DEBUGGING
2247         int did_match = 0;
2248 #endif
2249         if (prog->anchored_substr || prog->anchored_utf8) {
2250             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2251                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2252             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2253             back_max = back_min = prog->anchored_offset;
2254         } else {
2255             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2256                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2257             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2258             back_max = prog->float_max_offset;
2259             back_min = prog->float_min_offset;
2260         }
2261
2262
2263         if (must == &PL_sv_undef)
2264             /* could not downgrade utf8 check substring, so must fail */
2265             goto phooey;
2266
2267         if (back_min<0) {
2268             last = strend;
2269         } else {
2270             last = HOP3c(strend,        /* Cannot start after this */
2271                   -(I32)(CHR_SVLEN(must)
2272                          - (SvTAIL(must) != 0) + back_min), strbeg);
2273         }
2274         if (s > PL_bostr)
2275             last1 = HOPc(s, -1);
2276         else
2277             last1 = s - 1;      /* bogus */
2278
2279         /* XXXX check_substr already used to find "s", can optimize if
2280            check_substr==must. */
2281         scream_pos = -1;
2282         dontbother = end_shift;
2283         strend = HOPc(strend, -dontbother);
2284         while ( (s <= last) &&
2285                 ((flags & REXEC_SCREAM)
2286                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2287                                     end_shift, &scream_pos, 0))
2288                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2289                                   (unsigned char*)strend, must,
2290                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2291             /* we may be pointing at the wrong string */
2292             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2293                 s = strbeg + (s - SvPVX_const(sv));
2294             DEBUG_EXECUTE_r( did_match = 1 );
2295             if (HOPc(s, -back_max) > last1) {
2296                 last1 = HOPc(s, -back_min);
2297                 s = HOPc(s, -back_max);
2298             }
2299             else {
2300                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2301
2302                 last1 = HOPc(s, -back_min);
2303                 s = t;
2304             }
2305             if (utf8_target) {
2306                 while (s <= last1) {
2307                     if (regtry(&reginfo, &s))
2308                         goto got_it;
2309                     s += UTF8SKIP(s);
2310                 }
2311             }
2312             else {
2313                 while (s <= last1) {
2314                     if (regtry(&reginfo, &s))
2315                         goto got_it;
2316                     s++;
2317                 }
2318             }
2319         }
2320         DEBUG_EXECUTE_r(if (!did_match) {
2321             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2322                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2323             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2324                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2325                                ? "anchored" : "floating"),
2326                 quoted, RE_SV_TAIL(must));
2327         });
2328         goto phooey;
2329     }
2330     else if ( (c = progi->regstclass) ) {
2331         if (minlen) {
2332             const OPCODE op = OP(progi->regstclass);
2333             /* don't bother with what can't match */
2334             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2335                 strend = HOPc(strend, -(minlen - 1));
2336         }
2337         DEBUG_EXECUTE_r({
2338             SV * const prop = sv_newmortal();
2339             regprop(prog, prop, c);
2340             {
2341                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2342                     s,strend-s,60);
2343                 PerlIO_printf(Perl_debug_log,
2344                     "Matching stclass %.*s against %s (%d bytes)\n",
2345                     (int)SvCUR(prop), SvPVX_const(prop),
2346                      quoted, (int)(strend - s));
2347             }
2348         });
2349         if (find_byclass(prog, c, s, strend, &reginfo))
2350             goto got_it;
2351         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2352     }
2353     else {
2354         dontbother = 0;
2355         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2356             /* Trim the end. */
2357             char *last;
2358             SV* float_real;
2359
2360             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2361                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2362             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2363
2364             if (flags & REXEC_SCREAM) {
2365                 last = screaminstr(sv, float_real, s - strbeg,
2366                                    end_shift, &scream_pos, 1); /* last one */
2367                 if (!last)
2368                     last = scream_olds; /* Only one occurrence. */
2369                 /* we may be pointing at the wrong string */
2370                 else if (RXp_MATCH_COPIED(prog))
2371                     s = strbeg + (s - SvPVX_const(sv));
2372             }
2373             else {
2374                 STRLEN len;
2375                 const char * const little = SvPV_const(float_real, len);
2376
2377                 if (SvTAIL(float_real)) {
2378                     if (memEQ(strend - len + 1, little, len - 1))
2379                         last = strend - len + 1;
2380                     else if (!multiline)
2381                         last = memEQ(strend - len, little, len)
2382                             ? strend - len : NULL;
2383                     else
2384                         goto find_last;
2385                 } else {
2386                   find_last:
2387                     if (len)
2388                         last = rninstr(s, strend, little, little + len);
2389                     else
2390                         last = strend;  /* matching "$" */
2391                 }
2392             }
2393             if (last == NULL) {
2394                 DEBUG_EXECUTE_r(
2395                     PerlIO_printf(Perl_debug_log,
2396                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2397                         PL_colors[4], PL_colors[5]));
2398                 goto phooey; /* Should not happen! */
2399             }
2400             dontbother = strend - last + prog->float_min_offset;
2401         }
2402         if (minlen && (dontbother < minlen))
2403             dontbother = minlen - 1;
2404         strend -= dontbother;              /* this one's always in bytes! */
2405         /* We don't know much -- general case. */
2406         if (utf8_target) {
2407             for (;;) {
2408                 if (regtry(&reginfo, &s))
2409                     goto got_it;
2410                 if (s >= strend)
2411                     break;
2412                 s += UTF8SKIP(s);
2413             };
2414         }
2415         else {
2416             do {
2417                 if (regtry(&reginfo, &s))
2418                     goto got_it;
2419             } while (s++ < strend);
2420         }
2421     }
2422
2423     /* Failure. */
2424     goto phooey;
2425
2426 got_it:
2427     Safefree(swap);
2428     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2429
2430     if (PL_reg_eval_set)
2431         restore_pos(aTHX_ prog);
2432     if (RXp_PAREN_NAMES(prog))
2433         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2434
2435     /* make sure $`, $&, $', and $digit will work later */
2436     if ( !(flags & REXEC_NOT_FIRST) ) {
2437         RX_MATCH_COPY_FREE(rx);
2438         if (flags & REXEC_COPY_STR) {
2439             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2440 #ifdef PERL_OLD_COPY_ON_WRITE
2441             if ((SvIsCOW(sv)
2442                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2443                 if (DEBUG_C_TEST) {
2444                     PerlIO_printf(Perl_debug_log,
2445                                   "Copy on write: regexp capture, type %d\n",
2446                                   (int) SvTYPE(sv));
2447                 }
2448                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2449                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2450                 assert (SvPOKp(prog->saved_copy));
2451             } else
2452 #endif
2453             {
2454                 RX_MATCH_COPIED_on(rx);
2455                 s = savepvn(strbeg, i);
2456                 prog->subbeg = s;
2457             }
2458             prog->sublen = i;
2459         }
2460         else {
2461             prog->subbeg = strbeg;
2462             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2463         }
2464     }
2465
2466     return 1;
2467
2468 phooey:
2469     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2470                           PL_colors[4], PL_colors[5]));
2471     if (PL_reg_eval_set)
2472         restore_pos(aTHX_ prog);
2473     if (swap) {
2474         /* we failed :-( roll it back */
2475         Safefree(prog->offs);
2476         prog->offs = swap;
2477     }
2478
2479     return 0;
2480 }
2481
2482
2483 /*
2484  - regtry - try match at specific point
2485  */
2486 STATIC I32                      /* 0 failure, 1 success */
2487 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2488 {
2489     dVAR;
2490     CHECKPOINT lastcp;
2491     REGEXP *const rx = reginfo->prog;
2492     regexp *const prog = (struct regexp *)SvANY(rx);
2493     RXi_GET_DECL(prog,progi);
2494     GET_RE_DEBUG_FLAGS_DECL;
2495
2496     PERL_ARGS_ASSERT_REGTRY;
2497
2498     reginfo->cutpoint=NULL;
2499
2500     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2501         MAGIC *mg;
2502
2503         PL_reg_eval_set = RS_init;
2504         DEBUG_EXECUTE_r(DEBUG_s(
2505             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2506                           (IV)(PL_stack_sp - PL_stack_base));
2507             ));
2508         SAVESTACK_CXPOS();
2509         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2510         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2511         SAVETMPS;
2512         /* Apparently this is not needed, judging by wantarray. */
2513         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2514            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2515
2516         if (reginfo->sv) {
2517             /* Make $_ available to executed code. */
2518             if (reginfo->sv != DEFSV) {
2519                 SAVE_DEFSV;
2520                 DEFSV_set(reginfo->sv);
2521             }
2522
2523             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2524                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2525                 /* prepare for quick setting of pos */
2526 #ifdef PERL_OLD_COPY_ON_WRITE
2527                 if (SvIsCOW(reginfo->sv))
2528                     sv_force_normal_flags(reginfo->sv, 0);
2529 #endif
2530                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2531                                  &PL_vtbl_mglob, NULL, 0);
2532                 mg->mg_len = -1;
2533             }
2534             PL_reg_magic    = mg;
2535             PL_reg_oldpos   = mg->mg_len;
2536             SAVEDESTRUCTOR_X(restore_pos, prog);
2537         }
2538         if (!PL_reg_curpm) {
2539             Newxz(PL_reg_curpm, 1, PMOP);
2540 #ifdef USE_ITHREADS
2541             {
2542                 SV* const repointer = &PL_sv_undef;
2543                 /* this regexp is also owned by the new PL_reg_curpm, which
2544                    will try to free it.  */
2545                 av_push(PL_regex_padav, repointer);
2546                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2547                 PL_regex_pad = AvARRAY(PL_regex_padav);
2548             }
2549 #endif
2550         }
2551 #ifdef USE_ITHREADS
2552         /* It seems that non-ithreads works both with and without this code.
2553            So for efficiency reasons it seems best not to have the code
2554            compiled when it is not needed.  */
2555         /* This is safe against NULLs: */
2556         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2557         /* PM_reg_curpm owns a reference to this regexp.  */
2558         ReREFCNT_inc(rx);
2559 #endif
2560         PM_SETRE(PL_reg_curpm, rx);
2561         PL_reg_oldcurpm = PL_curpm;
2562         PL_curpm = PL_reg_curpm;
2563         if (RXp_MATCH_COPIED(prog)) {
2564             /*  Here is a serious problem: we cannot rewrite subbeg,
2565                 since it may be needed if this match fails.  Thus
2566                 $` inside (?{}) could fail... */
2567             PL_reg_oldsaved = prog->subbeg;
2568             PL_reg_oldsavedlen = prog->sublen;
2569 #ifdef PERL_OLD_COPY_ON_WRITE
2570             PL_nrs = prog->saved_copy;
2571 #endif
2572             RXp_MATCH_COPIED_off(prog);
2573         }
2574         else
2575             PL_reg_oldsaved = NULL;
2576         prog->subbeg = PL_bostr;
2577         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2578     }
2579     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2580     prog->offs[0].start = *startpos - PL_bostr;
2581     PL_reginput = *startpos;
2582     PL_reglastparen = &prog->lastparen;
2583     PL_reglastcloseparen = &prog->lastcloseparen;
2584     prog->lastparen = 0;
2585     prog->lastcloseparen = 0;
2586     PL_regsize = 0;
2587     PL_regoffs = prog->offs;
2588     if (PL_reg_start_tmpl <= prog->nparens) {
2589         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2590         if(PL_reg_start_tmp)
2591             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2592         else
2593             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2594     }
2595
2596     /* XXXX What this code is doing here?!!!  There should be no need
2597        to do this again and again, PL_reglastparen should take care of
2598        this!  --ilya*/
2599
2600     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2601      * Actually, the code in regcppop() (which Ilya may be meaning by
2602      * PL_reglastparen), is not needed at all by the test suite
2603      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2604      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2605      * Meanwhile, this code *is* needed for the
2606      * above-mentioned test suite tests to succeed.  The common theme
2607      * on those tests seems to be returning null fields from matches.
2608      * --jhi updated by dapm */
2609 #if 1
2610     if (prog->nparens) {
2611         regexp_paren_pair *pp = PL_regoffs;
2612         register I32 i;
2613         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2614             ++pp;
2615             pp->start = -1;
2616             pp->end = -1;
2617         }
2618     }
2619 #endif
2620     REGCP_SET(lastcp);
2621     if (regmatch(reginfo, progi->program + 1)) {
2622         PL_regoffs[0].end = PL_reginput - PL_bostr;
2623         return 1;
2624     }
2625     if (reginfo->cutpoint)
2626         *startpos= reginfo->cutpoint;
2627     REGCP_UNWIND(lastcp);
2628     return 0;
2629 }
2630
2631
2632 #define sayYES goto yes
2633 #define sayNO goto no
2634 #define sayNO_SILENT goto no_silent
2635
2636 /* we dont use STMT_START/END here because it leads to
2637    "unreachable code" warnings, which are bogus, but distracting. */
2638 #define CACHEsayNO \
2639     if (ST.cache_mask) \
2640        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2641     sayNO
2642
2643 /* this is used to determine how far from the left messages like
2644    'failed...' are printed. It should be set such that messages
2645    are inline with the regop output that created them.
2646 */
2647 #define REPORT_CODE_OFF 32
2648
2649
2650 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2651 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2652
2653 #define SLAB_FIRST(s) (&(s)->states[0])
2654 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2655
2656 /* grab a new slab and return the first slot in it */
2657
2658 STATIC regmatch_state *
2659 S_push_slab(pTHX)
2660 {
2661 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2662     dMY_CXT;
2663 #endif
2664     regmatch_slab *s = PL_regmatch_slab->next;
2665     if (!s) {
2666         Newx(s, 1, regmatch_slab);
2667         s->prev = PL_regmatch_slab;
2668         s->next = NULL;
2669         PL_regmatch_slab->next = s;
2670     }
2671     PL_regmatch_slab = s;
2672     return SLAB_FIRST(s);
2673 }
2674
2675
2676 /* push a new state then goto it */
2677
2678 #define PUSH_STATE_GOTO(state, node) \
2679     scan = node; \
2680     st->resume_state = state; \
2681     goto push_state;
2682
2683 /* push a new state with success backtracking, then goto it */
2684
2685 #define PUSH_YES_STATE_GOTO(state, node) \
2686     scan = node; \
2687     st->resume_state = state; \
2688     goto push_yes_state;
2689
2690
2691
2692 /*
2693
2694 regmatch() - main matching routine
2695
2696 This is basically one big switch statement in a loop. We execute an op,
2697 set 'next' to point the next op, and continue. If we come to a point which
2698 we may need to backtrack to on failure such as (A|B|C), we push a
2699 backtrack state onto the backtrack stack. On failure, we pop the top
2700 state, and re-enter the loop at the state indicated. If there are no more
2701 states to pop, we return failure.
2702
2703 Sometimes we also need to backtrack on success; for example /A+/, where
2704 after successfully matching one A, we need to go back and try to
2705 match another one; similarly for lookahead assertions: if the assertion
2706 completes successfully, we backtrack to the state just before the assertion
2707 and then carry on.  In these cases, the pushed state is marked as
2708 'backtrack on success too'. This marking is in fact done by a chain of
2709 pointers, each pointing to the previous 'yes' state. On success, we pop to
2710 the nearest yes state, discarding any intermediate failure-only states.
2711 Sometimes a yes state is pushed just to force some cleanup code to be
2712 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2713 it to free the inner regex.
2714
2715 Note that failure backtracking rewinds the cursor position, while
2716 success backtracking leaves it alone.
2717
2718 A pattern is complete when the END op is executed, while a subpattern
2719 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2720 ops trigger the "pop to last yes state if any, otherwise return true"
2721 behaviour.
2722
2723 A common convention in this function is to use A and B to refer to the two
2724 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2725 the subpattern to be matched possibly multiple times, while B is the entire
2726 rest of the pattern. Variable and state names reflect this convention.
2727
2728 The states in the main switch are the union of ops and failure/success of
2729 substates associated with with that op.  For example, IFMATCH is the op
2730 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2731 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2732 successfully matched A and IFMATCH_A_fail is a state saying that we have
2733 just failed to match A. Resume states always come in pairs. The backtrack
2734 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2735 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2736 on success or failure.
2737
2738 The struct that holds a backtracking state is actually a big union, with
2739 one variant for each major type of op. The variable st points to the
2740 top-most backtrack struct. To make the code clearer, within each
2741 block of code we #define ST to alias the relevant union.
2742
2743 Here's a concrete example of a (vastly oversimplified) IFMATCH
2744 implementation:
2745
2746     switch (state) {
2747     ....
2748
2749 #define ST st->u.ifmatch
2750
2751     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2752         ST.foo = ...; // some state we wish to save
2753         ...
2754         // push a yes backtrack state with a resume value of
2755         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2756         // first node of A:
2757         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2758         // NOTREACHED
2759
2760     case IFMATCH_A: // we have successfully executed A; now continue with B
2761         next = B;
2762         bar = ST.foo; // do something with the preserved value
2763         break;
2764
2765     case IFMATCH_A_fail: // A failed, so the assertion failed
2766         ...;   // do some housekeeping, then ...
2767         sayNO; // propagate the failure
2768
2769 #undef ST
2770
2771     ...
2772     }
2773
2774 For any old-timers reading this who are familiar with the old recursive
2775 approach, the code above is equivalent to:
2776
2777     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2778     {
2779         int foo = ...
2780         ...
2781         if (regmatch(A)) {
2782             next = B;
2783             bar = foo;
2784             break;
2785         }
2786         ...;   // do some housekeeping, then ...
2787         sayNO; // propagate the failure
2788     }
2789
2790 The topmost backtrack state, pointed to by st, is usually free. If you
2791 want to claim it, populate any ST.foo fields in it with values you wish to
2792 save, then do one of
2793
2794         PUSH_STATE_GOTO(resume_state, node);
2795         PUSH_YES_STATE_GOTO(resume_state, node);
2796
2797 which sets that backtrack state's resume value to 'resume_state', pushes a
2798 new free entry to the top of the backtrack stack, then goes to 'node'.
2799 On backtracking, the free slot is popped, and the saved state becomes the
2800 new free state. An ST.foo field in this new top state can be temporarily
2801 accessed to retrieve values, but once the main loop is re-entered, it
2802 becomes available for reuse.
2803
2804 Note that the depth of the backtrack stack constantly increases during the
2805 left-to-right execution of the pattern, rather than going up and down with
2806 the pattern nesting. For example the stack is at its maximum at Z at the
2807 end of the pattern, rather than at X in the following:
2808
2809     /(((X)+)+)+....(Y)+....Z/
2810
2811 The only exceptions to this are lookahead/behind assertions and the cut,
2812 (?>A), which pop all the backtrack states associated with A before
2813 continuing.
2814
2815 Backtrack state structs are allocated in slabs of about 4K in size.
2816 PL_regmatch_state and st always point to the currently active state,
2817 and PL_regmatch_slab points to the slab currently containing
2818 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2819 allocated, and is never freed until interpreter destruction. When the slab
2820 is full, a new one is allocated and chained to the end. At exit from
2821 regmatch(), slabs allocated since entry are freed.
2822
2823 */
2824
2825
2826 #define DEBUG_STATE_pp(pp)                                  \
2827     DEBUG_STATE_r({                                         \
2828         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2829         PerlIO_printf(Perl_debug_log,                       \
2830             "    %*s"pp" %s%s%s%s%s\n",                     \
2831             depth*2, "",                                    \
2832             PL_reg_name[st->resume_state],                     \
2833             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2834             ((st==yes_state) ? "Y" : ""),                   \
2835             ((st==mark_state) ? "M" : ""),                  \
2836             ((st==yes_state||st==mark_state) ? "]" : "")    \
2837         );                                                  \
2838     });
2839
2840
2841 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2842
2843 #ifdef DEBUGGING
2844
2845 STATIC void
2846 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2847     const char *start, const char *end, const char *blurb)
2848 {
2849     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2850
2851     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2852
2853     if (!PL_colorset)
2854             reginitcolors();
2855     {
2856         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2857             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2858
2859         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2860             start, end - start, 60);
2861
2862         PerlIO_printf(Perl_debug_log,
2863             "%s%s REx%s %s against %s\n",
2864                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2865
2866         if (utf8_target||utf8_pat)
2867             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2868                 utf8_pat ? "pattern" : "",
2869                 utf8_pat && utf8_target ? " and " : "",
2870                 utf8_target ? "string" : ""
2871             );
2872     }
2873 }
2874
2875 STATIC void
2876 S_dump_exec_pos(pTHX_ const char *locinput,
2877                       const regnode *scan,
2878                       const char *loc_regeol,
2879                       const char *loc_bostr,
2880                       const char *loc_reg_starttry,
2881                       const bool utf8_target)
2882 {
2883     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2884     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2885     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2886     /* The part of the string before starttry has one color
2887        (pref0_len chars), between starttry and current
2888        position another one (pref_len - pref0_len chars),
2889        after the current position the third one.
2890        We assume that pref0_len <= pref_len, otherwise we
2891        decrease pref0_len.  */
2892     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2893         ? (5 + taill) - l : locinput - loc_bostr;
2894     int pref0_len;
2895
2896     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2897
2898     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2899         pref_len++;
2900     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2901     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2902         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2903               ? (5 + taill) - pref_len : loc_regeol - locinput);
2904     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2905         l--;
2906     if (pref0_len < 0)
2907         pref0_len = 0;
2908     if (pref0_len > pref_len)
2909         pref0_len = pref_len;
2910     {
2911         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2912
2913         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2914             (locinput - pref_len),pref0_len, 60, 4, 5);
2915
2916         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2917                     (locinput - pref_len + pref0_len),
2918                     pref_len - pref0_len, 60, 2, 3);
2919
2920         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2921                     locinput, loc_regeol - locinput, 10, 0, 1);
2922
2923         const STRLEN tlen=len0+len1+len2;
2924         PerlIO_printf(Perl_debug_log,
2925                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2926                     (IV)(locinput - loc_bostr),
2927                     len0, s0,
2928                     len1, s1,
2929                     (docolor ? "" : "> <"),
2930                     len2, s2,
2931                     (int)(tlen > 19 ? 0 :  19 - tlen),
2932                     "");
2933     }
2934 }
2935
2936 #endif
2937
2938 /* reg_check_named_buff_matched()
2939  * Checks to see if a named buffer has matched. The data array of
2940  * buffer numbers corresponding to the buffer is expected to reside
2941  * in the regexp->data->data array in the slot stored in the ARG() of
2942  * node involved. Note that this routine doesn't actually care about the
2943  * name, that information is not preserved from compilation to execution.
2944  * Returns the index of the leftmost defined buffer with the given name
2945  * or 0 if non of the buffers matched.
2946  */
2947 STATIC I32
2948 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2949 {
2950     I32 n;
2951     RXi_GET_DECL(rex,rexi);
2952     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2953     I32 *nums=(I32*)SvPVX(sv_dat);
2954
2955     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2956
2957     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2958         if ((I32)*PL_reglastparen >= nums[n] &&
2959             PL_regoffs[nums[n]].end != -1)
2960         {
2961             return nums[n];
2962         }
2963     }
2964     return 0;
2965 }
2966
2967
2968 /* free all slabs above current one  - called during LEAVE_SCOPE */
2969
2970 STATIC void
2971 S_clear_backtrack_stack(pTHX_ void *p)
2972 {
2973     regmatch_slab *s = PL_regmatch_slab->next;
2974     PERL_UNUSED_ARG(p);
2975
2976     if (!s)
2977         return;
2978     PL_regmatch_slab->next = NULL;
2979     while (s) {
2980         regmatch_slab * const osl = s;
2981         s = s->next;
2982         Safefree(osl);
2983     }
2984 }
2985
2986
2987 #define SETREX(Re1,Re2) \
2988     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2989     Re1 = (Re2)
2990
2991 STATIC I32                      /* 0 failure, 1 success */
2992 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2993 {
2994 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2995     dMY_CXT;
2996 #endif
2997     dVAR;
2998     register const bool utf8_target = PL_reg_match_utf8;
2999     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3000     REGEXP *rex_sv = reginfo->prog;
3001     regexp *rex = (struct regexp *)SvANY(rex_sv);
3002     RXi_GET_DECL(rex,rexi);
3003     I32 oldsave;
3004     /* the current state. This is a cached copy of PL_regmatch_state */
3005     register regmatch_state *st;
3006     /* cache heavy used fields of st in registers */
3007     register regnode *scan;
3008     register regnode *next;
3009     register U32 n = 0; /* general value; init to avoid compiler warning */
3010     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3011     register char *locinput = PL_reginput;
3012     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3013
3014     bool result = 0;        /* return value of S_regmatch */
3015     int depth = 0;          /* depth of backtrack stack */
3016     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3017     const U32 max_nochange_depth =
3018         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3019         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3020     regmatch_state *yes_state = NULL; /* state to pop to on success of
3021                                                             subpattern */
3022     /* mark_state piggy backs on the yes_state logic so that when we unwind
3023        the stack on success we can update the mark_state as we go */
3024     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3025     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3026     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3027     U32 state_num;
3028     bool no_final = 0;      /* prevent failure from backtracking? */
3029     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3030     char *startpoint = PL_reginput;
3031     SV *popmark = NULL;     /* are we looking for a mark? */
3032     SV *sv_commit = NULL;   /* last mark name seen in failure */
3033     SV *sv_yes_mark = NULL; /* last mark name we have seen
3034                                during a successful match */
3035     U32 lastopen = 0;       /* last open we saw */
3036     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3037     SV* const oreplsv = GvSV(PL_replgv);
3038     /* these three flags are set by various ops to signal information to
3039      * the very next op. They have a useful lifetime of exactly one loop
3040      * iteration, and are not preserved or restored by state pushes/pops
3041      */
3042     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3043     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3044     int logical = 0;        /* the following EVAL is:
3045                                 0: (?{...})
3046                                 1: (?(?{...})X|Y)
3047                                 2: (??{...})
3048                                or the following IFMATCH/UNLESSM is:
3049                                 false: plain (?=foo)
3050                                 true:  used as a condition: (?(?=foo))
3051                             */
3052 #ifdef DEBUGGING
3053     GET_RE_DEBUG_FLAGS_DECL;
3054 #endif
3055
3056     PERL_ARGS_ASSERT_REGMATCH;
3057
3058     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3059             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3060     }));
3061     /* on first ever call to regmatch, allocate first slab */
3062     if (!PL_regmatch_slab) {
3063         Newx(PL_regmatch_slab, 1, regmatch_slab);
3064         PL_regmatch_slab->prev = NULL;
3065         PL_regmatch_slab->next = NULL;
3066         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3067     }
3068
3069     oldsave = PL_savestack_ix;
3070     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3071     SAVEVPTR(PL_regmatch_slab);
3072     SAVEVPTR(PL_regmatch_state);
3073
3074     /* grab next free state slot */
3075     st = ++PL_regmatch_state;
3076     if (st >  SLAB_LAST(PL_regmatch_slab))
3077         st = PL_regmatch_state = S_push_slab(aTHX);
3078
3079     /* Note that nextchr is a byte even in UTF */
3080     nextchr = UCHARAT(locinput);
3081     scan = prog;
3082     while (scan != NULL) {
3083
3084         DEBUG_EXECUTE_r( {
3085             SV * const prop = sv_newmortal();
3086             regnode *rnext=regnext(scan);
3087             DUMP_EXEC_POS( locinput, scan, utf8_target );
3088             regprop(rex, prop, scan);
3089
3090             PerlIO_printf(Perl_debug_log,
3091                     "%3"IVdf":%*s%s(%"IVdf")\n",
3092                     (IV)(scan - rexi->program), depth*2, "",
3093                     SvPVX_const(prop),
3094                     (PL_regkind[OP(scan)] == END || !rnext) ?
3095                         0 : (IV)(rnext - rexi->program));
3096         });
3097
3098         next = scan + NEXT_OFF(scan);
3099         if (next == scan)
3100             next = NULL;
3101         state_num = OP(scan);
3102
3103       reenter_switch:
3104
3105         assert(PL_reglastparen == &rex->lastparen);
3106         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3107         assert(PL_regoffs == rex->offs);
3108
3109         switch (state_num) {
3110         case BOL:
3111             if (locinput == PL_bostr)
3112             {
3113                 /* reginfo->till = reginfo->bol; */
3114                 break;
3115             }
3116             sayNO;
3117         case MBOL:
3118             if (locinput == PL_bostr ||
3119                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3120             {
3121                 break;
3122             }
3123             sayNO;
3124         case SBOL:
3125             if (locinput == PL_bostr)
3126                 break;
3127             sayNO;
3128         case GPOS:
3129             if (locinput == reginfo->ganch)
3130                 break;
3131             sayNO;
3132
3133         case KEEPS:
3134             /* update the startpoint */
3135             st->u.keeper.val = PL_regoffs[0].start;
3136             PL_reginput = locinput;
3137             PL_regoffs[0].start = locinput - PL_bostr;
3138             PUSH_STATE_GOTO(KEEPS_next, next);
3139             /*NOT-REACHED*/
3140         case KEEPS_next_fail:
3141             /* rollback the start point change */
3142             PL_regoffs[0].start = st->u.keeper.val;
3143             sayNO_SILENT;
3144             /*NOT-REACHED*/
3145         case EOL:
3146                 goto seol;
3147         case MEOL:
3148             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3149                 sayNO;
3150             break;
3151         case SEOL:
3152           seol:
3153             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3154                 sayNO;
3155             if (PL_regeol - locinput > 1)
3156                 sayNO;
3157             break;
3158         case EOS:
3159             if (PL_regeol != locinput)
3160                 sayNO;
3161             break;
3162         case SANY:
3163             if (!nextchr && locinput >= PL_regeol)
3164                 sayNO;
3165             if (utf8_target) {
3166                 locinput += PL_utf8skip[nextchr];
3167                 if (locinput > PL_regeol)
3168                     sayNO;
3169                 nextchr = UCHARAT(locinput);
3170             }
3171             else
3172                 nextchr = UCHARAT(++locinput);
3173             break;
3174         case CANY:
3175             if (!nextchr && locinput >= PL_regeol)
3176                 sayNO;
3177             nextchr = UCHARAT(++locinput);
3178             break;
3179         case REG_ANY:
3180             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3181                 sayNO;
3182             if (utf8_target) {
3183                 locinput += PL_utf8skip[nextchr];
3184                 if (locinput > PL_regeol)
3185                     sayNO;
3186                 nextchr = UCHARAT(locinput);
3187             }
3188             else
3189                 nextchr = UCHARAT(++locinput);
3190             break;
3191
3192 #undef  ST
3193 #define ST st->u.trie
3194         case TRIEC:
3195             /* In this case the charclass data is available inline so
3196                we can fail fast without a lot of extra overhead.
3197              */
3198             if (scan->flags == EXACT || !utf8_target) {
3199                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3200                     DEBUG_EXECUTE_r(
3201                         PerlIO_printf(Perl_debug_log,
3202                                   "%*s  %sfailed to match trie start class...%s\n",
3203                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3204                     );
3205                     sayNO_SILENT;
3206                     /* NOTREACHED */
3207                 }
3208             }
3209             /* FALL THROUGH */
3210         case TRIE:
3211             /* the basic plan of execution of the trie is:
3212              * At the beginning, run though all the states, and
3213              * find the longest-matching word. Also remember the position
3214              * of the shortest matching word. For example, this pattern:
3215              *    1  2 3 4    5
3216              *    ab|a|x|abcd|abc
3217              * when matched against the string "abcde", will generate
3218              * accept states for all words except 3, with the longest
3219              * matching word being 4, and the shortest being 1 (with
3220              * the position being after char 1 of the string).
3221              *
3222              * Then for each matching word, in word order (i.e. 1,2,4,5),
3223              * we run the remainder of the pattern; on each try setting
3224              * the current position to the character following the word,
3225              * returning to try the next word on failure.
3226              *
3227              * We avoid having to build a list of words at runtime by
3228              * using a compile-time structure, wordinfo[].prev, which
3229              * gives, for each word, the previous accepting word (if any).
3230              * In the case above it would contain the mappings 1->2, 2->0,
3231              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3232              * the longest word (4 above), a list of all words, by
3233              * following the list of prev pointers; this gives us the
3234              * unordered list 4,5,1,2. Then given the current word we have
3235              * just tried, we can go through the list and find the
3236              * next-biggest word to try (so if we just failed on word 2,
3237              * the next in the list is 4).
3238              *
3239              * Since at runtime we don't record the matching position in
3240              * the string for each word, we have to work that out for
3241              * each word we're about to process. The wordinfo table holds
3242              * the character length of each word; given that we recorded
3243              * at the start: the position of the shortest word and its
3244              * length in chars, we just need to move the pointer the
3245              * difference between the two char lengths. Depending on
3246              * Unicode status and folding, that's cheap or expensive.
3247              *
3248              * This algorithm is optimised for the case where are only a
3249              * small number of accept states, i.e. 0,1, or maybe 2.
3250              * With lots of accepts states, and having to try all of them,
3251              * it becomes quadratic on number of accept states to find all
3252              * the next words.
3253              */
3254
3255             {
3256                 /* what type of TRIE am I? (utf8 makes this contextual) */
3257                 DECL_TRIE_TYPE(scan);
3258
3259                 /* what trie are we using right now */
3260                 reg_trie_data * const trie
3261                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3262                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3263                 U32 state = trie->startstate;
3264
3265                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3266                     !TRIE_BITMAP_TEST(trie,*locinput)
3267                 ) {
3268                     if (trie->states[ state ].wordnum) {
3269                          DEBUG_EXECUTE_r(
3270                             PerlIO_printf(Perl_debug_log,
3271                                           "%*s  %smatched empty string...%s\n",
3272                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3273                         );
3274                         if (!trie->jump)
3275                             break;
3276                     } else {
3277                         DEBUG_EXECUTE_r(
3278                             PerlIO_printf(Perl_debug_log,
3279                                           "%*s  %sfailed to match trie start class...%s\n",
3280                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3281                         );
3282                         sayNO_SILENT;
3283                    }
3284                 }
3285
3286             {
3287                 U8 *uc = ( U8* )locinput;
3288
3289                 STRLEN len = 0;
3290                 STRLEN foldlen = 0;
3291                 U8 *uscan = (U8*)NULL;
3292                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3293                 U32 charcount = 0; /* how many input chars we have matched */
3294                 U32 accepted = 0; /* have we seen any accepting states? */
3295
3296                 ST.B = next;
3297                 ST.jump = trie->jump;
3298                 ST.me = scan;
3299                 ST.firstpos = NULL;
3300                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3301                 ST.nextword = 0;
3302
3303                 /* fully traverse the TRIE; note the position of the
3304                    shortest accept state and the wordnum of the longest
3305                    accept state */
3306
3307                 while ( state && uc <= (U8*)PL_regeol ) {
3308                     U32 base = trie->states[ state ].trans.base;
3309                     UV uvc = 0;
3310                     U16 charid = 0;
3311                     U16 wordnum;
3312                     wordnum = trie->states[ state ].wordnum;
3313
3314                     if (wordnum) { /* it's an accept state */
3315                         if (!accepted) {
3316                             accepted = 1;
3317                             /* record first match position */
3318                             if (ST.longfold) {
3319                                 ST.firstpos = (U8*)locinput;
3320                                 ST.firstchars = 0;
3321                             }
3322                             else {
3323                                 ST.firstpos = uc;
3324                                 ST.firstchars = charcount;
3325                             }
3326                         }
3327                         if (!ST.nextword || wordnum < ST.nextword)
3328                             ST.nextword = wordnum;
3329                         ST.topword = wordnum;
3330                     }
3331
3332                     DEBUG_TRIE_EXECUTE_r({
3333                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3334                                 PerlIO_printf( Perl_debug_log,
3335                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3336                                     2+depth * 2, "", PL_colors[4],
3337                                     (UV)state, (accepted ? 'Y' : 'N'));
3338                     });
3339
3340                     /* read a char and goto next state */
3341                     if ( base ) {
3342                         I32 offset;
3343                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3344                                              uscan, len, uvc, charid, foldlen,
3345                                              foldbuf, uniflags);
3346                         charcount++;
3347                         if (foldlen>0)
3348                             ST.longfold = TRUE;
3349                         if (charid &&
3350                              ( ((offset =
3351                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3352
3353                              && ((U32)offset < trie->lasttrans)
3354                              && trie->trans[offset].check == state)
3355                         {
3356                             state = trie->trans[offset].next;
3357                         }
3358                         else {
3359                             state = 0;
3360                         }
3361                         uc += len;
3362
3363                     }
3364                     else {
3365                         state = 0;
3366                     }
3367                     DEBUG_TRIE_EXECUTE_r(
3368                         PerlIO_printf( Perl_debug_log,
3369                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3370                             charid, uvc, (UV)state, PL_colors[5] );
3371                     );
3372                 }
3373                 if (!accepted)
3374                    sayNO;
3375
3376                 /* calculate total number of accept states */
3377                 {
3378                     U16 w = ST.topword;
3379                     accepted = 0;
3380                     while (w) {
3381                         w = trie->wordinfo[w].prev;
3382                         accepted++;
3383                     }
3384                     ST.accepted = accepted;
3385                 }
3386
3387                 DEBUG_EXECUTE_r(
3388                     PerlIO_printf( Perl_debug_log,
3389                         "%*s  %sgot %"IVdf" possible matches%s\n",
3390                         REPORT_CODE_OFF + depth * 2, "",
3391                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3392                 );
3393                 goto trie_first_try; /* jump into the fail handler */
3394             }}
3395             /* NOTREACHED */
3396
3397         case TRIE_next_fail: /* we failed - try next alternative */
3398             if ( ST.jump) {
3399                 REGCP_UNWIND(ST.cp);
3400                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3401                     PL_regoffs[n].end = -1;
3402                 *PL_reglastparen = n;
3403             }
3404             if (!--ST.accepted) {
3405                 DEBUG_EXECUTE_r({
3406                     PerlIO_printf( Perl_debug_log,
3407                         "%*s  %sTRIE failed...%s\n",
3408                         REPORT_CODE_OFF+depth*2, "",
3409                         PL_colors[4],
3410                         PL_colors[5] );
3411                 });
3412                 sayNO_SILENT;
3413             }
3414             {
3415                 /* Find next-highest word to process.  Note that this code
3416                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3417                 register U16 min = 0;
3418                 register U16 word;
3419                 register U16 const nextword = ST.nextword;
3420                 register reg_trie_wordinfo * const wordinfo
3421                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3422                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3423                     if (word > nextword && (!min || word < min))
3424                         min = word;
3425                 }
3426                 ST.nextword = min;
3427             }
3428
3429           trie_first_try:
3430             if (do_cutgroup) {
3431                 do_cutgroup = 0;
3432                 no_final = 0;
3433             }
3434
3435             if ( ST.jump) {
3436                 ST.lastparen = *PL_reglastparen;
3437                 REGCP_SET(ST.cp);
3438             }
3439
3440             /* find start char of end of current word */
3441             {
3442                 U32 chars; /* how many chars to skip */
3443                 U8 *uc = ST.firstpos;
3444                 reg_trie_data * const trie
3445                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3446
3447                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3448                             >=  ST.firstchars);
3449                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3450                             - ST.firstchars;
3451
3452                 if (ST.longfold) {
3453                     /* the hard option - fold each char in turn and find
3454                      * its folded length (which may be different */
3455                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3456                     STRLEN foldlen;
3457                     STRLEN len;
3458                     UV uvc;
3459                     U8 *uscan;
3460
3461                     while (chars) {
3462                         if (utf8_target) {
3463                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3464                                                     uniflags);
3465                             uc += len;
3466                         }
3467                         else {
3468                             uvc = *uc;
3469                             uc++;
3470                         }
3471                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3472                         uscan = foldbuf;
3473                         while (foldlen) {
3474                             if (!--chars)
3475                                 break;
3476                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3477                                             uniflags);
3478                             uscan += len;
3479                             foldlen -= len;
3480                         }
3481                     }
3482                 }
3483                 else {
3484                     if (utf8_target)
3485                         while (chars--)
3486                             uc += UTF8SKIP(uc);
3487                     else
3488                         uc += chars;
3489                 }
3490                 PL_reginput = (char *)uc;
3491             }
3492
3493             scan = (ST.jump && ST.jump[ST.nextword])
3494                         ? ST.me + ST.jump[ST.nextword]
3495                         : ST.B;
3496
3497             DEBUG_EXECUTE_r({
3498                 PerlIO_printf( Perl_debug_log,
3499                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3500                     REPORT_CODE_OFF+depth*2, "",
3501                     PL_colors[4],
3502                     ST.nextword,
3503                     PL_colors[5]
3504                     );
3505             });
3506
3507             if (ST.accepted > 1 || has_cutgroup) {
3508                 PUSH_STATE_GOTO(TRIE_next, scan);
3509                 /* NOTREACHED */
3510             }
3511             /* only one choice left - just continue */
3512             DEBUG_EXECUTE_r({
3513                 AV *const trie_words
3514                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3515                 SV ** const tmp = av_fetch( trie_words,
3516                     ST.nextword-1, 0 );
3517                 SV *sv= tmp ? sv_newmortal() : NULL;
3518
3519                 PerlIO_printf( Perl_debug_log,
3520                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3521                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3522                     ST.nextword,
3523                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3524                             PL_colors[0], PL_colors[1],
3525                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3526                         )
3527                     : "not compiled under -Dr",
3528                     PL_colors[5] );
3529             });
3530
3531             locinput = PL_reginput;
3532             nextchr = UCHARAT(locinput);
3533             continue; /* execute rest of RE */
3534             /* NOTREACHED */
3535 #undef  ST
3536
3537         case EXACT: {
3538             char *s = STRING(scan);
3539             ln = STR_LEN(scan);
3540             if (utf8_target != UTF_PATTERN) {
3541                 /* The target and the pattern have differing utf8ness. */
3542                 char *l = locinput;
3543                 const char * const e = s + ln;
3544
3545                 if (utf8_target) {
3546                     /* The target is utf8, the pattern is not utf8. */
3547                     while (s < e) {
3548                         STRLEN ulen;
3549                         if (l >= PL_regeol)
3550                              sayNO;
3551                         if (NATIVE_TO_UNI(*(U8*)s) !=
3552                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3553                                             uniflags))
3554                              sayNO;
3555                         l += ulen;
3556                         s ++;
3557                     }
3558                 }
3559                 else {
3560                     /* The target is not utf8, the pattern is utf8. */
3561                     while (s < e) {
3562                         STRLEN ulen;
3563                         if (l >= PL_regeol)
3564                             sayNO;
3565                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3566                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3567                                            uniflags))
3568                             sayNO;
3569                         s += ulen;
3570                         l ++;
3571                     }
3572                 }
3573                 locinput = l;
3574                 nextchr = UCHARAT(locinput);
3575                 break;
3576             }
3577             /* The target and the pattern have the same utf8ness. */
3578             /* Inline the first character, for speed. */
3579             if (UCHARAT(s) != nextchr)
3580                 sayNO;
3581             if (PL_regeol - locinput < ln)
3582                 sayNO;
3583             if (ln > 1 && memNE(s, locinput, ln))
3584                 sayNO;
3585             locinput += ln;
3586             nextchr = UCHARAT(locinput);
3587             break;
3588             }
3589         case EXACTFL: {
3590             re_fold_t folder;
3591             const U8 * fold_array;
3592             const char * s;
3593             U32 fold_utf8_flags;
3594
3595             PL_reg_flags |= RF_tainted;
3596             folder = foldEQ_locale;
3597             fold_array = PL_fold_locale;
3598             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3599             goto do_exactf;
3600
3601         case EXACTFU:
3602             folder = foldEQ_latin1;
3603             fold_array = PL_fold_latin1;
3604             fold_utf8_flags = 0;
3605             goto do_exactf;
3606
3607         case EXACTFA:
3608             folder = foldEQ_latin1;
3609             fold_array = PL_fold_latin1;
3610             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3611             goto do_exactf;
3612
3613         case EXACTF:
3614             folder = foldEQ;
3615             fold_array = PL_fold;
3616             fold_utf8_flags = 0;
3617
3618           do_exactf:
3619             s = STRING(scan);
3620             ln = STR_LEN(scan);
3621
3622             if (utf8_target || UTF_PATTERN) {
3623               /* Either target or the pattern are utf8. */
3624                 const char * const l = locinput;
3625                 char *e = PL_regeol;
3626
3627                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3628                                l, &e, 0,  utf8_target, fold_utf8_flags)) {
3629                      /* One more case for the sharp s:
3630                       * pack("U0U*", 0xDF) =~ /ss/i,
3631                       * the 0xC3 0x9F are the UTF-8
3632                       * byte sequence for the U+00DF. */
3633
3634                      if (!(utf8_target &&
3635                            toLOWER(s[0]) == 's' &&
3636                            ln >= 2 &&
3637                            toLOWER(s[1]) == 's' &&
3638                            (U8)l[0] == 0xC3 &&
3639                            e - l >= 2 &&
3640                            (U8)l[1] == 0x9F))
3641                           sayNO;
3642                 }
3643                 locinput = e;
3644                 nextchr = UCHARAT(locinput);
3645                 break;
3646             }
3647
3648             /* Neither the target nor the pattern are utf8 */
3649             if (UCHARAT(s) != nextchr &&
3650                 UCHARAT(s) != fold_array[nextchr])
3651             {
3652                 sayNO;
3653             }
3654             if (PL_regeol - locinput < ln)
3655                 sayNO;
3656             if (ln > 1 && ! folder(s, locinput, ln))
3657                 sayNO;
3658             locinput += ln;
3659             nextchr = UCHARAT(locinput);
3660             break;
3661         }
3662
3663         /* XXX Could improve efficiency by separating these all out using a
3664          * macro or in-line function.  At that point regcomp.c would no longer
3665          * have to set the FLAGS fields of these */
3666         case BOUNDL:
3667         case NBOUNDL:
3668             PL_reg_flags |= RF_tainted;
3669             /* FALL THROUGH */
3670         case BOUND:
3671         case BOUNDU:
3672         case BOUNDA:
3673         case NBOUND:
3674         case NBOUNDU:
3675         case NBOUNDA:
3676             /* was last char in word? */
3677             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3678                 if (locinput == PL_bostr)
3679                     ln = '\n';
3680                 else {
3681                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3682
3683                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3684                 }
3685                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3686                     ln = isALNUM_uni(ln);
3687                     LOAD_UTF8_CHARCLASS_ALNUM();
3688                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3689                 }
3690                 else {
3691                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3692                     n = isALNUM_LC_utf8((U8*)locinput);
3693                 }
3694             }
3695             else {
3696
3697                 /* Here the string isn't utf8, or is utf8 and only ascii
3698                  * characters are to match \w.  In the latter case looking at
3699                  * the byte just prior to the current one may be just the final
3700                  * byte of a multi-byte character.  This is ok.  There are two
3701                  * cases:
3702                  * 1) it is a single byte character, and then the test is doing
3703                  *      just what it's supposed to.
3704                  * 2) it is a multi-byte character, in which case the final
3705                  *      byte is never mistakable for ASCII, and so the test
3706                  *      will say it is not a word character, which is the
3707                  *      correct answer. */
3708                 ln = (locinput != PL_bostr) ?
3709                     UCHARAT(locinput - 1) : '\n';
3710                 switch (FLAGS(scan)) {
3711                     case REGEX_UNICODE_CHARSET:
3712                         ln = isWORDCHAR_L1(ln);
3713                         n = isWORDCHAR_L1(nextchr);
3714                         break;
3715                     case REGEX_LOCALE_CHARSET:
3716                         ln = isALNUM_LC(ln);
3717                         n = isALNUM_LC(nextchr);
3718                         break;
3719                     case REGEX_DEPENDS_CHARSET:
3720                         ln = isALNUM(ln);
3721                         n = isALNUM(nextchr);
3722                         break;
3723                     case REGEX_ASCII_RESTRICTED_CHARSET:
3724                         ln = isWORDCHAR_A(ln);
3725                         n = isWORDCHAR_A(nextchr);
3726                         break;
3727                     default:
3728                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3729                         break;
3730                 }
3731             }
3732             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3733              * regcomp.sym */
3734             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3735                     sayNO;
3736             break;
3737         case ANYOFV:
3738         case ANYOF:
3739             if (utf8_target || state_num == ANYOFV) {
3740                 STRLEN inclasslen = PL_regeol - locinput;
3741                 if (locinput >= PL_regeol)
3742                     sayNO;
3743
3744                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3745                     sayNO;
3746                 locinput += inclasslen;
3747                 nextchr = UCHARAT(locinput);
3748                 break;
3749             }
3750             else {
3751                 if (nextchr < 0)
3752                     nextchr = UCHARAT(locinput);
3753                 if (!nextchr && locinput >= PL_regeol)
3754                     sayNO;
3755                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3756                     sayNO;
3757                 nextchr = UCHARAT(++locinput);
3758                 break;
3759             }
3760             break;
3761         /* Special char classes - The defines start on line 129 or so */
3762         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3763                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3764                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3765                   ALNUMA, NALNUMA, isWORDCHAR_A,
3766                   alnum, "a");
3767
3768         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3769                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3770                   SPACEU, NSPACEU, isSPACE_L1,
3771                   SPACEA, NSPACEA, isSPACE_A,
3772                   space, " ");
3773
3774         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3775                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3776                 DIGITA, NDIGITA, isDIGIT_A,
3777                 digit, "0");
3778
3779         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3780                        a Unicode extended Grapheme Cluster */
3781             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3782               extended Grapheme Cluster is:
3783
3784                CR LF
3785                | Prepend* Begin Extend*
3786                | .
3787
3788                Begin is (Hangul-syllable | ! Control)
3789                Extend is (Grapheme_Extend | Spacing_Mark)
3790                Control is [ GCB_Control CR LF ]
3791
3792                The discussion below shows how the code for CLUMP is derived
3793                from this regex.  Note that most of these concepts are from
3794                property values of the Grapheme Cluster Boundary (GCB) property.
3795                No code point can have multiple property values for a given
3796                property.  Thus a code point in Prepend can't be in Control, but
3797                it must be in !Control.  This is why Control above includes
3798                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3799                property separately, and so can't be in GCB_Control, even though
3800                they logically are controls.  Control is not the same as gc=cc,
3801                but includes format and other characters as well.
3802
3803                The Unicode definition of Hangul-syllable is:
3804                    L+
3805                    | (L* ( ( V | LV ) V* | LVT ) T*)
3806                    | T+
3807                   )
3808                Each of these is a value for the GCB property, and hence must be
3809                disjoint, so the order they are tested is immaterial, so the
3810                above can safely be changed to
3811                    T+
3812                    | L+
3813                    | (L* ( LVT | ( V | LV ) V*) T*)
3814
3815                The last two terms can be combined like this:
3816                    L* ( L
3817                         | (( LVT | ( V | LV ) V*) T*))
3818
3819                And refactored into this:
3820                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3821
3822                That means that if we have seen any L's at all we can quit
3823                there, but if the next character is a LVT, a V or and LV we
3824                should keep going.
3825
3826                There is a subtlety with Prepend* which showed up in testing.
3827                Note that the Begin, and only the Begin is required in:
3828                 | Prepend* Begin Extend*
3829                Also, Begin contains '! Control'.  A Prepend must be a '!
3830                Control', which means it must be a Begin.  What it comes down to
3831                is that if we match Prepend* and then find no suitable Begin
3832                afterwards, that if we backtrack the last Prepend, that one will
3833                be a suitable Begin.
3834             */
3835
3836             if (locinput >= PL_regeol)
3837                 sayNO;
3838             if  (! utf8_target) {
3839
3840                 /* Match either CR LF  or '.', as all the other possibilities
3841                  * require utf8 */
3842                 locinput++;         /* Match the . or CR */
3843                 if (nextchr == '\r'
3844                     && locinput < PL_regeol
3845                     && UCHARAT(locinput) == '\n') locinput++;
3846             }
3847             else {
3848
3849                 /* Utf8: See if is ( CR LF ); already know that locinput <
3850                  * PL_regeol, so locinput+1 is in bounds */
3851                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3852                     locinput += 2;
3853                 }
3854                 else {
3855                     /* In case have to backtrack to beginning, then match '.' */
3856                     char *starting = locinput;
3857
3858                     /* In case have to backtrack the last prepend */
3859                     char *previous_prepend = 0;
3860
3861                     LOAD_UTF8_CHARCLASS_GCB();
3862
3863                     /* Match (prepend)* */
3864                     while (locinput < PL_regeol
3865                            && swash_fetch(PL_utf8_X_prepend,
3866                                           (U8*)locinput, utf8_target))
3867                     {
3868                         previous_prepend = locinput;
3869                         locinput += UTF8SKIP(locinput);
3870                     }
3871
3872                     /* As noted above, if we matched a prepend character, but
3873                      * the next thing won't match, back off the last prepend we
3874                      * matched, as it is guaranteed to match the begin */
3875                     if (previous_prepend
3876                         && (locinput >=  PL_regeol
3877                             || ! swash_fetch(PL_utf8_X_begin,
3878                                              (U8*)locinput, utf8_target)))
3879                     {
3880                         locinput = previous_prepend;
3881                     }
3882
3883                     /* Note that here we know PL_regeol > locinput, as we
3884                      * tested that upon input to this switch case, and if we
3885                      * moved locinput forward, we tested the result just above
3886                      * and it either passed, or we backed off so that it will
3887                      * now pass */
3888                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3889
3890                         /* Here did not match the required 'Begin' in the
3891                          * second term.  So just match the very first
3892                          * character, the '.' of the final term of the regex */
3893                         locinput = starting + UTF8SKIP(starting);
3894                     } else {
3895
3896                         /* Here is the beginning of a character that can have
3897                          * an extender.  It is either a hangul syllable, or a
3898                          * non-control */
3899                         if (swash_fetch(PL_utf8_X_non_hangul,
3900                                         (U8*)locinput, utf8_target))
3901                         {
3902
3903                             /* Here not a Hangul syllable, must be a
3904                              * ('!  * Control') */
3905                             locinput += UTF8SKIP(locinput);
3906                         } else {
3907
3908                             /* Here is a Hangul syllable.  It can be composed
3909                              * of several individual characters.  One
3910                              * possibility is T+ */
3911                             if (swash_fetch(PL_utf8_X_T,
3912                                             (U8*)locinput, utf8_target))
3913                             {
3914                                 while (locinput < PL_regeol
3915                                         && swash_fetch(PL_utf8_X_T,
3916                                                         (U8*)locinput, utf8_target))
3917                                 {
3918                                     locinput += UTF8SKIP(locinput);
3919                                 }
3920                             } else {
3921
3922                                 /* Here, not T+, but is a Hangul.  That means
3923                                  * it is one of the others: L, LV, LVT or V,
3924                                  * and matches:
3925                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3926
3927                                 /* Match L*           */
3928                                 while (locinput < PL_regeol
3929                                         && swash_fetch(PL_utf8_X_L,
3930                                                         (U8*)locinput, utf8_target))
3931                                 {
3932                                     locinput += UTF8SKIP(locinput);
3933                                 }
3934
3935                                 /* Here, have exhausted L*.  If the next
3936                                  * character is not an LV, LVT nor V, it means
3937                                  * we had to have at least one L, so matches L+
3938                                  * in the original equation, we have a complete
3939                                  * hangul syllable.  Are done. */
3940
3941                                 if (locinput < PL_regeol
3942                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3943                                                     (U8*)locinput, utf8_target))
3944                                 {
3945
3946                                     /* Otherwise keep going.  Must be LV, LVT
3947                                      * or V.  See if LVT */
3948                                     if (swash_fetch(PL_utf8_X_LVT,
3949                                                     (U8*)locinput, utf8_target))
3950                                     {
3951                                         locinput += UTF8SKIP(locinput);
3952                                     } else {
3953
3954                                         /* Must be  V or LV.  Take it, then
3955                                          * match V*     */
3956                                         locinput += UTF8SKIP(locinput);
3957                                         while (locinput < PL_regeol
3958                                                 && swash_fetch(PL_utf8_X_V,
3959                                                          (U8*)locinput, utf8_target))
3960                                         {
3961                                             locinput += UTF8SKIP(locinput);
3962                                         }
3963                                     }
3964
3965                                     /* And any of LV, LVT, or V can be followed
3966                                      * by T*            */
3967                                     while (locinput < PL_regeol
3968                                            && swash_fetch(PL_utf8_X_T,
3969                                                            (U8*)locinput,
3970                                                            utf8_target))
3971                                     {
3972                                         locinput += UTF8SKIP(locinput);
3973                                     }
3974                                 }
3975                             }
3976                         }
3977
3978                         /* Match any extender */
3979                         while (locinput < PL_regeol
3980                                 && swash_fetch(PL_utf8_X_extend,
3981                                                 (U8*)locinput, utf8_target))
3982                         {
3983                             locinput += UTF8SKIP(locinput);
3984                         }
3985                     }
3986                 }
3987                 if (locinput > PL_regeol) sayNO;
3988             }
3989             nextchr = UCHARAT(locinput);
3990             break;
3991
3992         case NREFFL:
3993         {   /* The capture buffer cases.  The ones beginning with N for the
3994                named buffers just convert to the equivalent numbered and
3995                pretend they were called as the corresponding numbered buffer
3996                op.  */
3997             /* don't initialize these in the declaration, it makes C++
3998                unhappy */
3999             char *s;
4000             char type;
4001             re_fold_t folder;
4002             const U8 *fold_array;
4003             UV utf8_fold_flags;
4004
4005             PL_reg_flags |= RF_tainted;
4006             folder = foldEQ_locale;
4007             fold_array = PL_fold_locale;
4008             type = REFFL;
4009             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4010             goto do_nref;
4011
4012         case NREFFA:
4013             folder = foldEQ_latin1;
4014             fold_array = PL_fold_latin1;
4015             type = REFFA;
4016             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4017             goto do_nref;
4018
4019         case NREFFU:
4020             folder = foldEQ_latin1;
4021             fold_array = PL_fold_latin1;
4022             type = REFFU;
4023             utf8_fold_flags = 0;
4024             goto do_nref;
4025
4026         case NREFF:
4027             folder = foldEQ;
4028             fold_array = PL_fold;
4029             type = REFF;
4030             utf8_fold_flags = 0;
4031             goto do_nref;
4032
4033         case NREF:
4034             type = REF;
4035             folder = NULL;
4036             fold_array = NULL;
4037             utf8_fold_flags = 0;
4038           do_nref:
4039
4040             /* For the named back references, find the corresponding buffer
4041              * number */
4042             n = reg_check_named_buff_matched(rex,scan);
4043
4044             if ( ! n ) {
4045                 sayNO;
4046             }
4047             goto do_nref_ref_common;
4048
4049         case REFFL:
4050             PL_reg_flags |= RF_tainted;
4051             folder = foldEQ_locale;
4052             fold_array = PL_fold_locale;
4053             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4054             goto do_ref;
4055
4056         case REFFA:
4057             folder = foldEQ_latin1;
4058             fold_array = PL_fold_latin1;
4059             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4060             goto do_ref;
4061
4062         case REFFU:
4063             folder = foldEQ_latin1;
4064             fold_array = PL_fold_latin1;
4065             utf8_fold_flags = 0;
4066             goto do_ref;
4067
4068         case REFF:
4069             folder = foldEQ;
4070             fold_array = PL_fold;
4071             utf8_fold_flags = 0;
4072             goto do_ref;
4073
4074         case REF:
4075             folder = NULL;
4076             fold_array = NULL;
4077             utf8_fold_flags = 0;
4078
4079           do_ref:
4080             type = OP(scan);
4081             n = ARG(scan);  /* which paren pair */
4082
4083           do_nref_ref_common:
4084             ln = PL_regoffs[n].start;
4085             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4086             if (*PL_reglastparen < n || ln == -1)
4087                 sayNO;                  /* Do not match unless seen CLOSEn. */
4088             if (ln == PL_regoffs[n].end)
4089                 break;
4090
4091             s = PL_bostr + ln;
4092             if (type != REF     /* REF can do byte comparison */
4093                 && (utf8_target || type == REFFU))
4094             { /* XXX handle REFFL better */
4095                 char * limit = PL_regeol;
4096
4097                 /* This call case insensitively compares the entire buffer
4098                     * at s, with the current input starting at locinput, but
4099                     * not going off the end given by PL_regeol, and returns in
4100                     * limit upon success, how much of the current input was
4101                     * matched */
4102                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4103                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4104                 {
4105                     sayNO;
4106                 }
4107                 locinput = limit;
4108                 nextchr = UCHARAT(locinput);
4109                 break;
4110             }
4111
4112             /* Not utf8:  Inline the first character, for speed. */
4113             if (UCHARAT(s) != nextchr &&
4114                 (type == REF ||
4115                  UCHARAT(s) != fold_array[nextchr]))
4116                 sayNO;
4117             ln = PL_regoffs[n].end - ln;
4118             if (locinput + ln > PL_regeol)
4119                 sayNO;
4120             if (ln > 1 && (type == REF
4121                            ? memNE(s, locinput, ln)
4122                            : ! folder(s, locinput, ln)))
4123                 sayNO;
4124             locinput += ln;
4125             nextchr = UCHARAT(locinput);
4126             break;
4127         }
4128         case NOTHING:
4129         case TAIL:
4130             break;
4131         case BACK:
4132             break;
4133
4134 #undef  ST
4135 #define ST st->u.eval
4136         {
4137             SV *ret;
4138             REGEXP *re_sv;
4139             regexp *re;
4140             regexp_internal *rei;
4141             regnode *startpoint;
4142
4143         case GOSTART:
4144         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4145             if (cur_eval && cur_eval->locinput==locinput) {
4146                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4147                     Perl_croak(aTHX_ "Infinite recursion in regex");
4148                 if ( ++nochange_depth > max_nochange_depth )
4149                     Perl_croak(aTHX_
4150                         "Pattern subroutine nesting without pos change"
4151                         " exceeded limit in regex");
4152             } else {
4153                 nochange_depth = 0;
4154             }
4155             re_sv = rex_sv;
4156             re = rex;
4157             rei = rexi;
4158             (void)ReREFCNT_inc(rex_sv);
4159             if (OP(scan)==GOSUB) {
4160                 startpoint = scan + ARG2L(scan);
4161                 ST.close_paren = ARG(scan);
4162             } else {
4163                 startpoint = rei->program+1;
4164                 ST.close_paren = 0;
4165             }
4166             goto eval_recurse_doit;
4167             /* NOTREACHED */
4168         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4169             if (cur_eval && cur_eval->locinput==locinput) {
4170                 if ( ++nochange_depth > max_nochange_depth )
4171                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4172             } else {
4173                 nochange_depth = 0;
4174             }
4175             {
4176                 /* execute the code in the {...} */
4177                 dSP;
4178                 SV ** const before = SP;
4179                 OP_4tree * const oop = PL_op;
4180                 COP * const ocurcop = PL_curcop;
4181                 PAD *old_comppad;
4182                 char *saved_regeol = PL_regeol;
4183                 struct re_save_state saved_state;
4184
4185                 /* To not corrupt the existing regex state while executing the
4186                  * eval we would normally put it on the save stack, like with
4187                  * save_re_context. However, re-evals have a weird scoping so we
4188                  * can't just add ENTER/LEAVE here. With that, things like
4189                  *
4190                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4191                  *
4192                  * would break, as they expect the localisation to be unwound
4193                  * only when the re-engine backtracks through the bit that
4194                  * localised it.
4195                  *
4196                  * What we do instead is just saving the state in a local c
4197                  * variable.
4198                  */
4199                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4200
4201                 n = ARG(scan);
4202                 PL_op = (OP_4tree*)rexi->data->data[n];
4203                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4204                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4205                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4206                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4207
4208                 if (sv_yes_mark) {
4209                     SV *sv_mrk = get_sv("REGMARK", 1);
4210                     sv_setsv(sv_mrk, sv_yes_mark);
4211                 }
4212
4213                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4214                 SPAGAIN;
4215                 if (SP == before)
4216                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4217                 else {
4218                     ret = POPs;
4219                     PUTBACK;
4220                 }
4221
4222                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4223
4224                 PL_op = oop;
4225                 PAD_RESTORE_LOCAL(old_comppad);
4226                 PL_curcop = ocurcop;
4227                 PL_regeol = saved_regeol;
4228                 if (!logical) {
4229                     /* /(?{...})/ */
4230                     sv_setsv(save_scalar(PL_replgv), ret);
4231                     break;
4232                 }
4233             }
4234             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4235                 logical = 0;
4236                 {
4237                     /* extract RE object from returned value; compiling if
4238                      * necessary */
4239                     MAGIC *mg = NULL;
4240                     REGEXP *rx = NULL;
4241
4242                     if (SvROK(ret)) {
4243                         SV *const sv = SvRV(ret);
4244
4245                         if (SvTYPE(sv) == SVt_REGEXP) {
4246                             rx = (REGEXP*) sv;
4247                         } else if (SvSMAGICAL(sv)) {
4248                             mg = mg_find(sv, PERL_MAGIC_qr);
4249                             assert(mg);
4250                         }
4251                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4252                         rx = (REGEXP*) ret;
4253                     } else if (SvSMAGICAL(ret)) {
4254                         if (SvGMAGICAL(ret)) {
4255                             /* I don't believe that there is ever qr magic
4256                                here.  */
4257                             assert(!mg_find(ret, PERL_MAGIC_qr));
4258                             sv_unmagic(ret, PERL_MAGIC_qr);
4259                         }
4260                         else {
4261                             mg = mg_find(ret, PERL_MAGIC_qr);
4262                             /* testing suggests mg only ends up non-NULL for
4263                                scalars who were upgraded and compiled in the
4264                                else block below. In turn, this is only
4265                                triggered in the "postponed utf8 string" tests
4266                                in t/op/pat.t  */
4267                         }
4268                     }
4269
4270                     if (mg) {
4271                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4272                         assert(rx);
4273                     }
4274                     if (rx) {
4275                         rx = reg_temp_copy(NULL, rx);
4276                     }
4277                     else {
4278                         U32 pm_flags = 0;
4279                         const I32 osize = PL_regsize;
4280
4281                         if (DO_UTF8(ret)) {
4282                             assert (SvUTF8(ret));
4283                         } else if (SvUTF8(ret)) {
4284                             /* Not doing UTF-8, despite what the SV says. Is
4285                                this only if we're trapped in use 'bytes'?  */
4286                             /* Make a copy of the octet sequence, but without
4287                                the flag on, as the compiler now honours the
4288                                SvUTF8 flag on ret.  */
4289                             STRLEN len;
4290                             const char *const p = SvPV(ret, len);
4291                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4292                         }
4293                         rx = CALLREGCOMP(ret, pm_flags);
4294                         if (!(SvFLAGS(ret)
4295                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4296                                  | SVs_GMG))) {
4297                             /* This isn't a first class regexp. Instead, it's
4298                                caching a regexp onto an existing, Perl visible
4299                                scalar.  */
4300                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4301                         }
4302                         PL_regsize = osize;
4303                     }
4304                     re_sv = rx;
4305                     re = (struct regexp *)SvANY(rx);
4306                 }
4307                 RXp_MATCH_COPIED_off(re);
4308                 re->subbeg = rex->subbeg;
4309                 re->sublen = rex->sublen;
4310                 rei = RXi_GET(re);
4311                 DEBUG_EXECUTE_r(
4312                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4313                         "Matching embedded");
4314                 );
4315                 startpoint = rei->program + 1;
4316                 ST.close_paren = 0; /* only used for GOSUB */
4317                 /* borrowed from regtry */
4318                 if (PL_reg_start_tmpl <= re->nparens) {
4319                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4320                     if(PL_reg_start_tmp)
4321                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4322                     else
4323                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4324                 }
4325
4326         eval_recurse_doit: /* Share code with GOSUB below this line */
4327                 /* run the pattern returned from (??{...}) */
4328                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4329                 REGCP_SET(ST.lastcp);
4330
4331                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4332
4333                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4334                 PL_reglastparen = &re->lastparen;
4335                 PL_reglastcloseparen = &re->lastcloseparen;
4336                 re->lastparen = 0;
4337                 re->lastcloseparen = 0;
4338
4339                 PL_reginput = locinput;
4340                 PL_regsize = 0;
4341
4342                 /* XXXX This is too dramatic a measure... */
4343                 PL_reg_maxiter = 0;
4344
4345                 ST.toggle_reg_flags = PL_reg_flags;
4346                 if (RX_UTF8(re_sv))
4347                     PL_reg_flags |= RF_utf8;
4348                 else
4349                     PL_reg_flags &= ~RF_utf8;
4350                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4351
4352                 ST.prev_rex = rex_sv;
4353                 ST.prev_curlyx = cur_curlyx;
4354                 SETREX(rex_sv,re_sv);
4355                 rex = re;
4356                 rexi = rei;
4357                 cur_curlyx = NULL;
4358                 ST.B = next;
4359                 ST.prev_eval = cur_eval;
4360                 cur_eval = st;
4361                 /* now continue from first node in postoned RE */
4362                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4363                 /* NOTREACHED */
4364             }
4365             /* logical is 1,   /(?(?{...})X|Y)/ */
4366             sw = cBOOL(SvTRUE(ret));
4367             logical = 0;
4368             break;
4369         }
4370
4371         case EVAL_AB: /* cleanup after a successful (??{A})B */
4372             /* note: this is called twice; first after popping B, then A */
4373             PL_reg_flags ^= ST.toggle_reg_flags;
4374             ReREFCNT_dec(rex_sv);
4375             SETREX(rex_sv,ST.prev_rex);
4376             rex = (struct regexp *)SvANY(rex_sv);
4377             rexi = RXi_GET(rex);
4378             regcpblow(ST.cp);
4379             cur_eval = ST.prev_eval;
4380             cur_curlyx = ST.prev_curlyx;
4381
4382             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4383             PL_reglastparen = &rex->lastparen;
4384             PL_reglastcloseparen = &rex->lastcloseparen;
4385             /* also update PL_regoffs */
4386             PL_regoffs = rex->offs;
4387
4388             /* XXXX This is too dramatic a measure... */
4389             PL_reg_maxiter = 0;
4390             if ( nochange_depth )
4391                 nochange_depth--;
4392             sayYES;
4393
4394
4395         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4396             /* note: this is called twice; first after popping B, then A */
4397             PL_reg_flags ^= ST.toggle_reg_flags;
4398             ReREFCNT_dec(rex_sv);
4399             SETREX(rex_sv,ST.prev_rex);
4400             rex = (struct regexp *)SvANY(rex_sv);
4401             rexi = RXi_GET(rex);
4402             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4403             PL_reglastparen = &rex->lastparen;
4404             PL_reglastcloseparen = &rex->lastcloseparen;
4405
4406             PL_reginput = locinput;
4407             REGCP_UNWIND(ST.lastcp);
4408             regcppop(rex);
4409             cur_eval = ST.prev_eval;
4410             cur_curlyx = ST.prev_curlyx;
4411             /* XXXX This is too dramatic a measure... */
4412             PL_reg_maxiter = 0;
4413             if ( nochange_depth )
4414                 nochange_depth--;
4415             sayNO_SILENT;
4416 #undef ST
4417
4418         case OPEN:
4419             n = ARG(scan);  /* which paren pair */
4420             PL_reg_start_tmp[n] = locinput;
4421             if (n > PL_regsize)
4422                 PL_regsize = n;
4423             lastopen = n;
4424             break;
4425         case CLOSE:
4426             n = ARG(scan);  /* which paren pair */
4427             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4428             PL_regoffs[n].end = locinput - PL_bostr;
4429             /*if (n > PL_regsize)
4430                 PL_regsize = n;*/
4431             if (n > *PL_reglastparen)
4432                 *PL_reglastparen = n;
4433             *PL_reglastcloseparen = n;
4434             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4435                 goto fake_end;
4436             }
4437             break;
4438         case ACCEPT:
4439             if (ARG(scan)){
4440                 regnode *cursor;
4441                 for (cursor=scan;
4442                      cursor && OP(cursor)!=END;
4443                      cursor=regnext(cursor))
4444                 {
4445                     if ( OP(cursor)==CLOSE ){
4446                         n = ARG(cursor);
4447                         if ( n <= lastopen ) {
4448                             PL_regoffs[n].start
4449                                 = PL_reg_start_tmp[n] - PL_bostr;
4450                             PL_regoffs[n].end = locinput - PL_bostr;
4451                             /*if (n > PL_regsize)
4452                             PL_regsize = n;*/
4453                             if (n > *PL_reglastparen)
4454                                 *PL_reglastparen = n;
4455                             *PL_reglastcloseparen = n;
4456                             if ( n == ARG(scan) || (cur_eval &&
4457                                 cur_eval->u.eval.close_paren == n))
4458                                 break;
4459                         }
4460                     }
4461                 }
4462             }
4463             goto fake_end;
4464             /*NOTREACHED*/
4465         case GROUPP:
4466             n = ARG(scan);  /* which paren pair */
4467             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4468             break;
4469         case NGROUPP:
4470             /* reg_check_named_buff_matched returns 0 for no match */
4471             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4472             break;
4473         case INSUBP:
4474             n = ARG(scan);
4475             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4476             break;
4477         case DEFINEP:
4478             sw = 0;
4479             break;
4480         case IFTHEN:
4481             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4482             if (sw)
4483                 next = NEXTOPER(NEXTOPER(scan));
4484             else {
4485                 next = scan + ARG(scan);
4486                 if (OP(next) == IFTHEN) /* Fake one. */
4487                     next = NEXTOPER(NEXTOPER(next));
4488             }
4489             break;
4490         case LOGICAL:
4491             logical = scan->flags;
4492             break;
4493
4494 /*******************************************************************
4495
4496 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4497 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4498 STAR/PLUS/CURLY/CURLYN are used instead.)
4499
4500 A*B is compiled as <CURLYX><A><WHILEM><B>
4501
4502 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4503 state, which contains the current count, initialised to -1. It also sets
4504 cur_curlyx to point to this state, with any previous value saved in the
4505 state block.
4506
4507 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4508 since the pattern may possibly match zero times (i.e. it's a while {} loop
4509 rather than a do {} while loop).
4510
4511 Each entry to WHILEM represents a successful match of A. The count in the
4512 CURLYX block is incremented, another WHILEM state is pushed, and execution
4513 passes to A or B depending on greediness and the current count.
4514
4515 For example, if matching against the string a1a2a3b (where the aN are
4516 substrings that match /A/), then the match progresses as follows: (the
4517 pushed states are interspersed with the bits of strings matched so far):
4518
4519     <CURLYX cnt=-1>
4520     <CURLYX cnt=0><WHILEM>
4521     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4522     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4523     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4524     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4525
4526 (Contrast this with something like CURLYM, which maintains only a single
4527 backtrack state:
4528
4529     <CURLYM cnt=0> a1
4530     a1 <CURLYM cnt=1> a2
4531     a1 a2 <CURLYM cnt=2> a3
4532     a1 a2 a3 <CURLYM cnt=3> b
4533 )
4534
4535 Each WHILEM state block marks a point to backtrack to upon partial failure
4536 of A or B, and also contains some minor state data related to that
4537 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4538 overall state, such as the count, and pointers to the A and B ops.
4539
4540 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4541 must always point to the *current* CURLYX block, the rules are:
4542
4543 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4544 and set cur_curlyx to point the new block.
4545
4546 When popping the CURLYX block after a successful or unsuccessful match,
4547 restore the previous cur_curlyx.
4548
4549 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4550 to the outer one saved in the CURLYX block.
4551
4552 When popping the WHILEM block after a successful or unsuccessful B match,
4553 restore the previous cur_curlyx.
4554
4555 Here's an example for the pattern (AI* BI)*BO
4556 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4557
4558 cur_
4559 curlyx backtrack stack
4560 ------ ---------------
4561 NULL
4562 CO     <CO prev=NULL> <WO>
4563 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4564 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4565 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4566
4567 At this point the pattern succeeds, and we work back down the stack to
4568 clean up, restoring as we go:
4569
4570 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4571 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4572 CO     <CO prev=NULL> <WO>
4573 NULL
4574
4575 *******************************************************************/
4576
4577 #define ST st->u.curlyx
4578
4579         case CURLYX:    /* start of /A*B/  (for complex A) */
4580         {
4581             /* No need to save/restore up to this paren */
4582             I32 parenfloor = scan->flags;
4583
4584             assert(next); /* keep Coverity happy */
4585             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4586                 next += ARG(next);
4587
4588             /* XXXX Probably it is better to teach regpush to support
4589                parenfloor > PL_regsize... */
4590             if (parenfloor > (I32)*PL_reglastparen)
4591                 parenfloor = *PL_reglastparen; /* Pessimization... */
4592
4593             ST.prev_curlyx= cur_curlyx;
4594             cur_curlyx = st;
4595             ST.cp = PL_savestack_ix;
4596
4597             /* these fields contain the state of the current curly.
4598              * they are accessed by subsequent WHILEMs */
4599             ST.parenfloor = parenfloor;
4600             ST.me = scan;
4601             ST.B = next;
4602             ST.minmod = minmod;
4603             minmod = 0;
4604             ST.count = -1;      /* this will be updated by WHILEM */
4605             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4606
4607             PL_reginput = locinput;
4608             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4609             /* NOTREACHED */
4610         }
4611
4612         case CURLYX_end: /* just finished matching all of A*B */
4613             cur_curlyx = ST.prev_curlyx;
4614             sayYES;
4615             /* NOTREACHED */
4616
4617         case CURLYX_end_fail: /* just failed to match all of A*B */
4618             regcpblow(ST.cp);
4619             cur_curlyx = ST.prev_curlyx;
4620             sayNO;
4621             /* NOTREACHED */
4622
4623
4624 #undef ST
4625 #define ST st->u.whilem
4626
4627         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4628         {
4629             /* see the discussion above about CURLYX/WHILEM */
4630             I32 n;
4631             int min = ARG1(cur_curlyx->u.curlyx.me);
4632             int max = ARG2(cur_curlyx->u.curlyx.me);
4633             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4634
4635             assert(cur_curlyx); /* keep Coverity happy */
4636             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4637             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4638             ST.cache_offset = 0;
4639             ST.cache_mask = 0;
4640
4641             PL_reginput = locinput;
4642
4643             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4644                   "%*s  whilem: matched %ld out of %d..%d\n",
4645                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4646             );
4647
4648             /* First just match a string of min A's. */
4649
4650             if (n < min) {
4651                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4652                 cur_curlyx->u.curlyx.lastloc = locinput;
4653                 REGCP_SET(ST.lastcp);
4654
4655                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4656                 /* NOTREACHED */
4657             }
4658
4659             /* If degenerate A matches "", assume A done. */
4660
4661             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4662                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4663                    "%*s  whilem: empty match detected, trying continuation...\n",
4664                    REPORT_CODE_OFF+depth*2, "")
4665                 );
4666                 goto do_whilem_B_max;
4667             }
4668
4669             /* super-linear cache processing */
4670
4671             if (scan->flags) {
4672
4673                 if (!PL_reg_maxiter) {
4674                     /* start the countdown: Postpone detection until we
4675                      * know the match is not *that* much linear. */
4676                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4677                     /* possible overflow for long strings and many CURLYX's */
4678                     if (PL_reg_maxiter < 0)
4679                         PL_reg_maxiter = I32_MAX;
4680                     PL_reg_leftiter = PL_reg_maxiter;
4681                 }
4682
4683                 if (PL_reg_leftiter-- == 0) {
4684                     /* initialise cache */
4685                     const I32 size = (PL_reg_maxiter + 7)/8;
4686                     if (PL_reg_poscache) {
4687                         if ((I32)PL_reg_poscache_size < size) {
4688                             Renew(PL_reg_poscache, size, char);
4689                             PL_reg_poscache_size = size;
4690                         }
4691                         Zero(PL_reg_poscache, size, char);
4692                     }
4693                     else {
4694                         PL_reg_poscache_size = size;
4695                         Newxz(PL_reg_poscache, size, char);
4696                     }
4697                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4698       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4699                               PL_colors[4], PL_colors[5])
4700                     );
4701                 }
4702
4703                 if (PL_reg_leftiter < 0) {
4704                     /* have we already failed at this position? */
4705                     I32 offset, mask;
4706                     offset  = (scan->flags & 0xf) - 1
4707                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4708                     mask    = 1 << (offset % 8);
4709                     offset /= 8;
4710                     if (PL_reg_poscache[offset] & mask) {
4711                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4712                             "%*s  whilem: (cache) already tried at this position...\n",
4713                             REPORT_CODE_OFF+depth*2, "")
4714                         );
4715                         sayNO; /* cache records failure */
4716                     }
4717                     ST.cache_offset = offset;
4718                     ST.cache_mask   = mask;
4719                 }
4720             }
4721
4722             /* Prefer B over A for minimal matching. */
4723
4724             if (cur_curlyx->u.curlyx.minmod) {
4725                 ST.save_curlyx = cur_curlyx;
4726                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4727                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4728                 REGCP_SET(ST.lastcp);
4729                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4730                 /* NOTREACHED */
4731             }
4732
4733             /* Prefer A over B for maximal matching. */
4734
4735             if (n < max) { /* More greed allowed? */
4736                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4737                 cur_curlyx->u.curlyx.lastloc = locinput;
4738                 REGCP_SET(ST.lastcp);
4739                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4740                 /* NOTREACHED */
4741             }
4742             goto do_whilem_B_max;
4743         }
4744         /* NOTREACHED */
4745
4746         case WHILEM_B_min: /* just matched B in a minimal match */
4747         case WHILEM_B_max: /* just matched B in a maximal match */
4748             cur_curlyx = ST.save_curlyx;
4749             sayYES;
4750             /* NOTREACHED */
4751
4752         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4753             cur_curlyx = ST.save_curlyx;
4754             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4755             cur_curlyx->u.curlyx.count--;
4756             CACHEsayNO;
4757             /* NOTREACHED */
4758
4759         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4760             /* FALL THROUGH */
4761         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4762             REGCP_UNWIND(ST.lastcp);
4763             regcppop(rex);
4764             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4765             cur_curlyx->u.curlyx.count--;
4766             CACHEsayNO;
4767             /* NOTREACHED */
4768
4769         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4770             REGCP_UNWIND(ST.lastcp);
4771             regcppop(rex);      /* Restore some previous $<digit>s? */
4772             PL_reginput = locinput;
4773             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4774                 "%*s  whilem: failed, trying continuation...\n",
4775                 REPORT_CODE_OFF+depth*2, "")
4776             );
4777           do_whilem_B_max:
4778             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4779                 && ckWARN(WARN_REGEXP)
4780                 && !(PL_reg_flags & RF_warned))
4781             {
4782                 PL_reg_flags |= RF_warned;
4783                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4784                      "Complex regular subexpression recursion",
4785                      REG_INFTY - 1);
4786             }
4787
4788             /* now try B */
4789             ST.save_curlyx = cur_curlyx;
4790             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4791             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4792             /* NOTREACHED */
4793
4794         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4795             cur_curlyx = ST.save_curlyx;
4796             REGCP_UNWIND(ST.lastcp);
4797             regcppop(rex);
4798
4799             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4800                 /* Maximum greed exceeded */
4801                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4802                     && ckWARN(WARN_REGEXP)
4803                     && !(PL_reg_flags & RF_warned))
4804                 {
4805                     PL_reg_flags |= RF_warned;
4806                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4807                         "%s limit (%d) exceeded",
4808                         "Complex regular subexpression recursion",
4809                         REG_INFTY - 1);
4810                 }
4811                 cur_curlyx->u.curlyx.count--;
4812                 CACHEsayNO;
4813             }
4814
4815             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4816                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4817             );
4818             /* Try grabbing another A and see if it helps. */
4819             PL_reginput = locinput;
4820             cur_curlyx->u.curlyx.lastloc = locinput;
4821             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4822             REGCP_SET(ST.lastcp);
4823             PUSH_STATE_GOTO(WHILEM_A_min,
4824                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4825             /* NOTREACHED */
4826
4827 #undef  ST
4828 #define ST st->u.branch
4829
4830         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4831             next = scan + ARG(scan);
4832             if (next == scan)
4833                 next = NULL;
4834             scan = NEXTOPER(scan);
4835             /* FALL THROUGH */
4836
4837         case BRANCH:        /*  /(...|A|...)/ */
4838             scan = NEXTOPER(scan); /* scan now points to inner node */
4839             ST.lastparen = *PL_reglastparen;
4840             ST.next_branch = next;
4841             REGCP_SET(ST.cp);
4842             PL_reginput = locinput;
4843
4844             /* Now go into the branch */
4845             if (has_cutgroup) {
4846                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4847             } else {
4848                 PUSH_STATE_GOTO(BRANCH_next, scan);
4849             }
4850             /* NOTREACHED */
4851         case CUTGROUP:
4852             PL_reginput = locinput;
4853             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4854                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4855             PUSH_STATE_GOTO(CUTGROUP_next,next);
4856             /* NOTREACHED */
4857         case CUTGROUP_next_fail:
4858             do_cutgroup = 1;
4859             no_final = 1;
4860             if (st->u.mark.mark_name)
4861                 sv_commit = st->u.mark.mark_name;
4862             sayNO;
4863             /* NOTREACHED */
4864         case BRANCH_next:
4865             sayYES;
4866             /* NOTREACHED */
4867         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4868             if (do_cutgroup) {
4869                 do_cutgroup = 0;
4870                 no_final = 0;
4871             }
4872             REGCP_UNWIND(ST.cp);
4873             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4874                 PL_regoffs[n].end = -1;
4875             *PL_reglastparen = n;
4876             /*dmq: *PL_reglastcloseparen = n; */
4877             scan = ST.next_branch;
4878             /* no more branches? */
4879             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4880                 DEBUG_EXECUTE_r({
4881                     PerlIO_printf( Perl_debug_log,
4882                         "%*s  %sBRANCH failed...%s\n",
4883                         REPORT_CODE_OFF+depth*2, "",
4884                         PL_colors[4],
4885                         PL_colors[5] );
4886                 });
4887                 sayNO_SILENT;
4888             }
4889             continue; /* execute next BRANCH[J] op */
4890             /* NOTREACHED */
4891
4892         case MINMOD:
4893             minmod = 1;
4894             break;
4895
4896 #undef  ST
4897 #define ST st->u.curlym
4898
4899         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4900
4901             /* This is an optimisation of CURLYX that enables us to push
4902              * only a single backtracking state, no matter how many matches
4903              * there are in {m,n}. It relies on the pattern being constant
4904              * length, with no parens to influence future backrefs
4905              */
4906
4907             ST.me = scan;
4908             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4909
4910             /* if paren positive, emulate an OPEN/CLOSE around A */
4911             if (ST.me->flags) {
4912                 U32 paren = ST.me->flags;
4913                 if (paren > PL_regsize)
4914                     PL_regsize = paren;
4915                 if (paren > *PL_reglastparen)
4916                     *PL_reglastparen = paren;
4917                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4918             }
4919             ST.A = scan;
4920             ST.B = next;
4921             ST.alen = 0;
4922             ST.count = 0;
4923             ST.minmod = minmod;
4924             minmod = 0;
4925             ST.c1 = CHRTEST_UNINIT;
4926             REGCP_SET(ST.cp);
4927
4928             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4929                 goto curlym_do_B;
4930
4931           curlym_do_A: /* execute the A in /A{m,n}B/  */
4932             PL_reginput = locinput;
4933             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4934             /* NOTREACHED */
4935
4936         case CURLYM_A: /* we've just matched an A */
4937             locinput = st->locinput;
4938             nextchr = UCHARAT(locinput);
4939
4940             ST.count++;
4941             /* after first match, determine A's length: u.curlym.alen */
4942             if (ST.count == 1) {
4943                 if (PL_reg_match_utf8) {
4944                     char *s = locinput;
4945                     while (s < PL_reginput) {
4946                         ST.alen++;
4947                         s += UTF8SKIP(s);
4948                     }
4949                 }
4950                 else {
4951                     ST.alen = PL_reginput - locinput;
4952                 }
4953                 if (ST.alen == 0)
4954                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4955             }
4956             DEBUG_EXECUTE_r(
4957                 PerlIO_printf(Perl_debug_log,
4958                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4959                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4960                           (IV) ST.count, (IV)ST.alen)
4961             );
4962
4963             locinput = PL_reginput;
4964
4965             if (cur_eval && cur_eval->u.eval.close_paren &&
4966                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4967                 goto fake_end;
4968
4969             {
4970                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4971                 if ( max == REG_INFTY || ST.count < max )
4972                     goto curlym_do_A; /* try to match another A */
4973             }
4974             goto curlym_do_B; /* try to match B */
4975
4976         case CURLYM_A_fail: /* just failed to match an A */
4977             REGCP_UNWIND(ST.cp);
4978
4979             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4980                 || (cur_eval && cur_eval->u.eval.close_paren &&
4981                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4982                 sayNO;
4983
4984           curlym_do_B: /* execute the B in /A{m,n}B/  */
4985             PL_reginput = locinput;
4986             if (ST.c1 == CHRTEST_UNINIT) {
4987                 /* calculate c1 and c2 for possible match of 1st char
4988                  * following curly */
4989                 ST.c1 = ST.c2 = CHRTEST_VOID;
4990                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4991                     regnode *text_node = ST.B;
4992                     if (! HAS_TEXT(text_node))
4993                         FIND_NEXT_IMPT(text_node);
4994                     /* this used to be
4995
4996                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4997
4998                         But the former is redundant in light of the latter.
4999
5000                         if this changes back then the macro for
5001                         IS_TEXT and friends need to change.
5002                      */
5003                     if (PL_regkind[OP(text_node)] == EXACT)
5004                     {
5005
5006                         ST.c1 = (U8)*STRING(text_node);
5007                         switch (OP(text_node)) {
5008                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5009                             case EXACTFA:
5010                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5011                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5012                             default: ST.c2 = ST.c1;
5013                         }
5014                     }
5015                 }
5016             }
5017
5018             DEBUG_EXECUTE_r(
5019                 PerlIO_printf(Perl_debug_log,
5020                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5021                     (int)(REPORT_CODE_OFF+(depth*2)),
5022                     "", (IV)ST.count)
5023                 );
5024             if (ST.c1 != CHRTEST_VOID
5025                     && UCHARAT(PL_reginput) != ST.c1
5026                     && UCHARAT(PL_reginput) != ST.c2)
5027             {
5028                 /* simulate B failing */
5029                 DEBUG_OPTIMISE_r(
5030                     PerlIO_printf(Perl_debug_log,
5031                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5032                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5033                         (IV)ST.c1,(IV)ST.c2
5034                 ));
5035                 state_num = CURLYM_B_fail;
5036                 goto reenter_switch;
5037             }
5038
5039             if (ST.me->flags) {
5040                 /* mark current A as captured */
5041                 I32 paren = ST.me->flags;
5042                 if (ST.count) {
5043                     PL_regoffs[paren].start
5044                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5045                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5046                     /*dmq: *PL_reglastcloseparen = paren; */
5047                 }
5048                 else
5049                     PL_regoffs[paren].end = -1;
5050                 if (cur_eval && cur_eval->u.eval.close_paren &&
5051                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5052                 {
5053                     if (ST.count)
5054                         goto fake_end;
5055                     else
5056                         sayNO;
5057                 }
5058             }
5059
5060             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5061             /* NOTREACHED */
5062
5063         case CURLYM_B_fail: /* just failed to match a B */
5064             REGCP_UNWIND(ST.cp);
5065             if (ST.minmod) {
5066                 I32 max = ARG2(ST.me);
5067                 if (max != REG_INFTY && ST.count == max)
5068                     sayNO;
5069                 goto curlym_do_A; /* try to match a further A */
5070             }
5071             /* backtrack one A */
5072             if (ST.count == ARG1(ST.me) /* min */)
5073                 sayNO;
5074             ST.count--;
5075             locinput = HOPc(locinput, -ST.alen);
5076             goto curlym_do_B; /* try to match B */
5077
5078 #undef ST
5079 #define ST st->u.curly
5080
5081 #define CURLY_SETPAREN(paren, success) \
5082     if (paren) { \
5083         if (success) { \
5084             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5085             PL_regoffs[paren].end = locinput - PL_bostr; \
5086             *PL_reglastcloseparen = paren; \
5087         } \
5088         else \
5089             PL_regoffs[paren].end = -1; \
5090     }
5091
5092         case STAR:              /*  /A*B/ where A is width 1 */
5093             ST.paren = 0;
5094             ST.min = 0;
5095             ST.max = REG_INFTY;
5096             scan = NEXTOPER(scan);
5097             goto repeat;
5098         case PLUS:              /*  /A+B/ where A is width 1 */
5099             ST.paren = 0;
5100             ST.min = 1;
5101             ST.max = REG_INFTY;
5102             scan = NEXTOPER(scan);
5103             goto repeat;
5104         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5105             ST.paren = scan->flags;     /* Which paren to set */
5106             if (ST.paren > PL_regsize)
5107                 PL_regsize = ST.paren;
5108             if (ST.paren > *PL_reglastparen)
5109                 *PL_reglastparen = ST.paren;
5110             ST.min = ARG1(scan);  /* min to match */
5111             ST.max = ARG2(scan);  /* max to match */
5112             if (cur_eval && cur_eval->u.eval.close_paren &&
5113                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5114                 ST.min=1;
5115                 ST.max=1;
5116             }
5117             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5118             goto repeat;
5119         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5120             ST.paren = 0;
5121             ST.min = ARG1(scan);  /* min to match */
5122             ST.max = ARG2(scan);  /* max to match */
5123             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5124           repeat:
5125             /*
5126             * Lookahead to avoid useless match attempts
5127             * when we know what character comes next.
5128             *
5129             * Used to only do .*x and .*?x, but now it allows
5130             * for )'s, ('s and (?{ ... })'s to be in the way
5131             * of the quantifier and the EXACT-like node.  -- japhy
5132             */
5133
5134             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5135                 sayNO;
5136             if (HAS_TEXT(next) || JUMPABLE(next)) {
5137                 U8 *s;
5138                 regnode *text_node = next;
5139
5140                 if (! HAS_TEXT(text_node))
5141                     FIND_NEXT_IMPT(text_node);
5142
5143                 if (! HAS_TEXT(text_node))
5144                     ST.c1 = ST.c2 = CHRTEST_VOID;
5145                 else {
5146                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5147                         ST.c1 = ST.c2 = CHRTEST_VOID;
5148                         goto assume_ok_easy;
5149                     }
5150                     else
5151                         s = (U8*)STRING(text_node);
5152
5153                     /*  Currently we only get here when
5154
5155                         PL_rekind[OP(text_node)] == EXACT
5156
5157                         if this changes back then the macro for IS_TEXT and
5158                         friends need to change. */
5159                     if (!UTF_PATTERN) {
5160                         ST.c1 = *s;
5161                         switch (OP(text_node)) {
5162                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5163                             case EXACTFA:
5164                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5165                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5166                             default: ST.c2 = ST.c1; break;
5167                         }
5168                     }
5169                     else { /* UTF_PATTERN */
5170                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5171                              STRLEN ulen1, ulen2;
5172                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5173                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5174
5175                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5176                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5177 #ifdef EBCDIC
5178                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5179                                                     ckWARN(WARN_UTF8) ?
5180                                                     0 : UTF8_ALLOW_ANY);
5181                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5182                                                     ckWARN(WARN_UTF8) ?
5183                                                     0 : UTF8_ALLOW_ANY);
5184 #else
5185                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5186                                                     uniflags);
5187                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5188                                                     uniflags);
5189 #endif
5190                         }
5191                         else {
5192                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5193                                                      uniflags);
5194                         }
5195                     }
5196                 }
5197             }
5198             else
5199                 ST.c1 = ST.c2 = CHRTEST_VOID;
5200         assume_ok_easy:
5201
5202             ST.A = scan;
5203             ST.B = next;
5204             PL_reginput = locinput;
5205             if (minmod) {
5206                 minmod = 0;
5207                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5208                     sayNO;
5209                 ST.count = ST.min;
5210                 locinput = PL_reginput;
5211                 REGCP_SET(ST.cp);
5212                 if (ST.c1 == CHRTEST_VOID)
5213                     goto curly_try_B_min;
5214
5215                 ST.oldloc = locinput;
5216
5217                 /* set ST.maxpos to the furthest point along the
5218                  * string that could possibly match */
5219                 if  (ST.max == REG_INFTY) {
5220                     ST.maxpos = PL_regeol - 1;
5221                     if (utf8_target)
5222                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5223                             ST.maxpos--;
5224                 }
5225                 else if (utf8_target) {
5226                     int m = ST.max - ST.min;
5227                     for (ST.maxpos = locinput;
5228                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5229                         ST.maxpos += UTF8SKIP(ST.maxpos);
5230                 }
5231                 else {
5232                     ST.maxpos = locinput + ST.max - ST.min;
5233                     if (ST.maxpos >= PL_regeol)
5234                         ST.maxpos = PL_regeol - 1;
5235                 }
5236                 goto curly_try_B_min_known;
5237
5238             }
5239             else {
5240                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5241                 locinput = PL_reginput;
5242                 if (ST.count < ST.min)
5243                     sayNO;
5244                 if ((ST.count > ST.min)
5245                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5246                 {
5247                     /* A{m,n} must come at the end of the string, there's
5248                      * no point in backing off ... */
5249                     ST.min = ST.count;
5250                     /* ...except that $ and \Z can match before *and* after
5251                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5252                        We may back off by one in this case. */
5253                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5254                         ST.min--;
5255                 }
5256                 REGCP_SET(ST.cp);
5257                 goto curly_try_B_max;
5258             }
5259             /* NOTREACHED */
5260
5261
5262         case CURLY_B_min_known_fail:
5263             /* failed to find B in a non-greedy match where c1,c2 valid */
5264             if (ST.paren && ST.count)
5265                 PL_regoffs[ST.paren].end = -1;
5266
5267             PL_reginput = locinput;     /* Could be reset... */
5268             REGCP_UNWIND(ST.cp);
5269             /* Couldn't or didn't -- move forward. */
5270             ST.oldloc = locinput;
5271             if (utf8_target)
5272                 locinput += UTF8SKIP(locinput);
5273             else
5274                 locinput++;
5275             ST.count++;
5276           curly_try_B_min_known:
5277              /* find the next place where 'B' could work, then call B */
5278             {
5279                 int n;
5280                 if (utf8_target) {
5281                     n = (ST.oldloc == locinput) ? 0 : 1;
5282                     if (ST.c1 == ST.c2) {
5283                         STRLEN len;
5284                         /* set n to utf8_distance(oldloc, locinput) */
5285                         while (locinput <= ST.maxpos &&
5286                                utf8n_to_uvchr((U8*)locinput,
5287                                               UTF8_MAXBYTES, &len,
5288                                               uniflags) != (UV)ST.c1) {
5289                             locinput += len;
5290                             n++;
5291                         }
5292                     }
5293                     else {
5294                         /* set n to utf8_distance(oldloc, locinput) */
5295                         while (locinput <= ST.maxpos) {
5296                             STRLEN len;
5297                             const UV c = utf8n_to_uvchr((U8*)locinput,
5298                                                   UTF8_MAXBYTES, &len,
5299                                                   uniflags);
5300                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5301                                 break;
5302                             locinput += len;
5303                             n++;
5304                         }
5305                     }
5306                 }
5307                 else {
5308                     if (ST.c1 == ST.c2) {
5309                         while (locinput <= ST.maxpos &&
5310                                UCHARAT(locinput) != ST.c1)
5311                             locinput++;
5312                     }
5313                     else {
5314                         while (locinput <= ST.maxpos
5315                                && UCHARAT(locinput) != ST.c1
5316                                && UCHARAT(locinput) != ST.c2)
5317                             locinput++;
5318                     }
5319                     n = locinput - ST.oldloc;
5320                 }
5321                 if (locinput > ST.maxpos)
5322                     sayNO;
5323                 /* PL_reginput == oldloc now */
5324                 if (n) {
5325                     ST.count += n;
5326                     if (regrepeat(rex, ST.A, n, depth) < n)
5327                         sayNO;
5328                 }
5329                 PL_reginput = locinput;
5330                 CURLY_SETPAREN(ST.paren, ST.count);
5331                 if (cur_eval && cur_eval->u.eval.close_paren &&
5332                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5333                     goto fake_end;
5334                 }
5335                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5336             }
5337             /* NOTREACHED */
5338
5339
5340         case CURLY_B_min_fail:
5341             /* failed to find B in a non-greedy match where c1,c2 invalid */
5342             if (ST.paren && ST.count)
5343                 PL_regoffs[ST.paren].end = -1;
5344
5345             REGCP_UNWIND(ST.cp);
5346             /* failed -- move forward one */
5347             PL_reginput = locinput;
5348             if (regrepeat(rex, ST.A, 1, depth)) {
5349                 ST.count++;
5350                 locinput = PL_reginput;
5351                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5352                         ST.count > 0)) /* count overflow ? */
5353                 {
5354                   curly_try_B_min:
5355                     CURLY_SETPAREN(ST.paren, ST.count);
5356                     if (cur_eval && cur_eval->u.eval.close_paren &&
5357                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5358                         goto fake_end;
5359                     }
5360                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5361                 }
5362             }
5363             sayNO;
5364             /* NOTREACHED */
5365
5366
5367         curly_try_B_max:
5368             /* a successful greedy match: now try to match B */
5369             if (cur_eval && cur_eval->u.eval.close_paren &&
5370                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5371                 goto fake_end;
5372             }
5373             {
5374                 UV c = 0;
5375                 if (ST.c1 != CHRTEST_VOID)
5376                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5377                                            UTF8_MAXBYTES, 0, uniflags)
5378                                 : (UV) UCHARAT(PL_reginput);
5379                 /* If it could work, try it. */
5380                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5381                     CURLY_SETPAREN(ST.paren, ST.count);
5382                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5383                     /* NOTREACHED */
5384                 }
5385             }
5386             /* FALL THROUGH */
5387         case CURLY_B_max_fail:
5388             /* failed to find B in a greedy match */
5389             if (ST.paren && ST.count)
5390                 PL_regoffs[ST.paren].end = -1;
5391
5392             REGCP_UNWIND(ST.cp);
5393             /*  back up. */
5394             if (--ST.count < ST.min)
5395                 sayNO;
5396             PL_reginput = locinput = HOPc(locinput, -1);
5397             goto curly_try_B_max;
5398
5399 #undef ST
5400
5401         case END:
5402             fake_end:
5403             if (cur_eval) {
5404                 /* we've just finished A in /(??{A})B/; now continue with B */
5405                 I32 tmpix;
5406                 st->u.eval.toggle_reg_flags
5407                             = cur_eval->u.eval.toggle_reg_flags;
5408                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5409
5410                 st->u.eval.prev_rex = rex_sv;           /* inner */
5411                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5412                 rex = (struct regexp *)SvANY(rex_sv);
5413                 rexi = RXi_GET(rex);
5414                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5415                 ReREFCNT_inc(rex_sv);
5416                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5417
5418                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5419                 PL_reglastparen = &rex->lastparen;
5420                 PL_reglastcloseparen = &rex->lastcloseparen;
5421
5422                 REGCP_SET(st->u.eval.lastcp);
5423                 PL_reginput = locinput;
5424
5425                 /* Restore parens of the outer rex without popping the
5426                  * savestack */
5427                 tmpix = PL_savestack_ix;
5428                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5429                 regcppop(rex);
5430                 PL_savestack_ix = tmpix;
5431
5432                 st->u.eval.prev_eval = cur_eval;
5433                 cur_eval = cur_eval->u.eval.prev_eval;
5434                 DEBUG_EXECUTE_r(
5435                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5436                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5437                 if ( nochange_depth )
5438                     nochange_depth--;
5439
5440                 PUSH_YES_STATE_GOTO(EVAL_AB,
5441                         st->u.eval.prev_eval->u.eval.B); /* match B */
5442             }
5443
5444             if (locinput < reginfo->till) {
5445                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5446                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5447                                       PL_colors[4],
5448                                       (long)(locinput - PL_reg_starttry),
5449                                       (long)(reginfo->till - PL_reg_starttry),
5450                                       PL_colors[5]));
5451
5452                 sayNO_SILENT;           /* Cannot match: too short. */
5453             }
5454             PL_reginput = locinput;     /* put where regtry can find it */
5455             sayYES;                     /* Success! */
5456
5457         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5458             DEBUG_EXECUTE_r(
5459             PerlIO_printf(Perl_debug_log,
5460                 "%*s  %ssubpattern success...%s\n",
5461                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5462             PL_reginput = locinput;     /* put where regtry can find it */
5463             sayYES;                     /* Success! */
5464
5465 #undef  ST
5466 #define ST st->u.ifmatch
5467
5468         case SUSPEND:   /* (?>A) */
5469             ST.wanted = 1;
5470             PL_reginput = locinput;
5471             goto do_ifmatch;
5472
5473         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5474             ST.wanted = 0;
5475             goto ifmatch_trivial_fail_test;
5476
5477         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5478             ST.wanted = 1;
5479           ifmatch_trivial_fail_test:
5480             if (scan->flags) {
5481                 char * const s = HOPBACKc(locinput, scan->flags);
5482                 if (!s) {
5483                     /* trivial fail */
5484                     if (logical) {
5485                         logical = 0;
5486                         sw = 1 - cBOOL(ST.wanted);
5487                     }
5488                     else if (ST.wanted)
5489                         sayNO;
5490                     next = scan + ARG(scan);
5491                     if (next == scan)
5492                         next = NULL;
5493                     break;
5494                 }
5495                 PL_reginput = s;
5496             }
5497             else
5498                 PL_reginput = locinput;
5499
5500           do_ifmatch:
5501             ST.me = scan;
5502             ST.logical = logical;
5503             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5504
5505             /* execute body of (?...A) */
5506             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5507             /* NOTREACHED */
5508
5509         case IFMATCH_A_fail: /* body of (?...A) failed */
5510             ST.wanted = !ST.wanted;
5511             /* FALL THROUGH */
5512
5513         case IFMATCH_A: /* body of (?...A) succeeded */
5514             if (ST.logical) {
5515                 sw = cBOOL(ST.wanted);
5516             }
5517             else if (!ST.wanted)
5518                 sayNO;
5519
5520             if (OP(ST.me) == SUSPEND)
5521                 locinput = PL_reginput;
5522             else {
5523                 locinput = PL_reginput = st->locinput;
5524                 nextchr = UCHARAT(locinput);
5525             }
5526             scan = ST.me + ARG(ST.me);
5527             if (scan == ST.me)
5528                 scan = NULL;
5529             continue; /* execute B */
5530
5531 #undef ST
5532
5533         case LONGJMP:
5534             next = scan + ARG(scan);
5535             if (next == scan)
5536                 next = NULL;
5537             break;
5538         case COMMIT:
5539             reginfo->cutpoint = PL_regeol;
5540             /* FALLTHROUGH */
5541         case PRUNE:
5542             PL_reginput = locinput;
5543             if (!scan->flags)
5544                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5545             PUSH_STATE_GOTO(COMMIT_next,next);
5546             /* NOTREACHED */
5547         case COMMIT_next_fail:
5548             no_final = 1;
5549             /* FALLTHROUGH */
5550         case OPFAIL:
5551             sayNO;
5552             /* NOTREACHED */
5553
5554 #define ST st->u.mark
5555         case MARKPOINT:
5556             ST.prev_mark = mark_state;
5557             ST.mark_name = sv_commit = sv_yes_mark
5558                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5559             mark_state = st;
5560             ST.mark_loc = PL_reginput = locinput;
5561             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5562             /* NOTREACHED */
5563         case MARKPOINT_next:
5564             mark_state = ST.prev_mark;
5565             sayYES;
5566             /* NOTREACHED */
5567         case MARKPOINT_next_fail:
5568             if (popmark && sv_eq(ST.mark_name,popmark))
5569             {
5570                 if (ST.mark_loc > startpoint)
5571                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5572                 popmark = NULL; /* we found our mark */
5573                 sv_commit = ST.mark_name;
5574
5575                 DEBUG_EXECUTE_r({
5576                         PerlIO_printf(Perl_debug_log,
5577                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5578                             REPORT_CODE_OFF+depth*2, "",
5579                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5580                 });
5581             }
5582             mark_state = ST.prev_mark;
5583             sv_yes_mark = mark_state ?
5584                 mark_state->u.mark.mark_name : NULL;
5585             sayNO;
5586             /* NOTREACHED */
5587         case SKIP:
5588             PL_reginput = locinput;
5589             if (scan->flags) {
5590                 /* (*SKIP) : if we fail we cut here*/
5591                 ST.mark_name = NULL;
5592                 ST.mark_loc = locinput;
5593                 PUSH_STATE_GOTO(SKIP_next,next);
5594             } else {
5595                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5596                    otherwise do nothing.  Meaning we need to scan
5597                  */
5598                 regmatch_state *cur = mark_state;
5599                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5600
5601                 while (cur) {
5602                     if ( sv_eq( cur->u.mark.mark_name,
5603                                 find ) )
5604                     {
5605                         ST.mark_name = find;
5606                         PUSH_STATE_GOTO( SKIP_next, next );
5607                     }
5608                     cur = cur->u.mark.prev_mark;
5609                 }
5610             }
5611             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5612             break;
5613         case SKIP_next_fail:
5614             if (ST.mark_name) {
5615                 /* (*CUT:NAME) - Set up to search for the name as we
5616                    collapse the stack*/
5617                 popmark = ST.mark_name;
5618             } else {
5619                 /* (*CUT) - No name, we cut here.*/
5620                 if (ST.mark_loc > startpoint)
5621                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5622                 /* but we set sv_commit to latest mark_name if there
5623                    is one so they can test to see how things lead to this
5624                    cut */
5625                 if (mark_state)
5626                     sv_commit=mark_state->u.mark.mark_name;
5627             }
5628             no_final = 1;
5629             sayNO;
5630             /* NOTREACHED */
5631 #undef ST
5632         case FOLDCHAR:
5633             n = ARG(scan);
5634             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5635                 locinput += ln;
5636             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5637                 sayNO;
5638             } else  {
5639                 U8 folded[UTF8_MAXBYTES_CASE+1];
5640                 STRLEN foldlen;
5641                 const char * const l = locinput;
5642                 char *e = PL_regeol;
5643                 to_uni_fold(n, folded, &foldlen);
5644
5645                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5646                                l, &e, 0,  utf8_target)) {
5647                         sayNO;
5648                 }
5649                 locinput = e;
5650             }
5651             nextchr = UCHARAT(locinput);
5652             break;
5653         case LNBREAK:
5654             if ((n=is_LNBREAK(locinput,utf8_target))) {
5655                 locinput += n;
5656                 nextchr = UCHARAT(locinput);
5657             } else
5658                 sayNO;
5659             break;
5660
5661 #define CASE_CLASS(nAmE)                              \
5662         case nAmE:                                    \
5663             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5664                 locinput += n;                        \
5665                 nextchr = UCHARAT(locinput);          \
5666             } else                                    \
5667                 sayNO;                                \
5668             break;                                    \
5669         case N##nAmE:                                 \
5670             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5671                 sayNO;                                \
5672             } else {                                  \
5673                 locinput += UTF8SKIP(locinput);       \
5674                 nextchr = UCHARAT(locinput);          \
5675             }                                         \
5676             break
5677
5678         CASE_CLASS(VERTWS);
5679         CASE_CLASS(HORIZWS);
5680 #undef CASE_CLASS
5681
5682         default:
5683             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5684                           PTR2UV(scan), OP(scan));
5685             Perl_croak(aTHX_ "regexp memory corruption");
5686
5687         } /* end switch */
5688
5689         /* switch break jumps here */
5690         scan = next; /* prepare to execute the next op and ... */
5691         continue;    /* ... jump back to the top, reusing st */
5692         /* NOTREACHED */
5693
5694       push_yes_state:
5695         /* push a state that backtracks on success */
5696         st->u.yes.prev_yes_state = yes_state;
5697         yes_state = st;
5698         /* FALL THROUGH */
5699       push_state:
5700         /* push a new regex state, then continue at scan  */
5701         {
5702             regmatch_state *newst;
5703
5704             DEBUG_STACK_r({
5705                 regmatch_state *cur = st;
5706                 regmatch_state *curyes = yes_state;
5707                 int curd = depth;
5708                 regmatch_slab *slab = PL_regmatch_slab;
5709                 for (;curd > -1;cur--,curd--) {
5710                     if (cur < SLAB_FIRST(slab)) {
5711                         slab = slab->prev;
5712                         cur = SLAB_LAST(slab);
5713                     }
5714                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5715                         REPORT_CODE_OFF + 2 + depth * 2,"",
5716                         curd, PL_reg_name[cur->resume_state],
5717                         (curyes == cur) ? "yes" : ""
5718                     );
5719                     if (curyes == cur)
5720                         curyes = cur->u.yes.prev_yes_state;
5721                 }
5722             } else
5723                 DEBUG_STATE_pp("push")
5724             );
5725             depth++;
5726             st->locinput = locinput;
5727             newst = st+1;
5728             if (newst >  SLAB_LAST(PL_regmatch_slab))
5729                 newst = S_push_slab(aTHX);
5730             PL_regmatch_state = newst;
5731
5732             locinput = PL_reginput;
5733             nextchr = UCHARAT(locinput);
5734             st = newst;
5735             continue;
5736             /* NOTREACHED */
5737         }
5738     }
5739
5740     /*
5741     * We get here only if there's trouble -- normally "case END" is
5742     * the terminating point.
5743     */
5744     Perl_croak(aTHX_ "corrupted regexp pointers");
5745     /*NOTREACHED*/
5746     sayNO;
5747
5748 yes:
5749     if (yes_state) {
5750         /* we have successfully completed a subexpression, but we must now
5751          * pop to the state marked by yes_state and continue from there */
5752         assert(st != yes_state);
5753 #ifdef DEBUGGING
5754         while (st != yes_state) {
5755             st--;
5756             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5757                 PL_regmatch_slab = PL_regmatch_slab->prev;
5758                 st = SLAB_LAST(PL_regmatch_slab);
5759             }
5760             DEBUG_STATE_r({
5761                 if (no_final) {
5762                     DEBUG_STATE_pp("pop (no final)");
5763                 } else {
5764                     DEBUG_STATE_pp("pop (yes)");
5765                 }
5766             });
5767             depth--;
5768         }
5769 #else
5770         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5771             || yes_state > SLAB_LAST(PL_regmatch_slab))
5772         {
5773             /* not in this slab, pop slab */
5774             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5775             PL_regmatch_slab = PL_regmatch_slab->prev;
5776             st = SLAB_LAST(PL_regmatch_slab);
5777         }
5778         depth -= (st - yes_state);
5779 #endif
5780         st = yes_state;
5781         yes_state = st->u.yes.prev_yes_state;
5782         PL_regmatch_state = st;
5783
5784         if (no_final) {
5785             locinput= st->locinput;
5786             nextchr = UCHARAT(locinput);
5787         }
5788         state_num = st->resume_state + no_final;
5789         goto reenter_switch;
5790     }
5791
5792     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5793                           PL_colors[4], PL_colors[5]));
5794
5795     if (PL_reg_eval_set) {
5796         /* each successfully executed (?{...}) block does the equivalent of
5797          *   local $^R = do {...}
5798          * When popping the save stack, all these locals would be undone;
5799          * bypass this by setting the outermost saved $^R to the latest
5800          * value */
5801         if (oreplsv != GvSV(PL_replgv))
5802             sv_setsv(oreplsv, GvSV(PL_replgv));
5803     }
5804     result = 1;
5805     goto final_exit;
5806
5807 no:
5808     DEBUG_EXECUTE_r(
5809         PerlIO_printf(Perl_debug_log,
5810             "%*s  %sfailed...%s\n",
5811             REPORT_CODE_OFF+depth*2, "",
5812             PL_colors[4], PL_colors[5])
5813         );
5814
5815 no_silent:
5816     if (no_final) {
5817         if (yes_state) {
5818             goto yes;
5819         } else {
5820             goto final_exit;
5821         }
5822     }
5823     if (depth) {
5824         /* there's a previous state to backtrack to */
5825         st--;
5826         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5827             PL_regmatch_slab = PL_regmatch_slab->prev;
5828             st = SLAB_LAST(PL_regmatch_slab);
5829         }
5830         PL_regmatch_state = st;
5831         locinput= st->locinput;
5832         nextchr = UCHARAT(locinput);
5833
5834         DEBUG_STATE_pp("pop");
5835         depth--;
5836         if (yes_state == st)
5837             yes_state = st->u.yes.prev_yes_state;
5838
5839         state_num = st->resume_state + 1; /* failure = success + 1 */
5840         goto reenter_switch;
5841     }
5842     result = 0;
5843
5844   final_exit:
5845     if (rex->intflags & PREGf_VERBARG_SEEN) {
5846         SV *sv_err = get_sv("REGERROR", 1);
5847         SV *sv_mrk = get_sv("REGMARK", 1);
5848         if (result) {
5849             sv_commit = &PL_sv_no;
5850             if (!sv_yes_mark)
5851                 sv_yes_mark = &PL_sv_yes;
5852         } else {
5853             if (!sv_commit)
5854                 sv_commit = &PL_sv_yes;
5855             sv_yes_mark = &PL_sv_no;
5856         }
5857         sv_setsv(sv_err, sv_commit);
5858         sv_setsv(sv_mrk, sv_yes_mark);
5859     }
5860
5861     /* clean up; in particular, free all slabs above current one */
5862     LEAVE_SCOPE(oldsave);
5863
5864     return result;
5865 }
5866
5867 /*
5868  - regrepeat - repeatedly match something simple, report how many
5869  */
5870 /*
5871  * [This routine now assumes that it will only match on things of length 1.
5872  * That was true before, but now we assume scan - reginput is the count,
5873  * rather than incrementing count on every character.  [Er, except utf8.]]
5874  */
5875 STATIC I32
5876 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5877 {
5878     dVAR;
5879     register char *scan;
5880     register I32 c;
5881     register char *loceol = PL_regeol;
5882     register I32 hardcount = 0;
5883     register bool utf8_target = PL_reg_match_utf8;
5884     UV utf8_flags;
5885 #ifndef DEBUGGING
5886     PERL_UNUSED_ARG(depth);
5887 #endif
5888
5889     PERL_ARGS_ASSERT_REGREPEAT;
5890
5891     scan = PL_reginput;
5892     if (max == REG_INFTY)
5893         max = I32_MAX;
5894     else if (max < loceol - scan)
5895         loceol = scan + max;
5896     switch (OP(p)) {
5897     case REG_ANY:
5898         if (utf8_target) {
5899             loceol = PL_regeol;
5900             while (scan < loceol && hardcount < max && *scan != '\n') {
5901                 scan += UTF8SKIP(scan);
5902                 hardcount++;
5903             }
5904         } else {
5905             while (scan < loceol && *scan != '\n')
5906                 scan++;
5907         }
5908         break;
5909     case SANY:
5910         if (utf8_target) {
5911             loceol = PL_regeol;
5912             while (scan < loceol && hardcount < max) {
5913                 scan += UTF8SKIP(scan);
5914                 hardcount++;
5915             }
5916         }
5917         else
5918             scan = loceol;
5919         break;
5920     case CANY:
5921         scan = loceol;
5922         break;
5923     case EXACT:
5924         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5925          * means they match only characters in the string that can be expressed
5926          * as a single byte.  For non-utf8 strings, that means a simple match.
5927          * For utf8 strings, the character matched must be an invariant, or
5928          * downgradable to a single byte.  The pattern's utf8ness is
5929          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5930          * it is, it's an invariant */
5931
5932         c = (U8)*STRING(p);
5933         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5934
5935         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5936             while (scan < loceol && UCHARAT(scan) == c) {
5937                 scan++;
5938             }
5939         }
5940         else {
5941
5942             /* Here, the string is utf8, and the pattern char is different
5943              * in utf8 than not, so can't compare them directly.  Outside the
5944              * loop, find find the two utf8 bytes that represent c, and then
5945              * look for those in sequence in the utf8 string */
5946             U8 high = UTF8_TWO_BYTE_HI(c);
5947             U8 low = UTF8_TWO_BYTE_LO(c);
5948             loceol = PL_regeol;
5949
5950             while (hardcount < max
5951                     && scan + 1 < loceol
5952                     && UCHARAT(scan) == high
5953                     && UCHARAT(scan + 1) == low)
5954             {
5955                 scan += 2;
5956                 hardcount++;
5957             }
5958         }
5959         break;
5960     case EXACTFA:
5961         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5962         goto do_exactf;
5963
5964     case EXACTFL:
5965         PL_reg_flags |= RF_tainted;
5966         utf8_flags = FOLDEQ_UTF8_LOCALE;
5967         goto do_exactf;
5968
5969     case EXACTF:
5970     case EXACTFU:
5971         utf8_flags = 0;
5972
5973         /* The comments for the EXACT case above apply as well to these fold
5974          * ones */
5975
5976     do_exactf:
5977         c = (U8)*STRING(p);
5978         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5979
5980         if (utf8_target) { /* Use full Unicode fold matching */
5981             char *tmpeol = loceol;
5982             while (hardcount < max
5983                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5984                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5985             {
5986                 scan = tmpeol;
5987                 tmpeol = loceol;
5988                 hardcount++;
5989             }
5990
5991             /* XXX Note that the above handles properly the German sharp s in
5992              * the pattern matching ss in the string.  But it doesn't handle
5993              * properly cases where the string contains say 'LIGATURE ff' and
5994              * the pattern is 'f+'.  This would require, say, a new function or
5995              * revised interface to foldEQ_utf8(), in which the maximum number
5996              * of characters to match could be passed and it would return how
5997              * many actually did.  This is just one of many cases where
5998              * multi-char folds don't work properly, and so the fix is being
5999              * deferred */
6000         }
6001         else {
6002             U8 folded;
6003
6004             /* Here, the string isn't utf8 and c is a single byte; and either
6005              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6006              * doesn't affect c.  Can just do simple comparisons for exact or
6007              * fold matching. */
6008             switch (OP(p)) {
6009                 case EXACTF: folded = PL_fold[c]; break;
6010                 case EXACTFA:
6011                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6012                 case EXACTFL: folded = PL_fold_locale[c]; break;
6013                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6014             }
6015             while (scan < loceol &&
6016                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6017             {
6018                 scan++;
6019             }
6020         }
6021         break;
6022     case ANYOFV:
6023     case ANYOF:
6024         if (utf8_target || OP(p) == ANYOFV) {
6025             STRLEN inclasslen;
6026             loceol = PL_regeol;
6027             inclasslen = loceol - scan;
6028             while (hardcount < max
6029                    && ((inclasslen = loceol - scan) > 0)
6030                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6031             {
6032                 scan += inclasslen;
6033                 hardcount++;
6034             }
6035         } else {
6036             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6037                 scan++;
6038         }
6039         break;
6040     case ALNUMU:
6041         if (utf8_target) {
6042     utf8_wordchar:
6043             loceol = PL_regeol;
6044             LOAD_UTF8_CHARCLASS_ALNUM();
6045             while (hardcount < max && scan < loceol &&
6046                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6047             {
6048                 scan += UTF8SKIP(scan);
6049                 hardcount++;
6050             }
6051         } else {
6052             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6053                 scan++;
6054             }
6055         }
6056         break;
6057     case ALNUM:
6058         if (utf8_target)
6059             goto utf8_wordchar;
6060         while (scan < loceol && isALNUM((U8) *scan)) {
6061             scan++;
6062         }
6063         break;
6064     case ALNUMA:
6065         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6066             scan++;
6067         }
6068         break;
6069     case ALNUML:
6070         PL_reg_flags |= RF_tainted;
6071         if (utf8_target) {
6072             loceol = PL_regeol;
6073             while (hardcount < max && scan < loceol &&
6074                    isALNUM_LC_utf8((U8*)scan)) {
6075                 scan += UTF8SKIP(scan);
6076                 hardcount++;
6077             }
6078         } else {
6079             while (scan < loceol && isALNUM_LC(*scan))
6080                 scan++;
6081         }
6082         break;
6083     case NALNUMU:
6084         if (utf8_target) {
6085
6086     utf8_Nwordchar:
6087
6088             loceol = PL_regeol;
6089             LOAD_UTF8_CHARCLASS_ALNUM();
6090             while (hardcount < max && scan < loceol &&
6091                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6092             {
6093                 scan += UTF8SKIP(scan);
6094                 hardcount++;
6095             }
6096         } else {
6097             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6098                 scan++;
6099             }
6100         }
6101         break;
6102     case NALNUM:
6103         if (utf8_target)
6104             goto utf8_Nwordchar;
6105         while (scan < loceol && ! isALNUM((U8) *scan)) {
6106             scan++;
6107         }
6108         break;
6109     case NALNUMA:
6110         if (utf8_target) {
6111             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6112                 scan += UTF8SKIP(scan);
6113             }
6114         }
6115         else {
6116             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6117                 scan++;
6118             }
6119         }
6120         break;
6121     case NALNUML:
6122         PL_reg_flags |= RF_tainted;
6123         if (utf8_target) {
6124             loceol = PL_regeol;
6125             while (hardcount < max && scan < loceol &&
6126                    !isALNUM_LC_utf8((U8*)scan)) {
6127                 scan += UTF8SKIP(scan);
6128                 hardcount++;
6129             }
6130         } else {
6131             while (scan < loceol && !isALNUM_LC(*scan))
6132                 scan++;
6133         }
6134         break;
6135     case SPACEU:
6136         if (utf8_target) {
6137
6138     utf8_space:
6139
6140             loceol = PL_regeol;
6141             LOAD_UTF8_CHARCLASS_SPACE();
6142             while (hardcount < max && scan < loceol &&
6143                    (*scan == ' ' ||
6144                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6145             {
6146                 scan += UTF8SKIP(scan);
6147                 hardcount++;
6148             }
6149             break;
6150         }
6151         else {
6152             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6153                 scan++;
6154             }
6155             break;
6156         }
6157     case SPACE:
6158         if (utf8_target)
6159             goto utf8_space;
6160
6161         while (scan < loceol && isSPACE((U8) *scan)) {
6162             scan++;
6163         }
6164         break;
6165     case SPACEA:
6166         while (scan < loceol && isSPACE_A((U8) *scan)) {
6167             scan++;
6168         }
6169         break;
6170     case SPACEL:
6171         PL_reg_flags |= RF_tainted;
6172         if (utf8_target) {
6173             loceol = PL_regeol;
6174             while (hardcount < max && scan < loceol &&
6175                    isSPACE_LC_utf8((U8*)scan)) {
6176                 scan += UTF8SKIP(scan);
6177                 hardcount++;
6178             }
6179         } else {
6180             while (scan < loceol && isSPACE_LC(*scan))
6181                 scan++;
6182         }
6183         break;
6184     case NSPACEU:
6185         if (utf8_target) {
6186
6187     utf8_Nspace:
6188
6189             loceol = PL_regeol;
6190             LOAD_UTF8_CHARCLASS_SPACE();
6191             while (hardcount < max && scan < loceol &&
6192                    ! (*scan == ' ' ||
6193                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6194             {
6195                 scan += UTF8SKIP(scan);
6196                 hardcount++;
6197             }
6198             break;
6199         }
6200         else {
6201             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6202                 scan++;
6203             }
6204         }
6205         break;
6206     case NSPACE:
6207         if (utf8_target)
6208             goto utf8_Nspace;
6209
6210         while (scan < loceol && ! isSPACE((U8) *scan)) {
6211             scan++;
6212         }
6213         break;
6214     case NSPACEA:
6215         if (utf8_target) {
6216             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6217                 scan += UTF8SKIP(scan);
6218             }
6219         }
6220         else {
6221             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6222                 scan++;
6223             }
6224         }
6225         break;
6226     case NSPACEL:
6227         PL_reg_flags |= RF_tainted;
6228         if (utf8_target) {
6229             loceol = PL_regeol;
6230             while (hardcount < max && scan < loceol &&
6231                    !isSPACE_LC_utf8((U8*)scan)) {
6232                 scan += UTF8SKIP(scan);
6233                 hardcount++;
6234             }
6235         } else {
6236             while (scan < loceol && !isSPACE_LC(*scan))
6237                 scan++;
6238         }
6239         break;
6240     case DIGIT:
6241         if (utf8_target) {
6242             loceol = PL_regeol;
6243             LOAD_UTF8_CHARCLASS_DIGIT();
6244             while (hardcount < max && scan < loceol &&
6245                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6246                 scan += UTF8SKIP(scan);
6247                 hardcount++;
6248             }
6249         } else {
6250             while (scan < loceol && isDIGIT(*scan))
6251                 scan++;
6252         }
6253         break;
6254     case DIGITA:
6255         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6256             scan++;
6257         }
6258         break;
6259     case DIGITL:
6260         PL_reg_flags |= RF_tainted;
6261         if (utf8_target) {
6262             loceol = PL_regeol;
6263             while (hardcount < max && scan < loceol &&
6264                    isDIGIT_LC_utf8((U8*)scan)) {
6265                 scan += UTF8SKIP(scan);
6266                 hardcount++;
6267             }
6268         } else {
6269             while (scan < loceol && isDIGIT_LC(*scan))
6270                 scan++;
6271         }
6272         break;
6273     case NDIGIT:
6274         if (utf8_target) {
6275             loceol = PL_regeol;
6276             LOAD_UTF8_CHARCLASS_DIGIT();
6277             while (hardcount < max && scan < loceol &&
6278                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6279                 scan += UTF8SKIP(scan);
6280                 hardcount++;
6281             }
6282         } else {
6283             while (scan < loceol && !isDIGIT(*scan))
6284                 scan++;
6285         }
6286         break;
6287     case NDIGITA:
6288         if (utf8_target) {
6289             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6290                 scan += UTF8SKIP(scan);
6291             }
6292         }
6293         else {
6294             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6295                 scan++;
6296             }
6297         }
6298         break;
6299     case NDIGITL:
6300         PL_reg_flags |= RF_tainted;
6301         if (utf8_target) {
6302             loceol = PL_regeol;
6303             while (hardcount < max && scan < loceol &&
6304                    !isDIGIT_LC_utf8((U8*)scan)) {
6305                 scan += UTF8SKIP(scan);
6306                 hardcount++;
6307             }
6308         } else {
6309             while (scan < loceol && !isDIGIT_LC(*scan))
6310                 scan++;
6311         }
6312         break;
6313     case LNBREAK:
6314         if (utf8_target) {
6315             loceol = PL_regeol;
6316             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6317                 scan += c;
6318                 hardcount++;
6319             }
6320         } else {
6321             /*
6322               LNBREAK can match two latin chars, which is ok,
6323               because we have a null terminated string, but we
6324               have to use hardcount in this situation
6325             */
6326             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6327                 scan+=c;
6328                 hardcount++;
6329             }
6330         }
6331         break;
6332     case HORIZWS:
6333         if (utf8_target) {
6334             loceol = PL_regeol;
6335             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6336                 scan += c;
6337                 hardcount++;
6338             }
6339         } else {
6340             while (scan < loceol && is_HORIZWS_latin1(scan))
6341                 scan++;
6342         }
6343         break;
6344     case NHORIZWS:
6345         if (utf8_target) {
6346             loceol = PL_regeol;
6347             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6348                 scan += UTF8SKIP(scan);
6349                 hardcount++;
6350             }
6351         } else {
6352             while (scan < loceol && !is_HORIZWS_latin1(scan))
6353                 scan++;
6354
6355         }
6356         break;
6357     case VERTWS:
6358         if (utf8_target) {
6359             loceol = PL_regeol;
6360             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6361                 scan += c;
6362                 hardcount++;
6363             }
6364         } else {
6365             while (scan < loceol && is_VERTWS_latin1(scan))
6366                 scan++;
6367
6368         }
6369         break;
6370     case NVERTWS:
6371         if (utf8_target) {
6372             loceol = PL_regeol;
6373             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6374                 scan += UTF8SKIP(scan);
6375                 hardcount++;
6376             }
6377         } else {
6378             while (scan < loceol && !is_VERTWS_latin1(scan))
6379                 scan++;
6380
6381         }
6382         break;
6383
6384     default:            /* Called on something of 0 width. */
6385         break;          /* So match right here or not at all. */
6386     }
6387
6388     if (hardcount)
6389         c = hardcount;
6390     else
6391         c = scan - PL_reginput;
6392     PL_reginput = scan;
6393
6394     DEBUG_r({
6395         GET_RE_DEBUG_FLAGS_DECL;
6396         DEBUG_EXECUTE_r({
6397             SV * const prop = sv_newmortal();
6398             regprop(prog, prop, p);
6399             PerlIO_printf(Perl_debug_log,
6400                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6401                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6402         });
6403     });
6404
6405     return(c);
6406 }
6407
6408
6409 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6410 /*
6411 - regclass_swash - prepare the utf8 swash
6412 */
6413
6414 SV *
6415 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6416 {
6417     dVAR;
6418     SV *sw  = NULL;
6419     SV *si  = NULL;
6420     SV *alt = NULL;
6421     RXi_GET_DECL(prog,progi);
6422     const struct reg_data * const data = prog ? progi->data : NULL;
6423
6424     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6425
6426     assert(ANYOF_NONBITMAP(node));
6427
6428     if (data && data->count) {
6429         const U32 n = ARG(node);
6430
6431         if (data->what[n] == 's') {
6432             SV * const rv = MUTABLE_SV(data->data[n]);
6433             AV * const av = MUTABLE_AV(SvRV(rv));
6434             SV **const ary = AvARRAY(av);
6435             SV **a, **b;
6436
6437             /* See the end of regcomp.c:S_regclass() for
6438              * documentation of these array elements. */
6439
6440             si = *ary;
6441             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6442             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6443
6444             if (a)
6445                 sw = *a;
6446             else if (si && doinit) {
6447                 sw = swash_init("utf8", "", si, 1, 0);
6448                 (void)av_store(av, 1, sw);
6449             }
6450             if (b)
6451                 alt = *b;
6452         }
6453     }
6454
6455     if (listsvp)
6456         *listsvp = si;
6457     if (altsvp)
6458         *altsvp  = alt;
6459
6460     return sw;
6461 }
6462 #endif
6463
6464 /*
6465  - reginclass - determine if a character falls into a character class
6466
6467   n is the ANYOF regnode
6468   p is the target string
6469   lenp is pointer to the maximum number of bytes of how far to go in p
6470     (This is assumed wthout checking to always be at least the current
6471     character's size)
6472   utf8_target tells whether p is in UTF-8.
6473
6474   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6475   from a successful match, the value it points to will be updated to how many
6476   bytes in p were matched.  If there was no match, the value is undefined,
6477   possibly changed from the input.
6478
6479   Note that this can be a synthetic start class, a combination of various
6480   nodes, so things you think might be mutually exclusive, such as locale,
6481   aren't.  It can match both locale and non-locale
6482
6483  */
6484
6485 STATIC bool
6486 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6487 {
6488     dVAR;
6489     const char flags = ANYOF_FLAGS(n);
6490     bool match = FALSE;
6491     UV c = *p;
6492     STRLEN c_len = 0;
6493     STRLEN maxlen;
6494
6495     PERL_ARGS_ASSERT_REGINCLASS;
6496
6497     /* If c is not already the code point, get it */
6498     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6499         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6500                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6501                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6502                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6503                  * UTF8_ALLOW_FFFF */
6504         if (c_len == (STRLEN)-1)
6505             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6506     }
6507     else {
6508         c_len = 1;
6509     }
6510
6511     /* Use passed in max length, or one character if none passed in or less
6512      * than one character.  And assume will match just one character.  This is
6513      * overwritten later if matched more. */
6514     if (lenp) {
6515         maxlen = (*lenp > c_len) ? *lenp : c_len;
6516         *lenp = c_len;
6517
6518     }
6519     else {
6520         maxlen = c_len;
6521     }
6522
6523     /* If this character is potentially in the bitmap, check it */
6524     if (c < 256) {
6525         if (ANYOF_BITMAP_TEST(n, c))
6526             match = TRUE;
6527         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6528                 && ! utf8_target
6529                 && ! isASCII(c))
6530         {
6531             match = TRUE;
6532         }
6533
6534         else if (flags & ANYOF_LOCALE) {
6535             PL_reg_flags |= RF_tainted;
6536
6537             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6538                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6539             {
6540                 match = TRUE;
6541             }
6542             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6543                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6544                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6545                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6546                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6547                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6548                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6549                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6550                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6562                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6563                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6564                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6565                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6566                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6567                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6568                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6569                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6570                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6571                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6572                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6573                      ) /* How's that for a conditional? */
6574             ) {
6575                 match = TRUE;
6576             }
6577         }
6578     }
6579
6580     /* If the bitmap didn't (or couldn't) match, and something outside the
6581      * bitmap could match, try that.  Locale nodes specifiy completely the
6582      * behavior of code points in the bit map (otherwise, a utf8 target would
6583      * cause them to be treated as Unicode and not locale), except in
6584      * the very unlikely event when this node is a synthetic start class, which
6585      * could be a combination of locale and non-locale nodes.  So allow locale
6586      * to match for the synthetic start class, which will give a false
6587      * positive that will be resolved when the match is done again as not part
6588      * of the synthetic start class */
6589     if (!match) {
6590         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6591             match = TRUE;       /* Everything above 255 matches */
6592         }
6593         else if (ANYOF_NONBITMAP(n)
6594                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6595                      || (utf8_target
6596                          && (c >=256
6597                              || (! (flags & ANYOF_LOCALE))
6598                              || (flags & ANYOF_IS_SYNTHETIC)))))
6599         {
6600             AV *av;
6601             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6602
6603             if (sw) {
6604                 U8 * utf8_p;
6605                 if (utf8_target) {
6606                     utf8_p = (U8 *) p;
6607                 } else {
6608
6609                     /* Not utf8.  Convert as much of the string as available up
6610                      * to the limit of how far the (single) character in the
6611                      * pattern can possibly match (no need to go further).  If
6612                      * the node is a straight ANYOF or not folding, it can't
6613                      * match more than one.  Otherwise, It can match up to how
6614                      * far a single char can fold to.  Since not utf8, each
6615                      * character is a single byte, so the max it can be in
6616                      * bytes is the same as the max it can be in characters */
6617                     STRLEN len = (OP(n) == ANYOF
6618                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6619                                   ? 1
6620                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6621                                     ? maxlen
6622                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6623                     utf8_p = bytes_to_utf8(p, &len);
6624                 }
6625
6626                 if (swash_fetch(sw, utf8_p, TRUE))
6627                     match = TRUE;
6628                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6629
6630                     /* Here, we need to test if the fold of the target string
6631                      * matches.  The non-multi char folds have all been moved to
6632                      * the compilation phase, and the multi-char folds have
6633                      * been stored by regcomp into 'av'; we linearly check to
6634                      * see if any match the target string (folded).   We know
6635                      * that the originals were each one character, but we don't
6636                      * currently know how many characters/bytes each folded to,
6637                      * except we do know that there are small limits imposed by
6638                      * Unicode.  XXX A performance enhancement would be to have
6639                      * regcomp.c store the max number of chars/bytes that are
6640                      * in an av entry, as, say the 0th element.  Even better
6641                      * would be to have a hash of the few characters that can
6642                      * start a multi-char fold to the max number of chars of
6643                      * those folds.
6644                      *
6645                      * If there is a match, we will need to advance (if lenp is
6646                      * specified) the match pointer in the target string.  But
6647                      * what we are comparing here isn't that string directly,
6648                      * but its fold, whose length may differ from the original.
6649                      * As we go along in constructing the fold, therefore, we
6650                      * create a map so that we know how many bytes in the
6651                      * source to advance given that we have matched a certain
6652                      * number of bytes in the fold.  This map is stored in
6653                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6654                      * the fold of the first character that we are folding.
6655                      * Then map_fold_len_back[n] is set to the number of bytes
6656                      * in that first character.  Similarly let m be the
6657                      * corresponding number for the second character to be
6658                      * folded.  Then map_fold_len_back[n+m] is set to the
6659                      * number of bytes occupied by the first two source
6660                      * characters. ... */
6661                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6662                     U8 folded[UTF8_MAXBYTES_CASE+1];
6663                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6664                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6665                                                   chars */
6666
6667                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6668
6669                         /* Here, only need to fold the first char of the target
6670                          * string.  It the source wasn't utf8, is 1 byte long */
6671                         to_utf8_fold(utf8_p, folded, &foldlen);
6672                         total_foldlen = foldlen;
6673                         map_fold_len_back[foldlen] = (utf8_target)
6674                                                      ? UTF8SKIP(utf8_p)
6675                                                      : 1;
6676                     }
6677                     else {
6678
6679                         /* Here, need to fold more than the first char.  Do so
6680                          * up to the limits */
6681                         U8* source_ptr = utf8_p;    /* The source for the fold
6682                                                        is the regex target
6683                                                        string */
6684                         U8* folded_ptr = folded;
6685                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6686                                                        available byte in the
6687                                                        target string */
6688                         U8 i;
6689                         for (i = 0;
6690                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6691                              i++)
6692                         {
6693
6694                             /* Fold the next character */
6695                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6696                             STRLEN this_char_foldlen;
6697                             to_utf8_fold(source_ptr,
6698                                          this_char_folded,
6699                                          &this_char_foldlen);
6700
6701                             /* Bail if it would exceed the byte limit for
6702                              * folding a single char. */
6703                             if (this_char_foldlen + folded_ptr - folded >
6704                                                             UTF8_MAXBYTES_CASE)
6705                             {
6706                                 break;
6707                             }
6708
6709                             /* Add the fold of this character */
6710                             Copy(this_char_folded,
6711                                  folded_ptr,
6712                                  this_char_foldlen,
6713                                  U8);
6714                             source_ptr += UTF8SKIP(source_ptr);
6715                             folded_ptr += this_char_foldlen;
6716                             total_foldlen = folded_ptr - folded;
6717
6718                             /* Create map from the number of bytes in the fold
6719                              * back to the number of bytes in the source.  If
6720                              * the source isn't utf8, the byte count is just
6721                              * the number of characters so far */
6722                             map_fold_len_back[total_foldlen]
6723                                                       = (utf8_target)
6724                                                         ? source_ptr - utf8_p
6725                                                         : i + 1;
6726                         }
6727                         *folded_ptr = '\0';
6728                     }
6729
6730
6731                     /* Do the linear search to see if the fold is in the list
6732                      * of multi-char folds. */
6733                     if (av) {
6734                         I32 i;
6735                         for (i = 0; i <= av_len(av); i++) {
6736                             SV* const sv = *av_fetch(av, i, FALSE);
6737                             STRLEN len;
6738                             const char * const s = SvPV_const(sv, len);
6739
6740                             if (len <= total_foldlen && memEQ(s,
6741                                                                (char*)folded,
6742                                                                len))
6743                             {
6744
6745                                 /* Advance the target string ptr to account for
6746                                  * this fold, but have to translate from the
6747                                  * folded length to the corresponding source
6748                                  * length. */
6749                                 if (lenp) {
6750                                     *lenp = map_fold_len_back[len];
6751                                     assert(*lenp != 0); /* Otherwise will loop */
6752                                 }
6753                                 match = TRUE;
6754                                 break;
6755                             }
6756                         }
6757                     }
6758                 }
6759
6760                 /* If we allocated a string above, free it */
6761                 if (! utf8_target) Safefree(utf8_p);
6762             }
6763         }
6764     }
6765
6766     return (flags & ANYOF_INVERT) ? !match : match;
6767 }
6768
6769 STATIC U8 *
6770 S_reghop3(U8 *s, I32 off, const U8* lim)
6771 {
6772     dVAR;
6773
6774     PERL_ARGS_ASSERT_REGHOP3;
6775
6776     if (off >= 0) {
6777         while (off-- && s < lim) {
6778             /* XXX could check well-formedness here */
6779             s += UTF8SKIP(s);
6780         }
6781     }
6782     else {
6783         while (off++ && s > lim) {
6784             s--;
6785             if (UTF8_IS_CONTINUED(*s)) {
6786                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6787                     s--;
6788             }
6789             /* XXX could check well-formedness here */
6790         }
6791     }
6792     return s;
6793 }
6794
6795 #ifdef XXX_dmq
6796 /* there are a bunch of places where we use two reghop3's that should
6797    be replaced with this routine. but since thats not done yet
6798    we ifdef it out - dmq
6799 */
6800 STATIC U8 *
6801 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6802 {
6803     dVAR;
6804
6805     PERL_ARGS_ASSERT_REGHOP4;
6806
6807     if (off >= 0) {
6808         while (off-- && s < rlim) {
6809             /* XXX could check well-formedness here */
6810             s += UTF8SKIP(s);
6811         }
6812     }
6813     else {
6814         while (off++ && s > llim) {
6815             s--;
6816             if (UTF8_IS_CONTINUED(*s)) {
6817                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6818                     s--;
6819             }
6820             /* XXX could check well-formedness here */
6821         }
6822     }
6823     return s;
6824 }
6825 #endif
6826
6827 STATIC U8 *
6828 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6829 {
6830     dVAR;
6831
6832     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6833
6834     if (off >= 0) {
6835         while (off-- && s < lim) {
6836             /* XXX could check well-formedness here */
6837             s += UTF8SKIP(s);
6838         }
6839         if (off >= 0)
6840             return NULL;
6841     }
6842     else {
6843         while (off++ && s > lim) {
6844             s--;
6845             if (UTF8_IS_CONTINUED(*s)) {
6846                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6847                     s--;
6848             }
6849             /* XXX could check well-formedness here */
6850         }
6851         if (off <= 0)
6852             return NULL;
6853     }
6854     return s;
6855 }
6856
6857 static void
6858 restore_pos(pTHX_ void *arg)
6859 {
6860     dVAR;
6861     regexp * const rex = (regexp *)arg;
6862     if (PL_reg_eval_set) {
6863         if (PL_reg_oldsaved) {
6864             rex->subbeg = PL_reg_oldsaved;
6865             rex->sublen = PL_reg_oldsavedlen;
6866 #ifdef PERL_OLD_COPY_ON_WRITE
6867             rex->saved_copy = PL_nrs;
6868 #endif
6869             RXp_MATCH_COPIED_on(rex);
6870         }
6871         PL_reg_magic->mg_len = PL_reg_oldpos;
6872         PL_reg_eval_set = 0;
6873         PL_curpm = PL_reg_oldcurpm;
6874     }
6875 }
6876
6877 STATIC void
6878 S_to_utf8_substr(pTHX_ register regexp *prog)
6879 {
6880     int i = 1;
6881
6882     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6883
6884     do {
6885         if (prog->substrs->data[i].substr
6886             && !prog->substrs->data[i].utf8_substr) {
6887             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6888             prog->substrs->data[i].utf8_substr = sv;
6889             sv_utf8_upgrade(sv);
6890             if (SvVALID(prog->substrs->data[i].substr)) {
6891                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6892                 if (flags & FBMcf_TAIL) {
6893                     /* Trim the trailing \n that fbm_compile added last
6894                        time.  */
6895                     SvCUR_set(sv, SvCUR(sv) - 1);
6896                     /* Whilst this makes the SV technically "invalid" (as its
6897                        buffer is no longer followed by "\0") when fbm_compile()
6898                        adds the "\n" back, a "\0" is restored.  */
6899                 }
6900                 fbm_compile(sv, flags);
6901             }
6902             if (prog->substrs->data[i].substr == prog->check_substr)
6903                 prog->check_utf8 = sv;
6904         }
6905     } while (i--);
6906 }
6907
6908 STATIC void
6909 S_to_byte_substr(pTHX_ register regexp *prog)
6910 {
6911     dVAR;
6912     int i = 1;
6913
6914     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6915
6916     do {
6917         if (prog->substrs->data[i].utf8_substr
6918             && !prog->substrs->data[i].substr) {
6919             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6920             if (sv_utf8_downgrade(sv, TRUE)) {
6921                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6922                     const U8 flags
6923                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6924                     if (flags & FBMcf_TAIL) {
6925                         /* Trim the trailing \n that fbm_compile added last
6926                            time.  */
6927                         SvCUR_set(sv, SvCUR(sv) - 1);
6928                     }
6929                     fbm_compile(sv, flags);
6930                 }
6931             } else {
6932                 SvREFCNT_dec(sv);
6933                 sv = &PL_sv_undef;
6934             }
6935             prog->substrs->data[i].substr = sv;
6936             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6937                 prog->check_substr = sv;
6938         }
6939     } while (i--);
6940 }
6941
6942 /*
6943  * Local variables:
6944  * c-indentation-style: bsd
6945  * c-basic-offset: 4
6946  * indent-tabs-mode: t
6947  * End:
6948  *
6949  * ex: set ts=8 sts=4 sw=4 noet:
6950  */