src/5015000/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { \
 128         bool ok; \
 129         ENTER; save_re_context(); \
 130         ok=CAT2(is_utf8_,class)((const U8*)str); \
 131         assert(ok); LEAVE; } } STMT_END
 132
 133 /* Doesn't do an assert to verify that is correct */
 134 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 135     if (!CAT2(PL_utf8_,class)) { \
 136         bool throw_away __attribute__unused__; \
 137         ENTER; save_re_context(); \
 138         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 139         LEAVE; } } STMT_END
 140
 141 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 142 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 143 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 144
 145 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 146         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 147         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 148         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 149             * assert should likely and hopefully fail on an EBCDIC machine */ \
 150         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 151                                                                             \
 152         /* No asserts are done for these, in case called on an early        \
 153             * Unicode version in which they map to nothing */               \
 154         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 155         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 156         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 157         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 161
 162 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 163
 164 /* The actual code for CCC_TRY, which uses several variables from the routine
 165  * it's callable from.  It is designed to be the bulk of a case statement.
 166  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 167  *      nextchr matches the class.
 168  * UTF8_TEST is the whole test string to use for utf8 targets
 169  * LOAD is what to use to test, and if not present to load in the swash for the
 170  *      class
 171  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 172  *      UTF8_TEST test.
 173  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 174  * utf8 and a variant, load the swash if necessary and test using the utf8
 175  * test.  Advance to the next character if test is ok, otherwise fail; If not
 176  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 177  * fails, or advance to the next character */
 178
 179 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 180     if (locinput >= PL_regeol) {                                              \
 181         sayNO;                                                                \
 182     }                                                                         \
 183     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 184         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 185         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 186             sayNO;                                                            \
 187         }                                                                     \
 188         locinput += PL_utf8skip[nextchr];                                     \
 189         nextchr = UCHARAT(locinput);                                          \
 190         break;                                                                \
 191     }                                                                         \
 192     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 193         sayNO;                                                                \
 194     }                                                                         \
 195     nextchr = UCHARAT(++locinput);                                            \
 196     break;
 197
 198 /* Handle the non-locale cases for a character class and its complement.  It
 199  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 200  * This is because that code fails when the test succeeds, so we want to have
 201  * the test fail so that the code succeeds.  The swash is stored in a
 202  * predictable PL_ place */
 203 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 204                            CLASS, STR)                                        \
 205     case NAME:                                                                \
 206         _CCC_TRY_CODE( !, FUNC,                                               \
 207                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 208                                             (U8*)locinput, TRUE)),            \
 209                           CLASS, STR)                                         \
 210     case NNAME:                                                               \
 211         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 212                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 213                                             (U8*)locinput, TRUE)),            \
 214                           CLASS, STR)                                         \
 215
 216 /* Generate the case statements for both locale and non-locale character
 217  * classes in regmatch for classes that don't have special unicode semantics.
 218  * Locales don't use an immediate swash, but an intermediary special locale
 219  * function that is called on the pointer to the current place in the input
 220  * string.  That function will resolve to needing the same swash.  One might
 221  * think that because we don't know what the locale will match, we shouldn't
 222  * check with the swash loading function that it loaded properly; ie, that we
 223  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 224  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 225  * irrelevant here */
 226 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 227                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 228                 NAMEA, NNAMEA, FUNCA,                                         \
 229                 CLASS, STR)                                                   \
 230     case NAMEL:                                                               \
 231         PL_reg_flags |= RF_tainted;                                           \
 232         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 233     case NNAMEL:                                                              \
 234         PL_reg_flags |= RF_tainted;                                           \
 235         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 236                        CLASS, STR)                                            \
 237     case NAMEA:                                                               \
 238         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 239             sayNO;                                                            \
 240         }                                                                     \
 241         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 242         nextchr = UCHARAT(++locinput);                                        \
 243         break;                                                                \
 244     case NNAMEA:                                                              \
 245         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 246             sayNO;                                                            \
 247         }                                                                     \
 248         if (utf8_target) {                                                    \
 249             locinput += PL_utf8skip[nextchr];                                 \
 250             nextchr = UCHARAT(locinput);                                      \
 251         }                                                                     \
 252         else {                                                                \
 253             nextchr = UCHARAT(++locinput);                                    \
 254         }                                                                     \
 255         break;                                                                \
 256     /* Generate the non-locale cases */                                       \
 257     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 258
 259 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 260  * statements to handle separate Unicode semantics nodes */
 261 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 262                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 263                   NAMEU, NNAMEU, FUNCU,                                        \
 264                   NAMEA, NNAMEA, FUNCA,                                        \
 265                   CLASS, STR)                                                  \
 266     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 267             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 268             NAMEA, NNAMEA, FUNCA,                                              \
 269             CLASS, STR)                                                        \
 270     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 271
 272 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 273
 274 /* for use after a quantifier and before an EXACT-like node -- japhy */
 275 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 276  *
 277  * NOTE that *nothing* that affects backtracking should be in here, specifically
 278  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 279  * node that is in between two EXACT like nodes when ascertaining what the required
 280  * "follow" character is. This should probably be moved to regex compile time
 281  * although it may be done at run time beause of the REF possibility - more
 282  * investigation required. -- demerphq
 283 */
 284 #define JUMPABLE(rn) (      \
 285     OP(rn) == OPEN ||       \
 286     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 287     OP(rn) == EVAL ||   \
 288     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 289     OP(rn) == PLUS || OP(rn) == MINMOD || \
 290     OP(rn) == KEEPS || \
 291     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 292 )
 293 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 294
 295 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 296
 297 #if 0
 298 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 299    we don't need this definition. */
 300 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 301 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 302 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 303
 304 #else
 305 /* ... so we use this as its faster. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 307 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 308 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 309 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 310
 311 #endif
 312
 313 /*
 314   Search for mandatory following text node; for lookahead, the text must
 315   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 316 */
 317 #define FIND_NEXT_IMPT(rn) STMT_START { \
 318     while (JUMPABLE(rn)) { \
 319         const OPCODE type = OP(rn); \
 320         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 321             rn = NEXTOPER(NEXTOPER(rn)); \
 322         else if (type == PLUS) \
 323             rn = NEXTOPER(rn); \
 324         else if (type == IFMATCH) \
 325             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 326         else rn += NEXT_OFF(rn); \
 327     } \
 328 } STMT_END
 329
 330
 331 static void restore_pos(pTHX_ void *arg);
 332
 333 #define REGCP_PAREN_ELEMS 4
 334 #define REGCP_OTHER_ELEMS 5
 335 #define REGCP_FRAME_ELEMS 1
 336 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 337  * are needed for the regexp context stack bookkeeping. */
 338
 339 STATIC CHECKPOINT
 340 S_regcppush(pTHX_ I32 parenfloor)
 341 {
 342     dVAR;
 343     const int retval = PL_savestack_ix;
 344     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 345     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 346     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 347     int p;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     if (paren_elems_to_push < 0)
 351         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 352
 353     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 354         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 355                    " out of range (%lu-%ld)",
 356                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 357
 358     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 359
 360     for (p = PL_regsize; p > parenfloor; p--) {
 361 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 362         SSPUSHINT(PL_regoffs[p].end);
 363         SSPUSHINT(PL_regoffs[p].start);
 364         SSPUSHPTR(PL_reg_start_tmp[p]);
 365         SSPUSHINT(p);
 366         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 367           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 368                       (UV)p, (IV)PL_regoffs[p].start,
 369                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 370                       (IV)PL_regoffs[p].end
 371         ));
 372     }
 373 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 374     SSPUSHPTR(PL_regoffs);
 375     SSPUSHINT(PL_regsize);
 376     SSPUSHINT(*PL_reglastparen);
 377     SSPUSHINT(*PL_reglastcloseparen);
 378     SSPUSHPTR(PL_reginput);
 379     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 380
 381     return retval;
 382 }
 383
 384 /* These are needed since we do not localize EVAL nodes: */
 385 #define REGCP_SET(cp)                                           \
 386     DEBUG_STATE_r(                                              \
 387             PerlIO_printf(Perl_debug_log,                       \
 388                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 389                 (IV)PL_savestack_ix));                          \
 390     cp = PL_savestack_ix
 391
 392 #define REGCP_UNWIND(cp)                                        \
 393     DEBUG_STATE_r(                                              \
 394         if (cp != PL_savestack_ix)                              \
 395             PerlIO_printf(Perl_debug_log,                       \
 396                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 397                 (IV)(cp), (IV)PL_savestack_ix));                \
 398     regcpblow(cp)
 399
 400 STATIC char *
 401 S_regcppop(pTHX_ const regexp *rex)
 402 {
 403     dVAR;
 404     UV i;
 405     char *input;
 406     GET_RE_DEBUG_FLAGS_DECL;
 407
 408     PERL_ARGS_ASSERT_REGCPPOP;
 409
 410     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 411     i = SSPOPUV;
 412     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 413     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 414     input = (char *) SSPOPPTR;
 415     *PL_reglastcloseparen = SSPOPINT;
 416     *PL_reglastparen = SSPOPINT;
 417     PL_regsize = SSPOPINT;
 418     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 419
 420     i -= REGCP_OTHER_ELEMS;
 421     /* Now restore the parentheses context. */
 422     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 423         I32 tmps;
 424         U32 paren = (U32)SSPOPINT;
 425         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 426         PL_regoffs[paren].start = SSPOPINT;
 427         tmps = SSPOPINT;
 428         if (paren <= *PL_reglastparen)
 429             PL_regoffs[paren].end = tmps;
 430         DEBUG_BUFFERS_r(
 431             PerlIO_printf(Perl_debug_log,
 432                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 433                           (UV)paren, (IV)PL_regoffs[paren].start,
 434                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 435                           (IV)PL_regoffs[paren].end,
 436                           (paren > *PL_reglastparen ? "(no)" : ""));
 437         );
 438     }
 439     DEBUG_BUFFERS_r(
 440         if (*PL_reglastparen + 1 <= rex->nparens) {
 441             PerlIO_printf(Perl_debug_log,
 442                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 443                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 444         }
 445     );
 446 #if 1
 447     /* It would seem that the similar code in regtry()
 448      * already takes care of this, and in fact it is in
 449      * a better location to since this code can #if 0-ed out
 450      * but the code in regtry() is needed or otherwise tests
 451      * requiring null fields (pat.t#187 and split.t#{13,14}
 452      * (as of patchlevel 7877)  will fail.  Then again,
 453      * this code seems to be necessary or otherwise
 454      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 455      * --jhi updated by dapm */
 456     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 457         if (i > PL_regsize)
 458             PL_regoffs[i].start = -1;
 459         PL_regoffs[i].end = -1;
 460     }
 461 #endif
 462     return input;
 463 }
 464
 465 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 466
 467 /*
 468  * pregexec and friends
 469  */
 470
 471 #ifndef PERL_IN_XSUB_RE
 472 /*
 473  - pregexec - match a regexp against a string
 474  */
 475 I32
 476 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 477          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 478 /* strend: pointer to null at end of string */
 479 /* strbeg: real beginning of string */
 480 /* minend: end of match must be >=minend after stringarg. */
 481 /* nosave: For optimizations. */
 482 {
 483     PERL_ARGS_ASSERT_PREGEXEC;
 484
 485     return
 486         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 487                       nosave ? 0 : REXEC_COPY_STR);
 488 }
 489 #endif
 490
 491 /*
 492  * Need to implement the following flags for reg_anch:
 493  *
 494  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 495  * USE_INTUIT_ML
 496  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 497  * INTUIT_AUTORITATIVE_ML
 498  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 499  * INTUIT_ONCE_ML
 500  *
 501  * Another flag for this function: SECOND_TIME (so that float substrs
 502  * with giant delta may be not rechecked).
 503  */
 504
 505 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 506
 507 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 508    Otherwise, only SvCUR(sv) is used to get strbeg. */
 509
 510 /* XXXX We assume that strpos is strbeg unless sv. */
 511
 512 /* XXXX Some places assume that there is a fixed substring.
 513         An update may be needed if optimizer marks as "INTUITable"
 514         RExen without fixed substrings.  Similarly, it is assumed that
 515         lengths of all the strings are no more than minlen, thus they
 516         cannot come from lookahead.
 517         (Or minlen should take into account lookahead.)
 518   NOTE: Some of this comment is not correct. minlen does now take account
 519   of lookahead/behind. Further research is required. -- demerphq
 520
 521 */
 522
 523 /* A failure to find a constant substring means that there is no need to make
 524    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 525    finding a substring too deep into the string means that less calls to
 526    regtry() should be needed.
 527
 528    REx compiler's optimizer found 4 possible hints:
 529         a) Anchored substring;
 530         b) Fixed substring;
 531         c) Whether we are anchored (beginning-of-line or \G);
 532         d) First node (of those at offset 0) which may distinguish positions;
 533    We use a)b)d) and multiline-part of c), and try to find a position in the
 534    string which does not contradict any of them.
 535  */
 536
 537 /* Most of decisions we do here should have been done at compile time.
 538    The nodes of the REx which we used for the search should have been
 539    deleted from the finite automaton. */
 540
 541 char *
 542 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 543                      char *strend, const U32 flags, re_scream_pos_data *data)
 544 {
 545     dVAR;
 546     struct regexp *const prog = (struct regexp *)SvANY(rx);
 547     register I32 start_shift = 0;
 548     /* Should be nonnegative! */
 549     register I32 end_shift   = 0;
 550     register char *s;
 551     register SV *check;
 552     char *strbeg;
 553     char *t;
 554     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 555     I32 ml_anch;
 556     register char *other_last = NULL;   /* other substr checked before this */
 557     char *check_at = NULL;              /* check substr found at this pos */
 558     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 559     RXi_GET_DECL(prog,progi);
 560 #ifdef DEBUGGING
 561     const char * const i_strpos = strpos;
 562 #endif
 563     GET_RE_DEBUG_FLAGS_DECL;
 564
 565     PERL_ARGS_ASSERT_RE_INTUIT_START;
 566
 567     RX_MATCH_UTF8_set(rx,utf8_target);
 568
 569     if (RX_UTF8(rx)) {
 570         PL_reg_flags |= RF_utf8;
 571     }
 572     DEBUG_EXECUTE_r(
 573         debug_start_match(rx, utf8_target, strpos, strend,
 574             sv ? "Guessing start of match in sv for"
 575                : "Guessing start of match in string for");
 576               );
 577
 578     /* CHR_DIST() would be more correct here but it makes things slow. */
 579     if (prog->minlen > strend - strpos) {
 580         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 581                               "String too short... [re_intuit_start]\n"));
 582         goto fail;
 583     }
 584
 585     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 586     PL_regeol = strend;
 587     if (utf8_target) {
 588         if (!prog->check_utf8 && prog->check_substr)
 589             to_utf8_substr(prog);
 590         check = prog->check_utf8;
 591     } else {
 592         if (!prog->check_substr && prog->check_utf8)
 593             to_byte_substr(prog);
 594         check = prog->check_substr;
 595     }
 596     if (check == &PL_sv_undef) {
 597         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 598                 "Non-utf8 string cannot match utf8 check string\n"));
 599         goto fail;
 600     }
 601     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 602         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 603                      || ( (prog->extflags & RXf_ANCH_BOL)
 604                           && !multiline ) );    /* Check after \n? */
 605
 606         if (!ml_anch) {
 607           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 608                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 609                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 610                && sv && !SvROK(sv)
 611                && (strpos != strbeg)) {
 612               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 613               goto fail;
 614           }
 615           if (prog->check_offset_min == prog->check_offset_max &&
 616               !(prog->extflags & RXf_CANY_SEEN)) {
 617             /* Substring at constant offset from beg-of-str... */
 618             I32 slen;
 619
 620             s = HOP3c(strpos, prog->check_offset_min, strend);
 621
 622             if (SvTAIL(check)) {
 623                 slen = SvCUR(check);    /* >= 1 */
 624
 625                 if ( strend - s > slen || strend - s < slen - 1
 626                      || (strend - s == slen && strend[-1] != '\n')) {
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 628                     goto fail_finish;
 629                 }
 630                 /* Now should match s[0..slen-2] */
 631                 slen--;
 632                 if (slen && (*SvPVX_const(check) != *s
 633                              || (slen > 1
 634                                  && memNE(SvPVX_const(check), s, slen)))) {
 635                   report_neq:
 636                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 637                     goto fail_finish;
 638                 }
 639             }
 640             else if (*SvPVX_const(check) != *s
 641                      || ((slen = SvCUR(check)) > 1
 642                          && memNE(SvPVX_const(check), s, slen)))
 643                 goto report_neq;
 644             check_at = s;
 645             goto success_at_start;
 646           }
 647         }
 648         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 649         s = strpos;
 650         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 651         end_shift = prog->check_end_shift;
 652
 653         if (!ml_anch) {
 654             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 655                                          - (SvTAIL(check) != 0);
 656             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 657
 658             if (end_shift < eshift)
 659                 end_shift = eshift;
 660         }
 661     }
 662     else {                              /* Can match at random position */
 663         ml_anch = 0;
 664         s = strpos;
 665         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 666         end_shift = prog->check_end_shift;
 667
 668         /* end shift should be non negative here */
 669     }
 670
 671 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 672     if (end_shift < 0)
 673         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 674                    (IV)end_shift, RX_PRECOMP(prog));
 675 #endif
 676
 677   restart:
 678     /* Find a possible match in the region s..strend by looking for
 679        the "check" substring in the region corrected by start/end_shift. */
 680
 681     {
 682         I32 srch_start_shift = start_shift;
 683         I32 srch_end_shift = end_shift;
 684         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 685             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 686             srch_start_shift = strbeg - s;
 687         }
 688     DEBUG_OPTIMISE_MORE_r({
 689         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 690             (IV)prog->check_offset_min,
 691             (IV)srch_start_shift,
 692             (IV)srch_end_shift,
 693             (IV)prog->check_end_shift);
 694     });
 695
 696     if (flags & REXEC_SCREAM) {
 697         I32 p = -1;                     /* Internal iterator of scream. */
 698         I32 * const pp = data ? data->scream_pos : &p;
 699
 700         if (PL_screamfirst[BmRARE(check)] >= 0
 701             || ( BmRARE(check) == '\n'
 702                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 703                  && SvTAIL(check) ))
 704             s = screaminstr(sv, check,
 705                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 706         else
 707             goto fail_finish;
 708         /* we may be pointing at the wrong string */
 709         if (s && RXp_MATCH_COPIED(prog))
 710             s = strbeg + (s - SvPVX_const(sv));
 711         if (data)
 712             *data->scream_olds = s;
 713     }
 714     else {
 715         U8* start_point;
 716         U8* end_point;
 717         if (prog->extflags & RXf_CANY_SEEN) {
 718             start_point= (U8*)(s + srch_start_shift);
 719             end_point= (U8*)(strend - srch_end_shift);
 720         } else {
 721             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 722             end_point= HOP3(strend, -srch_end_shift, strbeg);
 723         }
 724         DEBUG_OPTIMISE_MORE_r({
 725             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 726                 (int)(end_point - start_point),
 727                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 728                 start_point);
 729         });
 730
 731         s = fbm_instr( start_point, end_point,
 732                       check, multiline ? FBMrf_MULTILINE : 0);
 733     }
 734     }
 735     /* Update the count-of-usability, remove useless subpatterns,
 736         unshift s.  */
 737
 738     DEBUG_EXECUTE_r({
 739         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 740             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 741         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 742                           (s ? "Found" : "Did not find"),
 743             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 744                 ? "anchored" : "floating"),
 745             quoted,
 746             RE_SV_TAIL(check),
 747             (s ? " at offset " : "...\n") );
 748     });
 749
 750     if (!s)
 751         goto fail_finish;
 752     /* Finish the diagnostic message */
 753     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 754
 755     /* XXX dmq: first branch is for positive lookbehind...
 756        Our check string is offset from the beginning of the pattern.
 757        So we need to do any stclass tests offset forward from that
 758        point. I think. :-(
 759      */
 760
 761
 762
 763     check_at=s;
 764
 765
 766     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 767        Start with the other substr.
 768        XXXX no SCREAM optimization yet - and a very coarse implementation
 769        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 770                 *always* match.  Probably should be marked during compile...
 771        Probably it is right to do no SCREAM here...
 772      */
 773
 774     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 775                 : (prog->float_substr && prog->anchored_substr))
 776     {
 777         /* Take into account the "other" substring. */
 778         /* XXXX May be hopelessly wrong for UTF... */
 779         if (!other_last)
 780             other_last = strpos;
 781         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 782           do_other_anchored:
 783             {
 784                 char * const last = HOP3c(s, -start_shift, strbeg);
 785                 char *last1, *last2;
 786                 char * const saved_s = s;
 787                 SV* must;
 788
 789                 t = s - prog->check_offset_max;
 790                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 791                     && (!utf8_target
 792                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 793                             && t > strpos)))
 794                     NOOP;
 795                 else
 796                     t = strpos;
 797                 t = HOP3c(t, prog->anchored_offset, strend);
 798                 if (t < other_last)     /* These positions already checked */
 799                     t = other_last;
 800                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 801                 if (last < last1)
 802                     last1 = last;
 803                 /* XXXX It is not documented what units *_offsets are in.
 804                    We assume bytes, but this is clearly wrong.
 805                    Meaning this code needs to be carefully reviewed for errors.
 806                    dmq.
 807                   */
 808
 809                 /* On end-of-str: see comment below. */
 810                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 811                 if (must == &PL_sv_undef) {
 812                     s = (char*)NULL;
 813                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 814                 }
 815                 else
 816                     s = fbm_instr(
 817                         (unsigned char*)t,
 818                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 819                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 820                         must,
 821                         multiline ? FBMrf_MULTILINE : 0
 822                     );
 823                 DEBUG_EXECUTE_r({
 824                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 825                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 826                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 827                         (s ? "Found" : "Contradicts"),
 828                         quoted, RE_SV_TAIL(must));
 829                 });
 830
 831
 832                 if (!s) {
 833                     if (last1 >= last2) {
 834                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 835                                                 ", giving up...\n"));
 836                         goto fail_finish;
 837                     }
 838                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 839                         ", trying floating at offset %ld...\n",
 840                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 841                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 842                     s = HOP3c(last, 1, strend);
 843                     goto restart;
 844                 }
 845                 else {
 846                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 847                           (long)(s - i_strpos)));
 848                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 849                     other_last = HOP3c(s, 1, strend);
 850                     s = saved_s;
 851                     if (t == strpos)
 852                         goto try_at_start;
 853                     goto try_at_offset;
 854                 }
 855             }
 856         }
 857         else {          /* Take into account the floating substring. */
 858             char *last, *last1;
 859             char * const saved_s = s;
 860             SV* must;
 861
 862             t = HOP3c(s, -start_shift, strbeg);
 863             last1 = last =
 864                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 865             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 866                 last = HOP3c(t, prog->float_max_offset, strend);
 867             s = HOP3c(t, prog->float_min_offset, strend);
 868             if (s < other_last)
 869                 s = other_last;
 870  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 871             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 872             /* fbm_instr() takes into account exact value of end-of-str
 873                if the check is SvTAIL(ed).  Since false positives are OK,
 874                and end-of-str is not later than strend we are OK. */
 875             if (must == &PL_sv_undef) {
 876                 s = (char*)NULL;
 877                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 878             }
 879             else
 880                 s = fbm_instr((unsigned char*)s,
 881                               (unsigned char*)last + SvCUR(must)
 882                                   - (SvTAIL(must)!=0),
 883                               must, multiline ? FBMrf_MULTILINE : 0);
 884             DEBUG_EXECUTE_r({
 885                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 886                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 887                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 888                     (s ? "Found" : "Contradicts"),
 889                     quoted, RE_SV_TAIL(must));
 890             });
 891             if (!s) {
 892                 if (last1 == last) {
 893                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 894                                             ", giving up...\n"));
 895                     goto fail_finish;
 896                 }
 897                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 898                     ", trying anchored starting at offset %ld...\n",
 899                     (long)(saved_s + 1 - i_strpos)));
 900                 other_last = last;
 901                 s = HOP3c(t, 1, strend);
 902                 goto restart;
 903             }
 904             else {
 905                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 906                       (long)(s - i_strpos)));
 907                 other_last = s; /* Fix this later. --Hugo */
 908                 s = saved_s;
 909                 if (t == strpos)
 910                     goto try_at_start;
 911                 goto try_at_offset;
 912             }
 913         }
 914     }
 915
 916
 917     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 918
 919     DEBUG_OPTIMISE_MORE_r(
 920         PerlIO_printf(Perl_debug_log,
 921             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 922             (IV)prog->check_offset_min,
 923             (IV)prog->check_offset_max,
 924             (IV)(s-strpos),
 925             (IV)(t-strpos),
 926             (IV)(t-s),
 927             (IV)(strend-strpos)
 928         )
 929     );
 930
 931     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 932         && (!utf8_target
 933             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 934                  && t > strpos)))
 935     {
 936         /* Fixed substring is found far enough so that the match
 937            cannot start at strpos. */
 938       try_at_offset:
 939         if (ml_anch && t[-1] != '\n') {
 940             /* Eventually fbm_*() should handle this, but often
 941                anchored_offset is not 0, so this check will not be wasted. */
 942             /* XXXX In the code below we prefer to look for "^" even in
 943                presence of anchored substrings.  And we search even
 944                beyond the found float position.  These pessimizations
 945                are historical artefacts only.  */
 946           find_anchor:
 947             while (t < strend - prog->minlen) {
 948                 if (*t == '\n') {
 949                     if (t < check_at - prog->check_offset_min) {
 950                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 951                             /* Since we moved from the found position,
 952                                we definitely contradict the found anchored
 953                                substr.  Due to the above check we do not
 954                                contradict "check" substr.
 955                                Thus we can arrive here only if check substr
 956                                is float.  Redo checking for "other"=="fixed".
 957                              */
 958                             strpos = t + 1;
 959                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 960                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 961                             goto do_other_anchored;
 962                         }
 963                         /* We don't contradict the found floating substring. */
 964                         /* XXXX Why not check for STCLASS? */
 965                         s = t + 1;
 966                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 967                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 968                         goto set_useful;
 969                     }
 970                     /* Position contradicts check-string */
 971                     /* XXXX probably better to look for check-string
 972                        than for "\n", so one should lower the limit for t? */
 973                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 974                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 975                     other_last = strpos = s = t + 1;
 976                     goto restart;
 977                 }
 978                 t++;
 979             }
 980             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 981                         PL_colors[0], PL_colors[1]));
 982             goto fail_finish;
 983         }
 984         else {
 985             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 986                         PL_colors[0], PL_colors[1]));
 987         }
 988         s = t;
 989       set_useful:
 990         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 991     }
 992     else {
 993         /* The found string does not prohibit matching at strpos,
 994            - no optimization of calling REx engine can be performed,
 995            unless it was an MBOL and we are not after MBOL,
 996            or a future STCLASS check will fail this. */
 997       try_at_start:
 998         /* Even in this situation we may use MBOL flag if strpos is offset
 999            wrt the start of the string. */
1000         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1001             && (strpos != strbeg) && strpos[-1] != '\n'
1002             /* May be due to an implicit anchor of m{.*foo}  */
1003             && !(prog->intflags & PREGf_IMPLICIT))
1004         {
1005             t = strpos;
1006             goto find_anchor;
1007         }
1008         DEBUG_EXECUTE_r( if (ml_anch)
1009             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1010                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1011         );
1012       success_at_start:
1013         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1014             && (utf8_target ? (
1015                 prog->check_utf8                /* Could be deleted already */
1016                 && --BmUSEFUL(prog->check_utf8) < 0
1017                 && (prog->check_utf8 == prog->float_utf8)
1018             ) : (
1019                 prog->check_substr              /* Could be deleted already */
1020                 && --BmUSEFUL(prog->check_substr) < 0
1021                 && (prog->check_substr == prog->float_substr)
1022             )))
1023         {
1024             /* If flags & SOMETHING - do not do it many times on the same match */
1025             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1026             /* XXX Does the destruction order has to change with utf8_target? */
1027             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1028             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1029             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1030             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1031             check = NULL;                       /* abort */
1032             s = strpos;
1033             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1034                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1035             if (prog->intflags & PREGf_IMPLICIT)
1036                 prog->extflags &= ~RXf_ANCH_MBOL;
1037             /* XXXX This is a remnant of the old implementation.  It
1038                     looks wasteful, since now INTUIT can use many
1039                     other heuristics. */
1040             prog->extflags &= ~RXf_USE_INTUIT;
1041             /* XXXX What other flags might need to be cleared in this branch? */
1042         }
1043         else
1044             s = strpos;
1045     }
1046
1047     /* Last resort... */
1048     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1049     /* trie stclasses are too expensive to use here, we are better off to
1050        leave it to regmatch itself */
1051     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1052         /* minlen == 0 is possible if regstclass is \b or \B,
1053            and the fixed substr is ''$.
1054            Since minlen is already taken into account, s+1 is before strend;
1055            accidentally, minlen >= 1 guaranties no false positives at s + 1
1056            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1057            regstclass does not come from lookahead...  */
1058         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1059            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1060         const U8* const str = (U8*)STRING(progi->regstclass);
1061         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1062                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1063                     : 1);
1064         char * endpos;
1065         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1066             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1067         else if (prog->float_substr || prog->float_utf8)
1068             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1069         else
1070             endpos= strend;
1071
1072         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1073                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1074
1075         t = s;
1076         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1077         if (!s) {
1078 #ifdef DEBUGGING
1079             const char *what = NULL;
1080 #endif
1081             if (endpos == strend) {
1082                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1083                                 "Could not match STCLASS...\n") );
1084                 goto fail;
1085             }
1086             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1087                                    "This position contradicts STCLASS...\n") );
1088             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1089                 goto fail;
1090             /* Contradict one of substrings */
1091             if (prog->anchored_substr || prog->anchored_utf8) {
1092                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1093                     DEBUG_EXECUTE_r( what = "anchored" );
1094                   hop_and_restart:
1095                     s = HOP3c(t, 1, strend);
1096                     if (s + start_shift + end_shift > strend) {
1097                         /* XXXX Should be taken into account earlier? */
1098                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1099                                                "Could not match STCLASS...\n") );
1100                         goto fail;
1101                     }
1102                     if (!check)
1103                         goto giveup;
1104                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1105                                 "Looking for %s substr starting at offset %ld...\n",
1106                                  what, (long)(s + start_shift - i_strpos)) );
1107                     goto restart;
1108                 }
1109                 /* Have both, check_string is floating */
1110                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1111                     goto retry_floating_check;
1112                 /* Recheck anchored substring, but not floating... */
1113                 s = check_at;
1114                 if (!check)
1115                     goto giveup;
1116                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1117                           "Looking for anchored substr starting at offset %ld...\n",
1118                           (long)(other_last - i_strpos)) );
1119                 goto do_other_anchored;
1120             }
1121             /* Another way we could have checked stclass at the
1122                current position only: */
1123             if (ml_anch) {
1124                 s = t = t + 1;
1125                 if (!check)
1126                     goto giveup;
1127                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1128                           "Looking for /%s^%s/m starting at offset %ld...\n",
1129                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1130                 goto try_at_offset;
1131             }
1132             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1133                 goto fail;
1134             /* Check is floating substring. */
1135           retry_floating_check:
1136             t = check_at - start_shift;
1137             DEBUG_EXECUTE_r( what = "floating" );
1138             goto hop_and_restart;
1139         }
1140         if (t != s) {
1141             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1142                         "By STCLASS: moving %ld --> %ld\n",
1143                                   (long)(t - i_strpos), (long)(s - i_strpos))
1144                    );
1145         }
1146         else {
1147             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1148                                   "Does not contradict STCLASS...\n");
1149                    );
1150         }
1151     }
1152   giveup:
1153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1154                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1155                           PL_colors[5], (long)(s - i_strpos)) );
1156     return s;
1157
1158   fail_finish:                          /* Substring not found */
1159     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1160         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1161   fail:
1162     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1163                           PL_colors[4], PL_colors[5]));
1164     return NULL;
1165 }
1166
1167 #define DECL_TRIE_TYPE(scan) \
1168     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1169                     trie_type = (scan->flags != EXACT) \
1170                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1171                               : (utf8_target ? trie_utf8 : trie_plain)
1172
1173 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1174 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1175     switch (trie_type) {                                                    \
1176     case trie_utf8_fold:                                                    \
1177         if ( foldlen>0 ) {                                                  \
1178             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1179             foldlen -= len;                                                 \
1180             uscan += len;                                                   \
1181             len=0;                                                          \
1182         } else {                                                            \
1183             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1184             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1185             foldlen -= UNISKIP( uvc );                                      \
1186             uscan = foldbuf + UNISKIP( uvc );                               \
1187         }                                                                   \
1188         break;                                                              \
1189     case trie_latin_utf8_fold:                                              \
1190         if ( foldlen>0 ) {                                                  \
1191             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1192             foldlen -= len;                                                 \
1193             uscan += len;                                                   \
1194             len=0;                                                          \
1195         } else {                                                            \
1196             len = 1;                                                        \
1197             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1198             foldlen -= UNISKIP( uvc );                                      \
1199             uscan = foldbuf + UNISKIP( uvc );                               \
1200         }                                                                   \
1201         break;                                                              \
1202     case trie_utf8:                                                         \
1203         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1204         break;                                                              \
1205     case trie_plain:                                                        \
1206         uvc = (UV)*uc;                                                      \
1207         len = 1;                                                            \
1208     }                                                                       \
1209     if (uvc < 256) {                                                        \
1210         charid = trie->charmap[ uvc ];                                      \
1211     }                                                                       \
1212     else {                                                                  \
1213         charid = 0;                                                         \
1214         if (widecharmap) {                                                  \
1215             SV** const svpp = hv_fetch(widecharmap,                         \
1216                         (char*)&uvc, sizeof(UV), 0);                        \
1217             if (svpp)                                                       \
1218                 charid = (U16)SvIV(*svpp);                                  \
1219         }                                                                   \
1220     }                                                                       \
1221 } STMT_END
1222
1223 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1224 STMT_START {                                              \
1225     while (s <= e) {                                      \
1226         if ( (CoNd)                                       \
1227              && (ln == 1 || folder(s, pat_string, ln))    \
1228              && (!reginfo || regtry(reginfo, &s)) )       \
1229             goto got_it;                                  \
1230         s++;                                              \
1231     }                                                     \
1232 } STMT_END
1233
1234 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1235 STMT_START {                                          \
1236     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1237         CoDe                                          \
1238         s += uskip;                                   \
1239     }                                                 \
1240 } STMT_END
1241
1242 #define REXEC_FBC_SCAN(CoDe)                          \
1243 STMT_START {                                          \
1244     while (s < strend) {                              \
1245         CoDe                                          \
1246         s++;                                          \
1247     }                                                 \
1248 } STMT_END
1249
1250 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1251 REXEC_FBC_UTF8_SCAN(                                  \
1252     if (CoNd) {                                       \
1253         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1254             goto got_it;                              \
1255         else                                          \
1256             tmp = doevery;                            \
1257     }                                                 \
1258     else                                              \
1259         tmp = 1;                                      \
1260 )
1261
1262 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1263 REXEC_FBC_SCAN(                                       \
1264     if (CoNd) {                                       \
1265         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1266             goto got_it;                              \
1267         else                                          \
1268             tmp = doevery;                            \
1269     }                                                 \
1270     else                                              \
1271         tmp = 1;                                      \
1272 )
1273
1274 #define REXEC_FBC_TRYIT               \
1275 if ((!reginfo || regtry(reginfo, &s))) \
1276     goto got_it
1277
1278 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1279     if (utf8_target) {                                             \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }
1285
1286 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1287     if (utf8_target) {                                             \
1288         UtFpReLoAd;                                            \
1289         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1290     }                                                          \
1291     else {                                                     \
1292         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1293     }
1294
1295 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1296     PL_reg_flags |= RF_tainted;                                \
1297     if (utf8_target) {                                             \
1298         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1299     }                                                          \
1300     else {                                                     \
1301         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1302     }
1303
1304 #define DUMP_EXEC_POS(li,s,doutf8) \
1305     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1306
1307
1308 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1309         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1310         tmp = TEST_NON_UTF8(tmp);                                              \
1311         REXEC_FBC_UTF8_SCAN(                                                   \
1312             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1313                 tmp = !tmp;                                                    \
1314                 IF_SUCCESS;                                                    \
1315             }                                                                  \
1316             else {                                                             \
1317                 IF_FAIL;                                                       \
1318             }                                                                  \
1319         );                                                                     \
1320
1321 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1322         if (s == PL_bostr) {                                                   \
1323             tmp = '\n';                                                        \
1324         }                                                                      \
1325         else {                                                                 \
1326             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1327             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1328         }                                                                      \
1329         tmp = TeSt1_UtF8;                                                      \
1330         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1331         REXEC_FBC_UTF8_SCAN(                                                   \
1332             if (tmp == ! (TeSt2_UtF8)) { \
1333                 tmp = !tmp;                                                    \
1334                 IF_SUCCESS;                                                    \
1335             }                                                                  \
1336             else {                                                             \
1337                 IF_FAIL;                                                       \
1338             }                                                                  \
1339         );                                                                     \
1340
1341 /* The only difference between the BOUND and NBOUND cases is that
1342  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1343  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1344  * with the other one being empty */
1345 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1346     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1347
1348 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1349     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1350
1351 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1352     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1353
1354 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1355     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1356
1357
1358 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1359  * be passed in completely with the variable name being tested, which isn't
1360  * such a clean interface, but this is easier to read than it was before.  We
1361  * are looking for the boundary (or non-boundary between a word and non-word
1362  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1363  * must be different.  Find the "wordness" of the character just prior to this
1364  * one, and compare it with the wordness of this one.  If they differ, we have
1365  * a boundary.  At the beginning of the string, pretend that the previous
1366  * character was a new-line */
1367 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1368     if (utf8_target) {                                                         \
1369                 UTF8_CODE \
1370     }                                                                          \
1371     else {  /* Not utf8 */                                                     \
1372         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1373         tmp = TEST_NON_UTF8(tmp);                                              \
1374         REXEC_FBC_SCAN(                                                        \
1375             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1376                 tmp = !tmp;                                                    \
1377                 IF_SUCCESS;                                                    \
1378             }                                                                  \
1379             else {                                                             \
1380                 IF_FAIL;                                                       \
1381             }                                                                  \
1382         );                                                                     \
1383     }                                                                          \
1384     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1385         goto got_it;
1386
1387 /* We know what class REx starts with.  Try to find this position... */
1388 /* if reginfo is NULL, its a dryrun */
1389 /* annoyingly all the vars in this routine have different names from their counterparts
1390    in regmatch. /grrr */
1391
1392 STATIC char *
1393 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1394     const char *strend, regmatch_info *reginfo)
1395 {
1396         dVAR;
1397         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1398         char *pat_string;   /* The pattern's exactish string */
1399         char *pat_end;      /* ptr to end char of pat_string */
1400         re_fold_t folder;       /* Function for computing non-utf8 folds */
1401         const U8 *fold_array;   /* array for folding ords < 256 */
1402         STRLEN ln;
1403         STRLEN lnc;
1404         register STRLEN uskip;
1405         U8 c1;
1406         U8 c2;
1407         char *e;
1408         register I32 tmp = 1;   /* Scratch variable? */
1409         register const bool utf8_target = PL_reg_match_utf8;
1410         UV utf8_fold_flags = 0;
1411         RXi_GET_DECL(prog,progi);
1412
1413         PERL_ARGS_ASSERT_FIND_BYCLASS;
1414
1415         /* We know what class it must start with. */
1416         switch (OP(c)) {
1417         case ANYOFV:
1418         case ANYOF:
1419             if (utf8_target || OP(c) == ANYOFV) {
1420                 STRLEN inclasslen = strend - s;
1421                 REXEC_FBC_UTF8_CLASS_SCAN(
1422                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1423             }
1424             else {
1425                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1426             }
1427             break;
1428         case CANY:
1429             REXEC_FBC_SCAN(
1430                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1431                     goto got_it;
1432                 else
1433                     tmp = doevery;
1434             );
1435             break;
1436
1437         case EXACTFA:
1438             if (UTF_PATTERN || utf8_target) {
1439                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1440                 goto do_exactf_utf8;
1441             }
1442             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1443             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1444             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1445
1446         case EXACTFU:
1447             if (UTF_PATTERN || utf8_target) {
1448                 utf8_fold_flags = 0;
1449                 goto do_exactf_utf8;
1450             }
1451
1452             /* Any 'ss' in the pattern should have been replaced by regcomp,
1453              * so we don't have to worry here about this single special case
1454              * in the Latin1 range */
1455             fold_array = PL_fold_latin1;
1456             folder = foldEQ_latin1;
1457             goto do_exactf_non_utf8;
1458
1459         case EXACTF:
1460             if (UTF_PATTERN || utf8_target) {
1461                 utf8_fold_flags = 0;
1462                 goto do_exactf_utf8;
1463             }
1464             fold_array = PL_fold;
1465             folder = foldEQ;
1466             goto do_exactf_non_utf8;
1467
1468         case EXACTFL:
1469             if (UTF_PATTERN || utf8_target) {
1470                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1471                 goto do_exactf_utf8;
1472             }
1473             fold_array = PL_fold_locale;
1474             folder = foldEQ_locale;
1475
1476             /* FALL THROUGH */
1477
1478         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1479
1480             /* The idea in the non-utf8 EXACTF* cases is to first find the
1481              * first character of the EXACTF* node and then, if necessary,
1482              * case-insensitively compare the full text of the node.  c1 is the
1483              * first character.  c2 is its fold.  This logic will not work for
1484              * Unicode semantics and the german sharp ss, which hence should
1485              * not be compiled into a node that gets here. */
1486             pat_string = STRING(c);
1487             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1488
1489             e = HOP3c(strend, -((I32)ln), s);
1490
1491             if (!reginfo && e < s) {
1492                 e = s;                  /* Due to minlen logic of intuit() */
1493             }
1494
1495             c1 = *pat_string;
1496             c2 = fold_array[c1];
1497             if (c1 == c2) { /* If char and fold are the same */
1498                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1499             }
1500             else {
1501                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1502             }
1503             break;
1504
1505         do_exactf_utf8:
1506
1507             /* If one of the operands is in utf8, we can't use the simpler
1508              * folding above, due to the fact that many different characters
1509              * can have the same fold, or portion of a fold, or different-
1510              * length fold */
1511             pat_string = STRING(c);
1512             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1513             pat_end = pat_string + ln;
1514             lnc = (UTF_PATTERN) /* length to match in characters */
1515                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1516                     : ln;
1517
1518             e = HOP3c(strend, -((I32)lnc), s);
1519
1520             if (!reginfo && e < s) {
1521                 e = s;                  /* Due to minlen logic of intuit() */
1522             }
1523
1524             while (s <= e) {
1525                 char *my_strend= (char *)strend;
1526                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1527                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1528                     && (!reginfo || regtry(reginfo, &s)) )
1529                 {
1530                     goto got_it;
1531                 }
1532                 s += UTF8SKIP(s);
1533             }
1534             break;
1535         case BOUNDL:
1536             PL_reg_flags |= RF_tainted;
1537             FBC_BOUND(isALNUM_LC,
1538                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1539                       isALNUM_LC_utf8((U8*)s));
1540             break;
1541         case NBOUNDL:
1542             PL_reg_flags |= RF_tainted;
1543             FBC_NBOUND(isALNUM_LC,
1544                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1545                        isALNUM_LC_utf8((U8*)s));
1546             break;
1547         case BOUND:
1548             FBC_BOUND(isWORDCHAR,
1549                       isALNUM_uni(tmp),
1550                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1551             break;
1552         case BOUNDA:
1553             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1554                              isWORDCHAR_A(tmp),
1555                              isWORDCHAR_A((U8*)s));
1556             break;
1557         case NBOUND:
1558             FBC_NBOUND(isWORDCHAR,
1559                        isALNUM_uni(tmp),
1560                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1561             break;
1562         case NBOUNDA:
1563             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1564                               isWORDCHAR_A(tmp),
1565                               isWORDCHAR_A((U8*)s));
1566             break;
1567         case BOUNDU:
1568             FBC_BOUND(isWORDCHAR_L1,
1569                       isALNUM_uni(tmp),
1570                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1571             break;
1572         case NBOUNDU:
1573             FBC_NBOUND(isWORDCHAR_L1,
1574                        isALNUM_uni(tmp),
1575                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1576             break;
1577         case ALNUML:
1578             REXEC_FBC_CSCAN_TAINT(
1579                 isALNUM_LC_utf8((U8*)s),
1580                 isALNUM_LC(*s)
1581             );
1582             break;
1583         case ALNUMU:
1584             REXEC_FBC_CSCAN_PRELOAD(
1585                 LOAD_UTF8_CHARCLASS_ALNUM(),
1586                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1587                 isWORDCHAR_L1((U8) *s)
1588             );
1589             break;
1590         case ALNUM:
1591             REXEC_FBC_CSCAN_PRELOAD(
1592                 LOAD_UTF8_CHARCLASS_ALNUM(),
1593                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1594                 isWORDCHAR((U8) *s)
1595             );
1596             break;
1597         case ALNUMA:
1598             /* Don't need to worry about utf8, as it can match only a single
1599              * byte invariant character */
1600             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1601             break;
1602         case NALNUMU:
1603             REXEC_FBC_CSCAN_PRELOAD(
1604                 LOAD_UTF8_CHARCLASS_ALNUM(),
1605                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1606                 ! isWORDCHAR_L1((U8) *s)
1607             );
1608             break;
1609         case NALNUM:
1610             REXEC_FBC_CSCAN_PRELOAD(
1611                 LOAD_UTF8_CHARCLASS_ALNUM(),
1612                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1613                 ! isALNUM(*s)
1614             );
1615             break;
1616         case NALNUMA:
1617             REXEC_FBC_CSCAN(
1618                 !isWORDCHAR_A(*s),
1619                 !isWORDCHAR_A(*s)
1620             );
1621             break;
1622         case NALNUML:
1623             REXEC_FBC_CSCAN_TAINT(
1624                 !isALNUM_LC_utf8((U8*)s),
1625                 !isALNUM_LC(*s)
1626             );
1627             break;
1628         case SPACEU:
1629             REXEC_FBC_CSCAN_PRELOAD(
1630                 LOAD_UTF8_CHARCLASS_SPACE(),
1631                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1632                 isSPACE_L1((U8) *s)
1633             );
1634             break;
1635         case SPACE:
1636             REXEC_FBC_CSCAN_PRELOAD(
1637                 LOAD_UTF8_CHARCLASS_SPACE(),
1638                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1639                 isSPACE((U8) *s)
1640             );
1641             break;
1642         case SPACEA:
1643             /* Don't need to worry about utf8, as it can match only a single
1644              * byte invariant character */
1645             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1646             break;
1647         case SPACEL:
1648             REXEC_FBC_CSCAN_TAINT(
1649                 isSPACE_LC_utf8((U8*)s),
1650                 isSPACE_LC(*s)
1651             );
1652             break;
1653         case NSPACEU:
1654             REXEC_FBC_CSCAN_PRELOAD(
1655                 LOAD_UTF8_CHARCLASS_SPACE(),
1656                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1657                 ! isSPACE_L1((U8) *s)
1658             );
1659             break;
1660         case NSPACE:
1661             REXEC_FBC_CSCAN_PRELOAD(
1662                 LOAD_UTF8_CHARCLASS_SPACE(),
1663                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1664                 ! isSPACE((U8) *s)
1665             );
1666             break;
1667         case NSPACEA:
1668             REXEC_FBC_CSCAN(
1669                 !isSPACE_A(*s),
1670                 !isSPACE_A(*s)
1671             );
1672             break;
1673         case NSPACEL:
1674             REXEC_FBC_CSCAN_TAINT(
1675                 !isSPACE_LC_utf8((U8*)s),
1676                 !isSPACE_LC(*s)
1677             );
1678             break;
1679         case DIGIT:
1680             REXEC_FBC_CSCAN_PRELOAD(
1681                 LOAD_UTF8_CHARCLASS_DIGIT(),
1682                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1683                 isDIGIT(*s)
1684             );
1685             break;
1686         case DIGITA:
1687             /* Don't need to worry about utf8, as it can match only a single
1688              * byte invariant character */
1689             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1690             break;
1691         case DIGITL:
1692             REXEC_FBC_CSCAN_TAINT(
1693                 isDIGIT_LC_utf8((U8*)s),
1694                 isDIGIT_LC(*s)
1695             );
1696             break;
1697         case NDIGIT:
1698             REXEC_FBC_CSCAN_PRELOAD(
1699                 LOAD_UTF8_CHARCLASS_DIGIT(),
1700                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1701                 !isDIGIT(*s)
1702             );
1703             break;
1704         case NDIGITA:
1705             REXEC_FBC_CSCAN(
1706                 !isDIGIT_A(*s),
1707                 !isDIGIT_A(*s)
1708             );
1709             break;
1710         case NDIGITL:
1711             REXEC_FBC_CSCAN_TAINT(
1712                 !isDIGIT_LC_utf8((U8*)s),
1713                 !isDIGIT_LC(*s)
1714             );
1715             break;
1716         case LNBREAK:
1717             REXEC_FBC_CSCAN(
1718                 is_LNBREAK_utf8(s),
1719                 is_LNBREAK_latin1(s)
1720             );
1721             break;
1722         case VERTWS:
1723             REXEC_FBC_CSCAN(
1724                 is_VERTWS_utf8(s),
1725                 is_VERTWS_latin1(s)
1726             );
1727             break;
1728         case NVERTWS:
1729             REXEC_FBC_CSCAN(
1730                 !is_VERTWS_utf8(s),
1731                 !is_VERTWS_latin1(s)
1732             );
1733             break;
1734         case HORIZWS:
1735             REXEC_FBC_CSCAN(
1736                 is_HORIZWS_utf8(s),
1737                 is_HORIZWS_latin1(s)
1738             );
1739             break;
1740         case NHORIZWS:
1741             REXEC_FBC_CSCAN(
1742                 !is_HORIZWS_utf8(s),
1743                 !is_HORIZWS_latin1(s)
1744             );
1745             break;
1746         case AHOCORASICKC:
1747         case AHOCORASICK:
1748             {
1749                 DECL_TRIE_TYPE(c);
1750                 /* what trie are we using right now */
1751                 reg_ac_data *aho
1752                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1753                 reg_trie_data *trie
1754                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1755                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1756
1757                 const char *last_start = strend - trie->minlen;
1758 #ifdef DEBUGGING
1759                 const char *real_start = s;
1760 #endif
1761                 STRLEN maxlen = trie->maxlen;
1762                 SV *sv_points;
1763                 U8 **points; /* map of where we were in the input string
1764                                 when reading a given char. For ASCII this
1765                                 is unnecessary overhead as the relationship
1766                                 is always 1:1, but for Unicode, especially
1767                                 case folded Unicode this is not true. */
1768                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1769                 U8 *bitmap=NULL;
1770
1771
1772                 GET_RE_DEBUG_FLAGS_DECL;
1773
1774                 /* We can't just allocate points here. We need to wrap it in
1775                  * an SV so it gets freed properly if there is a croak while
1776                  * running the match */
1777                 ENTER;
1778                 SAVETMPS;
1779                 sv_points=newSV(maxlen * sizeof(U8 *));
1780                 SvCUR_set(sv_points,
1781                     maxlen * sizeof(U8 *));
1782                 SvPOK_on(sv_points);
1783                 sv_2mortal(sv_points);
1784                 points=(U8**)SvPV_nolen(sv_points );
1785                 if ( trie_type != trie_utf8_fold
1786                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1787                 {
1788                     if (trie->bitmap)
1789                         bitmap=(U8*)trie->bitmap;
1790                     else
1791                         bitmap=(U8*)ANYOF_BITMAP(c);
1792                 }
1793                 /* this is the Aho-Corasick algorithm modified a touch
1794                    to include special handling for long "unknown char"
1795                    sequences. The basic idea being that we use AC as long
1796                    as we are dealing with a possible matching char, when
1797                    we encounter an unknown char (and we have not encountered
1798                    an accepting state) we scan forward until we find a legal
1799                    starting char.
1800                    AC matching is basically that of trie matching, except
1801                    that when we encounter a failing transition, we fall back
1802                    to the current states "fail state", and try the current char
1803                    again, a process we repeat until we reach the root state,
1804                    state 1, or a legal transition. If we fail on the root state
1805                    then we can either terminate if we have reached an accepting
1806                    state previously, or restart the entire process from the beginning
1807                    if we have not.
1808
1809                  */
1810                 while (s <= last_start) {
1811                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1812                     U8 *uc = (U8*)s;
1813                     U16 charid = 0;
1814                     U32 base = 1;
1815                     U32 state = 1;
1816                     UV uvc = 0;
1817                     STRLEN len = 0;
1818                     STRLEN foldlen = 0;
1819                     U8 *uscan = (U8*)NULL;
1820                     U8 *leftmost = NULL;
1821 #ifdef DEBUGGING
1822                     U32 accepted_word= 0;
1823 #endif
1824                     U32 pointpos = 0;
1825
1826                     while ( state && uc <= (U8*)strend ) {
1827                         int failed=0;
1828                         U32 word = aho->states[ state ].wordnum;
1829
1830                         if( state==1 ) {
1831                             if ( bitmap ) {
1832                                 DEBUG_TRIE_EXECUTE_r(
1833                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1834                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1835                                             (char *)uc, utf8_target );
1836                                         PerlIO_printf( Perl_debug_log,
1837                                             " Scanning for legal start char...\n");
1838                                     }
1839                                 );
1840                                 if (utf8_target) {
1841                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1842                                         uc += UTF8SKIP(uc);
1843                                     }
1844                                 } else {
1845                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1846                                         uc++;
1847                                     }
1848                                 }
1849                                 s= (char *)uc;
1850                             }
1851                             if (uc >(U8*)last_start) break;
1852                         }
1853
1854                         if ( word ) {
1855                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1856                             if (!leftmost || lpos < leftmost) {
1857                                 DEBUG_r(accepted_word=word);
1858                                 leftmost= lpos;
1859                             }
1860                             if (base==0) break;
1861
1862                         }
1863                         points[pointpos++ % maxlen]= uc;
1864                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1865                                              uscan, len, uvc, charid, foldlen,
1866                                              foldbuf, uniflags);
1867                         DEBUG_TRIE_EXECUTE_r({
1868                             dump_exec_pos( (char *)uc, c, strend, real_start,
1869                                 s,   utf8_target );
1870                             PerlIO_printf(Perl_debug_log,
1871                                 " Charid:%3u CP:%4"UVxf" ",
1872                                  charid, uvc);
1873                         });
1874
1875                         do {
1876 #ifdef DEBUGGING
1877                             word = aho->states[ state ].wordnum;
1878 #endif
1879                             base = aho->states[ state ].trans.base;
1880
1881                             DEBUG_TRIE_EXECUTE_r({
1882                                 if (failed)
1883                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1884                                         s,   utf8_target );
1885                                 PerlIO_printf( Perl_debug_log,
1886                                     "%sState: %4"UVxf", word=%"UVxf,
1887                                     failed ? " Fail transition to " : "",
1888                                     (UV)state, (UV)word);
1889                             });
1890                             if ( base ) {
1891                                 U32 tmp;
1892                                 I32 offset;
1893                                 if (charid &&
1894                                      ( ((offset = base + charid
1895                                         - 1 - trie->uniquecharcount)) >= 0)
1896                                      && ((U32)offset < trie->lasttrans)
1897                                      && trie->trans[offset].check == state
1898                                      && (tmp=trie->trans[offset].next))
1899                                 {
1900                                     DEBUG_TRIE_EXECUTE_r(
1901                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1902                                     state = tmp;
1903                                     break;
1904                                 }
1905                                 else {
1906                                     DEBUG_TRIE_EXECUTE_r(
1907                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1908                                     failed = 1;
1909                                     state = aho->fail[state];
1910                                 }
1911                             }
1912                             else {
1913                                 /* we must be accepting here */
1914                                 DEBUG_TRIE_EXECUTE_r(
1915                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1916                                 failed = 1;
1917                                 break;
1918                             }
1919                         } while(state);
1920                         uc += len;
1921                         if (failed) {
1922                             if (leftmost)
1923                                 break;
1924                             if (!state) state = 1;
1925                         }
1926                     }
1927                     if ( aho->states[ state ].wordnum ) {
1928                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1929                         if (!leftmost || lpos < leftmost) {
1930                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1931                             leftmost = lpos;
1932                         }
1933                     }
1934                     if (leftmost) {
1935                         s = (char*)leftmost;
1936                         DEBUG_TRIE_EXECUTE_r({
1937                             PerlIO_printf(
1938                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1939                                 (UV)accepted_word, (IV)(s - real_start)
1940                             );
1941                         });
1942                         if (!reginfo || regtry(reginfo, &s)) {
1943                             FREETMPS;
1944                             LEAVE;
1945                             goto got_it;
1946                         }
1947                         s = HOPc(s,1);
1948                         DEBUG_TRIE_EXECUTE_r({
1949                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1950                         });
1951                     } else {
1952                         DEBUG_TRIE_EXECUTE_r(
1953                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1954                         break;
1955                     }
1956                 }
1957                 FREETMPS;
1958                 LEAVE;
1959             }
1960             break;
1961         default:
1962             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1963             break;
1964         }
1965         return 0;
1966       got_it:
1967         return s;
1968 }
1969
1970
1971 /*
1972  - regexec_flags - match a regexp against a string
1973  */
1974 I32
1975 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1976               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1977 /* strend: pointer to null at end of string */
1978 /* strbeg: real beginning of string */
1979 /* minend: end of match must be >=minend after stringarg. */
1980 /* data: May be used for some additional optimizations.
1981          Currently its only used, with a U32 cast, for transmitting
1982          the ganch offset when doing a /g match. This will change */
1983 /* nosave: For optimizations. */
1984 {
1985     dVAR;
1986     struct regexp *const prog = (struct regexp *)SvANY(rx);
1987     /*register*/ char *s;
1988     register regnode *c;
1989     /*register*/ char *startpos = stringarg;
1990     I32 minlen;         /* must match at least this many chars */
1991     I32 dontbother = 0; /* how many characters not to try at end */
1992     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1993     I32 scream_pos = -1;                /* Internal iterator of scream. */
1994     char *scream_olds = NULL;
1995     const bool utf8_target = cBOOL(DO_UTF8(sv));
1996     I32 multiline;
1997     RXi_GET_DECL(prog,progi);
1998     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1999     regexp_paren_pair *swap = NULL;
2000     GET_RE_DEBUG_FLAGS_DECL;
2001
2002     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2003     PERL_UNUSED_ARG(data);
2004
2005     /* Be paranoid... */
2006     if (prog == NULL || startpos == NULL) {
2007         Perl_croak(aTHX_ "NULL regexp parameter");
2008         return 0;
2009     }
2010
2011     multiline = prog->extflags & RXf_PMf_MULTILINE;
2012     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2013
2014     RX_MATCH_UTF8_set(rx, utf8_target);
2015     DEBUG_EXECUTE_r(
2016         debug_start_match(rx, utf8_target, startpos, strend,
2017         "Matching");
2018     );
2019
2020     minlen = prog->minlen;
2021
2022     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2023         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2024                               "String too short [regexec_flags]...\n"));
2025         goto phooey;
2026     }
2027
2028
2029     /* Check validity of program. */
2030     if (UCHARAT(progi->program) != REG_MAGIC) {
2031         Perl_croak(aTHX_ "corrupted regexp program");
2032     }
2033
2034     PL_reg_flags = 0;
2035     PL_reg_eval_set = 0;
2036     PL_reg_maxiter = 0;
2037
2038     if (RX_UTF8(rx))
2039         PL_reg_flags |= RF_utf8;
2040
2041     /* Mark beginning of line for ^ and lookbehind. */
2042     reginfo.bol = startpos; /* XXX not used ??? */
2043     PL_bostr  = strbeg;
2044     reginfo.sv = sv;
2045
2046     /* Mark end of line for $ (and such) */
2047     PL_regeol = strend;
2048
2049     /* see how far we have to get to not match where we matched before */
2050     reginfo.till = startpos+minend;
2051
2052     /* If there is a "must appear" string, look for it. */
2053     s = startpos;
2054
2055     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2056         MAGIC *mg;
2057         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2058             reginfo.ganch = startpos + prog->gofs;
2059             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2060               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2061         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2062                   && SvMAGIC(sv)
2063                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2064                   && mg->mg_len >= 0) {
2065             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2066             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2067                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2068
2069             if (prog->extflags & RXf_ANCH_GPOS) {
2070                 if (s > reginfo.ganch)
2071                     goto phooey;
2072                 s = reginfo.ganch - prog->gofs;
2073                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2074                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2075                 if (s < strbeg)
2076                     goto phooey;
2077             }
2078         }
2079         else if (data) {
2080             reginfo.ganch = strbeg + PTR2UV(data);
2081             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2082                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2083
2084         } else {                                /* pos() not defined */
2085             reginfo.ganch = strbeg;
2086             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2087                  "GPOS: reginfo.ganch = strbeg\n"));
2088         }
2089     }
2090     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2091         /* We have to be careful. If the previous successful match
2092            was from this regex we don't want a subsequent partially
2093            successful match to clobber the old results.
2094            So when we detect this possibility we add a swap buffer
2095            to the re, and switch the buffer each match. If we fail
2096            we switch it back, otherwise we leave it swapped.
2097         */
2098         swap = prog->offs;
2099         /* do we need a save destructor here for eval dies? */
2100         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2101     }
2102     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2103         re_scream_pos_data d;
2104
2105         d.scream_olds = &scream_olds;
2106         d.scream_pos = &scream_pos;
2107         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2108         if (!s) {
2109             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2110             goto phooey;        /* not present */
2111         }
2112     }
2113
2114
2115
2116     /* Simplest case:  anchored match need be tried only once. */
2117     /*  [unless only anchor is BOL and multiline is set] */
2118     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2119         if (s == startpos && regtry(&reginfo, &startpos))
2120             goto got_it;
2121         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2122                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2123         {
2124             char *end;
2125
2126             if (minlen)
2127                 dontbother = minlen - 1;
2128             end = HOP3c(strend, -dontbother, strbeg) - 1;
2129             /* for multiline we only have to try after newlines */
2130             if (prog->check_substr || prog->check_utf8) {
2131                 /* because of the goto we can not easily reuse the macros for bifurcating the
2132                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2133                 if (utf8_target) {
2134                     if (s == startpos)
2135                         goto after_try_utf8;
2136                     while (1) {
2137                         if (regtry(&reginfo, &s)) {
2138                             goto got_it;
2139                         }
2140                       after_try_utf8:
2141                         if (s > end) {
2142                             goto phooey;
2143                         }
2144                         if (prog->extflags & RXf_USE_INTUIT) {
2145                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2146                             if (!s) {
2147                                 goto phooey;
2148                             }
2149                         }
2150                         else {
2151                             s += UTF8SKIP(s);
2152                         }
2153                     }
2154                 } /* end search for check string in unicode */
2155                 else {
2156                     if (s == startpos) {
2157                         goto after_try_latin;
2158                     }
2159                     while (1) {
2160                         if (regtry(&reginfo, &s)) {
2161                             goto got_it;
2162                         }
2163                       after_try_latin:
2164                         if (s > end) {
2165                             goto phooey;
2166                         }
2167                         if (prog->extflags & RXf_USE_INTUIT) {
2168                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2169                             if (!s) {
2170                                 goto phooey;
2171                             }
2172                         }
2173                         else {
2174                             s++;
2175                         }
2176                     }
2177                 } /* end search for check string in latin*/
2178             } /* end search for check string */
2179             else { /* search for newline */
2180                 if (s > startpos) {
2181                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2182                     s--;
2183                 }
2184                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2185                 while (s < end) {
2186                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2187                         if (regtry(&reginfo, &s))
2188                             goto got_it;
2189                     }
2190                 }
2191             } /* end search for newline */
2192         } /* end anchored/multiline check string search */
2193         goto phooey;
2194     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2195     {
2196         /* the warning about reginfo.ganch being used without initialization
2197            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2198            and we only enter this block when the same bit is set. */
2199         char *tmp_s = reginfo.ganch - prog->gofs;
2200
2201         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2202             goto got_it;
2203         goto phooey;
2204     }
2205
2206     /* Messy cases:  unanchored match. */
2207     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2208         /* we have /x+whatever/ */
2209         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2210         char ch;
2211 #ifdef DEBUGGING
2212         int did_match = 0;
2213 #endif
2214         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2215             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2216         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2217
2218         if (utf8_target) {
2219             REXEC_FBC_SCAN(
2220                 if (*s == ch) {
2221                     DEBUG_EXECUTE_r( did_match = 1 );
2222                     if (regtry(&reginfo, &s)) goto got_it;
2223                     s += UTF8SKIP(s);
2224                     while (s < strend && *s == ch)
2225                         s += UTF8SKIP(s);
2226                 }
2227             );
2228         }
2229         else {
2230             REXEC_FBC_SCAN(
2231                 if (*s == ch) {
2232                     DEBUG_EXECUTE_r( did_match = 1 );
2233                     if (regtry(&reginfo, &s)) goto got_it;
2234                     s++;
2235                     while (s < strend && *s == ch)
2236                         s++;
2237                 }
2238             );
2239         }
2240         DEBUG_EXECUTE_r(if (!did_match)
2241                 PerlIO_printf(Perl_debug_log,
2242                                   "Did not find anchored character...\n")
2243                );
2244     }
2245     else if (prog->anchored_substr != NULL
2246               || prog->anchored_utf8 != NULL
2247               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2248                   && prog->float_max_offset < strend - s)) {
2249         SV *must;
2250         I32 back_max;
2251         I32 back_min;
2252         char *last;
2253         char *last1;            /* Last position checked before */
2254 #ifdef DEBUGGING
2255         int did_match = 0;
2256 #endif
2257         if (prog->anchored_substr || prog->anchored_utf8) {
2258             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2259                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2260             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2261             back_max = back_min = prog->anchored_offset;
2262         } else {
2263             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2264                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2265             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2266             back_max = prog->float_max_offset;
2267             back_min = prog->float_min_offset;
2268         }
2269
2270
2271         if (must == &PL_sv_undef)
2272             /* could not downgrade utf8 check substring, so must fail */
2273             goto phooey;
2274
2275         if (back_min<0) {
2276             last = strend;
2277         } else {
2278             last = HOP3c(strend,        /* Cannot start after this */
2279                   -(I32)(CHR_SVLEN(must)
2280                          - (SvTAIL(must) != 0) + back_min), strbeg);
2281         }
2282         if (s > PL_bostr)
2283             last1 = HOPc(s, -1);
2284         else
2285             last1 = s - 1;      /* bogus */
2286
2287         /* XXXX check_substr already used to find "s", can optimize if
2288            check_substr==must. */
2289         scream_pos = -1;
2290         dontbother = end_shift;
2291         strend = HOPc(strend, -dontbother);
2292         while ( (s <= last) &&
2293                 ((flags & REXEC_SCREAM)
2294                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2295                                     end_shift, &scream_pos, 0))
2296                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2297                                   (unsigned char*)strend, must,
2298                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2299             /* we may be pointing at the wrong string */
2300             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2301                 s = strbeg + (s - SvPVX_const(sv));
2302             DEBUG_EXECUTE_r( did_match = 1 );
2303             if (HOPc(s, -back_max) > last1) {
2304                 last1 = HOPc(s, -back_min);
2305                 s = HOPc(s, -back_max);
2306             }
2307             else {
2308                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2309
2310                 last1 = HOPc(s, -back_min);
2311                 s = t;
2312             }
2313             if (utf8_target) {
2314                 while (s <= last1) {
2315                     if (regtry(&reginfo, &s))
2316                         goto got_it;
2317                     s += UTF8SKIP(s);
2318                 }
2319             }
2320             else {
2321                 while (s <= last1) {
2322                     if (regtry(&reginfo, &s))
2323                         goto got_it;
2324                     s++;
2325                 }
2326             }
2327         }
2328         DEBUG_EXECUTE_r(if (!did_match) {
2329             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2330                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2331             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2332                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2333                                ? "anchored" : "floating"),
2334                 quoted, RE_SV_TAIL(must));
2335         });
2336         goto phooey;
2337     }
2338     else if ( (c = progi->regstclass) ) {
2339         if (minlen) {
2340             const OPCODE op = OP(progi->regstclass);
2341             /* don't bother with what can't match */
2342             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2343                 strend = HOPc(strend, -(minlen - 1));
2344         }
2345         DEBUG_EXECUTE_r({
2346             SV * const prop = sv_newmortal();
2347             regprop(prog, prop, c);
2348             {
2349                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2350                     s,strend-s,60);
2351                 PerlIO_printf(Perl_debug_log,
2352                     "Matching stclass %.*s against %s (%d bytes)\n",
2353                     (int)SvCUR(prop), SvPVX_const(prop),
2354                      quoted, (int)(strend - s));
2355             }
2356         });
2357         if (find_byclass(prog, c, s, strend, &reginfo))
2358             goto got_it;
2359         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2360     }
2361     else {
2362         dontbother = 0;
2363         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2364             /* Trim the end. */
2365             char *last;
2366             SV* float_real;
2367
2368             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2369                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2370             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2371
2372             if (flags & REXEC_SCREAM) {
2373                 last = screaminstr(sv, float_real, s - strbeg,
2374                                    end_shift, &scream_pos, 1); /* last one */
2375                 if (!last)
2376                     last = scream_olds; /* Only one occurrence. */
2377                 /* we may be pointing at the wrong string */
2378                 else if (RXp_MATCH_COPIED(prog))
2379                     s = strbeg + (s - SvPVX_const(sv));
2380             }
2381             else {
2382                 STRLEN len;
2383                 const char * const little = SvPV_const(float_real, len);
2384
2385                 if (SvTAIL(float_real)) {
2386                     if (memEQ(strend - len + 1, little, len - 1))
2387                         last = strend - len + 1;
2388                     else if (!multiline)
2389                         last = memEQ(strend - len, little, len)
2390                             ? strend - len : NULL;
2391                     else
2392                         goto find_last;
2393                 } else {
2394                   find_last:
2395                     if (len)
2396                         last = rninstr(s, strend, little, little + len);
2397                     else
2398                         last = strend;  /* matching "$" */
2399                 }
2400             }
2401             if (last == NULL) {
2402                 DEBUG_EXECUTE_r(
2403                     PerlIO_printf(Perl_debug_log,
2404                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2405                         PL_colors[4], PL_colors[5]));
2406                 goto phooey; /* Should not happen! */
2407             }
2408             dontbother = strend - last + prog->float_min_offset;
2409         }
2410         if (minlen && (dontbother < minlen))
2411             dontbother = minlen - 1;
2412         strend -= dontbother;              /* this one's always in bytes! */
2413         /* We don't know much -- general case. */
2414         if (utf8_target) {
2415             for (;;) {
2416                 if (regtry(&reginfo, &s))
2417                     goto got_it;
2418                 if (s >= strend)
2419                     break;
2420                 s += UTF8SKIP(s);
2421             };
2422         }
2423         else {
2424             do {
2425                 if (regtry(&reginfo, &s))
2426                     goto got_it;
2427             } while (s++ < strend);
2428         }
2429     }
2430
2431     /* Failure. */
2432     goto phooey;
2433
2434 got_it:
2435     Safefree(swap);
2436     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2437
2438     if (PL_reg_eval_set)
2439         restore_pos(aTHX_ prog);
2440     if (RXp_PAREN_NAMES(prog))
2441         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2442
2443     /* make sure $`, $&, $', and $digit will work later */
2444     if ( !(flags & REXEC_NOT_FIRST) ) {
2445         RX_MATCH_COPY_FREE(rx);
2446         if (flags & REXEC_COPY_STR) {
2447             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2448 #ifdef PERL_OLD_COPY_ON_WRITE
2449             if ((SvIsCOW(sv)
2450                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2451                 if (DEBUG_C_TEST) {
2452                     PerlIO_printf(Perl_debug_log,
2453                                   "Copy on write: regexp capture, type %d\n",
2454                                   (int) SvTYPE(sv));
2455                 }
2456                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2457                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2458                 assert (SvPOKp(prog->saved_copy));
2459             } else
2460 #endif
2461             {
2462                 RX_MATCH_COPIED_on(rx);
2463                 s = savepvn(strbeg, i);
2464                 prog->subbeg = s;
2465             }
2466             prog->sublen = i;
2467         }
2468         else {
2469             prog->subbeg = strbeg;
2470             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2471         }
2472     }
2473
2474     return 1;
2475
2476 phooey:
2477     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2478                           PL_colors[4], PL_colors[5]));
2479     if (PL_reg_eval_set)
2480         restore_pos(aTHX_ prog);
2481     if (swap) {
2482         /* we failed :-( roll it back */
2483         Safefree(prog->offs);
2484         prog->offs = swap;
2485     }
2486
2487     return 0;
2488 }
2489
2490
2491 /*
2492  - regtry - try match at specific point
2493  */
2494 STATIC I32                      /* 0 failure, 1 success */
2495 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2496 {
2497     dVAR;
2498     CHECKPOINT lastcp;
2499     REGEXP *const rx = reginfo->prog;
2500     regexp *const prog = (struct regexp *)SvANY(rx);
2501     RXi_GET_DECL(prog,progi);
2502     GET_RE_DEBUG_FLAGS_DECL;
2503
2504     PERL_ARGS_ASSERT_REGTRY;
2505
2506     reginfo->cutpoint=NULL;
2507
2508     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2509         MAGIC *mg;
2510
2511         PL_reg_eval_set = RS_init;
2512         DEBUG_EXECUTE_r(DEBUG_s(
2513             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2514                           (IV)(PL_stack_sp - PL_stack_base));
2515             ));
2516         SAVESTACK_CXPOS();
2517         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2518         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2519         SAVETMPS;
2520         /* Apparently this is not needed, judging by wantarray. */
2521         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2522            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2523
2524         if (reginfo->sv) {
2525             /* Make $_ available to executed code. */
2526             if (reginfo->sv != DEFSV) {
2527                 SAVE_DEFSV;
2528                 DEFSV_set(reginfo->sv);
2529             }
2530
2531             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2532                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2533                 /* prepare for quick setting of pos */
2534 #ifdef PERL_OLD_COPY_ON_WRITE
2535                 if (SvIsCOW(reginfo->sv))
2536                     sv_force_normal_flags(reginfo->sv, 0);
2537 #endif
2538                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2539                                  &PL_vtbl_mglob, NULL, 0);
2540                 mg->mg_len = -1;
2541             }
2542             PL_reg_magic    = mg;
2543             PL_reg_oldpos   = mg->mg_len;
2544             SAVEDESTRUCTOR_X(restore_pos, prog);
2545         }
2546         if (!PL_reg_curpm) {
2547             Newxz(PL_reg_curpm, 1, PMOP);
2548 #ifdef USE_ITHREADS
2549             {
2550                 SV* const repointer = &PL_sv_undef;
2551                 /* this regexp is also owned by the new PL_reg_curpm, which
2552                    will try to free it.  */
2553                 av_push(PL_regex_padav, repointer);
2554                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2555                 PL_regex_pad = AvARRAY(PL_regex_padav);
2556             }
2557 #endif
2558         }
2559 #ifdef USE_ITHREADS
2560         /* It seems that non-ithreads works both with and without this code.
2561            So for efficiency reasons it seems best not to have the code
2562            compiled when it is not needed.  */
2563         /* This is safe against NULLs: */
2564         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2565         /* PM_reg_curpm owns a reference to this regexp.  */
2566         (void)ReREFCNT_inc(rx);
2567 #endif
2568         PM_SETRE(PL_reg_curpm, rx);
2569         PL_reg_oldcurpm = PL_curpm;
2570         PL_curpm = PL_reg_curpm;
2571         if (RXp_MATCH_COPIED(prog)) {
2572             /*  Here is a serious problem: we cannot rewrite subbeg,
2573                 since it may be needed if this match fails.  Thus
2574                 $` inside (?{}) could fail... */
2575             PL_reg_oldsaved = prog->subbeg;
2576             PL_reg_oldsavedlen = prog->sublen;
2577 #ifdef PERL_OLD_COPY_ON_WRITE
2578             PL_nrs = prog->saved_copy;
2579 #endif
2580             RXp_MATCH_COPIED_off(prog);
2581         }
2582         else
2583             PL_reg_oldsaved = NULL;
2584         prog->subbeg = PL_bostr;
2585         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2586     }
2587     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2588     prog->offs[0].start = *startpos - PL_bostr;
2589     PL_reginput = *startpos;
2590     PL_reglastparen = &prog->lastparen;
2591     PL_reglastcloseparen = &prog->lastcloseparen;
2592     prog->lastparen = 0;
2593     prog->lastcloseparen = 0;
2594     PL_regsize = 0;
2595     PL_regoffs = prog->offs;
2596     if (PL_reg_start_tmpl <= prog->nparens) {
2597         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2598         if(PL_reg_start_tmp)
2599             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2600         else
2601             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2602     }
2603
2604     /* XXXX What this code is doing here?!!!  There should be no need
2605        to do this again and again, PL_reglastparen should take care of
2606        this!  --ilya*/
2607
2608     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2609      * Actually, the code in regcppop() (which Ilya may be meaning by
2610      * PL_reglastparen), is not needed at all by the test suite
2611      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2612      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2613      * Meanwhile, this code *is* needed for the
2614      * above-mentioned test suite tests to succeed.  The common theme
2615      * on those tests seems to be returning null fields from matches.
2616      * --jhi updated by dapm */
2617 #if 1
2618     if (prog->nparens) {
2619         regexp_paren_pair *pp = PL_regoffs;
2620         register I32 i;
2621         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2622             ++pp;
2623             pp->start = -1;
2624             pp->end = -1;
2625         }
2626     }
2627 #endif
2628     REGCP_SET(lastcp);
2629     if (regmatch(reginfo, progi->program + 1)) {
2630         PL_regoffs[0].end = PL_reginput - PL_bostr;
2631         return 1;
2632     }
2633     if (reginfo->cutpoint)
2634         *startpos= reginfo->cutpoint;
2635     REGCP_UNWIND(lastcp);
2636     return 0;
2637 }
2638
2639
2640 #define sayYES goto yes
2641 #define sayNO goto no
2642 #define sayNO_SILENT goto no_silent
2643
2644 /* we dont use STMT_START/END here because it leads to
2645    "unreachable code" warnings, which are bogus, but distracting. */
2646 #define CACHEsayNO \
2647     if (ST.cache_mask) \
2648        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2649     sayNO
2650
2651 /* this is used to determine how far from the left messages like
2652    'failed...' are printed. It should be set such that messages
2653    are inline with the regop output that created them.
2654 */
2655 #define REPORT_CODE_OFF 32
2656
2657
2658 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2659 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2660
2661 #define SLAB_FIRST(s) (&(s)->states[0])
2662 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2663
2664 /* grab a new slab and return the first slot in it */
2665
2666 STATIC regmatch_state *
2667 S_push_slab(pTHX)
2668 {
2669 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2670     dMY_CXT;
2671 #endif
2672     regmatch_slab *s = PL_regmatch_slab->next;
2673     if (!s) {
2674         Newx(s, 1, regmatch_slab);
2675         s->prev = PL_regmatch_slab;
2676         s->next = NULL;
2677         PL_regmatch_slab->next = s;
2678     }
2679     PL_regmatch_slab = s;
2680     return SLAB_FIRST(s);
2681 }
2682
2683
2684 /* push a new state then goto it */
2685
2686 #define PUSH_STATE_GOTO(state, node) \
2687     scan = node; \
2688     st->resume_state = state; \
2689     goto push_state;
2690
2691 /* push a new state with success backtracking, then goto it */
2692
2693 #define PUSH_YES_STATE_GOTO(state, node) \
2694     scan = node; \
2695     st->resume_state = state; \
2696     goto push_yes_state;
2697
2698
2699
2700 /*
2701
2702 regmatch() - main matching routine
2703
2704 This is basically one big switch statement in a loop. We execute an op,
2705 set 'next' to point the next op, and continue. If we come to a point which
2706 we may need to backtrack to on failure such as (A|B|C), we push a
2707 backtrack state onto the backtrack stack. On failure, we pop the top
2708 state, and re-enter the loop at the state indicated. If there are no more
2709 states to pop, we return failure.
2710
2711 Sometimes we also need to backtrack on success; for example /A+/, where
2712 after successfully matching one A, we need to go back and try to
2713 match another one; similarly for lookahead assertions: if the assertion
2714 completes successfully, we backtrack to the state just before the assertion
2715 and then carry on.  In these cases, the pushed state is marked as
2716 'backtrack on success too'. This marking is in fact done by a chain of
2717 pointers, each pointing to the previous 'yes' state. On success, we pop to
2718 the nearest yes state, discarding any intermediate failure-only states.
2719 Sometimes a yes state is pushed just to force some cleanup code to be
2720 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2721 it to free the inner regex.
2722
2723 Note that failure backtracking rewinds the cursor position, while
2724 success backtracking leaves it alone.
2725
2726 A pattern is complete when the END op is executed, while a subpattern
2727 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2728 ops trigger the "pop to last yes state if any, otherwise return true"
2729 behaviour.
2730
2731 A common convention in this function is to use A and B to refer to the two
2732 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2733 the subpattern to be matched possibly multiple times, while B is the entire
2734 rest of the pattern. Variable and state names reflect this convention.
2735
2736 The states in the main switch are the union of ops and failure/success of
2737 substates associated with with that op.  For example, IFMATCH is the op
2738 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2739 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2740 successfully matched A and IFMATCH_A_fail is a state saying that we have
2741 just failed to match A. Resume states always come in pairs. The backtrack
2742 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2743 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2744 on success or failure.
2745
2746 The struct that holds a backtracking state is actually a big union, with
2747 one variant for each major type of op. The variable st points to the
2748 top-most backtrack struct. To make the code clearer, within each
2749 block of code we #define ST to alias the relevant union.
2750
2751 Here's a concrete example of a (vastly oversimplified) IFMATCH
2752 implementation:
2753
2754     switch (state) {
2755     ....
2756
2757 #define ST st->u.ifmatch
2758
2759     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2760         ST.foo = ...; // some state we wish to save
2761         ...
2762         // push a yes backtrack state with a resume value of
2763         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2764         // first node of A:
2765         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2766         // NOTREACHED
2767
2768     case IFMATCH_A: // we have successfully executed A; now continue with B
2769         next = B;
2770         bar = ST.foo; // do something with the preserved value
2771         break;
2772
2773     case IFMATCH_A_fail: // A failed, so the assertion failed
2774         ...;   // do some housekeeping, then ...
2775         sayNO; // propagate the failure
2776
2777 #undef ST
2778
2779     ...
2780     }
2781
2782 For any old-timers reading this who are familiar with the old recursive
2783 approach, the code above is equivalent to:
2784
2785     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2786     {
2787         int foo = ...
2788         ...
2789         if (regmatch(A)) {
2790             next = B;
2791             bar = foo;
2792             break;
2793         }
2794         ...;   // do some housekeeping, then ...
2795         sayNO; // propagate the failure
2796     }
2797
2798 The topmost backtrack state, pointed to by st, is usually free. If you
2799 want to claim it, populate any ST.foo fields in it with values you wish to
2800 save, then do one of
2801
2802         PUSH_STATE_GOTO(resume_state, node);
2803         PUSH_YES_STATE_GOTO(resume_state, node);
2804
2805 which sets that backtrack state's resume value to 'resume_state', pushes a
2806 new free entry to the top of the backtrack stack, then goes to 'node'.
2807 On backtracking, the free slot is popped, and the saved state becomes the
2808 new free state. An ST.foo field in this new top state can be temporarily
2809 accessed to retrieve values, but once the main loop is re-entered, it
2810 becomes available for reuse.
2811
2812 Note that the depth of the backtrack stack constantly increases during the
2813 left-to-right execution of the pattern, rather than going up and down with
2814 the pattern nesting. For example the stack is at its maximum at Z at the
2815 end of the pattern, rather than at X in the following:
2816
2817     /(((X)+)+)+....(Y)+....Z/
2818
2819 The only exceptions to this are lookahead/behind assertions and the cut,
2820 (?>A), which pop all the backtrack states associated with A before
2821 continuing.
2822
2823 Backtrack state structs are allocated in slabs of about 4K in size.
2824 PL_regmatch_state and st always point to the currently active state,
2825 and PL_regmatch_slab points to the slab currently containing
2826 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2827 allocated, and is never freed until interpreter destruction. When the slab
2828 is full, a new one is allocated and chained to the end. At exit from
2829 regmatch(), slabs allocated since entry are freed.
2830
2831 */
2832
2833
2834 #define DEBUG_STATE_pp(pp)                                  \
2835     DEBUG_STATE_r({                                         \
2836         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2837         PerlIO_printf(Perl_debug_log,                       \
2838             "    %*s"pp" %s%s%s%s%s\n",                     \
2839             depth*2, "",                                    \
2840             PL_reg_name[st->resume_state],                     \
2841             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2842             ((st==yes_state) ? "Y" : ""),                   \
2843             ((st==mark_state) ? "M" : ""),                  \
2844             ((st==yes_state||st==mark_state) ? "]" : "")    \
2845         );                                                  \
2846     });
2847
2848
2849 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2850
2851 #ifdef DEBUGGING
2852
2853 STATIC void
2854 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2855     const char *start, const char *end, const char *blurb)
2856 {
2857     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2858
2859     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2860
2861     if (!PL_colorset)
2862             reginitcolors();
2863     {
2864         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2865             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2866
2867         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2868             start, end - start, 60);
2869
2870         PerlIO_printf(Perl_debug_log,
2871             "%s%s REx%s %s against %s\n",
2872                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2873
2874         if (utf8_target||utf8_pat)
2875             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2876                 utf8_pat ? "pattern" : "",
2877                 utf8_pat && utf8_target ? " and " : "",
2878                 utf8_target ? "string" : ""
2879             );
2880     }
2881 }
2882
2883 STATIC void
2884 S_dump_exec_pos(pTHX_ const char *locinput,
2885                       const regnode *scan,
2886                       const char *loc_regeol,
2887                       const char *loc_bostr,
2888                       const char *loc_reg_starttry,
2889                       const bool utf8_target)
2890 {
2891     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2892     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2893     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2894     /* The part of the string before starttry has one color
2895        (pref0_len chars), between starttry and current
2896        position another one (pref_len - pref0_len chars),
2897        after the current position the third one.
2898        We assume that pref0_len <= pref_len, otherwise we
2899        decrease pref0_len.  */
2900     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2901         ? (5 + taill) - l : locinput - loc_bostr;
2902     int pref0_len;
2903
2904     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2905
2906     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2907         pref_len++;
2908     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2909     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2910         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2911               ? (5 + taill) - pref_len : loc_regeol - locinput);
2912     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2913         l--;
2914     if (pref0_len < 0)
2915         pref0_len = 0;
2916     if (pref0_len > pref_len)
2917         pref0_len = pref_len;
2918     {
2919         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2920
2921         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2922             (locinput - pref_len),pref0_len, 60, 4, 5);
2923
2924         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2925                     (locinput - pref_len + pref0_len),
2926                     pref_len - pref0_len, 60, 2, 3);
2927
2928         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2929                     locinput, loc_regeol - locinput, 10, 0, 1);
2930
2931         const STRLEN tlen=len0+len1+len2;
2932         PerlIO_printf(Perl_debug_log,
2933                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2934                     (IV)(locinput - loc_bostr),
2935                     len0, s0,
2936                     len1, s1,
2937                     (docolor ? "" : "> <"),
2938                     len2, s2,
2939                     (int)(tlen > 19 ? 0 :  19 - tlen),
2940                     "");
2941     }
2942 }
2943
2944 #endif
2945
2946 /* reg_check_named_buff_matched()
2947  * Checks to see if a named buffer has matched. The data array of
2948  * buffer numbers corresponding to the buffer is expected to reside
2949  * in the regexp->data->data array in the slot stored in the ARG() of
2950  * node involved. Note that this routine doesn't actually care about the
2951  * name, that information is not preserved from compilation to execution.
2952  * Returns the index of the leftmost defined buffer with the given name
2953  * or 0 if non of the buffers matched.
2954  */
2955 STATIC I32
2956 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2957 {
2958     I32 n;
2959     RXi_GET_DECL(rex,rexi);
2960     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2961     I32 *nums=(I32*)SvPVX(sv_dat);
2962
2963     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2964
2965     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2966         if ((I32)*PL_reglastparen >= nums[n] &&
2967             PL_regoffs[nums[n]].end != -1)
2968         {
2969             return nums[n];
2970         }
2971     }
2972     return 0;
2973 }
2974
2975
2976 /* free all slabs above current one  - called during LEAVE_SCOPE */
2977
2978 STATIC void
2979 S_clear_backtrack_stack(pTHX_ void *p)
2980 {
2981     regmatch_slab *s = PL_regmatch_slab->next;
2982     PERL_UNUSED_ARG(p);
2983
2984     if (!s)
2985         return;
2986     PL_regmatch_slab->next = NULL;
2987     while (s) {
2988         regmatch_slab * const osl = s;
2989         s = s->next;
2990         Safefree(osl);
2991     }
2992 }
2993
2994
2995 #define SETREX(Re1,Re2) \
2996     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2997     Re1 = (Re2)
2998
2999 STATIC I32                      /* 0 failure, 1 success */
3000 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3001 {
3002 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3003     dMY_CXT;
3004 #endif
3005     dVAR;
3006     register const bool utf8_target = PL_reg_match_utf8;
3007     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3008     REGEXP *rex_sv = reginfo->prog;
3009     regexp *rex = (struct regexp *)SvANY(rex_sv);
3010     RXi_GET_DECL(rex,rexi);
3011     I32 oldsave;
3012     /* the current state. This is a cached copy of PL_regmatch_state */
3013     register regmatch_state *st;
3014     /* cache heavy used fields of st in registers */
3015     register regnode *scan;
3016     register regnode *next;
3017     register U32 n = 0; /* general value; init to avoid compiler warning */
3018     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3019     register char *locinput = PL_reginput;
3020     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3021
3022     bool result = 0;        /* return value of S_regmatch */
3023     int depth = 0;          /* depth of backtrack stack */
3024     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3025     const U32 max_nochange_depth =
3026         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3027         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3028     regmatch_state *yes_state = NULL; /* state to pop to on success of
3029                                                             subpattern */
3030     /* mark_state piggy backs on the yes_state logic so that when we unwind
3031        the stack on success we can update the mark_state as we go */
3032     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3033     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3034     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3035     U32 state_num;
3036     bool no_final = 0;      /* prevent failure from backtracking? */
3037     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3038     char *startpoint = PL_reginput;
3039     SV *popmark = NULL;     /* are we looking for a mark? */
3040     SV *sv_commit = NULL;   /* last mark name seen in failure */
3041     SV *sv_yes_mark = NULL; /* last mark name we have seen
3042                                during a successful match */
3043     U32 lastopen = 0;       /* last open we saw */
3044     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3045     SV* const oreplsv = GvSV(PL_replgv);
3046     /* these three flags are set by various ops to signal information to
3047      * the very next op. They have a useful lifetime of exactly one loop
3048      * iteration, and are not preserved or restored by state pushes/pops
3049      */
3050     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3051     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3052     int logical = 0;        /* the following EVAL is:
3053                                 0: (?{...})
3054                                 1: (?(?{...})X|Y)
3055                                 2: (??{...})
3056                                or the following IFMATCH/UNLESSM is:
3057                                 false: plain (?=foo)
3058                                 true:  used as a condition: (?(?=foo))
3059                             */
3060 #ifdef DEBUGGING
3061     GET_RE_DEBUG_FLAGS_DECL;
3062 #endif
3063
3064     PERL_ARGS_ASSERT_REGMATCH;
3065
3066     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3067             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3068     }));
3069     /* on first ever call to regmatch, allocate first slab */
3070     if (!PL_regmatch_slab) {
3071         Newx(PL_regmatch_slab, 1, regmatch_slab);
3072         PL_regmatch_slab->prev = NULL;
3073         PL_regmatch_slab->next = NULL;
3074         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3075     }
3076
3077     oldsave = PL_savestack_ix;
3078     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3079     SAVEVPTR(PL_regmatch_slab);
3080     SAVEVPTR(PL_regmatch_state);
3081
3082     /* grab next free state slot */
3083     st = ++PL_regmatch_state;
3084     if (st >  SLAB_LAST(PL_regmatch_slab))
3085         st = PL_regmatch_state = S_push_slab(aTHX);
3086
3087     /* Note that nextchr is a byte even in UTF */
3088     nextchr = UCHARAT(locinput);
3089     scan = prog;
3090     while (scan != NULL) {
3091
3092         DEBUG_EXECUTE_r( {
3093             SV * const prop = sv_newmortal();
3094             regnode *rnext=regnext(scan);
3095             DUMP_EXEC_POS( locinput, scan, utf8_target );
3096             regprop(rex, prop, scan);
3097
3098             PerlIO_printf(Perl_debug_log,
3099                     "%3"IVdf":%*s%s(%"IVdf")\n",
3100                     (IV)(scan - rexi->program), depth*2, "",
3101                     SvPVX_const(prop),
3102                     (PL_regkind[OP(scan)] == END || !rnext) ?
3103                         0 : (IV)(rnext - rexi->program));
3104         });
3105
3106         next = scan + NEXT_OFF(scan);
3107         if (next == scan)
3108             next = NULL;
3109         state_num = OP(scan);
3110
3111         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3112       reenter_switch:
3113
3114         assert(PL_reglastparen == &rex->lastparen);
3115         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3116         assert(PL_regoffs == rex->offs);
3117
3118         switch (state_num) {
3119         case BOL:
3120             if (locinput == PL_bostr)
3121             {
3122                 /* reginfo->till = reginfo->bol; */
3123                 break;
3124             }
3125             sayNO;
3126         case MBOL:
3127             if (locinput == PL_bostr ||
3128                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3129             {
3130                 break;
3131             }
3132             sayNO;
3133         case SBOL:
3134             if (locinput == PL_bostr)
3135                 break;
3136             sayNO;
3137         case GPOS:
3138             if (locinput == reginfo->ganch)
3139                 break;
3140             sayNO;
3141
3142         case KEEPS:
3143             /* update the startpoint */
3144             st->u.keeper.val = PL_regoffs[0].start;
3145             PL_reginput = locinput;
3146             PL_regoffs[0].start = locinput - PL_bostr;
3147             PUSH_STATE_GOTO(KEEPS_next, next);
3148             /*NOT-REACHED*/
3149         case KEEPS_next_fail:
3150             /* rollback the start point change */
3151             PL_regoffs[0].start = st->u.keeper.val;
3152             sayNO_SILENT;
3153             /*NOT-REACHED*/
3154         case EOL:
3155                 goto seol;
3156         case MEOL:
3157             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3158                 sayNO;
3159             break;
3160         case SEOL:
3161           seol:
3162             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3163                 sayNO;
3164             if (PL_regeol - locinput > 1)
3165                 sayNO;
3166             break;
3167         case EOS:
3168             if (PL_regeol != locinput)
3169                 sayNO;
3170             break;
3171         case SANY:
3172             if (!nextchr && locinput >= PL_regeol)
3173                 sayNO;
3174             if (utf8_target) {
3175                 locinput += PL_utf8skip[nextchr];
3176                 if (locinput > PL_regeol)
3177                     sayNO;
3178                 nextchr = UCHARAT(locinput);
3179             }
3180             else
3181                 nextchr = UCHARAT(++locinput);
3182             break;
3183         case CANY:
3184             if (!nextchr && locinput >= PL_regeol)
3185                 sayNO;
3186             nextchr = UCHARAT(++locinput);
3187             break;
3188         case REG_ANY:
3189             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3190                 sayNO;
3191             if (utf8_target) {
3192                 locinput += PL_utf8skip[nextchr];
3193                 if (locinput > PL_regeol)
3194                     sayNO;
3195                 nextchr = UCHARAT(locinput);
3196             }
3197             else
3198                 nextchr = UCHARAT(++locinput);
3199             break;
3200
3201 #undef  ST
3202 #define ST st->u.trie
3203         case TRIEC:
3204             /* In this case the charclass data is available inline so
3205                we can fail fast without a lot of extra overhead.
3206              */
3207             if (scan->flags == EXACT || !utf8_target) {
3208                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3209                     DEBUG_EXECUTE_r(
3210                         PerlIO_printf(Perl_debug_log,
3211                                   "%*s  %sfailed to match trie start class...%s\n",
3212                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3213                     );
3214                     sayNO_SILENT;
3215                     /* NOTREACHED */
3216                 }
3217             }
3218             /* FALL THROUGH */
3219         case TRIE:
3220             /* the basic plan of execution of the trie is:
3221              * At the beginning, run though all the states, and
3222              * find the longest-matching word. Also remember the position
3223              * of the shortest matching word. For example, this pattern:
3224              *    1  2 3 4    5
3225              *    ab|a|x|abcd|abc
3226              * when matched against the string "abcde", will generate
3227              * accept states for all words except 3, with the longest
3228              * matching word being 4, and the shortest being 1 (with
3229              * the position being after char 1 of the string).
3230              *
3231              * Then for each matching word, in word order (i.e. 1,2,4,5),
3232              * we run the remainder of the pattern; on each try setting
3233              * the current position to the character following the word,
3234              * returning to try the next word on failure.
3235              *
3236              * We avoid having to build a list of words at runtime by
3237              * using a compile-time structure, wordinfo[].prev, which
3238              * gives, for each word, the previous accepting word (if any).
3239              * In the case above it would contain the mappings 1->2, 2->0,
3240              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3241              * the longest word (4 above), a list of all words, by
3242              * following the list of prev pointers; this gives us the
3243              * unordered list 4,5,1,2. Then given the current word we have
3244              * just tried, we can go through the list and find the
3245              * next-biggest word to try (so if we just failed on word 2,
3246              * the next in the list is 4).
3247              *
3248              * Since at runtime we don't record the matching position in
3249              * the string for each word, we have to work that out for
3250              * each word we're about to process. The wordinfo table holds
3251              * the character length of each word; given that we recorded
3252              * at the start: the position of the shortest word and its
3253              * length in chars, we just need to move the pointer the
3254              * difference between the two char lengths. Depending on
3255              * Unicode status and folding, that's cheap or expensive.
3256              *
3257              * This algorithm is optimised for the case where are only a
3258              * small number of accept states, i.e. 0,1, or maybe 2.
3259              * With lots of accepts states, and having to try all of them,
3260              * it becomes quadratic on number of accept states to find all
3261              * the next words.
3262              */
3263
3264             {
3265                 /* what type of TRIE am I? (utf8 makes this contextual) */
3266                 DECL_TRIE_TYPE(scan);
3267
3268                 /* what trie are we using right now */
3269                 reg_trie_data * const trie
3270                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3271                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3272                 U32 state = trie->startstate;
3273
3274                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3275                     !TRIE_BITMAP_TEST(trie,*locinput)
3276                 ) {
3277                     if (trie->states[ state ].wordnum) {
3278                          DEBUG_EXECUTE_r(
3279                             PerlIO_printf(Perl_debug_log,
3280                                           "%*s  %smatched empty string...%s\n",
3281                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3282                         );
3283                         if (!trie->jump)
3284                             break;
3285                     } else {
3286                         DEBUG_EXECUTE_r(
3287                             PerlIO_printf(Perl_debug_log,
3288                                           "%*s  %sfailed to match trie start class...%s\n",
3289                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3290                         );
3291                         sayNO_SILENT;
3292                    }
3293                 }
3294
3295             {
3296                 U8 *uc = ( U8* )locinput;
3297
3298                 STRLEN len = 0;
3299                 STRLEN foldlen = 0;
3300                 U8 *uscan = (U8*)NULL;
3301                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3302                 U32 charcount = 0; /* how many input chars we have matched */
3303                 U32 accepted = 0; /* have we seen any accepting states? */
3304
3305                 ST.B = next;
3306                 ST.jump = trie->jump;
3307                 ST.me = scan;
3308                 ST.firstpos = NULL;
3309                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3310                 ST.nextword = 0;
3311
3312                 /* fully traverse the TRIE; note the position of the
3313                    shortest accept state and the wordnum of the longest
3314                    accept state */
3315
3316                 while ( state && uc <= (U8*)PL_regeol ) {
3317                     U32 base = trie->states[ state ].trans.base;
3318                     UV uvc = 0;
3319                     U16 charid = 0;
3320                     U16 wordnum;
3321                     wordnum = trie->states[ state ].wordnum;
3322
3323                     if (wordnum) { /* it's an accept state */
3324                         if (!accepted) {
3325                             accepted = 1;
3326                             /* record first match position */
3327                             if (ST.longfold) {
3328                                 ST.firstpos = (U8*)locinput;
3329                                 ST.firstchars = 0;
3330                             }
3331                             else {
3332                                 ST.firstpos = uc;
3333                                 ST.firstchars = charcount;
3334                             }
3335                         }
3336                         if (!ST.nextword || wordnum < ST.nextword)
3337                             ST.nextword = wordnum;
3338                         ST.topword = wordnum;
3339                     }
3340
3341                     DEBUG_TRIE_EXECUTE_r({
3342                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3343                                 PerlIO_printf( Perl_debug_log,
3344                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3345                                     2+depth * 2, "", PL_colors[4],
3346                                     (UV)state, (accepted ? 'Y' : 'N'));
3347                     });
3348
3349                     /* read a char and goto next state */
3350                     if ( base ) {
3351                         I32 offset;
3352                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3353                                              uscan, len, uvc, charid, foldlen,
3354                                              foldbuf, uniflags);
3355                         charcount++;
3356                         if (foldlen>0)
3357                             ST.longfold = TRUE;
3358                         if (charid &&
3359                              ( ((offset =
3360                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3361
3362                              && ((U32)offset < trie->lasttrans)
3363                              && trie->trans[offset].check == state)
3364                         {
3365                             state = trie->trans[offset].next;
3366                         }
3367                         else {
3368                             state = 0;
3369                         }
3370                         uc += len;
3371
3372                     }
3373                     else {
3374                         state = 0;
3375                     }
3376                     DEBUG_TRIE_EXECUTE_r(
3377                         PerlIO_printf( Perl_debug_log,
3378                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3379                             charid, uvc, (UV)state, PL_colors[5] );
3380                     );
3381                 }
3382                 if (!accepted)
3383                    sayNO;
3384
3385                 /* calculate total number of accept states */
3386                 {
3387                     U16 w = ST.topword;
3388                     accepted = 0;
3389                     while (w) {
3390                         w = trie->wordinfo[w].prev;
3391                         accepted++;
3392                     }
3393                     ST.accepted = accepted;
3394                 }
3395
3396                 DEBUG_EXECUTE_r(
3397                     PerlIO_printf( Perl_debug_log,
3398                         "%*s  %sgot %"IVdf" possible matches%s\n",
3399                         REPORT_CODE_OFF + depth * 2, "",
3400                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3401                 );
3402                 goto trie_first_try; /* jump into the fail handler */
3403             }}
3404             /* NOTREACHED */
3405
3406         case TRIE_next_fail: /* we failed - try next alternative */
3407             if ( ST.jump) {
3408                 REGCP_UNWIND(ST.cp);
3409                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3410                     PL_regoffs[n].end = -1;
3411                 *PL_reglastparen = n;
3412             }
3413             if (!--ST.accepted) {
3414                 DEBUG_EXECUTE_r({
3415                     PerlIO_printf( Perl_debug_log,
3416                         "%*s  %sTRIE failed...%s\n",
3417                         REPORT_CODE_OFF+depth*2, "",
3418                         PL_colors[4],
3419                         PL_colors[5] );
3420                 });
3421                 sayNO_SILENT;
3422             }
3423             {
3424                 /* Find next-highest word to process.  Note that this code
3425                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3426                 register U16 min = 0;
3427                 register U16 word;
3428                 register U16 const nextword = ST.nextword;
3429                 register reg_trie_wordinfo * const wordinfo
3430                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3431                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3432                     if (word > nextword && (!min || word < min))
3433                         min = word;
3434                 }
3435                 ST.nextword = min;
3436             }
3437
3438           trie_first_try:
3439             if (do_cutgroup) {
3440                 do_cutgroup = 0;
3441                 no_final = 0;
3442             }
3443
3444             if ( ST.jump) {
3445                 ST.lastparen = *PL_reglastparen;
3446                 REGCP_SET(ST.cp);
3447             }
3448
3449             /* find start char of end of current word */
3450             {
3451                 U32 chars; /* how many chars to skip */
3452                 U8 *uc = ST.firstpos;
3453                 reg_trie_data * const trie
3454                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3455
3456                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3457                             >=  ST.firstchars);
3458                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3459                             - ST.firstchars;
3460
3461                 if (ST.longfold) {
3462                     /* the hard option - fold each char in turn and find
3463                      * its folded length (which may be different */
3464                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3465                     STRLEN foldlen;
3466                     STRLEN len;
3467                     UV uvc;
3468                     U8 *uscan;
3469
3470                     while (chars) {
3471                         if (utf8_target) {
3472                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3473                                                     uniflags);
3474                             uc += len;
3475                         }
3476                         else {
3477                             uvc = *uc;
3478                             uc++;
3479                         }
3480                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3481                         uscan = foldbuf;
3482                         while (foldlen) {
3483                             if (!--chars)
3484                                 break;
3485                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3486                                             uniflags);
3487                             uscan += len;
3488                             foldlen -= len;
3489                         }
3490                     }
3491                 }
3492                 else {
3493                     if (utf8_target)
3494                         while (chars--)
3495                             uc += UTF8SKIP(uc);
3496                     else
3497                         uc += chars;
3498                 }
3499                 PL_reginput = (char *)uc;
3500             }
3501
3502             scan = (ST.jump && ST.jump[ST.nextword])
3503                         ? ST.me + ST.jump[ST.nextword]
3504                         : ST.B;
3505
3506             DEBUG_EXECUTE_r({
3507                 PerlIO_printf( Perl_debug_log,
3508                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3509                     REPORT_CODE_OFF+depth*2, "",
3510                     PL_colors[4],
3511                     ST.nextword,
3512                     PL_colors[5]
3513                     );
3514             });
3515
3516             if (ST.accepted > 1 || has_cutgroup) {
3517                 PUSH_STATE_GOTO(TRIE_next, scan);
3518                 /* NOTREACHED */
3519             }
3520             /* only one choice left - just continue */
3521             DEBUG_EXECUTE_r({
3522                 AV *const trie_words
3523                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3524                 SV ** const tmp = av_fetch( trie_words,
3525                     ST.nextword-1, 0 );
3526                 SV *sv= tmp ? sv_newmortal() : NULL;
3527
3528                 PerlIO_printf( Perl_debug_log,
3529                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3530                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3531                     ST.nextword,
3532                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3533                             PL_colors[0], PL_colors[1],
3534                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3535                         )
3536                     : "not compiled under -Dr",
3537                     PL_colors[5] );
3538             });
3539
3540             locinput = PL_reginput;
3541             nextchr = UCHARAT(locinput);
3542             continue; /* execute rest of RE */
3543             /* NOTREACHED */
3544 #undef  ST
3545
3546         case EXACT: {
3547             char *s = STRING(scan);
3548             ln = STR_LEN(scan);
3549             if (utf8_target != UTF_PATTERN) {
3550                 /* The target and the pattern have differing utf8ness. */
3551                 char *l = locinput;
3552                 const char * const e = s + ln;
3553
3554                 if (utf8_target) {
3555                     /* The target is utf8, the pattern is not utf8. */
3556                     while (s < e) {
3557                         STRLEN ulen;
3558                         if (l >= PL_regeol)
3559                              sayNO;
3560                         if (NATIVE_TO_UNI(*(U8*)s) !=
3561                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3562                                             uniflags))
3563                              sayNO;
3564                         l += ulen;
3565                         s ++;
3566                     }
3567                 }
3568                 else {
3569                     /* The target is not utf8, the pattern is utf8. */
3570                     while (s < e) {
3571                         STRLEN ulen;
3572                         if (l >= PL_regeol)
3573                             sayNO;
3574                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3575                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3576                                            uniflags))
3577                             sayNO;
3578                         s += ulen;
3579                         l ++;
3580                     }
3581                 }
3582                 locinput = l;
3583                 nextchr = UCHARAT(locinput);
3584                 break;
3585             }
3586             /* The target and the pattern have the same utf8ness. */
3587             /* Inline the first character, for speed. */
3588             if (UCHARAT(s) != nextchr)
3589                 sayNO;
3590             if (PL_regeol - locinput < ln)
3591                 sayNO;
3592             if (ln > 1 && memNE(s, locinput, ln))
3593                 sayNO;
3594             locinput += ln;
3595             nextchr = UCHARAT(locinput);
3596             break;
3597             }
3598         case EXACTFL: {
3599             re_fold_t folder;
3600             const U8 * fold_array;
3601             const char * s;
3602             U32 fold_utf8_flags;
3603
3604             PL_reg_flags |= RF_tainted;
3605             folder = foldEQ_locale;
3606             fold_array = PL_fold_locale;
3607             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3608             goto do_exactf;
3609
3610         case EXACTFU:
3611             folder = foldEQ_latin1;
3612             fold_array = PL_fold_latin1;
3613             fold_utf8_flags = 0;
3614             goto do_exactf;
3615
3616         case EXACTFA:
3617             folder = foldEQ_latin1;
3618             fold_array = PL_fold_latin1;
3619             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3620             goto do_exactf;
3621
3622         case EXACTF:
3623             folder = foldEQ;
3624             fold_array = PL_fold;
3625             fold_utf8_flags = 0;
3626
3627           do_exactf:
3628             s = STRING(scan);
3629             ln = STR_LEN(scan);
3630
3631             if (utf8_target || UTF_PATTERN) {
3632               /* Either target or the pattern are utf8. */
3633                 const char * const l = locinput;
3634                 char *e = PL_regeol;
3635
3636                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3637                                l, &e, 0,  utf8_target, fold_utf8_flags))
3638                 {
3639                     sayNO;
3640                 }
3641                 locinput = e;
3642                 nextchr = UCHARAT(locinput);
3643                 break;
3644             }
3645
3646             /* Neither the target nor the pattern are utf8 */
3647             if (UCHARAT(s) != nextchr &&
3648                 UCHARAT(s) != fold_array[nextchr])
3649             {
3650                 sayNO;
3651             }
3652             if (PL_regeol - locinput < ln)
3653                 sayNO;
3654             if (ln > 1 && ! folder(s, locinput, ln))
3655                 sayNO;
3656             locinput += ln;
3657             nextchr = UCHARAT(locinput);
3658             break;
3659         }
3660
3661         /* XXX Could improve efficiency by separating these all out using a
3662          * macro or in-line function.  At that point regcomp.c would no longer
3663          * have to set the FLAGS fields of these */
3664         case BOUNDL:
3665         case NBOUNDL:
3666             PL_reg_flags |= RF_tainted;
3667             /* FALL THROUGH */
3668         case BOUND:
3669         case BOUNDU:
3670         case BOUNDA:
3671         case NBOUND:
3672         case NBOUNDU:
3673         case NBOUNDA:
3674             /* was last char in word? */
3675             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3676                 if (locinput == PL_bostr)
3677                     ln = '\n';
3678                 else {
3679                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3680
3681                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3682                 }
3683                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3684                     ln = isALNUM_uni(ln);
3685                     LOAD_UTF8_CHARCLASS_ALNUM();
3686                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3687                 }
3688                 else {
3689                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3690                     n = isALNUM_LC_utf8((U8*)locinput);
3691                 }
3692             }
3693             else {
3694
3695                 /* Here the string isn't utf8, or is utf8 and only ascii
3696                  * characters are to match \w.  In the latter case looking at
3697                  * the byte just prior to the current one may be just the final
3698                  * byte of a multi-byte character.  This is ok.  There are two
3699                  * cases:
3700                  * 1) it is a single byte character, and then the test is doing
3701                  *      just what it's supposed to.
3702                  * 2) it is a multi-byte character, in which case the final
3703                  *      byte is never mistakable for ASCII, and so the test
3704                  *      will say it is not a word character, which is the
3705                  *      correct answer. */
3706                 ln = (locinput != PL_bostr) ?
3707                     UCHARAT(locinput - 1) : '\n';
3708                 switch (FLAGS(scan)) {
3709                     case REGEX_UNICODE_CHARSET:
3710                         ln = isWORDCHAR_L1(ln);
3711                         n = isWORDCHAR_L1(nextchr);
3712                         break;
3713                     case REGEX_LOCALE_CHARSET:
3714                         ln = isALNUM_LC(ln);
3715                         n = isALNUM_LC(nextchr);
3716                         break;
3717                     case REGEX_DEPENDS_CHARSET:
3718                         ln = isALNUM(ln);
3719                         n = isALNUM(nextchr);
3720                         break;
3721                     case REGEX_ASCII_RESTRICTED_CHARSET:
3722                         ln = isWORDCHAR_A(ln);
3723                         n = isWORDCHAR_A(nextchr);
3724                         break;
3725                     default:
3726                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3727                         break;
3728                 }
3729             }
3730             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3731              * regcomp.sym */
3732             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3733                     sayNO;
3734             break;
3735         case ANYOFV:
3736         case ANYOF:
3737             if (utf8_target || state_num == ANYOFV) {
3738                 STRLEN inclasslen = PL_regeol - locinput;
3739                 if (locinput >= PL_regeol)
3740                     sayNO;
3741
3742                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3743                     sayNO;
3744                 locinput += inclasslen;
3745                 nextchr = UCHARAT(locinput);
3746                 break;
3747             }
3748             else {
3749                 if (nextchr < 0)
3750                     nextchr = UCHARAT(locinput);
3751                 if (!nextchr && locinput >= PL_regeol)
3752                     sayNO;
3753                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3754                     sayNO;
3755                 nextchr = UCHARAT(++locinput);
3756                 break;
3757             }
3758             break;
3759         /* Special char classes - The defines start on line 129 or so */
3760         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3761                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3762                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3763                   ALNUMA, NALNUMA, isWORDCHAR_A,
3764                   alnum, "a");
3765
3766         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3767                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3768                   SPACEU, NSPACEU, isSPACE_L1,
3769                   SPACEA, NSPACEA, isSPACE_A,
3770                   space, " ");
3771
3772         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3773                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3774                 DIGITA, NDIGITA, isDIGIT_A,
3775                 digit, "0");
3776
3777         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3778                        a Unicode extended Grapheme Cluster */
3779             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3780               extended Grapheme Cluster is:
3781
3782                CR LF
3783                | Prepend* Begin Extend*
3784                | .
3785
3786                Begin is (Hangul-syllable | ! Control)
3787                Extend is (Grapheme_Extend | Spacing_Mark)
3788                Control is [ GCB_Control CR LF ]
3789
3790                The discussion below shows how the code for CLUMP is derived
3791                from this regex.  Note that most of these concepts are from
3792                property values of the Grapheme Cluster Boundary (GCB) property.
3793                No code point can have multiple property values for a given
3794                property.  Thus a code point in Prepend can't be in Control, but
3795                it must be in !Control.  This is why Control above includes
3796                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3797                property separately, and so can't be in GCB_Control, even though
3798                they logically are controls.  Control is not the same as gc=cc,
3799                but includes format and other characters as well.
3800
3801                The Unicode definition of Hangul-syllable is:
3802                    L+
3803                    | (L* ( ( V | LV ) V* | LVT ) T*)
3804                    | T+
3805                   )
3806                Each of these is a value for the GCB property, and hence must be
3807                disjoint, so the order they are tested is immaterial, so the
3808                above can safely be changed to
3809                    T+
3810                    | L+
3811                    | (L* ( LVT | ( V | LV ) V*) T*)
3812
3813                The last two terms can be combined like this:
3814                    L* ( L
3815                         | (( LVT | ( V | LV ) V*) T*))
3816
3817                And refactored into this:
3818                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3819
3820                That means that if we have seen any L's at all we can quit
3821                there, but if the next character is a LVT, a V or and LV we
3822                should keep going.
3823
3824                There is a subtlety with Prepend* which showed up in testing.
3825                Note that the Begin, and only the Begin is required in:
3826                 | Prepend* Begin Extend*
3827                Also, Begin contains '! Control'.  A Prepend must be a '!
3828                Control', which means it must be a Begin.  What it comes down to
3829                is that if we match Prepend* and then find no suitable Begin
3830                afterwards, that if we backtrack the last Prepend, that one will
3831                be a suitable Begin.
3832             */
3833
3834             if (locinput >= PL_regeol)
3835                 sayNO;
3836             if  (! utf8_target) {
3837
3838                 /* Match either CR LF  or '.', as all the other possibilities
3839                  * require utf8 */
3840                 locinput++;         /* Match the . or CR */
3841                 if (nextchr == '\r'
3842                     && locinput < PL_regeol
3843                     && UCHARAT(locinput) == '\n') locinput++;
3844             }
3845             else {
3846
3847                 /* Utf8: See if is ( CR LF ); already know that locinput <
3848                  * PL_regeol, so locinput+1 is in bounds */
3849                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3850                     locinput += 2;
3851                 }
3852                 else {
3853                     /* In case have to backtrack to beginning, then match '.' */
3854                     char *starting = locinput;
3855
3856                     /* In case have to backtrack the last prepend */
3857                     char *previous_prepend = 0;
3858
3859                     LOAD_UTF8_CHARCLASS_GCB();
3860
3861                     /* Match (prepend)* */
3862                     while (locinput < PL_regeol
3863                            && swash_fetch(PL_utf8_X_prepend,
3864                                           (U8*)locinput, utf8_target))
3865                     {
3866                         previous_prepend = locinput;
3867                         locinput += UTF8SKIP(locinput);
3868                     }
3869
3870                     /* As noted above, if we matched a prepend character, but
3871                      * the next thing won't match, back off the last prepend we
3872                      * matched, as it is guaranteed to match the begin */
3873                     if (previous_prepend
3874                         && (locinput >=  PL_regeol
3875                             || ! swash_fetch(PL_utf8_X_begin,
3876                                              (U8*)locinput, utf8_target)))
3877                     {
3878                         locinput = previous_prepend;
3879                     }
3880
3881                     /* Note that here we know PL_regeol > locinput, as we
3882                      * tested that upon input to this switch case, and if we
3883                      * moved locinput forward, we tested the result just above
3884                      * and it either passed, or we backed off so that it will
3885                      * now pass */
3886                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3887
3888                         /* Here did not match the required 'Begin' in the
3889                          * second term.  So just match the very first
3890                          * character, the '.' of the final term of the regex */
3891                         locinput = starting + UTF8SKIP(starting);
3892                     } else {
3893
3894                         /* Here is the beginning of a character that can have
3895                          * an extender.  It is either a hangul syllable, or a
3896                          * non-control */
3897                         if (swash_fetch(PL_utf8_X_non_hangul,
3898                                         (U8*)locinput, utf8_target))
3899                         {
3900
3901                             /* Here not a Hangul syllable, must be a
3902                              * ('!  * Control') */
3903                             locinput += UTF8SKIP(locinput);
3904                         } else {
3905
3906                             /* Here is a Hangul syllable.  It can be composed
3907                              * of several individual characters.  One
3908                              * possibility is T+ */
3909                             if (swash_fetch(PL_utf8_X_T,
3910                                             (U8*)locinput, utf8_target))
3911                             {
3912                                 while (locinput < PL_regeol
3913                                         && swash_fetch(PL_utf8_X_T,
3914                                                         (U8*)locinput, utf8_target))
3915                                 {
3916                                     locinput += UTF8SKIP(locinput);
3917                                 }
3918                             } else {
3919
3920                                 /* Here, not T+, but is a Hangul.  That means
3921                                  * it is one of the others: L, LV, LVT or V,
3922                                  * and matches:
3923                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3924
3925                                 /* Match L*           */
3926                                 while (locinput < PL_regeol
3927                                         && swash_fetch(PL_utf8_X_L,
3928                                                         (U8*)locinput, utf8_target))
3929                                 {
3930                                     locinput += UTF8SKIP(locinput);
3931                                 }
3932
3933                                 /* Here, have exhausted L*.  If the next
3934                                  * character is not an LV, LVT nor V, it means
3935                                  * we had to have at least one L, so matches L+
3936                                  * in the original equation, we have a complete
3937                                  * hangul syllable.  Are done. */
3938
3939                                 if (locinput < PL_regeol
3940                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3941                                                     (U8*)locinput, utf8_target))
3942                                 {
3943
3944                                     /* Otherwise keep going.  Must be LV, LVT
3945                                      * or V.  See if LVT */
3946                                     if (swash_fetch(PL_utf8_X_LVT,
3947                                                     (U8*)locinput, utf8_target))
3948                                     {
3949                                         locinput += UTF8SKIP(locinput);
3950                                     } else {
3951
3952                                         /* Must be  V or LV.  Take it, then
3953                                          * match V*     */
3954                                         locinput += UTF8SKIP(locinput);
3955                                         while (locinput < PL_regeol
3956                                                 && swash_fetch(PL_utf8_X_V,
3957                                                          (U8*)locinput, utf8_target))
3958                                         {
3959                                             locinput += UTF8SKIP(locinput);
3960                                         }
3961                                     }
3962
3963                                     /* And any of LV, LVT, or V can be followed
3964                                      * by T*            */
3965                                     while (locinput < PL_regeol
3966                                            && swash_fetch(PL_utf8_X_T,
3967                                                            (U8*)locinput,
3968                                                            utf8_target))
3969                                     {
3970                                         locinput += UTF8SKIP(locinput);
3971                                     }
3972                                 }
3973                             }
3974                         }
3975
3976                         /* Match any extender */
3977                         while (locinput < PL_regeol
3978                                 && swash_fetch(PL_utf8_X_extend,
3979                                                 (U8*)locinput, utf8_target))
3980                         {
3981                             locinput += UTF8SKIP(locinput);
3982                         }
3983                     }
3984                 }
3985                 if (locinput > PL_regeol) sayNO;
3986             }
3987             nextchr = UCHARAT(locinput);
3988             break;
3989
3990         case NREFFL:
3991         {   /* The capture buffer cases.  The ones beginning with N for the
3992                named buffers just convert to the equivalent numbered and
3993                pretend they were called as the corresponding numbered buffer
3994                op.  */
3995             /* don't initialize these in the declaration, it makes C++
3996                unhappy */
3997             char *s;
3998             char type;
3999             re_fold_t folder;
4000             const U8 *fold_array;
4001             UV utf8_fold_flags;
4002
4003             PL_reg_flags |= RF_tainted;
4004             folder = foldEQ_locale;
4005             fold_array = PL_fold_locale;
4006             type = REFFL;
4007             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4008             goto do_nref;
4009
4010         case NREFFA:
4011             folder = foldEQ_latin1;
4012             fold_array = PL_fold_latin1;
4013             type = REFFA;
4014             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4015             goto do_nref;
4016
4017         case NREFFU:
4018             folder = foldEQ_latin1;
4019             fold_array = PL_fold_latin1;
4020             type = REFFU;
4021             utf8_fold_flags = 0;
4022             goto do_nref;
4023
4024         case NREFF:
4025             folder = foldEQ;
4026             fold_array = PL_fold;
4027             type = REFF;
4028             utf8_fold_flags = 0;
4029             goto do_nref;
4030
4031         case NREF:
4032             type = REF;
4033             folder = NULL;
4034             fold_array = NULL;
4035             utf8_fold_flags = 0;
4036           do_nref:
4037
4038             /* For the named back references, find the corresponding buffer
4039              * number */
4040             n = reg_check_named_buff_matched(rex,scan);
4041
4042             if ( ! n ) {
4043                 sayNO;
4044             }
4045             goto do_nref_ref_common;
4046
4047         case REFFL:
4048             PL_reg_flags |= RF_tainted;
4049             folder = foldEQ_locale;
4050             fold_array = PL_fold_locale;
4051             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4052             goto do_ref;
4053
4054         case REFFA:
4055             folder = foldEQ_latin1;
4056             fold_array = PL_fold_latin1;
4057             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4058             goto do_ref;
4059
4060         case REFFU:
4061             folder = foldEQ_latin1;
4062             fold_array = PL_fold_latin1;
4063             utf8_fold_flags = 0;
4064             goto do_ref;
4065
4066         case REFF:
4067             folder = foldEQ;
4068             fold_array = PL_fold;
4069             utf8_fold_flags = 0;
4070             goto do_ref;
4071
4072         case REF:
4073             folder = NULL;
4074             fold_array = NULL;
4075             utf8_fold_flags = 0;
4076
4077           do_ref:
4078             type = OP(scan);
4079             n = ARG(scan);  /* which paren pair */
4080
4081           do_nref_ref_common:
4082             ln = PL_regoffs[n].start;
4083             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4084             if (*PL_reglastparen < n || ln == -1)
4085                 sayNO;                  /* Do not match unless seen CLOSEn. */
4086             if (ln == PL_regoffs[n].end)
4087                 break;
4088
4089             s = PL_bostr + ln;
4090             if (type != REF     /* REF can do byte comparison */
4091                 && (utf8_target || type == REFFU))
4092             { /* XXX handle REFFL better */
4093                 char * limit = PL_regeol;
4094
4095                 /* This call case insensitively compares the entire buffer
4096                     * at s, with the current input starting at locinput, but
4097                     * not going off the end given by PL_regeol, and returns in
4098                     * limit upon success, how much of the current input was
4099                     * matched */
4100                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4101                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4102                 {
4103                     sayNO;
4104                 }
4105                 locinput = limit;
4106                 nextchr = UCHARAT(locinput);
4107                 break;
4108             }
4109
4110             /* Not utf8:  Inline the first character, for speed. */
4111             if (UCHARAT(s) != nextchr &&
4112                 (type == REF ||
4113                  UCHARAT(s) != fold_array[nextchr]))
4114                 sayNO;
4115             ln = PL_regoffs[n].end - ln;
4116             if (locinput + ln > PL_regeol)
4117                 sayNO;
4118             if (ln > 1 && (type == REF
4119                            ? memNE(s, locinput, ln)
4120                            : ! folder(s, locinput, ln)))
4121                 sayNO;
4122             locinput += ln;
4123             nextchr = UCHARAT(locinput);
4124             break;
4125         }
4126         case NOTHING:
4127         case TAIL:
4128             break;
4129         case BACK:
4130             break;
4131
4132 #undef  ST
4133 #define ST st->u.eval
4134         {
4135             SV *ret;
4136             REGEXP *re_sv;
4137             regexp *re;
4138             regexp_internal *rei;
4139             regnode *startpoint;
4140
4141         case GOSTART:
4142         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4143             if (cur_eval && cur_eval->locinput==locinput) {
4144                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4145                     Perl_croak(aTHX_ "Infinite recursion in regex");
4146                 if ( ++nochange_depth > max_nochange_depth )
4147                     Perl_croak(aTHX_
4148                         "Pattern subroutine nesting without pos change"
4149                         " exceeded limit in regex");
4150             } else {
4151                 nochange_depth = 0;
4152             }
4153             re_sv = rex_sv;
4154             re = rex;
4155             rei = rexi;
4156             (void)ReREFCNT_inc(rex_sv);
4157             if (OP(scan)==GOSUB) {
4158                 startpoint = scan + ARG2L(scan);
4159                 ST.close_paren = ARG(scan);
4160             } else {
4161                 startpoint = rei->program+1;
4162                 ST.close_paren = 0;
4163             }
4164             goto eval_recurse_doit;
4165             /* NOTREACHED */
4166         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4167             if (cur_eval && cur_eval->locinput==locinput) {
4168                 if ( ++nochange_depth > max_nochange_depth )
4169                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4170             } else {
4171                 nochange_depth = 0;
4172             }
4173             {
4174                 /* execute the code in the {...} */
4175                 dSP;
4176                 SV ** const before = SP;
4177                 OP_4tree * const oop = PL_op;
4178                 COP * const ocurcop = PL_curcop;
4179                 PAD *old_comppad;
4180                 char *saved_regeol = PL_regeol;
4181                 struct re_save_state saved_state;
4182
4183                 /* To not corrupt the existing regex state while executing the
4184                  * eval we would normally put it on the save stack, like with
4185                  * save_re_context. However, re-evals have a weird scoping so we
4186                  * can't just add ENTER/LEAVE here. With that, things like
4187                  *
4188                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4189                  *
4190                  * would break, as they expect the localisation to be unwound
4191                  * only when the re-engine backtracks through the bit that
4192                  * localised it.
4193                  *
4194                  * What we do instead is just saving the state in a local c
4195                  * variable.
4196                  */
4197                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4198
4199                 n = ARG(scan);
4200                 PL_op = (OP_4tree*)rexi->data->data[n];
4201                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4202                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4203                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4204                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4205
4206                 if (sv_yes_mark) {
4207                     SV *sv_mrk = get_sv("REGMARK", 1);
4208                     sv_setsv(sv_mrk, sv_yes_mark);
4209                 }
4210
4211                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4212                 SPAGAIN;
4213                 if (SP == before)
4214                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4215                 else {
4216                     ret = POPs;
4217                     PUTBACK;
4218                 }
4219
4220                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4221
4222                 PL_op = oop;
4223                 PAD_RESTORE_LOCAL(old_comppad);
4224                 PL_curcop = ocurcop;
4225                 PL_regeol = saved_regeol;
4226                 if (!logical) {
4227                     /* /(?{...})/ */
4228                     sv_setsv(save_scalar(PL_replgv), ret);
4229                     break;
4230                 }
4231             }
4232             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4233                 logical = 0;
4234                 {
4235                     /* extract RE object from returned value; compiling if
4236                      * necessary */
4237                     MAGIC *mg = NULL;
4238                     REGEXP *rx = NULL;
4239
4240                     if (SvROK(ret)) {
4241                         SV *const sv = SvRV(ret);
4242
4243                         if (SvTYPE(sv) == SVt_REGEXP) {
4244                             rx = (REGEXP*) sv;
4245                         } else if (SvSMAGICAL(sv)) {
4246                             mg = mg_find(sv, PERL_MAGIC_qr);
4247                             assert(mg);
4248                         }
4249                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4250                         rx = (REGEXP*) ret;
4251                     } else if (SvSMAGICAL(ret)) {
4252                         if (SvGMAGICAL(ret)) {
4253                             /* I don't believe that there is ever qr magic
4254                                here.  */
4255                             assert(!mg_find(ret, PERL_MAGIC_qr));
4256                             sv_unmagic(ret, PERL_MAGIC_qr);
4257                         }
4258                         else {
4259                             mg = mg_find(ret, PERL_MAGIC_qr);
4260                             /* testing suggests mg only ends up non-NULL for
4261                                scalars who were upgraded and compiled in the
4262                                else block below. In turn, this is only
4263                                triggered in the "postponed utf8 string" tests
4264                                in t/op/pat.t  */
4265                         }
4266                     }
4267
4268                     if (mg) {
4269                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4270                         assert(rx);
4271                     }
4272                     if (rx) {
4273                         rx = reg_temp_copy(NULL, rx);
4274                     }
4275                     else {
4276                         U32 pm_flags = 0;
4277                         const I32 osize = PL_regsize;
4278
4279                         if (DO_UTF8(ret)) {
4280                             assert (SvUTF8(ret));
4281                         } else if (SvUTF8(ret)) {
4282                             /* Not doing UTF-8, despite what the SV says. Is
4283                                this only if we're trapped in use 'bytes'?  */
4284                             /* Make a copy of the octet sequence, but without
4285                                the flag on, as the compiler now honours the
4286                                SvUTF8 flag on ret.  */
4287                             STRLEN len;
4288                             const char *const p = SvPV(ret, len);
4289                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4290                         }
4291                         rx = CALLREGCOMP(ret, pm_flags);
4292                         if (!(SvFLAGS(ret)
4293                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4294                                  | SVs_GMG))) {
4295                             /* This isn't a first class regexp. Instead, it's
4296                                caching a regexp onto an existing, Perl visible
4297                                scalar.  */
4298                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4299                         }
4300                         PL_regsize = osize;
4301                     }
4302                     re_sv = rx;
4303                     re = (struct regexp *)SvANY(rx);
4304                 }
4305                 RXp_MATCH_COPIED_off(re);
4306                 re->subbeg = rex->subbeg;
4307                 re->sublen = rex->sublen;
4308                 rei = RXi_GET(re);
4309                 DEBUG_EXECUTE_r(
4310                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4311                         "Matching embedded");
4312                 );
4313                 startpoint = rei->program + 1;
4314                 ST.close_paren = 0; /* only used for GOSUB */
4315                 /* borrowed from regtry */
4316                 if (PL_reg_start_tmpl <= re->nparens) {
4317                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4318                     if(PL_reg_start_tmp)
4319                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4320                     else
4321                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4322                 }
4323
4324         eval_recurse_doit: /* Share code with GOSUB below this line */
4325                 /* run the pattern returned from (??{...}) */
4326                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4327                 REGCP_SET(ST.lastcp);
4328
4329                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4330
4331                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4332                 PL_reglastparen = &re->lastparen;
4333                 PL_reglastcloseparen = &re->lastcloseparen;
4334                 re->lastparen = 0;
4335                 re->lastcloseparen = 0;
4336
4337                 PL_reginput = locinput;
4338                 PL_regsize = 0;
4339
4340                 /* XXXX This is too dramatic a measure... */
4341                 PL_reg_maxiter = 0;
4342
4343                 ST.toggle_reg_flags = PL_reg_flags;
4344                 if (RX_UTF8(re_sv))
4345                     PL_reg_flags |= RF_utf8;
4346                 else
4347                     PL_reg_flags &= ~RF_utf8;
4348                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4349
4350                 ST.prev_rex = rex_sv;
4351                 ST.prev_curlyx = cur_curlyx;
4352                 SETREX(rex_sv,re_sv);
4353                 rex = re;
4354                 rexi = rei;
4355                 cur_curlyx = NULL;
4356                 ST.B = next;
4357                 ST.prev_eval = cur_eval;
4358                 cur_eval = st;
4359                 /* now continue from first node in postoned RE */
4360                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4361                 /* NOTREACHED */
4362             }
4363             /* logical is 1,   /(?(?{...})X|Y)/ */
4364             sw = cBOOL(SvTRUE(ret));
4365             logical = 0;
4366             break;
4367         }
4368
4369         case EVAL_AB: /* cleanup after a successful (??{A})B */
4370             /* note: this is called twice; first after popping B, then A */
4371             PL_reg_flags ^= ST.toggle_reg_flags;
4372             ReREFCNT_dec(rex_sv);
4373             SETREX(rex_sv,ST.prev_rex);
4374             rex = (struct regexp *)SvANY(rex_sv);
4375             rexi = RXi_GET(rex);
4376             regcpblow(ST.cp);
4377             cur_eval = ST.prev_eval;
4378             cur_curlyx = ST.prev_curlyx;
4379
4380             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4381             PL_reglastparen = &rex->lastparen;
4382             PL_reglastcloseparen = &rex->lastcloseparen;
4383             /* also update PL_regoffs */
4384             PL_regoffs = rex->offs;
4385
4386             /* XXXX This is too dramatic a measure... */
4387             PL_reg_maxiter = 0;
4388             if ( nochange_depth )
4389                 nochange_depth--;
4390             sayYES;
4391
4392
4393         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4394             /* note: this is called twice; first after popping B, then A */
4395             PL_reg_flags ^= ST.toggle_reg_flags;
4396             ReREFCNT_dec(rex_sv);
4397             SETREX(rex_sv,ST.prev_rex);
4398             rex = (struct regexp *)SvANY(rex_sv);
4399             rexi = RXi_GET(rex);
4400             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4401             PL_reglastparen = &rex->lastparen;
4402             PL_reglastcloseparen = &rex->lastcloseparen;
4403
4404             PL_reginput = locinput;
4405             REGCP_UNWIND(ST.lastcp);
4406             regcppop(rex);
4407             cur_eval = ST.prev_eval;
4408             cur_curlyx = ST.prev_curlyx;
4409             /* XXXX This is too dramatic a measure... */
4410             PL_reg_maxiter = 0;
4411             if ( nochange_depth )
4412                 nochange_depth--;
4413             sayNO_SILENT;
4414 #undef ST
4415
4416         case OPEN:
4417             n = ARG(scan);  /* which paren pair */
4418             PL_reg_start_tmp[n] = locinput;
4419             if (n > PL_regsize)
4420                 PL_regsize = n;
4421             lastopen = n;
4422             break;
4423         case CLOSE:
4424             n = ARG(scan);  /* which paren pair */
4425             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4426             PL_regoffs[n].end = locinput - PL_bostr;
4427             /*if (n > PL_regsize)
4428                 PL_regsize = n;*/
4429             if (n > *PL_reglastparen)
4430                 *PL_reglastparen = n;
4431             *PL_reglastcloseparen = n;
4432             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4433                 goto fake_end;
4434             }
4435             break;
4436         case ACCEPT:
4437             if (ARG(scan)){
4438                 regnode *cursor;
4439                 for (cursor=scan;
4440                      cursor && OP(cursor)!=END;
4441                      cursor=regnext(cursor))
4442                 {
4443                     if ( OP(cursor)==CLOSE ){
4444                         n = ARG(cursor);
4445                         if ( n <= lastopen ) {
4446                             PL_regoffs[n].start
4447                                 = PL_reg_start_tmp[n] - PL_bostr;
4448                             PL_regoffs[n].end = locinput - PL_bostr;
4449                             /*if (n > PL_regsize)
4450                             PL_regsize = n;*/
4451                             if (n > *PL_reglastparen)
4452                                 *PL_reglastparen = n;
4453                             *PL_reglastcloseparen = n;
4454                             if ( n == ARG(scan) || (cur_eval &&
4455                                 cur_eval->u.eval.close_paren == n))
4456                                 break;
4457                         }
4458                     }
4459                 }
4460             }
4461             goto fake_end;
4462             /*NOTREACHED*/
4463         case GROUPP:
4464             n = ARG(scan);  /* which paren pair */
4465             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4466             break;
4467         case NGROUPP:
4468             /* reg_check_named_buff_matched returns 0 for no match */
4469             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4470             break;
4471         case INSUBP:
4472             n = ARG(scan);
4473             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4474             break;
4475         case DEFINEP:
4476             sw = 0;
4477             break;
4478         case IFTHEN:
4479             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4480             if (sw)
4481                 next = NEXTOPER(NEXTOPER(scan));
4482             else {
4483                 next = scan + ARG(scan);
4484                 if (OP(next) == IFTHEN) /* Fake one. */
4485                     next = NEXTOPER(NEXTOPER(next));
4486             }
4487             break;
4488         case LOGICAL:
4489             logical = scan->flags;
4490             break;
4491
4492 /*******************************************************************
4493
4494 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4495 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4496 STAR/PLUS/CURLY/CURLYN are used instead.)
4497
4498 A*B is compiled as <CURLYX><A><WHILEM><B>
4499
4500 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4501 state, which contains the current count, initialised to -1. It also sets
4502 cur_curlyx to point to this state, with any previous value saved in the
4503 state block.
4504
4505 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4506 since the pattern may possibly match zero times (i.e. it's a while {} loop
4507 rather than a do {} while loop).
4508
4509 Each entry to WHILEM represents a successful match of A. The count in the
4510 CURLYX block is incremented, another WHILEM state is pushed, and execution
4511 passes to A or B depending on greediness and the current count.
4512
4513 For example, if matching against the string a1a2a3b (where the aN are
4514 substrings that match /A/), then the match progresses as follows: (the
4515 pushed states are interspersed with the bits of strings matched so far):
4516
4517     <CURLYX cnt=-1>
4518     <CURLYX cnt=0><WHILEM>
4519     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4520     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4521     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4522     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4523
4524 (Contrast this with something like CURLYM, which maintains only a single
4525 backtrack state:
4526
4527     <CURLYM cnt=0> a1
4528     a1 <CURLYM cnt=1> a2
4529     a1 a2 <CURLYM cnt=2> a3
4530     a1 a2 a3 <CURLYM cnt=3> b
4531 )
4532
4533 Each WHILEM state block marks a point to backtrack to upon partial failure
4534 of A or B, and also contains some minor state data related to that
4535 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4536 overall state, such as the count, and pointers to the A and B ops.
4537
4538 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4539 must always point to the *current* CURLYX block, the rules are:
4540
4541 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4542 and set cur_curlyx to point the new block.
4543
4544 When popping the CURLYX block after a successful or unsuccessful match,
4545 restore the previous cur_curlyx.
4546
4547 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4548 to the outer one saved in the CURLYX block.
4549
4550 When popping the WHILEM block after a successful or unsuccessful B match,
4551 restore the previous cur_curlyx.
4552
4553 Here's an example for the pattern (AI* BI)*BO
4554 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4555
4556 cur_
4557 curlyx backtrack stack
4558 ------ ---------------
4559 NULL
4560 CO     <CO prev=NULL> <WO>
4561 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4562 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4563 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4564
4565 At this point the pattern succeeds, and we work back down the stack to
4566 clean up, restoring as we go:
4567
4568 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4569 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4570 CO     <CO prev=NULL> <WO>
4571 NULL
4572
4573 *******************************************************************/
4574
4575 #define ST st->u.curlyx
4576
4577         case CURLYX:    /* start of /A*B/  (for complex A) */
4578         {
4579             /* No need to save/restore up to this paren */
4580             I32 parenfloor = scan->flags;
4581
4582             assert(next); /* keep Coverity happy */
4583             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4584                 next += ARG(next);
4585
4586             /* XXXX Probably it is better to teach regpush to support
4587                parenfloor > PL_regsize... */
4588             if (parenfloor > (I32)*PL_reglastparen)
4589                 parenfloor = *PL_reglastparen; /* Pessimization... */
4590
4591             ST.prev_curlyx= cur_curlyx;
4592             cur_curlyx = st;
4593             ST.cp = PL_savestack_ix;
4594
4595             /* these fields contain the state of the current curly.
4596              * they are accessed by subsequent WHILEMs */
4597             ST.parenfloor = parenfloor;
4598             ST.me = scan;
4599             ST.B = next;
4600             ST.minmod = minmod;
4601             minmod = 0;
4602             ST.count = -1;      /* this will be updated by WHILEM */
4603             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4604
4605             PL_reginput = locinput;
4606             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4607             /* NOTREACHED */
4608         }
4609
4610         case CURLYX_end: /* just finished matching all of A*B */
4611             cur_curlyx = ST.prev_curlyx;
4612             sayYES;
4613             /* NOTREACHED */
4614
4615         case CURLYX_end_fail: /* just failed to match all of A*B */
4616             regcpblow(ST.cp);
4617             cur_curlyx = ST.prev_curlyx;
4618             sayNO;
4619             /* NOTREACHED */
4620
4621
4622 #undef ST
4623 #define ST st->u.whilem
4624
4625         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4626         {
4627             /* see the discussion above about CURLYX/WHILEM */
4628             I32 n;
4629             int min = ARG1(cur_curlyx->u.curlyx.me);
4630             int max = ARG2(cur_curlyx->u.curlyx.me);
4631             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4632
4633             assert(cur_curlyx); /* keep Coverity happy */
4634             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4635             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4636             ST.cache_offset = 0;
4637             ST.cache_mask = 0;
4638
4639             PL_reginput = locinput;
4640
4641             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4642                   "%*s  whilem: matched %ld out of %d..%d\n",
4643                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4644             );
4645
4646             /* First just match a string of min A's. */
4647
4648             if (n < min) {
4649                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4650                 cur_curlyx->u.curlyx.lastloc = locinput;
4651                 REGCP_SET(ST.lastcp);
4652
4653                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4654                 /* NOTREACHED */
4655             }
4656
4657             /* If degenerate A matches "", assume A done. */
4658
4659             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4660                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4661                    "%*s  whilem: empty match detected, trying continuation...\n",
4662                    REPORT_CODE_OFF+depth*2, "")
4663                 );
4664                 goto do_whilem_B_max;
4665             }
4666
4667             /* super-linear cache processing */
4668
4669             if (scan->flags) {
4670
4671                 if (!PL_reg_maxiter) {
4672                     /* start the countdown: Postpone detection until we
4673                      * know the match is not *that* much linear. */
4674                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4675                     /* possible overflow for long strings and many CURLYX's */
4676                     if (PL_reg_maxiter < 0)
4677                         PL_reg_maxiter = I32_MAX;
4678                     PL_reg_leftiter = PL_reg_maxiter;
4679                 }
4680
4681                 if (PL_reg_leftiter-- == 0) {
4682                     /* initialise cache */
4683                     const I32 size = (PL_reg_maxiter + 7)/8;
4684                     if (PL_reg_poscache) {
4685                         if ((I32)PL_reg_poscache_size < size) {
4686                             Renew(PL_reg_poscache, size, char);
4687                             PL_reg_poscache_size = size;
4688                         }
4689                         Zero(PL_reg_poscache, size, char);
4690                     }
4691                     else {
4692                         PL_reg_poscache_size = size;
4693                         Newxz(PL_reg_poscache, size, char);
4694                     }
4695                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4696       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4697                               PL_colors[4], PL_colors[5])
4698                     );
4699                 }
4700
4701                 if (PL_reg_leftiter < 0) {
4702                     /* have we already failed at this position? */
4703                     I32 offset, mask;
4704                     offset  = (scan->flags & 0xf) - 1
4705                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4706                     mask    = 1 << (offset % 8);
4707                     offset /= 8;
4708                     if (PL_reg_poscache[offset] & mask) {
4709                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4710                             "%*s  whilem: (cache) already tried at this position...\n",
4711                             REPORT_CODE_OFF+depth*2, "")
4712                         );
4713                         sayNO; /* cache records failure */
4714                     }
4715                     ST.cache_offset = offset;
4716                     ST.cache_mask   = mask;
4717                 }
4718             }
4719
4720             /* Prefer B over A for minimal matching. */
4721
4722             if (cur_curlyx->u.curlyx.minmod) {
4723                 ST.save_curlyx = cur_curlyx;
4724                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4725                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4726                 REGCP_SET(ST.lastcp);
4727                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4728                 /* NOTREACHED */
4729             }
4730
4731             /* Prefer A over B for maximal matching. */
4732
4733             if (n < max) { /* More greed allowed? */
4734                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4735                 cur_curlyx->u.curlyx.lastloc = locinput;
4736                 REGCP_SET(ST.lastcp);
4737                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4738                 /* NOTREACHED */
4739             }
4740             goto do_whilem_B_max;
4741         }
4742         /* NOTREACHED */
4743
4744         case WHILEM_B_min: /* just matched B in a minimal match */
4745         case WHILEM_B_max: /* just matched B in a maximal match */
4746             cur_curlyx = ST.save_curlyx;
4747             sayYES;
4748             /* NOTREACHED */
4749
4750         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4751             cur_curlyx = ST.save_curlyx;
4752             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4753             cur_curlyx->u.curlyx.count--;
4754             CACHEsayNO;
4755             /* NOTREACHED */
4756
4757         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4758             /* FALL THROUGH */
4759         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4760             REGCP_UNWIND(ST.lastcp);
4761             regcppop(rex);
4762             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4763             cur_curlyx->u.curlyx.count--;
4764             CACHEsayNO;
4765             /* NOTREACHED */
4766
4767         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4768             REGCP_UNWIND(ST.lastcp);
4769             regcppop(rex);      /* Restore some previous $<digit>s? */
4770             PL_reginput = locinput;
4771             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4772                 "%*s  whilem: failed, trying continuation...\n",
4773                 REPORT_CODE_OFF+depth*2, "")
4774             );
4775           do_whilem_B_max:
4776             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4777                 && ckWARN(WARN_REGEXP)
4778                 && !(PL_reg_flags & RF_warned))
4779             {
4780                 PL_reg_flags |= RF_warned;
4781                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4782                      "Complex regular subexpression recursion",
4783                      REG_INFTY - 1);
4784             }
4785
4786             /* now try B */
4787             ST.save_curlyx = cur_curlyx;
4788             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4789             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4790             /* NOTREACHED */
4791
4792         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4793             cur_curlyx = ST.save_curlyx;
4794             REGCP_UNWIND(ST.lastcp);
4795             regcppop(rex);
4796
4797             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4798                 /* Maximum greed exceeded */
4799                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4800                     && ckWARN(WARN_REGEXP)
4801                     && !(PL_reg_flags & RF_warned))
4802                 {
4803                     PL_reg_flags |= RF_warned;
4804                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4805                         "%s limit (%d) exceeded",
4806                         "Complex regular subexpression recursion",
4807                         REG_INFTY - 1);
4808                 }
4809                 cur_curlyx->u.curlyx.count--;
4810                 CACHEsayNO;
4811             }
4812
4813             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4814                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4815             );
4816             /* Try grabbing another A and see if it helps. */
4817             PL_reginput = locinput;
4818             cur_curlyx->u.curlyx.lastloc = locinput;
4819             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4820             REGCP_SET(ST.lastcp);
4821             PUSH_STATE_GOTO(WHILEM_A_min,
4822                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4823             /* NOTREACHED */
4824
4825 #undef  ST
4826 #define ST st->u.branch
4827
4828         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4829             next = scan + ARG(scan);
4830             if (next == scan)
4831                 next = NULL;
4832             scan = NEXTOPER(scan);
4833             /* FALL THROUGH */
4834
4835         case BRANCH:        /*  /(...|A|...)/ */
4836             scan = NEXTOPER(scan); /* scan now points to inner node */
4837             ST.lastparen = *PL_reglastparen;
4838             ST.next_branch = next;
4839             REGCP_SET(ST.cp);
4840             PL_reginput = locinput;
4841
4842             /* Now go into the branch */
4843             if (has_cutgroup) {
4844                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4845             } else {
4846                 PUSH_STATE_GOTO(BRANCH_next, scan);
4847             }
4848             /* NOTREACHED */
4849         case CUTGROUP:
4850             PL_reginput = locinput;
4851             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4852                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4853             PUSH_STATE_GOTO(CUTGROUP_next,next);
4854             /* NOTREACHED */
4855         case CUTGROUP_next_fail:
4856             do_cutgroup = 1;
4857             no_final = 1;
4858             if (st->u.mark.mark_name)
4859                 sv_commit = st->u.mark.mark_name;
4860             sayNO;
4861             /* NOTREACHED */
4862         case BRANCH_next:
4863             sayYES;
4864             /* NOTREACHED */
4865         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4866             if (do_cutgroup) {
4867                 do_cutgroup = 0;
4868                 no_final = 0;
4869             }
4870             REGCP_UNWIND(ST.cp);
4871             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4872                 PL_regoffs[n].end = -1;
4873             *PL_reglastparen = n;
4874             /*dmq: *PL_reglastcloseparen = n; */
4875             scan = ST.next_branch;
4876             /* no more branches? */
4877             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4878                 DEBUG_EXECUTE_r({
4879                     PerlIO_printf( Perl_debug_log,
4880                         "%*s  %sBRANCH failed...%s\n",
4881                         REPORT_CODE_OFF+depth*2, "",
4882                         PL_colors[4],
4883                         PL_colors[5] );
4884                 });
4885                 sayNO_SILENT;
4886             }
4887             continue; /* execute next BRANCH[J] op */
4888             /* NOTREACHED */
4889
4890         case MINMOD:
4891             minmod = 1;
4892             break;
4893
4894 #undef  ST
4895 #define ST st->u.curlym
4896
4897         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4898
4899             /* This is an optimisation of CURLYX that enables us to push
4900              * only a single backtracking state, no matter how many matches
4901              * there are in {m,n}. It relies on the pattern being constant
4902              * length, with no parens to influence future backrefs
4903              */
4904
4905             ST.me = scan;
4906             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4907
4908             /* if paren positive, emulate an OPEN/CLOSE around A */
4909             if (ST.me->flags) {
4910                 U32 paren = ST.me->flags;
4911                 if (paren > PL_regsize)
4912                     PL_regsize = paren;
4913                 if (paren > *PL_reglastparen)
4914                     *PL_reglastparen = paren;
4915                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4916             }
4917             ST.A = scan;
4918             ST.B = next;
4919             ST.alen = 0;
4920             ST.count = 0;
4921             ST.minmod = minmod;
4922             minmod = 0;
4923             ST.c1 = CHRTEST_UNINIT;
4924             REGCP_SET(ST.cp);
4925
4926             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4927                 goto curlym_do_B;
4928
4929           curlym_do_A: /* execute the A in /A{m,n}B/  */
4930             PL_reginput = locinput;
4931             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4932             /* NOTREACHED */
4933
4934         case CURLYM_A: /* we've just matched an A */
4935             locinput = st->locinput;
4936             nextchr = UCHARAT(locinput);
4937
4938             ST.count++;
4939             /* after first match, determine A's length: u.curlym.alen */
4940             if (ST.count == 1) {
4941                 if (PL_reg_match_utf8) {
4942                     char *s = locinput;
4943                     while (s < PL_reginput) {
4944                         ST.alen++;
4945                         s += UTF8SKIP(s);
4946                     }
4947                 }
4948                 else {
4949                     ST.alen = PL_reginput - locinput;
4950                 }
4951                 if (ST.alen == 0)
4952                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4953             }
4954             DEBUG_EXECUTE_r(
4955                 PerlIO_printf(Perl_debug_log,
4956                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4957                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4958                           (IV) ST.count, (IV)ST.alen)
4959             );
4960
4961             locinput = PL_reginput;
4962
4963             if (cur_eval && cur_eval->u.eval.close_paren &&
4964                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4965                 goto fake_end;
4966
4967             {
4968                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4969                 if ( max == REG_INFTY || ST.count < max )
4970                     goto curlym_do_A; /* try to match another A */
4971             }
4972             goto curlym_do_B; /* try to match B */
4973
4974         case CURLYM_A_fail: /* just failed to match an A */
4975             REGCP_UNWIND(ST.cp);
4976
4977             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4978                 || (cur_eval && cur_eval->u.eval.close_paren &&
4979                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4980                 sayNO;
4981
4982           curlym_do_B: /* execute the B in /A{m,n}B/  */
4983             PL_reginput = locinput;
4984             if (ST.c1 == CHRTEST_UNINIT) {
4985                 /* calculate c1 and c2 for possible match of 1st char
4986                  * following curly */
4987                 ST.c1 = ST.c2 = CHRTEST_VOID;
4988                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4989                     regnode *text_node = ST.B;
4990                     if (! HAS_TEXT(text_node))
4991                         FIND_NEXT_IMPT(text_node);
4992                     /* this used to be
4993
4994                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4995
4996                         But the former is redundant in light of the latter.
4997
4998                         if this changes back then the macro for
4999                         IS_TEXT and friends need to change.
5000                      */
5001                     if (PL_regkind[OP(text_node)] == EXACT)
5002                     {
5003
5004                         ST.c1 = (U8)*STRING(text_node);
5005                         switch (OP(text_node)) {
5006                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5007                             case EXACTFA:
5008                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5009                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5010                             default: ST.c2 = ST.c1;
5011                         }
5012                     }
5013                 }
5014             }
5015
5016             DEBUG_EXECUTE_r(
5017                 PerlIO_printf(Perl_debug_log,
5018                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5019                     (int)(REPORT_CODE_OFF+(depth*2)),
5020                     "", (IV)ST.count)
5021                 );
5022             if (ST.c1 != CHRTEST_VOID
5023                     && UCHARAT(PL_reginput) != ST.c1
5024                     && UCHARAT(PL_reginput) != ST.c2)
5025             {
5026                 /* simulate B failing */
5027                 DEBUG_OPTIMISE_r(
5028                     PerlIO_printf(Perl_debug_log,
5029                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5030                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5031                         (IV)ST.c1,(IV)ST.c2
5032                 ));
5033                 state_num = CURLYM_B_fail;
5034                 goto reenter_switch;
5035             }
5036
5037             if (ST.me->flags) {
5038                 /* mark current A as captured */
5039                 I32 paren = ST.me->flags;
5040                 if (ST.count) {
5041                     PL_regoffs[paren].start
5042                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5043                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5044                     /*dmq: *PL_reglastcloseparen = paren; */
5045                 }
5046                 else
5047                     PL_regoffs[paren].end = -1;
5048                 if (cur_eval && cur_eval->u.eval.close_paren &&
5049                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5050                 {
5051                     if (ST.count)
5052                         goto fake_end;
5053                     else
5054                         sayNO;
5055                 }
5056             }
5057
5058             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5059             /* NOTREACHED */
5060
5061         case CURLYM_B_fail: /* just failed to match a B */
5062             REGCP_UNWIND(ST.cp);
5063             if (ST.minmod) {
5064                 I32 max = ARG2(ST.me);
5065                 if (max != REG_INFTY && ST.count == max)
5066                     sayNO;
5067                 goto curlym_do_A; /* try to match a further A */
5068             }
5069             /* backtrack one A */
5070             if (ST.count == ARG1(ST.me) /* min */)
5071                 sayNO;
5072             ST.count--;
5073             locinput = HOPc(locinput, -ST.alen);
5074             goto curlym_do_B; /* try to match B */
5075
5076 #undef ST
5077 #define ST st->u.curly
5078
5079 #define CURLY_SETPAREN(paren, success) \
5080     if (paren) { \
5081         if (success) { \
5082             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5083             PL_regoffs[paren].end = locinput - PL_bostr; \
5084             *PL_reglastcloseparen = paren; \
5085         } \
5086         else \
5087             PL_regoffs[paren].end = -1; \
5088     }
5089
5090         case STAR:              /*  /A*B/ where A is width 1 */
5091             ST.paren = 0;
5092             ST.min = 0;
5093             ST.max = REG_INFTY;
5094             scan = NEXTOPER(scan);
5095             goto repeat;
5096         case PLUS:              /*  /A+B/ where A is width 1 */
5097             ST.paren = 0;
5098             ST.min = 1;
5099             ST.max = REG_INFTY;
5100             scan = NEXTOPER(scan);
5101             goto repeat;
5102         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5103             ST.paren = scan->flags;     /* Which paren to set */
5104             if (ST.paren > PL_regsize)
5105                 PL_regsize = ST.paren;
5106             if (ST.paren > *PL_reglastparen)
5107                 *PL_reglastparen = ST.paren;
5108             ST.min = ARG1(scan);  /* min to match */
5109             ST.max = ARG2(scan);  /* max to match */
5110             if (cur_eval && cur_eval->u.eval.close_paren &&
5111                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5112                 ST.min=1;
5113                 ST.max=1;
5114             }
5115             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5116             goto repeat;
5117         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5118             ST.paren = 0;
5119             ST.min = ARG1(scan);  /* min to match */
5120             ST.max = ARG2(scan);  /* max to match */
5121             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5122           repeat:
5123             /*
5124             * Lookahead to avoid useless match attempts
5125             * when we know what character comes next.
5126             *
5127             * Used to only do .*x and .*?x, but now it allows
5128             * for )'s, ('s and (?{ ... })'s to be in the way
5129             * of the quantifier and the EXACT-like node.  -- japhy
5130             */
5131
5132             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5133                 sayNO;
5134             if (HAS_TEXT(next) || JUMPABLE(next)) {
5135                 U8 *s;
5136                 regnode *text_node = next;
5137
5138                 if (! HAS_TEXT(text_node))
5139                     FIND_NEXT_IMPT(text_node);
5140
5141                 if (! HAS_TEXT(text_node))
5142                     ST.c1 = ST.c2 = CHRTEST_VOID;
5143                 else {
5144                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5145                         ST.c1 = ST.c2 = CHRTEST_VOID;
5146                         goto assume_ok_easy;
5147                     }
5148                     else
5149                         s = (U8*)STRING(text_node);
5150
5151                     /*  Currently we only get here when
5152
5153                         PL_rekind[OP(text_node)] == EXACT
5154
5155                         if this changes back then the macro for IS_TEXT and
5156                         friends need to change. */
5157                     if (!UTF_PATTERN) {
5158                         ST.c1 = *s;
5159                         switch (OP(text_node)) {
5160                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5161                             case EXACTFA:
5162                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5163                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5164                             default: ST.c2 = ST.c1; break;
5165                         }
5166                     }
5167                     else { /* UTF_PATTERN */
5168                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5169                              STRLEN ulen1, ulen2;
5170                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5171                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5172
5173                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5174                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5175 #ifdef EBCDIC
5176                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5177                                                     ckWARN(WARN_UTF8) ?
5178                                                     0 : UTF8_ALLOW_ANY);
5179                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5180                                                     ckWARN(WARN_UTF8) ?
5181                                                     0 : UTF8_ALLOW_ANY);
5182 #else
5183                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5184                                                     uniflags);
5185                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5186                                                     uniflags);
5187 #endif
5188                         }
5189                         else {
5190                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5191                                                      uniflags);
5192                         }
5193                     }
5194                 }
5195             }
5196             else
5197                 ST.c1 = ST.c2 = CHRTEST_VOID;
5198         assume_ok_easy:
5199
5200             ST.A = scan;
5201             ST.B = next;
5202             PL_reginput = locinput;
5203             if (minmod) {
5204                 minmod = 0;
5205                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5206                     sayNO;
5207                 ST.count = ST.min;
5208                 locinput = PL_reginput;
5209                 REGCP_SET(ST.cp);
5210                 if (ST.c1 == CHRTEST_VOID)
5211                     goto curly_try_B_min;
5212
5213                 ST.oldloc = locinput;
5214
5215                 /* set ST.maxpos to the furthest point along the
5216                  * string that could possibly match */
5217                 if  (ST.max == REG_INFTY) {
5218                     ST.maxpos = PL_regeol - 1;
5219                     if (utf8_target)
5220                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5221                             ST.maxpos--;
5222                 }
5223                 else if (utf8_target) {
5224                     int m = ST.max - ST.min;
5225                     for (ST.maxpos = locinput;
5226                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5227                         ST.maxpos += UTF8SKIP(ST.maxpos);
5228                 }
5229                 else {
5230                     ST.maxpos = locinput + ST.max - ST.min;
5231                     if (ST.maxpos >= PL_regeol)
5232                         ST.maxpos = PL_regeol - 1;
5233                 }
5234                 goto curly_try_B_min_known;
5235
5236             }
5237             else {
5238                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5239                 locinput = PL_reginput;
5240                 if (ST.count < ST.min)
5241                     sayNO;
5242                 if ((ST.count > ST.min)
5243                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5244                 {
5245                     /* A{m,n} must come at the end of the string, there's
5246                      * no point in backing off ... */
5247                     ST.min = ST.count;
5248                     /* ...except that $ and \Z can match before *and* after
5249                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5250                        We may back off by one in this case. */
5251                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5252                         ST.min--;
5253                 }
5254                 REGCP_SET(ST.cp);
5255                 goto curly_try_B_max;
5256             }
5257             /* NOTREACHED */
5258
5259
5260         case CURLY_B_min_known_fail:
5261             /* failed to find B in a non-greedy match where c1,c2 valid */
5262             if (ST.paren && ST.count)
5263                 PL_regoffs[ST.paren].end = -1;
5264
5265             PL_reginput = locinput;     /* Could be reset... */
5266             REGCP_UNWIND(ST.cp);
5267             /* Couldn't or didn't -- move forward. */
5268             ST.oldloc = locinput;
5269             if (utf8_target)
5270                 locinput += UTF8SKIP(locinput);
5271             else
5272                 locinput++;
5273             ST.count++;
5274           curly_try_B_min_known:
5275              /* find the next place where 'B' could work, then call B */
5276             {
5277                 int n;
5278                 if (utf8_target) {
5279                     n = (ST.oldloc == locinput) ? 0 : 1;
5280                     if (ST.c1 == ST.c2) {
5281                         STRLEN len;
5282                         /* set n to utf8_distance(oldloc, locinput) */
5283                         while (locinput <= ST.maxpos &&
5284                                utf8n_to_uvchr((U8*)locinput,
5285                                               UTF8_MAXBYTES, &len,
5286                                               uniflags) != (UV)ST.c1) {
5287                             locinput += len;
5288                             n++;
5289                         }
5290                     }
5291                     else {
5292                         /* set n to utf8_distance(oldloc, locinput) */
5293                         while (locinput <= ST.maxpos) {
5294                             STRLEN len;
5295                             const UV c = utf8n_to_uvchr((U8*)locinput,
5296                                                   UTF8_MAXBYTES, &len,
5297                                                   uniflags);
5298                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5299                                 break;
5300                             locinput += len;
5301                             n++;
5302                         }
5303                     }
5304                 }
5305                 else {
5306                     if (ST.c1 == ST.c2) {
5307                         while (locinput <= ST.maxpos &&
5308                                UCHARAT(locinput) != ST.c1)
5309                             locinput++;
5310                     }
5311                     else {
5312                         while (locinput <= ST.maxpos
5313                                && UCHARAT(locinput) != ST.c1
5314                                && UCHARAT(locinput) != ST.c2)
5315                             locinput++;
5316                     }
5317                     n = locinput - ST.oldloc;
5318                 }
5319                 if (locinput > ST.maxpos)
5320                     sayNO;
5321                 /* PL_reginput == oldloc now */
5322                 if (n) {
5323                     ST.count += n;
5324                     if (regrepeat(rex, ST.A, n, depth) < n)
5325                         sayNO;
5326                 }
5327                 PL_reginput = locinput;
5328                 CURLY_SETPAREN(ST.paren, ST.count);
5329                 if (cur_eval && cur_eval->u.eval.close_paren &&
5330                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5331                     goto fake_end;
5332                 }
5333                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5334             }
5335             /* NOTREACHED */
5336
5337
5338         case CURLY_B_min_fail:
5339             /* failed to find B in a non-greedy match where c1,c2 invalid */
5340             if (ST.paren && ST.count)
5341                 PL_regoffs[ST.paren].end = -1;
5342
5343             REGCP_UNWIND(ST.cp);
5344             /* failed -- move forward one */
5345             PL_reginput = locinput;
5346             if (regrepeat(rex, ST.A, 1, depth)) {
5347                 ST.count++;
5348                 locinput = PL_reginput;
5349                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5350                         ST.count > 0)) /* count overflow ? */
5351                 {
5352                   curly_try_B_min:
5353                     CURLY_SETPAREN(ST.paren, ST.count);
5354                     if (cur_eval && cur_eval->u.eval.close_paren &&
5355                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5356                         goto fake_end;
5357                     }
5358                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5359                 }
5360             }
5361             sayNO;
5362             /* NOTREACHED */
5363
5364
5365         curly_try_B_max:
5366             /* a successful greedy match: now try to match B */
5367             if (cur_eval && cur_eval->u.eval.close_paren &&
5368                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5369                 goto fake_end;
5370             }
5371             {
5372                 UV c = 0;
5373                 if (ST.c1 != CHRTEST_VOID)
5374                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5375                                            UTF8_MAXBYTES, 0, uniflags)
5376                                 : (UV) UCHARAT(PL_reginput);
5377                 /* If it could work, try it. */
5378                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5379                     CURLY_SETPAREN(ST.paren, ST.count);
5380                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5381                     /* NOTREACHED */
5382                 }
5383             }
5384             /* FALL THROUGH */
5385         case CURLY_B_max_fail:
5386             /* failed to find B in a greedy match */
5387             if (ST.paren && ST.count)
5388                 PL_regoffs[ST.paren].end = -1;
5389
5390             REGCP_UNWIND(ST.cp);
5391             /*  back up. */
5392             if (--ST.count < ST.min)
5393                 sayNO;
5394             PL_reginput = locinput = HOPc(locinput, -1);
5395             goto curly_try_B_max;
5396
5397 #undef ST
5398
5399         case END:
5400             fake_end:
5401             if (cur_eval) {
5402                 /* we've just finished A in /(??{A})B/; now continue with B */
5403                 I32 tmpix;
5404                 st->u.eval.toggle_reg_flags
5405                             = cur_eval->u.eval.toggle_reg_flags;
5406                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5407
5408                 st->u.eval.prev_rex = rex_sv;           /* inner */
5409                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5410                 rex = (struct regexp *)SvANY(rex_sv);
5411                 rexi = RXi_GET(rex);
5412                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5413                 (void)ReREFCNT_inc(rex_sv);
5414                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5415
5416                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5417                 PL_reglastparen = &rex->lastparen;
5418                 PL_reglastcloseparen = &rex->lastcloseparen;
5419
5420                 REGCP_SET(st->u.eval.lastcp);
5421                 PL_reginput = locinput;
5422
5423                 /* Restore parens of the outer rex without popping the
5424                  * savestack */
5425                 tmpix = PL_savestack_ix;
5426                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5427                 regcppop(rex);
5428                 PL_savestack_ix = tmpix;
5429
5430                 st->u.eval.prev_eval = cur_eval;
5431                 cur_eval = cur_eval->u.eval.prev_eval;
5432                 DEBUG_EXECUTE_r(
5433                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5434                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5435                 if ( nochange_depth )
5436                     nochange_depth--;
5437
5438                 PUSH_YES_STATE_GOTO(EVAL_AB,
5439                         st->u.eval.prev_eval->u.eval.B); /* match B */
5440             }
5441
5442             if (locinput < reginfo->till) {
5443                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5444                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5445                                       PL_colors[4],
5446                                       (long)(locinput - PL_reg_starttry),
5447                                       (long)(reginfo->till - PL_reg_starttry),
5448                                       PL_colors[5]));
5449
5450                 sayNO_SILENT;           /* Cannot match: too short. */
5451             }
5452             PL_reginput = locinput;     /* put where regtry can find it */
5453             sayYES;                     /* Success! */
5454
5455         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5456             DEBUG_EXECUTE_r(
5457             PerlIO_printf(Perl_debug_log,
5458                 "%*s  %ssubpattern success...%s\n",
5459                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5460             PL_reginput = locinput;     /* put where regtry can find it */
5461             sayYES;                     /* Success! */
5462
5463 #undef  ST
5464 #define ST st->u.ifmatch
5465
5466         case SUSPEND:   /* (?>A) */
5467             ST.wanted = 1;
5468             PL_reginput = locinput;
5469             goto do_ifmatch;
5470
5471         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5472             ST.wanted = 0;
5473             goto ifmatch_trivial_fail_test;
5474
5475         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5476             ST.wanted = 1;
5477           ifmatch_trivial_fail_test:
5478             if (scan->flags) {
5479                 char * const s = HOPBACKc(locinput, scan->flags);
5480                 if (!s) {
5481                     /* trivial fail */
5482                     if (logical) {
5483                         logical = 0;
5484                         sw = 1 - cBOOL(ST.wanted);
5485                     }
5486                     else if (ST.wanted)
5487                         sayNO;
5488                     next = scan + ARG(scan);
5489                     if (next == scan)
5490                         next = NULL;
5491                     break;
5492                 }
5493                 PL_reginput = s;
5494             }
5495             else
5496                 PL_reginput = locinput;
5497
5498           do_ifmatch:
5499             ST.me = scan;
5500             ST.logical = logical;
5501             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5502
5503             /* execute body of (?...A) */
5504             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5505             /* NOTREACHED */
5506
5507         case IFMATCH_A_fail: /* body of (?...A) failed */
5508             ST.wanted = !ST.wanted;
5509             /* FALL THROUGH */
5510
5511         case IFMATCH_A: /* body of (?...A) succeeded */
5512             if (ST.logical) {
5513                 sw = cBOOL(ST.wanted);
5514             }
5515             else if (!ST.wanted)
5516                 sayNO;
5517
5518             if (OP(ST.me) == SUSPEND)
5519                 locinput = PL_reginput;
5520             else {
5521                 locinput = PL_reginput = st->locinput;
5522                 nextchr = UCHARAT(locinput);
5523             }
5524             scan = ST.me + ARG(ST.me);
5525             if (scan == ST.me)
5526                 scan = NULL;
5527             continue; /* execute B */
5528
5529 #undef ST
5530
5531         case LONGJMP:
5532             next = scan + ARG(scan);
5533             if (next == scan)
5534                 next = NULL;
5535             break;
5536         case COMMIT:
5537             reginfo->cutpoint = PL_regeol;
5538             /* FALLTHROUGH */
5539         case PRUNE:
5540             PL_reginput = locinput;
5541             if (!scan->flags)
5542                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5543             PUSH_STATE_GOTO(COMMIT_next,next);
5544             /* NOTREACHED */
5545         case COMMIT_next_fail:
5546             no_final = 1;
5547             /* FALLTHROUGH */
5548         case OPFAIL:
5549             sayNO;
5550             /* NOTREACHED */
5551
5552 #define ST st->u.mark
5553         case MARKPOINT:
5554             ST.prev_mark = mark_state;
5555             ST.mark_name = sv_commit = sv_yes_mark
5556                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5557             mark_state = st;
5558             ST.mark_loc = PL_reginput = locinput;
5559             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5560             /* NOTREACHED */
5561         case MARKPOINT_next:
5562             mark_state = ST.prev_mark;
5563             sayYES;
5564             /* NOTREACHED */
5565         case MARKPOINT_next_fail:
5566             if (popmark && sv_eq(ST.mark_name,popmark))
5567             {
5568                 if (ST.mark_loc > startpoint)
5569                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5570                 popmark = NULL; /* we found our mark */
5571                 sv_commit = ST.mark_name;
5572
5573                 DEBUG_EXECUTE_r({
5574                         PerlIO_printf(Perl_debug_log,
5575                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5576                             REPORT_CODE_OFF+depth*2, "",
5577                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5578                 });
5579             }
5580             mark_state = ST.prev_mark;
5581             sv_yes_mark = mark_state ?
5582                 mark_state->u.mark.mark_name : NULL;
5583             sayNO;
5584             /* NOTREACHED */
5585         case SKIP:
5586             PL_reginput = locinput;
5587             if (scan->flags) {
5588                 /* (*SKIP) : if we fail we cut here*/
5589                 ST.mark_name = NULL;
5590                 ST.mark_loc = locinput;
5591                 PUSH_STATE_GOTO(SKIP_next,next);
5592             } else {
5593                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5594                    otherwise do nothing.  Meaning we need to scan
5595                  */
5596                 regmatch_state *cur = mark_state;
5597                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5598
5599                 while (cur) {
5600                     if ( sv_eq( cur->u.mark.mark_name,
5601                                 find ) )
5602                     {
5603                         ST.mark_name = find;
5604                         PUSH_STATE_GOTO( SKIP_next, next );
5605                     }
5606                     cur = cur->u.mark.prev_mark;
5607                 }
5608             }
5609             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5610             break;
5611         case SKIP_next_fail:
5612             if (ST.mark_name) {
5613                 /* (*CUT:NAME) - Set up to search for the name as we
5614                    collapse the stack*/
5615                 popmark = ST.mark_name;
5616             } else {
5617                 /* (*CUT) - No name, we cut here.*/
5618                 if (ST.mark_loc > startpoint)
5619                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5620                 /* but we set sv_commit to latest mark_name if there
5621                    is one so they can test to see how things lead to this
5622                    cut */
5623                 if (mark_state)
5624                     sv_commit=mark_state->u.mark.mark_name;
5625             }
5626             no_final = 1;
5627             sayNO;
5628             /* NOTREACHED */
5629 #undef ST
5630         case FOLDCHAR:
5631             n = ARG(scan);
5632             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5633                 locinput += ln;
5634             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5635                 sayNO;
5636             } else  {
5637                 U8 folded[UTF8_MAXBYTES_CASE+1];
5638                 STRLEN foldlen;
5639                 const char * const l = locinput;
5640                 char *e = PL_regeol;
5641                 to_uni_fold(n, folded, &foldlen);
5642
5643                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5644                                l, &e, 0,  utf8_target)) {
5645                         sayNO;
5646                 }
5647                 locinput = e;
5648             }
5649             nextchr = UCHARAT(locinput);
5650             break;
5651         case LNBREAK:
5652             if ((n=is_LNBREAK(locinput,utf8_target))) {
5653                 locinput += n;
5654                 nextchr = UCHARAT(locinput);
5655             } else
5656                 sayNO;
5657             break;
5658
5659 #define CASE_CLASS(nAmE)                              \
5660         case nAmE:                                    \
5661             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5662                 locinput += n;                        \
5663                 nextchr = UCHARAT(locinput);          \
5664             } else                                    \
5665                 sayNO;                                \
5666             break;                                    \
5667         case N##nAmE:                                 \
5668             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5669                 sayNO;                                \
5670             } else {                                  \
5671                 locinput += UTF8SKIP(locinput);       \
5672                 nextchr = UCHARAT(locinput);          \
5673             }                                         \
5674             break
5675
5676         CASE_CLASS(VERTWS);
5677         CASE_CLASS(HORIZWS);
5678 #undef CASE_CLASS
5679
5680         default:
5681             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5682                           PTR2UV(scan), OP(scan));
5683             Perl_croak(aTHX_ "regexp memory corruption");
5684
5685         } /* end switch */
5686
5687         /* switch break jumps here */
5688         scan = next; /* prepare to execute the next op and ... */
5689         continue;    /* ... jump back to the top, reusing st */
5690         /* NOTREACHED */
5691
5692       push_yes_state:
5693         /* push a state that backtracks on success */
5694         st->u.yes.prev_yes_state = yes_state;
5695         yes_state = st;
5696         /* FALL THROUGH */
5697       push_state:
5698         /* push a new regex state, then continue at scan  */
5699         {
5700             regmatch_state *newst;
5701
5702             DEBUG_STACK_r({
5703                 regmatch_state *cur = st;
5704                 regmatch_state *curyes = yes_state;
5705                 int curd = depth;
5706                 regmatch_slab *slab = PL_regmatch_slab;
5707                 for (;curd > -1;cur--,curd--) {
5708                     if (cur < SLAB_FIRST(slab)) {
5709                         slab = slab->prev;
5710                         cur = SLAB_LAST(slab);
5711                     }
5712                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5713                         REPORT_CODE_OFF + 2 + depth * 2,"",
5714                         curd, PL_reg_name[cur->resume_state],
5715                         (curyes == cur) ? "yes" : ""
5716                     );
5717                     if (curyes == cur)
5718                         curyes = cur->u.yes.prev_yes_state;
5719                 }
5720             } else
5721                 DEBUG_STATE_pp("push")
5722             );
5723             depth++;
5724             st->locinput = locinput;
5725             newst = st+1;
5726             if (newst >  SLAB_LAST(PL_regmatch_slab))
5727                 newst = S_push_slab(aTHX);
5728             PL_regmatch_state = newst;
5729
5730             locinput = PL_reginput;
5731             nextchr = UCHARAT(locinput);
5732             st = newst;
5733             continue;
5734             /* NOTREACHED */
5735         }
5736     }
5737
5738     /*
5739     * We get here only if there's trouble -- normally "case END" is
5740     * the terminating point.
5741     */
5742     Perl_croak(aTHX_ "corrupted regexp pointers");
5743     /*NOTREACHED*/
5744     sayNO;
5745
5746 yes:
5747     if (yes_state) {
5748         /* we have successfully completed a subexpression, but we must now
5749          * pop to the state marked by yes_state and continue from there */
5750         assert(st != yes_state);
5751 #ifdef DEBUGGING
5752         while (st != yes_state) {
5753             st--;
5754             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5755                 PL_regmatch_slab = PL_regmatch_slab->prev;
5756                 st = SLAB_LAST(PL_regmatch_slab);
5757             }
5758             DEBUG_STATE_r({
5759                 if (no_final) {
5760                     DEBUG_STATE_pp("pop (no final)");
5761                 } else {
5762                     DEBUG_STATE_pp("pop (yes)");
5763                 }
5764             });
5765             depth--;
5766         }
5767 #else
5768         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5769             || yes_state > SLAB_LAST(PL_regmatch_slab))
5770         {
5771             /* not in this slab, pop slab */
5772             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5773             PL_regmatch_slab = PL_regmatch_slab->prev;
5774             st = SLAB_LAST(PL_regmatch_slab);
5775         }
5776         depth -= (st - yes_state);
5777 #endif
5778         st = yes_state;
5779         yes_state = st->u.yes.prev_yes_state;
5780         PL_regmatch_state = st;
5781
5782         if (no_final) {
5783             locinput= st->locinput;
5784             nextchr = UCHARAT(locinput);
5785         }
5786         state_num = st->resume_state + no_final;
5787         goto reenter_switch;
5788     }
5789
5790     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5791                           PL_colors[4], PL_colors[5]));
5792
5793     if (PL_reg_eval_set) {
5794         /* each successfully executed (?{...}) block does the equivalent of
5795          *   local $^R = do {...}
5796          * When popping the save stack, all these locals would be undone;
5797          * bypass this by setting the outermost saved $^R to the latest
5798          * value */
5799         if (oreplsv != GvSV(PL_replgv))
5800             sv_setsv(oreplsv, GvSV(PL_replgv));
5801     }
5802     result = 1;
5803     goto final_exit;
5804
5805 no:
5806     DEBUG_EXECUTE_r(
5807         PerlIO_printf(Perl_debug_log,
5808             "%*s  %sfailed...%s\n",
5809             REPORT_CODE_OFF+depth*2, "",
5810             PL_colors[4], PL_colors[5])
5811         );
5812
5813 no_silent:
5814     if (no_final) {
5815         if (yes_state) {
5816             goto yes;
5817         } else {
5818             goto final_exit;
5819         }
5820     }
5821     if (depth) {
5822         /* there's a previous state to backtrack to */
5823         st--;
5824         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5825             PL_regmatch_slab = PL_regmatch_slab->prev;
5826             st = SLAB_LAST(PL_regmatch_slab);
5827         }
5828         PL_regmatch_state = st;
5829         locinput= st->locinput;
5830         nextchr = UCHARAT(locinput);
5831
5832         DEBUG_STATE_pp("pop");
5833         depth--;
5834         if (yes_state == st)
5835             yes_state = st->u.yes.prev_yes_state;
5836
5837         state_num = st->resume_state + 1; /* failure = success + 1 */
5838         goto reenter_switch;
5839     }
5840     result = 0;
5841
5842   final_exit:
5843     if (rex->intflags & PREGf_VERBARG_SEEN) {
5844         SV *sv_err = get_sv("REGERROR", 1);
5845         SV *sv_mrk = get_sv("REGMARK", 1);
5846         if (result) {
5847             sv_commit = &PL_sv_no;
5848             if (!sv_yes_mark)
5849                 sv_yes_mark = &PL_sv_yes;
5850         } else {
5851             if (!sv_commit)
5852                 sv_commit = &PL_sv_yes;
5853             sv_yes_mark = &PL_sv_no;
5854         }
5855         sv_setsv(sv_err, sv_commit);
5856         sv_setsv(sv_mrk, sv_yes_mark);
5857     }
5858
5859     /* clean up; in particular, free all slabs above current one */
5860     LEAVE_SCOPE(oldsave);
5861
5862     return result;
5863 }
5864
5865 /*
5866  - regrepeat - repeatedly match something simple, report how many
5867  */
5868 /*
5869  * [This routine now assumes that it will only match on things of length 1.
5870  * That was true before, but now we assume scan - reginput is the count,
5871  * rather than incrementing count on every character.  [Er, except utf8.]]
5872  */
5873 STATIC I32
5874 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5875 {
5876     dVAR;
5877     register char *scan;
5878     register I32 c;
5879     register char *loceol = PL_regeol;
5880     register I32 hardcount = 0;
5881     register bool utf8_target = PL_reg_match_utf8;
5882     UV utf8_flags;
5883 #ifndef DEBUGGING
5884     PERL_UNUSED_ARG(depth);
5885 #endif
5886
5887     PERL_ARGS_ASSERT_REGREPEAT;
5888
5889     scan = PL_reginput;
5890     if (max == REG_INFTY)
5891         max = I32_MAX;
5892     else if (max < loceol - scan)
5893         loceol = scan + max;
5894     switch (OP(p)) {
5895     case REG_ANY:
5896         if (utf8_target) {
5897             loceol = PL_regeol;
5898             while (scan < loceol && hardcount < max && *scan != '\n') {
5899                 scan += UTF8SKIP(scan);
5900                 hardcount++;
5901             }
5902         } else {
5903             while (scan < loceol && *scan != '\n')
5904                 scan++;
5905         }
5906         break;
5907     case SANY:
5908         if (utf8_target) {
5909             loceol = PL_regeol;
5910             while (scan < loceol && hardcount < max) {
5911                 scan += UTF8SKIP(scan);
5912                 hardcount++;
5913             }
5914         }
5915         else
5916             scan = loceol;
5917         break;
5918     case CANY:
5919         scan = loceol;
5920         break;
5921     case EXACT:
5922         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5923          * means they match only characters in the string that can be expressed
5924          * as a single byte.  For non-utf8 strings, that means a simple match.
5925          * For utf8 strings, the character matched must be an invariant, or
5926          * downgradable to a single byte.  The pattern's utf8ness is
5927          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5928          * it is, it's an invariant */
5929
5930         c = (U8)*STRING(p);
5931         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5932
5933         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5934             while (scan < loceol && UCHARAT(scan) == c) {
5935                 scan++;
5936             }
5937         }
5938         else {
5939
5940             /* Here, the string is utf8, and the pattern char is different
5941              * in utf8 than not, so can't compare them directly.  Outside the
5942              * loop, find find the two utf8 bytes that represent c, and then
5943              * look for those in sequence in the utf8 string */
5944             U8 high = UTF8_TWO_BYTE_HI(c);
5945             U8 low = UTF8_TWO_BYTE_LO(c);
5946             loceol = PL_regeol;
5947
5948             while (hardcount < max
5949                     && scan + 1 < loceol
5950                     && UCHARAT(scan) == high
5951                     && UCHARAT(scan + 1) == low)
5952             {
5953                 scan += 2;
5954                 hardcount++;
5955             }
5956         }
5957         break;
5958     case EXACTFA:
5959         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5960         goto do_exactf;
5961
5962     case EXACTFL:
5963         PL_reg_flags |= RF_tainted;
5964         utf8_flags = FOLDEQ_UTF8_LOCALE;
5965         goto do_exactf;
5966
5967     case EXACTF:
5968     case EXACTFU:
5969         utf8_flags = 0;
5970
5971         /* The comments for the EXACT case above apply as well to these fold
5972          * ones */
5973
5974     do_exactf:
5975         c = (U8)*STRING(p);
5976         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5977
5978         if (utf8_target) { /* Use full Unicode fold matching */
5979             char *tmpeol = loceol;
5980             while (hardcount < max
5981                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5982                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5983             {
5984                 scan = tmpeol;
5985                 tmpeol = loceol;
5986                 hardcount++;
5987             }
5988
5989             /* XXX Note that the above handles properly the German sharp s in
5990              * the pattern matching ss in the string.  But it doesn't handle
5991              * properly cases where the string contains say 'LIGATURE ff' and
5992              * the pattern is 'f+'.  This would require, say, a new function or
5993              * revised interface to foldEQ_utf8(), in which the maximum number
5994              * of characters to match could be passed and it would return how
5995              * many actually did.  This is just one of many cases where
5996              * multi-char folds don't work properly, and so the fix is being
5997              * deferred */
5998         }
5999         else {
6000             U8 folded;
6001
6002             /* Here, the string isn't utf8 and c is a single byte; and either
6003              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6004              * doesn't affect c.  Can just do simple comparisons for exact or
6005              * fold matching. */
6006             switch (OP(p)) {
6007                 case EXACTF: folded = PL_fold[c]; break;
6008                 case EXACTFA:
6009                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6010                 case EXACTFL: folded = PL_fold_locale[c]; break;
6011                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6012             }
6013             while (scan < loceol &&
6014                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6015             {
6016                 scan++;
6017             }
6018         }
6019         break;
6020     case ANYOFV:
6021     case ANYOF:
6022         if (utf8_target || OP(p) == ANYOFV) {
6023             STRLEN inclasslen;
6024             loceol = PL_regeol;
6025             inclasslen = loceol - scan;
6026             while (hardcount < max
6027                    && ((inclasslen = loceol - scan) > 0)
6028                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6029             {
6030                 scan += inclasslen;
6031                 hardcount++;
6032             }
6033         } else {
6034             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6035                 scan++;
6036         }
6037         break;
6038     case ALNUMU:
6039         if (utf8_target) {
6040     utf8_wordchar:
6041             loceol = PL_regeol;
6042             LOAD_UTF8_CHARCLASS_ALNUM();
6043             while (hardcount < max && scan < loceol &&
6044                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6045             {
6046                 scan += UTF8SKIP(scan);
6047                 hardcount++;
6048             }
6049         } else {
6050             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6051                 scan++;
6052             }
6053         }
6054         break;
6055     case ALNUM:
6056         if (utf8_target)
6057             goto utf8_wordchar;
6058         while (scan < loceol && isALNUM((U8) *scan)) {
6059             scan++;
6060         }
6061         break;
6062     case ALNUMA:
6063         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6064             scan++;
6065         }
6066         break;
6067     case ALNUML:
6068         PL_reg_flags |= RF_tainted;
6069         if (utf8_target) {
6070             loceol = PL_regeol;
6071             while (hardcount < max && scan < loceol &&
6072                    isALNUM_LC_utf8((U8*)scan)) {
6073                 scan += UTF8SKIP(scan);
6074                 hardcount++;
6075             }
6076         } else {
6077             while (scan < loceol && isALNUM_LC(*scan))
6078                 scan++;
6079         }
6080         break;
6081     case NALNUMU:
6082         if (utf8_target) {
6083
6084     utf8_Nwordchar:
6085
6086             loceol = PL_regeol;
6087             LOAD_UTF8_CHARCLASS_ALNUM();
6088             while (hardcount < max && scan < loceol &&
6089                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6090             {
6091                 scan += UTF8SKIP(scan);
6092                 hardcount++;
6093             }
6094         } else {
6095             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6096                 scan++;
6097             }
6098         }
6099         break;
6100     case NALNUM:
6101         if (utf8_target)
6102             goto utf8_Nwordchar;
6103         while (scan < loceol && ! isALNUM((U8) *scan)) {
6104             scan++;
6105         }
6106         break;
6107     case NALNUMA:
6108         if (utf8_target) {
6109             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6110                 scan += UTF8SKIP(scan);
6111             }
6112         }
6113         else {
6114             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6115                 scan++;
6116             }
6117         }
6118         break;
6119     case NALNUML:
6120         PL_reg_flags |= RF_tainted;
6121         if (utf8_target) {
6122             loceol = PL_regeol;
6123             while (hardcount < max && scan < loceol &&
6124                    !isALNUM_LC_utf8((U8*)scan)) {
6125                 scan += UTF8SKIP(scan);
6126                 hardcount++;
6127             }
6128         } else {
6129             while (scan < loceol && !isALNUM_LC(*scan))
6130                 scan++;
6131         }
6132         break;
6133     case SPACEU:
6134         if (utf8_target) {
6135
6136     utf8_space:
6137
6138             loceol = PL_regeol;
6139             LOAD_UTF8_CHARCLASS_SPACE();
6140             while (hardcount < max && scan < loceol &&
6141                    (*scan == ' ' ||
6142                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6143             {
6144                 scan += UTF8SKIP(scan);
6145                 hardcount++;
6146             }
6147             break;
6148         }
6149         else {
6150             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6151                 scan++;
6152             }
6153             break;
6154         }
6155     case SPACE:
6156         if (utf8_target)
6157             goto utf8_space;
6158
6159         while (scan < loceol && isSPACE((U8) *scan)) {
6160             scan++;
6161         }
6162         break;
6163     case SPACEA:
6164         while (scan < loceol && isSPACE_A((U8) *scan)) {
6165             scan++;
6166         }
6167         break;
6168     case SPACEL:
6169         PL_reg_flags |= RF_tainted;
6170         if (utf8_target) {
6171             loceol = PL_regeol;
6172             while (hardcount < max && scan < loceol &&
6173                    isSPACE_LC_utf8((U8*)scan)) {
6174                 scan += UTF8SKIP(scan);
6175                 hardcount++;
6176             }
6177         } else {
6178             while (scan < loceol && isSPACE_LC(*scan))
6179                 scan++;
6180         }
6181         break;
6182     case NSPACEU:
6183         if (utf8_target) {
6184
6185     utf8_Nspace:
6186
6187             loceol = PL_regeol;
6188             LOAD_UTF8_CHARCLASS_SPACE();
6189             while (hardcount < max && scan < loceol &&
6190                    ! (*scan == ' ' ||
6191                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6192             {
6193                 scan += UTF8SKIP(scan);
6194                 hardcount++;
6195             }
6196             break;
6197         }
6198         else {
6199             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6200                 scan++;
6201             }
6202         }
6203         break;
6204     case NSPACE:
6205         if (utf8_target)
6206             goto utf8_Nspace;
6207
6208         while (scan < loceol && ! isSPACE((U8) *scan)) {
6209             scan++;
6210         }
6211         break;
6212     case NSPACEA:
6213         if (utf8_target) {
6214             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6215                 scan += UTF8SKIP(scan);
6216             }
6217         }
6218         else {
6219             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6220                 scan++;
6221             }
6222         }
6223         break;
6224     case NSPACEL:
6225         PL_reg_flags |= RF_tainted;
6226         if (utf8_target) {
6227             loceol = PL_regeol;
6228             while (hardcount < max && scan < loceol &&
6229                    !isSPACE_LC_utf8((U8*)scan)) {
6230                 scan += UTF8SKIP(scan);
6231                 hardcount++;
6232             }
6233         } else {
6234             while (scan < loceol && !isSPACE_LC(*scan))
6235                 scan++;
6236         }
6237         break;
6238     case DIGIT:
6239         if (utf8_target) {
6240             loceol = PL_regeol;
6241             LOAD_UTF8_CHARCLASS_DIGIT();
6242             while (hardcount < max && scan < loceol &&
6243                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6244                 scan += UTF8SKIP(scan);
6245                 hardcount++;
6246             }
6247         } else {
6248             while (scan < loceol && isDIGIT(*scan))
6249                 scan++;
6250         }
6251         break;
6252     case DIGITA:
6253         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6254             scan++;
6255         }
6256         break;
6257     case DIGITL:
6258         PL_reg_flags |= RF_tainted;
6259         if (utf8_target) {
6260             loceol = PL_regeol;
6261             while (hardcount < max && scan < loceol &&
6262                    isDIGIT_LC_utf8((U8*)scan)) {
6263                 scan += UTF8SKIP(scan);
6264                 hardcount++;
6265             }
6266         } else {
6267             while (scan < loceol && isDIGIT_LC(*scan))
6268                 scan++;
6269         }
6270         break;
6271     case NDIGIT:
6272         if (utf8_target) {
6273             loceol = PL_regeol;
6274             LOAD_UTF8_CHARCLASS_DIGIT();
6275             while (hardcount < max && scan < loceol &&
6276                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6277                 scan += UTF8SKIP(scan);
6278                 hardcount++;
6279             }
6280         } else {
6281             while (scan < loceol && !isDIGIT(*scan))
6282                 scan++;
6283         }
6284         break;
6285     case NDIGITA:
6286         if (utf8_target) {
6287             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6288                 scan += UTF8SKIP(scan);
6289             }
6290         }
6291         else {
6292             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6293                 scan++;
6294             }
6295         }
6296         break;
6297     case NDIGITL:
6298         PL_reg_flags |= RF_tainted;
6299         if (utf8_target) {
6300             loceol = PL_regeol;
6301             while (hardcount < max && scan < loceol &&
6302                    !isDIGIT_LC_utf8((U8*)scan)) {
6303                 scan += UTF8SKIP(scan);
6304                 hardcount++;
6305             }
6306         } else {
6307             while (scan < loceol && !isDIGIT_LC(*scan))
6308                 scan++;
6309         }
6310         break;
6311     case LNBREAK:
6312         if (utf8_target) {
6313             loceol = PL_regeol;
6314             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6315                 scan += c;
6316                 hardcount++;
6317             }
6318         } else {
6319             /*
6320               LNBREAK can match two latin chars, which is ok,
6321               because we have a null terminated string, but we
6322               have to use hardcount in this situation
6323             */
6324             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6325                 scan+=c;
6326                 hardcount++;
6327             }
6328         }
6329         break;
6330     case HORIZWS:
6331         if (utf8_target) {
6332             loceol = PL_regeol;
6333             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6334                 scan += c;
6335                 hardcount++;
6336             }
6337         } else {
6338             while (scan < loceol && is_HORIZWS_latin1(scan))
6339                 scan++;
6340         }
6341         break;
6342     case NHORIZWS:
6343         if (utf8_target) {
6344             loceol = PL_regeol;
6345             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6346                 scan += UTF8SKIP(scan);
6347                 hardcount++;
6348             }
6349         } else {
6350             while (scan < loceol && !is_HORIZWS_latin1(scan))
6351                 scan++;
6352
6353         }
6354         break;
6355     case VERTWS:
6356         if (utf8_target) {
6357             loceol = PL_regeol;
6358             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6359                 scan += c;
6360                 hardcount++;
6361             }
6362         } else {
6363             while (scan < loceol && is_VERTWS_latin1(scan))
6364                 scan++;
6365
6366         }
6367         break;
6368     case NVERTWS:
6369         if (utf8_target) {
6370             loceol = PL_regeol;
6371             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6372                 scan += UTF8SKIP(scan);
6373                 hardcount++;
6374             }
6375         } else {
6376             while (scan < loceol && !is_VERTWS_latin1(scan))
6377                 scan++;
6378
6379         }
6380         break;
6381
6382     default:            /* Called on something of 0 width. */
6383         break;          /* So match right here or not at all. */
6384     }
6385
6386     if (hardcount)
6387         c = hardcount;
6388     else
6389         c = scan - PL_reginput;
6390     PL_reginput = scan;
6391
6392     DEBUG_r({
6393         GET_RE_DEBUG_FLAGS_DECL;
6394         DEBUG_EXECUTE_r({
6395             SV * const prop = sv_newmortal();
6396             regprop(prog, prop, p);
6397             PerlIO_printf(Perl_debug_log,
6398                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6399                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6400         });
6401     });
6402
6403     return(c);
6404 }
6405
6406
6407 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6408 /*
6409 - regclass_swash - prepare the utf8 swash
6410 */
6411
6412 SV *
6413 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6414 {
6415     dVAR;
6416     SV *sw  = NULL;
6417     SV *si  = NULL;
6418     SV *alt = NULL;
6419     RXi_GET_DECL(prog,progi);
6420     const struct reg_data * const data = prog ? progi->data : NULL;
6421
6422     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6423
6424     assert(ANYOF_NONBITMAP(node));
6425
6426     if (data && data->count) {
6427         const U32 n = ARG(node);
6428
6429         if (data->what[n] == 's') {
6430             SV * const rv = MUTABLE_SV(data->data[n]);
6431             AV * const av = MUTABLE_AV(SvRV(rv));
6432             SV **const ary = AvARRAY(av);
6433             SV **a, **b;
6434
6435             /* See the end of regcomp.c:S_regclass() for
6436              * documentation of these array elements. */
6437
6438             si = *ary;
6439             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6440             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6441
6442             if (a)
6443                 sw = *a;
6444             else if (si && doinit) {
6445                 sw = swash_init("utf8", "", si, 1, 0);
6446                 (void)av_store(av, 1, sw);
6447             }
6448             if (b)
6449                 alt = *b;
6450         }
6451     }
6452
6453     if (listsvp)
6454         *listsvp = si;
6455     if (altsvp)
6456         *altsvp  = alt;
6457
6458     return sw;
6459 }
6460 #endif
6461
6462 /*
6463  - reginclass - determine if a character falls into a character class
6464
6465   n is the ANYOF regnode
6466   p is the target string
6467   lenp is pointer to the maximum number of bytes of how far to go in p
6468     (This is assumed wthout checking to always be at least the current
6469     character's size)
6470   utf8_target tells whether p is in UTF-8.
6471
6472   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6473   from a successful match, the value it points to will be updated to how many
6474   bytes in p were matched.  If there was no match, the value is undefined,
6475   possibly changed from the input.
6476
6477   Note that this can be a synthetic start class, a combination of various
6478   nodes, so things you think might be mutually exclusive, such as locale,
6479   aren't.  It can match both locale and non-locale
6480
6481  */
6482
6483 STATIC bool
6484 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6485 {
6486     dVAR;
6487     const char flags = ANYOF_FLAGS(n);
6488     bool match = FALSE;
6489     UV c = *p;
6490     STRLEN c_len = 0;
6491     STRLEN maxlen;
6492
6493     PERL_ARGS_ASSERT_REGINCLASS;
6494
6495     /* If c is not already the code point, get it */
6496     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6497         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6498                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6499                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6500                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6501                  * UTF8_ALLOW_FFFF */
6502         if (c_len == (STRLEN)-1)
6503             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6504     }
6505     else {
6506         c_len = 1;
6507     }
6508
6509     /* Use passed in max length, or one character if none passed in or less
6510      * than one character.  And assume will match just one character.  This is
6511      * overwritten later if matched more. */
6512     if (lenp) {
6513         maxlen = (*lenp > c_len) ? *lenp : c_len;
6514         *lenp = c_len;
6515
6516     }
6517     else {
6518         maxlen = c_len;
6519     }
6520
6521     /* If this character is potentially in the bitmap, check it */
6522     if (c < 256) {
6523         if (ANYOF_BITMAP_TEST(n, c))
6524             match = TRUE;
6525         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6526                 && ! utf8_target
6527                 && ! isASCII(c))
6528         {
6529             match = TRUE;
6530         }
6531
6532         else if (flags & ANYOF_LOCALE) {
6533             PL_reg_flags |= RF_tainted;
6534
6535             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6536                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6537             {
6538                 match = TRUE;
6539             }
6540             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6541                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6542                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6543                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6544                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6545                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6546                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6547                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6548                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6549                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6550                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6562                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6563                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6564                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6565                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6566                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6567                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6568                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6569                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6570                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6571                      ) /* How's that for a conditional? */
6572             ) {
6573                 match = TRUE;
6574             }
6575         }
6576     }
6577
6578     /* If the bitmap didn't (or couldn't) match, and something outside the
6579      * bitmap could match, try that.  Locale nodes specifiy completely the
6580      * behavior of code points in the bit map (otherwise, a utf8 target would
6581      * cause them to be treated as Unicode and not locale), except in
6582      * the very unlikely event when this node is a synthetic start class, which
6583      * could be a combination of locale and non-locale nodes.  So allow locale
6584      * to match for the synthetic start class, which will give a false
6585      * positive that will be resolved when the match is done again as not part
6586      * of the synthetic start class */
6587     if (!match) {
6588         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6589             match = TRUE;       /* Everything above 255 matches */
6590         }
6591         else if (ANYOF_NONBITMAP(n)
6592                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6593                      || (utf8_target
6594                          && (c >=256
6595                              || (! (flags & ANYOF_LOCALE))
6596                              || (flags & ANYOF_IS_SYNTHETIC)))))
6597         {
6598             AV *av;
6599             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6600
6601             if (sw) {
6602                 U8 * utf8_p;
6603                 if (utf8_target) {
6604                     utf8_p = (U8 *) p;
6605                 } else {
6606
6607                     /* Not utf8.  Convert as much of the string as available up
6608                      * to the limit of how far the (single) character in the
6609                      * pattern can possibly match (no need to go further).  If
6610                      * the node is a straight ANYOF or not folding, it can't
6611                      * match more than one.  Otherwise, It can match up to how
6612                      * far a single char can fold to.  Since not utf8, each
6613                      * character is a single byte, so the max it can be in
6614                      * bytes is the same as the max it can be in characters */
6615                     STRLEN len = (OP(n) == ANYOF
6616                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6617                                   ? 1
6618                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6619                                     ? maxlen
6620                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6621                     utf8_p = bytes_to_utf8(p, &len);
6622                 }
6623
6624                 if (swash_fetch(sw, utf8_p, TRUE))
6625                     match = TRUE;
6626                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6627
6628                     /* Here, we need to test if the fold of the target string
6629                      * matches.  The non-multi char folds have all been moved to
6630                      * the compilation phase, and the multi-char folds have
6631                      * been stored by regcomp into 'av'; we linearly check to
6632                      * see if any match the target string (folded).   We know
6633                      * that the originals were each one character, but we don't
6634                      * currently know how many characters/bytes each folded to,
6635                      * except we do know that there are small limits imposed by
6636                      * Unicode.  XXX A performance enhancement would be to have
6637                      * regcomp.c store the max number of chars/bytes that are
6638                      * in an av entry, as, say the 0th element.  Even better
6639                      * would be to have a hash of the few characters that can
6640                      * start a multi-char fold to the max number of chars of
6641                      * those folds.
6642                      *
6643                      * If there is a match, we will need to advance (if lenp is
6644                      * specified) the match pointer in the target string.  But
6645                      * what we are comparing here isn't that string directly,
6646                      * but its fold, whose length may differ from the original.
6647                      * As we go along in constructing the fold, therefore, we
6648                      * create a map so that we know how many bytes in the
6649                      * source to advance given that we have matched a certain
6650                      * number of bytes in the fold.  This map is stored in
6651                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6652                      * the fold of the first character that we are folding.
6653                      * Then map_fold_len_back[n] is set to the number of bytes
6654                      * in that first character.  Similarly let m be the
6655                      * corresponding number for the second character to be
6656                      * folded.  Then map_fold_len_back[n+m] is set to the
6657                      * number of bytes occupied by the first two source
6658                      * characters. ... */
6659                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6660                     U8 folded[UTF8_MAXBYTES_CASE+1];
6661                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6662                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6663                                                   chars */
6664
6665                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6666
6667                         /* Here, only need to fold the first char of the target
6668                          * string.  It the source wasn't utf8, is 1 byte long */
6669                         to_utf8_fold(utf8_p, folded, &foldlen);
6670                         total_foldlen = foldlen;
6671                         map_fold_len_back[foldlen] = (utf8_target)
6672                                                      ? UTF8SKIP(utf8_p)
6673                                                      : 1;
6674                     }
6675                     else {
6676
6677                         /* Here, need to fold more than the first char.  Do so
6678                          * up to the limits */
6679                         U8* source_ptr = utf8_p;    /* The source for the fold
6680                                                        is the regex target
6681                                                        string */
6682                         U8* folded_ptr = folded;
6683                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6684                                                        available byte in the
6685                                                        target string */
6686                         U8 i;
6687                         for (i = 0;
6688                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6689                              i++)
6690                         {
6691
6692                             /* Fold the next character */
6693                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6694                             STRLEN this_char_foldlen;
6695                             to_utf8_fold(source_ptr,
6696                                          this_char_folded,
6697                                          &this_char_foldlen);
6698
6699                             /* Bail if it would exceed the byte limit for
6700                              * folding a single char. */
6701                             if (this_char_foldlen + folded_ptr - folded >
6702                                                             UTF8_MAXBYTES_CASE)
6703                             {
6704                                 break;
6705                             }
6706
6707                             /* Add the fold of this character */
6708                             Copy(this_char_folded,
6709                                  folded_ptr,
6710                                  this_char_foldlen,
6711                                  U8);
6712                             source_ptr += UTF8SKIP(source_ptr);
6713                             folded_ptr += this_char_foldlen;
6714                             total_foldlen = folded_ptr - folded;
6715
6716                             /* Create map from the number of bytes in the fold
6717                              * back to the number of bytes in the source.  If
6718                              * the source isn't utf8, the byte count is just
6719                              * the number of characters so far */
6720                             map_fold_len_back[total_foldlen]
6721                                                       = (utf8_target)
6722                                                         ? source_ptr - utf8_p
6723                                                         : i + 1;
6724                         }
6725                         *folded_ptr = '\0';
6726                     }
6727
6728
6729                     /* Do the linear search to see if the fold is in the list
6730                      * of multi-char folds. */
6731                     if (av) {
6732                         I32 i;
6733                         for (i = 0; i <= av_len(av); i++) {
6734                             SV* const sv = *av_fetch(av, i, FALSE);
6735                             STRLEN len;
6736                             const char * const s = SvPV_const(sv, len);
6737
6738                             if (len <= total_foldlen
6739                                 && memEQ(s, (char*)folded, len)
6740
6741                                    /* If 0, means matched a partial char. See
6742                                     * [perl #90536] */
6743                                 && map_fold_len_back[len])
6744                             {
6745
6746                                 /* Advance the target string ptr to account for
6747                                  * this fold, but have to translate from the
6748                                  * folded length to the corresponding source
6749                                  * length. */
6750                                 if (lenp) {
6751                                     *lenp = map_fold_len_back[len];
6752                                 }
6753                                 match = TRUE;
6754                                 break;
6755                             }
6756                         }
6757                     }
6758                 }
6759
6760                 /* If we allocated a string above, free it */
6761                 if (! utf8_target) Safefree(utf8_p);
6762             }
6763         }
6764     }
6765
6766     return (flags & ANYOF_INVERT) ? !match : match;
6767 }
6768
6769 STATIC U8 *
6770 S_reghop3(U8 *s, I32 off, const U8* lim)
6771 {
6772     dVAR;
6773
6774     PERL_ARGS_ASSERT_REGHOP3;
6775
6776     if (off >= 0) {
6777         while (off-- && s < lim) {
6778             /* XXX could check well-formedness here */
6779             s += UTF8SKIP(s);
6780         }
6781     }
6782     else {
6783         while (off++ && s > lim) {
6784             s--;
6785             if (UTF8_IS_CONTINUED(*s)) {
6786                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6787                     s--;
6788             }
6789             /* XXX could check well-formedness here */
6790         }
6791     }
6792     return s;
6793 }
6794
6795 #ifdef XXX_dmq
6796 /* there are a bunch of places where we use two reghop3's that should
6797    be replaced with this routine. but since thats not done yet
6798    we ifdef it out - dmq
6799 */
6800 STATIC U8 *
6801 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6802 {
6803     dVAR;
6804
6805     PERL_ARGS_ASSERT_REGHOP4;
6806
6807     if (off >= 0) {
6808         while (off-- && s < rlim) {
6809             /* XXX could check well-formedness here */
6810             s += UTF8SKIP(s);
6811         }
6812     }
6813     else {
6814         while (off++ && s > llim) {
6815             s--;
6816             if (UTF8_IS_CONTINUED(*s)) {
6817                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6818                     s--;
6819             }
6820             /* XXX could check well-formedness here */
6821         }
6822     }
6823     return s;
6824 }
6825 #endif
6826
6827 STATIC U8 *
6828 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6829 {
6830     dVAR;
6831
6832     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6833
6834     if (off >= 0) {
6835         while (off-- && s < lim) {
6836             /* XXX could check well-formedness here */
6837             s += UTF8SKIP(s);
6838         }
6839         if (off >= 0)
6840             return NULL;
6841     }
6842     else {
6843         while (off++ && s > lim) {
6844             s--;
6845             if (UTF8_IS_CONTINUED(*s)) {
6846                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6847                     s--;
6848             }
6849             /* XXX could check well-formedness here */
6850         }
6851         if (off <= 0)
6852             return NULL;
6853     }
6854     return s;
6855 }
6856
6857 static void
6858 restore_pos(pTHX_ void *arg)
6859 {
6860     dVAR;
6861     regexp * const rex = (regexp *)arg;
6862     if (PL_reg_eval_set) {
6863         if (PL_reg_oldsaved) {
6864             rex->subbeg = PL_reg_oldsaved;
6865             rex->sublen = PL_reg_oldsavedlen;
6866 #ifdef PERL_OLD_COPY_ON_WRITE
6867             rex->saved_copy = PL_nrs;
6868 #endif
6869             RXp_MATCH_COPIED_on(rex);
6870         }
6871         PL_reg_magic->mg_len = PL_reg_oldpos;
6872         PL_reg_eval_set = 0;
6873         PL_curpm = PL_reg_oldcurpm;
6874     }
6875 }
6876
6877 STATIC void
6878 S_to_utf8_substr(pTHX_ register regexp *prog)
6879 {
6880     int i = 1;
6881
6882     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6883
6884     do {
6885         if (prog->substrs->data[i].substr
6886             && !prog->substrs->data[i].utf8_substr) {
6887             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6888             prog->substrs->data[i].utf8_substr = sv;
6889             sv_utf8_upgrade(sv);
6890             if (SvVALID(prog->substrs->data[i].substr)) {
6891                 if (SvTAIL(prog->substrs->data[i].substr)) {
6892                     /* Trim the trailing \n that fbm_compile added last
6893                        time.  */
6894                     SvCUR_set(sv, SvCUR(sv) - 1);
6895                     /* Whilst this makes the SV technically "invalid" (as its
6896                        buffer is no longer followed by "\0") when fbm_compile()
6897                        adds the "\n" back, a "\0" is restored.  */
6898                     fbm_compile(sv, FBMcf_TAIL);
6899                 } else
6900                     fbm_compile(sv, 0);
6901             }
6902             if (prog->substrs->data[i].substr == prog->check_substr)
6903                 prog->check_utf8 = sv;
6904         }
6905     } while (i--);
6906 }
6907
6908 STATIC void
6909 S_to_byte_substr(pTHX_ register regexp *prog)
6910 {
6911     dVAR;
6912     int i = 1;
6913
6914     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6915
6916     do {
6917         if (prog->substrs->data[i].utf8_substr
6918             && !prog->substrs->data[i].substr) {
6919             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6920             if (sv_utf8_downgrade(sv, TRUE)) {
6921                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6922                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
6923                         /* Trim the trailing \n that fbm_compile added last
6924                            time.  */
6925                         SvCUR_set(sv, SvCUR(sv) - 1);
6926                         fbm_compile(sv, FBMcf_TAIL);
6927                     } else
6928                         fbm_compile(sv, 0);
6929                 }
6930             } else {
6931                 SvREFCNT_dec(sv);
6932                 sv = &PL_sv_undef;
6933             }
6934             prog->substrs->data[i].substr = sv;
6935             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6936                 prog->check_substr = sv;
6937         }
6938     } while (i--);
6939 }
6940
6941 /*
6942  * Local variables:
6943  * c-indentation-style: bsd
6944  * c-basic-offset: 4
6945  * indent-tabs-mode: t
6946  * End:
6947  *
6948  * ex: set ts=8 sts=4 sw=4 noet:
6949  */