src/5013004/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1               /* tainted information used? */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  99
 100 /*
 101  * Forwards.
 102  */
 103
 104 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 106
 107 #define HOPc(pos,off) \
 108         (char *)(PL_reg_match_utf8 \
 109             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 110             : (U8*)(pos + off))
 111 #define HOPBACKc(pos, off) \
 112         (char*)(PL_reg_match_utf8\
 113             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 114             : (pos - off >= PL_bostr)           \
 115                 ? (U8*)pos - off                \
 116                 : NULL)
 117
 118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 120
 121 /* these are unrolled below in the CCC_TRY_XXX defined */
 122 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 123     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 124
 125 /* Doesn't do an assert to verify that is correct */
 126 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 128
 129 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 130 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 131 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 132
 133 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 134         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 135         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 136         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 137             * assert should likely and hopefully fail on an EBCDIC machine */ \
 138         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 139                                                                             \
 140         /* No asserts are done for these, in case called on an early        \
 141             * Unicode version in which they map to nothing */               \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 149
 150 /*
 151    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 152    so that it is possible to override the option here without having to
 153    rebuild the entire core. as we are required to do if we change regcomp.h
 154    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 155 */
 156 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 157 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 158 #endif
 159
 160 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 162 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 163 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 164 #define RE_utf8_perl_word   PL_utf8_alnum
 165 #define RE_utf8_perl_space  PL_utf8_space
 166 #define RE_utf8_posix_digit PL_utf8_digit
 167 #define perl_word  alnum
 168 #define perl_space space
 169 #define posix_digit digit
 170 #else
 171 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 172 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 173 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 174 #define RE_utf8_perl_word   PL_utf8_perl_word
 175 #define RE_utf8_perl_space  PL_utf8_perl_space
 176 #define RE_utf8_posix_digit PL_utf8_posix_digit
 177 #endif
 178
 179
 180 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 181         case NAMEL:                                                              \
 182             PL_reg_flags |= RF_tainted;                                                 \
 183             /* FALL THROUGH */                                                          \
 184         case NAME:                                                                     \
 185             if (!nextchr)                                                               \
 186                 sayNO;                                                                  \
 187             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
 188                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 189                     bool ok;                                                            \
 190                     ENTER;                                                              \
 191                     save_re_context();                                                  \
 192                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 193                     assert(ok);                                                         \
 194                     LEAVE;                                                              \
 195                 }                                                                       \
 196                 if (!(OP(scan) == NAME                                                  \
 197                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 198                     : LCFUNC_utf8((U8*)locinput)))                                      \
 199                 {                                                                       \
 200                     sayNO;                                                              \
 201                 }                                                                       \
 202                 locinput += PL_utf8skip[nextchr];                                       \
 203                 nextchr = UCHARAT(locinput);                                            \
 204                 break;                                                                  \
 205             }                                                                           \
 206             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 207                 sayNO;                                                                  \
 208             nextchr = UCHARAT(++locinput);                                              \
 209             break
 210
 211 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 212         case NAMEL:                                                              \
 213             PL_reg_flags |= RF_tainted;                                                 \
 214             /* FALL THROUGH */                                                          \
 215         case NAME :                                                                     \
 216             if (!nextchr && locinput >= PL_regeol)                                      \
 217                 sayNO;                                                                  \
 218             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
 219                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 220                     bool ok;                                                            \
 221                     ENTER;                                                              \
 222                     save_re_context();                                                  \
 223                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 224                     assert(ok);                                                         \
 225                     LEAVE;                                                              \
 226                 }                                                                       \
 227                 if ((OP(scan) == NAME                                                  \
 228                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 229                     : LCFUNC_utf8((U8*)locinput)))                                      \
 230                 {                                                                       \
 231                     sayNO;                                                              \
 232                 }                                                                       \
 233                 locinput += PL_utf8skip[nextchr];                                       \
 234                 nextchr = UCHARAT(locinput);                                            \
 235                 break;                                                                  \
 236             }                                                                           \
 237             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 238                 sayNO;                                                                  \
 239             nextchr = UCHARAT(++locinput);                                              \
 240             break
 241
 242
 243
 244
 245
 246 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 247
 248 /* for use after a quantifier and before an EXACT-like node -- japhy */
 249 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 250 #define JUMPABLE(rn) (      \
 251     OP(rn) == OPEN ||       \
 252     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 253     OP(rn) == EVAL ||   \
 254     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 255     OP(rn) == PLUS || OP(rn) == MINMOD || \
 256     OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 257     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 258 )
 259 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 260
 261 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 262
 263 #if 0
 264 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 265    we don't need this definition. */
 266 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 267 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 268 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 269
 270 #else
 271 /* ... so we use this as its faster. */
 272 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 273 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 274 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 275
 276 #endif
 277
 278 /*
 279   Search for mandatory following text node; for lookahead, the text must
 280   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 281 */
 282 #define FIND_NEXT_IMPT(rn) STMT_START { \
 283     while (JUMPABLE(rn)) { \
 284         const OPCODE type = OP(rn); \
 285         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 286             rn = NEXTOPER(NEXTOPER(rn)); \
 287         else if (type == PLUS) \
 288             rn = NEXTOPER(rn); \
 289         else if (type == IFMATCH) \
 290             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 291         else rn += NEXT_OFF(rn); \
 292     } \
 293 } STMT_END
 294
 295
 296 static void restore_pos(pTHX_ void *arg);
 297
 298 #define REGCP_PAREN_ELEMS 4
 299 #define REGCP_OTHER_ELEMS 5
 300 #define REGCP_FRAME_ELEMS 1
 301 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 302  * are needed for the regexp context stack bookkeeping. */
 303
 304 STATIC CHECKPOINT
 305 S_regcppush(pTHX_ I32 parenfloor)
 306 {
 307     dVAR;
 308     const int retval = PL_savestack_ix;
 309     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 310     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 311     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 312     int p;
 313     GET_RE_DEBUG_FLAGS_DECL;
 314
 315     if (paren_elems_to_push < 0)
 316         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 317
 318     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 319         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 320                    " out of range (%lu-%ld)",
 321                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 322
 323     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 324
 325     for (p = PL_regsize; p > parenfloor; p--) {
 326 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 327         SSPUSHINT(PL_regoffs[p].end);
 328         SSPUSHINT(PL_regoffs[p].start);
 329         SSPUSHPTR(PL_reg_start_tmp[p]);
 330         SSPUSHINT(p);
 331         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 332           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 333                       (UV)p, (IV)PL_regoffs[p].start,
 334                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 335                       (IV)PL_regoffs[p].end
 336         ));
 337     }
 338 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 339     SSPUSHPTR(PL_regoffs);
 340     SSPUSHINT(PL_regsize);
 341     SSPUSHINT(*PL_reglastparen);
 342     SSPUSHINT(*PL_reglastcloseparen);
 343     SSPUSHPTR(PL_reginput);
 344     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 345
 346     return retval;
 347 }
 348
 349 /* These are needed since we do not localize EVAL nodes: */
 350 #define REGCP_SET(cp)                                           \
 351     DEBUG_STATE_r(                                              \
 352             PerlIO_printf(Perl_debug_log,                       \
 353                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 354                 (IV)PL_savestack_ix));                          \
 355     cp = PL_savestack_ix
 356
 357 #define REGCP_UNWIND(cp)                                        \
 358     DEBUG_STATE_r(                                              \
 359         if (cp != PL_savestack_ix)                              \
 360             PerlIO_printf(Perl_debug_log,                       \
 361                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 362                 (IV)(cp), (IV)PL_savestack_ix));                \
 363     regcpblow(cp)
 364
 365 STATIC char *
 366 S_regcppop(pTHX_ const regexp *rex)
 367 {
 368     dVAR;
 369     UV i;
 370     char *input;
 371     GET_RE_DEBUG_FLAGS_DECL;
 372
 373     PERL_ARGS_ASSERT_REGCPPOP;
 374
 375     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 376     i = SSPOPUV;
 377     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 378     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 379     input = (char *) SSPOPPTR;
 380     *PL_reglastcloseparen = SSPOPINT;
 381     *PL_reglastparen = SSPOPINT;
 382     PL_regsize = SSPOPINT;
 383     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 384
 385     i -= REGCP_OTHER_ELEMS;
 386     /* Now restore the parentheses context. */
 387     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 388         I32 tmps;
 389         U32 paren = (U32)SSPOPINT;
 390         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 391         PL_regoffs[paren].start = SSPOPINT;
 392         tmps = SSPOPINT;
 393         if (paren <= *PL_reglastparen)
 394             PL_regoffs[paren].end = tmps;
 395         DEBUG_BUFFERS_r(
 396             PerlIO_printf(Perl_debug_log,
 397                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 398                           (UV)paren, (IV)PL_regoffs[paren].start,
 399                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 400                           (IV)PL_regoffs[paren].end,
 401                           (paren > *PL_reglastparen ? "(no)" : ""));
 402         );
 403     }
 404     DEBUG_BUFFERS_r(
 405         if (*PL_reglastparen + 1 <= rex->nparens) {
 406             PerlIO_printf(Perl_debug_log,
 407                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 408                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 409         }
 410     );
 411 #if 1
 412     /* It would seem that the similar code in regtry()
 413      * already takes care of this, and in fact it is in
 414      * a better location to since this code can #if 0-ed out
 415      * but the code in regtry() is needed or otherwise tests
 416      * requiring null fields (pat.t#187 and split.t#{13,14}
 417      * (as of patchlevel 7877)  will fail.  Then again,
 418      * this code seems to be necessary or otherwise
 419      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 420      * --jhi updated by dapm */
 421     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 422         if (i > PL_regsize)
 423             PL_regoffs[i].start = -1;
 424         PL_regoffs[i].end = -1;
 425     }
 426 #endif
 427     return input;
 428 }
 429
 430 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 431
 432 /*
 433  * pregexec and friends
 434  */
 435
 436 #ifndef PERL_IN_XSUB_RE
 437 /*
 438  - pregexec - match a regexp against a string
 439  */
 440 I32
 441 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 442          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 443 /* strend: pointer to null at end of string */
 444 /* strbeg: real beginning of string */
 445 /* minend: end of match must be >=minend after stringarg. */
 446 /* nosave: For optimizations. */
 447 {
 448     PERL_ARGS_ASSERT_PREGEXEC;
 449
 450     return
 451         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 452                       nosave ? 0 : REXEC_COPY_STR);
 453 }
 454 #endif
 455
 456 /*
 457  * Need to implement the following flags for reg_anch:
 458  *
 459  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 460  * USE_INTUIT_ML
 461  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 462  * INTUIT_AUTORITATIVE_ML
 463  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 464  * INTUIT_ONCE_ML
 465  *
 466  * Another flag for this function: SECOND_TIME (so that float substrs
 467  * with giant delta may be not rechecked).
 468  */
 469
 470 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 471
 472 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 473    Otherwise, only SvCUR(sv) is used to get strbeg. */
 474
 475 /* XXXX We assume that strpos is strbeg unless sv. */
 476
 477 /* XXXX Some places assume that there is a fixed substring.
 478         An update may be needed if optimizer marks as "INTUITable"
 479         RExen without fixed substrings.  Similarly, it is assumed that
 480         lengths of all the strings are no more than minlen, thus they
 481         cannot come from lookahead.
 482         (Or minlen should take into account lookahead.)
 483   NOTE: Some of this comment is not correct. minlen does now take account
 484   of lookahead/behind. Further research is required. -- demerphq
 485
 486 */
 487
 488 /* A failure to find a constant substring means that there is no need to make
 489    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 490    finding a substring too deep into the string means that less calls to
 491    regtry() should be needed.
 492
 493    REx compiler's optimizer found 4 possible hints:
 494         a) Anchored substring;
 495         b) Fixed substring;
 496         c) Whether we are anchored (beginning-of-line or \G);
 497         d) First node (of those at offset 0) which may distingush positions;
 498    We use a)b)d) and multiline-part of c), and try to find a position in the
 499    string which does not contradict any of them.
 500  */
 501
 502 /* Most of decisions we do here should have been done at compile time.
 503    The nodes of the REx which we used for the search should have been
 504    deleted from the finite automaton. */
 505
 506 char *
 507 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 508                      char *strend, const U32 flags, re_scream_pos_data *data)
 509 {
 510     dVAR;
 511     struct regexp *const prog = (struct regexp *)SvANY(rx);
 512     register I32 start_shift = 0;
 513     /* Should be nonnegative! */
 514     register I32 end_shift   = 0;
 515     register char *s;
 516     register SV *check;
 517     char *strbeg;
 518     char *t;
 519     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 520     I32 ml_anch;
 521     register char *other_last = NULL;   /* other substr checked before this */
 522     char *check_at = NULL;              /* check substr found at this pos */
 523     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 524     RXi_GET_DECL(prog,progi);
 525 #ifdef DEBUGGING
 526     const char * const i_strpos = strpos;
 527 #endif
 528     GET_RE_DEBUG_FLAGS_DECL;
 529
 530     PERL_ARGS_ASSERT_RE_INTUIT_START;
 531
 532     RX_MATCH_UTF8_set(rx,utf8_target);
 533
 534     if (RX_UTF8(rx)) {
 535         PL_reg_flags |= RF_utf8;
 536     }
 537     DEBUG_EXECUTE_r(
 538         debug_start_match(rx, utf8_target, strpos, strend,
 539             sv ? "Guessing start of match in sv for"
 540                : "Guessing start of match in string for");
 541               );
 542
 543     /* CHR_DIST() would be more correct here but it makes things slow. */
 544     if (prog->minlen > strend - strpos) {
 545         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 546                               "String too short... [re_intuit_start]\n"));
 547         goto fail;
 548     }
 549
 550     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 551     PL_regeol = strend;
 552     if (utf8_target) {
 553         if (!prog->check_utf8 && prog->check_substr)
 554             to_utf8_substr(prog);
 555         check = prog->check_utf8;
 556     } else {
 557         if (!prog->check_substr && prog->check_utf8)
 558             to_byte_substr(prog);
 559         check = prog->check_substr;
 560     }
 561     if (check == &PL_sv_undef) {
 562         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 563                 "Non-utf8 string cannot match utf8 check string\n"));
 564         goto fail;
 565     }
 566     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 567         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 568                      || ( (prog->extflags & RXf_ANCH_BOL)
 569                           && !multiline ) );    /* Check after \n? */
 570
 571         if (!ml_anch) {
 572           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 573                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 574                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 575                && sv && !SvROK(sv)
 576                && (strpos != strbeg)) {
 577               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 578               goto fail;
 579           }
 580           if (prog->check_offset_min == prog->check_offset_max &&
 581               !(prog->extflags & RXf_CANY_SEEN)) {
 582             /* Substring at constant offset from beg-of-str... */
 583             I32 slen;
 584
 585             s = HOP3c(strpos, prog->check_offset_min, strend);
 586
 587             if (SvTAIL(check)) {
 588                 slen = SvCUR(check);    /* >= 1 */
 589
 590                 if ( strend - s > slen || strend - s < slen - 1
 591                      || (strend - s == slen && strend[-1] != '\n')) {
 592                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 593                     goto fail_finish;
 594                 }
 595                 /* Now should match s[0..slen-2] */
 596                 slen--;
 597                 if (slen && (*SvPVX_const(check) != *s
 598                              || (slen > 1
 599                                  && memNE(SvPVX_const(check), s, slen)))) {
 600                   report_neq:
 601                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 602                     goto fail_finish;
 603                 }
 604             }
 605             else if (*SvPVX_const(check) != *s
 606                      || ((slen = SvCUR(check)) > 1
 607                          && memNE(SvPVX_const(check), s, slen)))
 608                 goto report_neq;
 609             check_at = s;
 610             goto success_at_start;
 611           }
 612         }
 613         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 614         s = strpos;
 615         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 616         end_shift = prog->check_end_shift;
 617
 618         if (!ml_anch) {
 619             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 620                                          - (SvTAIL(check) != 0);
 621             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 622
 623             if (end_shift < eshift)
 624                 end_shift = eshift;
 625         }
 626     }
 627     else {                              /* Can match at random position */
 628         ml_anch = 0;
 629         s = strpos;
 630         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 631         end_shift = prog->check_end_shift;
 632
 633         /* end shift should be non negative here */
 634     }
 635
 636 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 637     if (end_shift < 0)
 638         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 639                    (IV)end_shift, RX_PRECOMP(prog));
 640 #endif
 641
 642   restart:
 643     /* Find a possible match in the region s..strend by looking for
 644        the "check" substring in the region corrected by start/end_shift. */
 645
 646     {
 647         I32 srch_start_shift = start_shift;
 648         I32 srch_end_shift = end_shift;
 649         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 650             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 651             srch_start_shift = strbeg - s;
 652         }
 653     DEBUG_OPTIMISE_MORE_r({
 654         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 655             (IV)prog->check_offset_min,
 656             (IV)srch_start_shift,
 657             (IV)srch_end_shift,
 658             (IV)prog->check_end_shift);
 659     });
 660
 661     if (flags & REXEC_SCREAM) {
 662         I32 p = -1;                     /* Internal iterator of scream. */
 663         I32 * const pp = data ? data->scream_pos : &p;
 664
 665         if (PL_screamfirst[BmRARE(check)] >= 0
 666             || ( BmRARE(check) == '\n'
 667                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 668                  && SvTAIL(check) ))
 669             s = screaminstr(sv, check,
 670                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 671         else
 672             goto fail_finish;
 673         /* we may be pointing at the wrong string */
 674         if (s && RXp_MATCH_COPIED(prog))
 675             s = strbeg + (s - SvPVX_const(sv));
 676         if (data)
 677             *data->scream_olds = s;
 678     }
 679     else {
 680         U8* start_point;
 681         U8* end_point;
 682         if (prog->extflags & RXf_CANY_SEEN) {
 683             start_point= (U8*)(s + srch_start_shift);
 684             end_point= (U8*)(strend - srch_end_shift);
 685         } else {
 686             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 687             end_point= HOP3(strend, -srch_end_shift, strbeg);
 688         }
 689         DEBUG_OPTIMISE_MORE_r({
 690             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 691                 (int)(end_point - start_point),
 692                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 693                 start_point);
 694         });
 695
 696         s = fbm_instr( start_point, end_point,
 697                       check, multiline ? FBMrf_MULTILINE : 0);
 698     }
 699     }
 700     /* Update the count-of-usability, remove useless subpatterns,
 701         unshift s.  */
 702
 703     DEBUG_EXECUTE_r({
 704         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 705             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 706         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 707                           (s ? "Found" : "Did not find"),
 708             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 709                 ? "anchored" : "floating"),
 710             quoted,
 711             RE_SV_TAIL(check),
 712             (s ? " at offset " : "...\n") );
 713     });
 714
 715     if (!s)
 716         goto fail_finish;
 717     /* Finish the diagnostic message */
 718     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 719
 720     /* XXX dmq: first branch is for positive lookbehind...
 721        Our check string is offset from the beginning of the pattern.
 722        So we need to do any stclass tests offset forward from that
 723        point. I think. :-(
 724      */
 725
 726
 727
 728     check_at=s;
 729
 730
 731     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 732        Start with the other substr.
 733        XXXX no SCREAM optimization yet - and a very coarse implementation
 734        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 735                 *always* match.  Probably should be marked during compile...
 736        Probably it is right to do no SCREAM here...
 737      */
 738
 739     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 740                 : (prog->float_substr && prog->anchored_substr))
 741     {
 742         /* Take into account the "other" substring. */
 743         /* XXXX May be hopelessly wrong for UTF... */
 744         if (!other_last)
 745             other_last = strpos;
 746         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 747           do_other_anchored:
 748             {
 749                 char * const last = HOP3c(s, -start_shift, strbeg);
 750                 char *last1, *last2;
 751                 char * const saved_s = s;
 752                 SV* must;
 753
 754                 t = s - prog->check_offset_max;
 755                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 756                     && (!utf8_target
 757                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 758                             && t > strpos)))
 759                     NOOP;
 760                 else
 761                     t = strpos;
 762                 t = HOP3c(t, prog->anchored_offset, strend);
 763                 if (t < other_last)     /* These positions already checked */
 764                     t = other_last;
 765                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 766                 if (last < last1)
 767                     last1 = last;
 768                 /* XXXX It is not documented what units *_offsets are in.
 769                    We assume bytes, but this is clearly wrong.
 770                    Meaning this code needs to be carefully reviewed for errors.
 771                    dmq.
 772                   */
 773
 774                 /* On end-of-str: see comment below. */
 775                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 776                 if (must == &PL_sv_undef) {
 777                     s = (char*)NULL;
 778                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 779                 }
 780                 else
 781                     s = fbm_instr(
 782                         (unsigned char*)t,
 783                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 784                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 785                         must,
 786                         multiline ? FBMrf_MULTILINE : 0
 787                     );
 788                 DEBUG_EXECUTE_r({
 789                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 790                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 791                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 792                         (s ? "Found" : "Contradicts"),
 793                         quoted, RE_SV_TAIL(must));
 794                 });
 795
 796
 797                 if (!s) {
 798                     if (last1 >= last2) {
 799                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 800                                                 ", giving up...\n"));
 801                         goto fail_finish;
 802                     }
 803                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 804                         ", trying floating at offset %ld...\n",
 805                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 806                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 807                     s = HOP3c(last, 1, strend);
 808                     goto restart;
 809                 }
 810                 else {
 811                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 812                           (long)(s - i_strpos)));
 813                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 814                     other_last = HOP3c(s, 1, strend);
 815                     s = saved_s;
 816                     if (t == strpos)
 817                         goto try_at_start;
 818                     goto try_at_offset;
 819                 }
 820             }
 821         }
 822         else {          /* Take into account the floating substring. */
 823             char *last, *last1;
 824             char * const saved_s = s;
 825             SV* must;
 826
 827             t = HOP3c(s, -start_shift, strbeg);
 828             last1 = last =
 829                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 830             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 831                 last = HOP3c(t, prog->float_max_offset, strend);
 832             s = HOP3c(t, prog->float_min_offset, strend);
 833             if (s < other_last)
 834                 s = other_last;
 835  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 836             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 837             /* fbm_instr() takes into account exact value of end-of-str
 838                if the check is SvTAIL(ed).  Since false positives are OK,
 839                and end-of-str is not later than strend we are OK. */
 840             if (must == &PL_sv_undef) {
 841                 s = (char*)NULL;
 842                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 843             }
 844             else
 845                 s = fbm_instr((unsigned char*)s,
 846                               (unsigned char*)last + SvCUR(must)
 847                                   - (SvTAIL(must)!=0),
 848                               must, multiline ? FBMrf_MULTILINE : 0);
 849             DEBUG_EXECUTE_r({
 850                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 851                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 852                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 853                     (s ? "Found" : "Contradicts"),
 854                     quoted, RE_SV_TAIL(must));
 855             });
 856             if (!s) {
 857                 if (last1 == last) {
 858                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 859                                             ", giving up...\n"));
 860                     goto fail_finish;
 861                 }
 862                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 863                     ", trying anchored starting at offset %ld...\n",
 864                     (long)(saved_s + 1 - i_strpos)));
 865                 other_last = last;
 866                 s = HOP3c(t, 1, strend);
 867                 goto restart;
 868             }
 869             else {
 870                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 871                       (long)(s - i_strpos)));
 872                 other_last = s; /* Fix this later. --Hugo */
 873                 s = saved_s;
 874                 if (t == strpos)
 875                     goto try_at_start;
 876                 goto try_at_offset;
 877             }
 878         }
 879     }
 880
 881
 882     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 883
 884     DEBUG_OPTIMISE_MORE_r(
 885         PerlIO_printf(Perl_debug_log,
 886             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 887             (IV)prog->check_offset_min,
 888             (IV)prog->check_offset_max,
 889             (IV)(s-strpos),
 890             (IV)(t-strpos),
 891             (IV)(t-s),
 892             (IV)(strend-strpos)
 893         )
 894     );
 895
 896     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 897         && (!utf8_target
 898             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 899                  && t > strpos)))
 900     {
 901         /* Fixed substring is found far enough so that the match
 902            cannot start at strpos. */
 903       try_at_offset:
 904         if (ml_anch && t[-1] != '\n') {
 905             /* Eventually fbm_*() should handle this, but often
 906                anchored_offset is not 0, so this check will not be wasted. */
 907             /* XXXX In the code below we prefer to look for "^" even in
 908                presence of anchored substrings.  And we search even
 909                beyond the found float position.  These pessimizations
 910                are historical artefacts only.  */
 911           find_anchor:
 912             while (t < strend - prog->minlen) {
 913                 if (*t == '\n') {
 914                     if (t < check_at - prog->check_offset_min) {
 915                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 916                             /* Since we moved from the found position,
 917                                we definitely contradict the found anchored
 918                                substr.  Due to the above check we do not
 919                                contradict "check" substr.
 920                                Thus we can arrive here only if check substr
 921                                is float.  Redo checking for "other"=="fixed".
 922                              */
 923                             strpos = t + 1;
 924                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 925                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 926                             goto do_other_anchored;
 927                         }
 928                         /* We don't contradict the found floating substring. */
 929                         /* XXXX Why not check for STCLASS? */
 930                         s = t + 1;
 931                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 932                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 933                         goto set_useful;
 934                     }
 935                     /* Position contradicts check-string */
 936                     /* XXXX probably better to look for check-string
 937                        than for "\n", so one should lower the limit for t? */
 938                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 939                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 940                     other_last = strpos = s = t + 1;
 941                     goto restart;
 942                 }
 943                 t++;
 944             }
 945             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 946                         PL_colors[0], PL_colors[1]));
 947             goto fail_finish;
 948         }
 949         else {
 950             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 951                         PL_colors[0], PL_colors[1]));
 952         }
 953         s = t;
 954       set_useful:
 955         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 956     }
 957     else {
 958         /* The found string does not prohibit matching at strpos,
 959            - no optimization of calling REx engine can be performed,
 960            unless it was an MBOL and we are not after MBOL,
 961            or a future STCLASS check will fail this. */
 962       try_at_start:
 963         /* Even in this situation we may use MBOL flag if strpos is offset
 964            wrt the start of the string. */
 965         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 966             && (strpos != strbeg) && strpos[-1] != '\n'
 967             /* May be due to an implicit anchor of m{.*foo}  */
 968             && !(prog->intflags & PREGf_IMPLICIT))
 969         {
 970             t = strpos;
 971             goto find_anchor;
 972         }
 973         DEBUG_EXECUTE_r( if (ml_anch)
 974             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 975                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 976         );
 977       success_at_start:
 978         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
 979             && (utf8_target ? (
 980                 prog->check_utf8                /* Could be deleted already */
 981                 && --BmUSEFUL(prog->check_utf8) < 0
 982                 && (prog->check_utf8 == prog->float_utf8)
 983             ) : (
 984                 prog->check_substr              /* Could be deleted already */
 985                 && --BmUSEFUL(prog->check_substr) < 0
 986                 && (prog->check_substr == prog->float_substr)
 987             )))
 988         {
 989             /* If flags & SOMETHING - do not do it many times on the same match */
 990             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 991             /* XXX Does the destruction order has to change with utf8_target? */
 992             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
 993             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
 994             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
 995             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
 996             check = NULL;                       /* abort */
 997             s = strpos;
 998             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
 999                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1000             if (prog->intflags & PREGf_IMPLICIT)
1001                 prog->extflags &= ~RXf_ANCH_MBOL;
1002             /* XXXX This is a remnant of the old implementation.  It
1003                     looks wasteful, since now INTUIT can use many
1004                     other heuristics. */
1005             prog->extflags &= ~RXf_USE_INTUIT;
1006             /* XXXX What other flags might need to be cleared in this branch? */
1007         }
1008         else
1009             s = strpos;
1010     }
1011
1012     /* Last resort... */
1013     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1014     /* trie stclasses are too expensive to use here, we are better off to
1015        leave it to regmatch itself */
1016     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1017         /* minlen == 0 is possible if regstclass is \b or \B,
1018            and the fixed substr is ''$.
1019            Since minlen is already taken into account, s+1 is before strend;
1020            accidentally, minlen >= 1 guaranties no false positives at s + 1
1021            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1022            regstclass does not come from lookahead...  */
1023         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1024            This leaves EXACTF only, which is dealt with in find_byclass().  */
1025         const U8* const str = (U8*)STRING(progi->regstclass);
1026         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1027                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1028                     : 1);
1029         char * endpos;
1030         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1031             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1032         else if (prog->float_substr || prog->float_utf8)
1033             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1034         else
1035             endpos= strend;
1036
1037         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1038                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1039
1040         t = s;
1041         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1042         if (!s) {
1043 #ifdef DEBUGGING
1044             const char *what = NULL;
1045 #endif
1046             if (endpos == strend) {
1047                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1048                                 "Could not match STCLASS...\n") );
1049                 goto fail;
1050             }
1051             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1052                                    "This position contradicts STCLASS...\n") );
1053             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1054                 goto fail;
1055             /* Contradict one of substrings */
1056             if (prog->anchored_substr || prog->anchored_utf8) {
1057                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1058                     DEBUG_EXECUTE_r( what = "anchored" );
1059                   hop_and_restart:
1060                     s = HOP3c(t, 1, strend);
1061                     if (s + start_shift + end_shift > strend) {
1062                         /* XXXX Should be taken into account earlier? */
1063                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1064                                                "Could not match STCLASS...\n") );
1065                         goto fail;
1066                     }
1067                     if (!check)
1068                         goto giveup;
1069                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1070                                 "Looking for %s substr starting at offset %ld...\n",
1071                                  what, (long)(s + start_shift - i_strpos)) );
1072                     goto restart;
1073                 }
1074                 /* Have both, check_string is floating */
1075                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1076                     goto retry_floating_check;
1077                 /* Recheck anchored substring, but not floating... */
1078                 s = check_at;
1079                 if (!check)
1080                     goto giveup;
1081                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                           "Looking for anchored substr starting at offset %ld...\n",
1083                           (long)(other_last - i_strpos)) );
1084                 goto do_other_anchored;
1085             }
1086             /* Another way we could have checked stclass at the
1087                current position only: */
1088             if (ml_anch) {
1089                 s = t = t + 1;
1090                 if (!check)
1091                     goto giveup;
1092                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1093                           "Looking for /%s^%s/m starting at offset %ld...\n",
1094                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1095                 goto try_at_offset;
1096             }
1097             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1098                 goto fail;
1099             /* Check is floating subtring. */
1100           retry_floating_check:
1101             t = check_at - start_shift;
1102             DEBUG_EXECUTE_r( what = "floating" );
1103             goto hop_and_restart;
1104         }
1105         if (t != s) {
1106             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1107                         "By STCLASS: moving %ld --> %ld\n",
1108                                   (long)(t - i_strpos), (long)(s - i_strpos))
1109                    );
1110         }
1111         else {
1112             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1113                                   "Does not contradict STCLASS...\n");
1114                    );
1115         }
1116     }
1117   giveup:
1118     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1119                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1120                           PL_colors[5], (long)(s - i_strpos)) );
1121     return s;
1122
1123   fail_finish:                          /* Substring not found */
1124     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1125         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1126   fail:
1127     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1128                           PL_colors[4], PL_colors[5]));
1129     return NULL;
1130 }
1131
1132 #define DECL_TRIE_TYPE(scan) \
1133     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1134                     trie_type = (scan->flags != EXACT) \
1135                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1136                               : (utf8_target ? trie_utf8 : trie_plain)
1137
1138 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1139 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1140     switch (trie_type) {                                                    \
1141     case trie_utf8_fold:                                                    \
1142         if ( foldlen>0 ) {                                                  \
1143             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1144             foldlen -= len;                                                 \
1145             uscan += len;                                                   \
1146             len=0;                                                          \
1147         } else {                                                            \
1148             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1149             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1150             foldlen -= UNISKIP( uvc );                                      \
1151             uscan = foldbuf + UNISKIP( uvc );                               \
1152         }                                                                   \
1153         break;                                                              \
1154     case trie_latin_utf8_fold:                                              \
1155         if ( foldlen>0 ) {                                                  \
1156             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1157             foldlen -= len;                                                 \
1158             uscan += len;                                                   \
1159             len=0;                                                          \
1160         } else {                                                            \
1161             len = 1;                                                        \
1162             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1163             foldlen -= UNISKIP( uvc );                                      \
1164             uscan = foldbuf + UNISKIP( uvc );                               \
1165         }                                                                   \
1166         break;                                                              \
1167     case trie_utf8:                                                         \
1168         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1169         break;                                                              \
1170     case trie_plain:                                                        \
1171         uvc = (UV)*uc;                                                      \
1172         len = 1;                                                            \
1173     }                                                                       \
1174     if (uvc < 256) {                                                        \
1175         charid = trie->charmap[ uvc ];                                      \
1176     }                                                                       \
1177     else {                                                                  \
1178         charid = 0;                                                         \
1179         if (widecharmap) {                                                  \
1180             SV** const svpp = hv_fetch(widecharmap,                         \
1181                         (char*)&uvc, sizeof(UV), 0);                        \
1182             if (svpp)                                                       \
1183                 charid = (U16)SvIV(*svpp);                                  \
1184         }                                                                   \
1185     }                                                                       \
1186 } STMT_END
1187
1188 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1189 {                                                      \
1190     char *my_strend= (char *)strend;                   \
1191     if ( (CoNd)                                        \
1192          && (ln == len ||                              \
1193              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1194                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1195          && (!reginfo || regtry(reginfo, &s)) )        \
1196         goto got_it;                                   \
1197     else {                                             \
1198          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1199          uvchr_to_utf8(tmpbuf, c);                     \
1200          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1201          if ( f != c                                   \
1202               && (f == c1 || f == c2)                  \
1203               && (ln == len ||                         \
1204                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1205                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1206               && (!reginfo || regtry(reginfo, &s)) )   \
1207               goto got_it;                             \
1208     }                                                  \
1209 }                                                      \
1210 s += len
1211
1212 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1213 STMT_START {                                              \
1214     while (s <= e) {                                      \
1215         if ( (CoNd)                                       \
1216              && (ln == 1 || (OP(c) == EXACTF             \
1217                               ? foldEQ(s, m, ln)           \
1218                               : foldEQ_locale(s, m, ln)))  \
1219              && (!reginfo || regtry(reginfo, &s)) )        \
1220             goto got_it;                                  \
1221         s++;                                              \
1222     }                                                     \
1223 } STMT_END
1224
1225 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1226 STMT_START {                                          \
1227     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1228         CoDe                                          \
1229         s += uskip;                                   \
1230     }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_SCAN(CoDe)                          \
1234 STMT_START {                                          \
1235     while (s < strend) {                              \
1236         CoDe                                          \
1237         s++;                                          \
1238     }                                                 \
1239 } STMT_END
1240
1241 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1242 REXEC_FBC_UTF8_SCAN(                                  \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1254 REXEC_FBC_SCAN(                                       \
1255     if (CoNd) {                                       \
1256         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1257             goto got_it;                              \
1258         else                                          \
1259             tmp = doevery;                            \
1260     }                                                 \
1261     else                                              \
1262         tmp = 1;                                      \
1263 )
1264
1265 #define REXEC_FBC_TRYIT               \
1266 if ((!reginfo || regtry(reginfo, &s))) \
1267     goto got_it
1268
1269 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1270     if (utf8_target) {                                             \
1271         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1272     }                                                          \
1273     else {                                                     \
1274         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1275     }                                                          \
1276     break
1277
1278 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1279     if (utf8_target) {                                             \
1280         UtFpReLoAd;                                            \
1281         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1282     }                                                          \
1283     else {                                                     \
1284         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1285     }                                                          \
1286     break
1287
1288 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1289     PL_reg_flags |= RF_tainted;                                \
1290     if (utf8_target) {                                             \
1291         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1292     }                                                          \
1293     else {                                                     \
1294         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1295     }                                                          \
1296     break
1297
1298 #define DUMP_EXEC_POS(li,s,doutf8) \
1299     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1300
1301 /* We know what class REx starts with.  Try to find this position... */
1302 /* if reginfo is NULL, its a dryrun */
1303 /* annoyingly all the vars in this routine have different names from their counterparts
1304    in regmatch. /grrr */
1305
1306 STATIC char *
1307 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1308     const char *strend, regmatch_info *reginfo)
1309 {
1310         dVAR;
1311         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1312         char *m;
1313         STRLEN ln;
1314         STRLEN lnc;
1315         register STRLEN uskip;
1316         unsigned int c1;
1317         unsigned int c2;
1318         char *e;
1319         register I32 tmp = 1;   /* Scratch variable? */
1320         register const bool utf8_target = PL_reg_match_utf8;
1321         RXi_GET_DECL(prog,progi);
1322
1323         PERL_ARGS_ASSERT_FIND_BYCLASS;
1324
1325         /* We know what class it must start with. */
1326         switch (OP(c)) {
1327         case ANYOF:
1328             if (utf8_target) {
1329                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1330                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1331                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1332                           REGINCLASS(prog, c, (U8*)s));
1333             }
1334             else {
1335                  while (s < strend) {
1336                       STRLEN skip = 1;
1337
1338                       if (REGINCLASS(prog, c, (U8*)s) ||
1339                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1340                            /* The assignment of 2 is intentional:
1341                             * for the folded sharp s, the skip is 2. */
1342                            (skip = SHARP_S_SKIP))) {
1343                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1344                                 goto got_it;
1345                            else
1346                                 tmp = doevery;
1347                       }
1348                       else
1349                            tmp = 1;
1350                       s += skip;
1351                  }
1352             }
1353             break;
1354         case CANY:
1355             REXEC_FBC_SCAN(
1356                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1357                     goto got_it;
1358                 else
1359                     tmp = doevery;
1360             );
1361             break;
1362         case EXACTF:
1363             m   = STRING(c);
1364             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1365             lnc = (I32) ln;     /* length to match in characters */
1366             if (UTF_PATTERN) {
1367                 STRLEN ulen1, ulen2;
1368                 U8 *sm = (U8 *) m;
1369                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1370                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1371                 /* used by commented-out code below */
1372                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1373
1374                 /* XXX: Since the node will be case folded at compile
1375                    time this logic is a little odd, although im not
1376                    sure that its actually wrong. --dmq */
1377
1378                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1379                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1380
1381                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1382                    codepoint of the first character in the converted
1383                    form, yet originally we did the extra step.
1384                    No tests fail by commenting this code out however
1385                    so Ive left it out. -- dmq.
1386
1387                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1388                                     0, uniflags);
1389                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1390                                     0, uniflags);
1391                 */
1392
1393                 lnc = 0;
1394                 while (sm < ((U8 *) m + ln)) {
1395                     lnc++;
1396                     sm += UTF8SKIP(sm);
1397                 }
1398             }
1399             else {
1400                 c1 = *(U8*)m;
1401                 c2 = PL_fold[c1];
1402             }
1403             goto do_exactf;
1404         case EXACTFL:
1405             m   = STRING(c);
1406             ln  = STR_LEN(c);
1407             lnc = (I32) ln;
1408             c1 = *(U8*)m;
1409             c2 = PL_fold_locale[c1];
1410           do_exactf:
1411             e = HOP3c(strend, -((I32)lnc), s);
1412
1413             if (!reginfo && e < s)
1414                 e = s;                  /* Due to minlen logic of intuit() */
1415
1416             /* The idea in the EXACTF* cases is to first find the
1417              * first character of the EXACTF* node and then, if
1418              * necessary, case-insensitively compare the full
1419              * text of the node.  The c1 and c2 are the first
1420              * characters (though in Unicode it gets a bit
1421              * more complicated because there are more cases
1422              * than just upper and lower: one needs to use
1423              * the so-called folding case for case-insensitive
1424              * matching (called "loose matching" in Unicode).
1425              * foldEQ_utf8() will do just that. */
1426
1427             if (utf8_target || UTF_PATTERN) {
1428                 UV c, f;
1429                 U8 tmpbuf [UTF8_MAXBYTES+1];
1430                 STRLEN len = 1;
1431                 STRLEN foldlen;
1432                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1433                 if (c1 == c2) {
1434                     /* Upper and lower of 1st char are equal -
1435                      * probably not a "letter". */
1436                     while (s <= e) {
1437                         if (utf8_target) {
1438                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1439                                            uniflags);
1440                         } else {
1441                             c = *((U8*)s);
1442                         }
1443                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1444                     }
1445                 }
1446                 else {
1447                     while (s <= e) {
1448                         if (utf8_target) {
1449                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1450                                            uniflags);
1451                         } else {
1452                             c = *((U8*)s);
1453                         }
1454
1455                         /* Handle some of the three Greek sigmas cases.
1456                          * Note that not all the possible combinations
1457                          * are handled here: some of them are handled
1458                          * by the standard folding rules, and some of
1459                          * them (the character class or ANYOF cases)
1460                          * are handled during compiletime in
1461                          * regexec.c:S_regclass(). */
1462                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1463                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1464                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1465
1466                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1467                     }
1468                 }
1469             }
1470             else {
1471                 /* Neither pattern nor string are UTF8 */
1472                 if (c1 == c2)
1473                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1474                 else
1475                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1476             }
1477             break;
1478         case BOUNDL:
1479             PL_reg_flags |= RF_tainted;
1480             /* FALL THROUGH */
1481         case BOUND:
1482             if (utf8_target) {
1483                 if (s == PL_bostr)
1484                     tmp = '\n';
1485                 else {
1486                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1487                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1488                 }
1489                 tmp = ((OP(c) == BOUND ?
1490                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1491                 LOAD_UTF8_CHARCLASS_ALNUM();
1492                 REXEC_FBC_UTF8_SCAN(
1493                     if (tmp == !(OP(c) == BOUND ?
1494                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1495                                  isALNUM_LC_utf8((U8*)s)))
1496                     {
1497                         tmp = !tmp;
1498                         REXEC_FBC_TRYIT;
1499                 }
1500                 );
1501             }
1502             else {
1503                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1504                 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1505                 REXEC_FBC_SCAN(
1506                     if (tmp ==
1507                         !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1508                         tmp = !tmp;
1509                         REXEC_FBC_TRYIT;
1510                 }
1511                 );
1512             }
1513             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1514                 goto got_it;
1515             break;
1516         case NBOUNDL:
1517             PL_reg_flags |= RF_tainted;
1518             /* FALL THROUGH */
1519         case NBOUND:
1520             if (utf8_target) {
1521                 if (s == PL_bostr)
1522                     tmp = '\n';
1523                 else {
1524                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1525                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1526                 }
1527                 tmp = ((OP(c) == NBOUND ?
1528                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1529                 LOAD_UTF8_CHARCLASS_ALNUM();
1530                 REXEC_FBC_UTF8_SCAN(
1531                     if (tmp == !(OP(c) == NBOUND ?
1532                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1533                                  isALNUM_LC_utf8((U8*)s)))
1534                         tmp = !tmp;
1535                     else REXEC_FBC_TRYIT;
1536                 );
1537             }
1538             else {
1539                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1540                 tmp = ((OP(c) == NBOUND ?
1541                         isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1542                 REXEC_FBC_SCAN(
1543                     if (tmp ==
1544                         !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1545                         tmp = !tmp;
1546                     else REXEC_FBC_TRYIT;
1547                 );
1548             }
1549             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1550                 goto got_it;
1551             break;
1552         case ALNUM:
1553             REXEC_FBC_CSCAN_PRELOAD(
1554                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1555                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1556                 isALNUM(*s)
1557             );
1558         case ALNUML:
1559             REXEC_FBC_CSCAN_TAINT(
1560                 isALNUM_LC_utf8((U8*)s),
1561                 isALNUM_LC(*s)
1562             );
1563         case NALNUM:
1564             REXEC_FBC_CSCAN_PRELOAD(
1565                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1566                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1567                 !isALNUM(*s)
1568             );
1569         case NALNUML:
1570             REXEC_FBC_CSCAN_TAINT(
1571                 !isALNUM_LC_utf8((U8*)s),
1572                 !isALNUM_LC(*s)
1573             );
1574         case SPACE:
1575             REXEC_FBC_CSCAN_PRELOAD(
1576                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1577                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1578                 isSPACE(*s)
1579             );
1580         case SPACEL:
1581             REXEC_FBC_CSCAN_TAINT(
1582                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1583                 isSPACE_LC(*s)
1584             );
1585         case NSPACE:
1586             REXEC_FBC_CSCAN_PRELOAD(
1587                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1588                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1589                 !isSPACE(*s)
1590             );
1591         case NSPACEL:
1592             REXEC_FBC_CSCAN_TAINT(
1593                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1594                 !isSPACE_LC(*s)
1595             );
1596         case DIGIT:
1597             REXEC_FBC_CSCAN_PRELOAD(
1598                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1599                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1600                 isDIGIT(*s)
1601             );
1602         case DIGITL:
1603             REXEC_FBC_CSCAN_TAINT(
1604                 isDIGIT_LC_utf8((U8*)s),
1605                 isDIGIT_LC(*s)
1606             );
1607         case NDIGIT:
1608             REXEC_FBC_CSCAN_PRELOAD(
1609                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1610                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1611                 !isDIGIT(*s)
1612             );
1613         case NDIGITL:
1614             REXEC_FBC_CSCAN_TAINT(
1615                 !isDIGIT_LC_utf8((U8*)s),
1616                 !isDIGIT_LC(*s)
1617             );
1618         case LNBREAK:
1619             REXEC_FBC_CSCAN(
1620                 is_LNBREAK_utf8(s),
1621                 is_LNBREAK_latin1(s)
1622             );
1623         case VERTWS:
1624             REXEC_FBC_CSCAN(
1625                 is_VERTWS_utf8(s),
1626                 is_VERTWS_latin1(s)
1627             );
1628         case NVERTWS:
1629             REXEC_FBC_CSCAN(
1630                 !is_VERTWS_utf8(s),
1631                 !is_VERTWS_latin1(s)
1632             );
1633         case HORIZWS:
1634             REXEC_FBC_CSCAN(
1635                 is_HORIZWS_utf8(s),
1636                 is_HORIZWS_latin1(s)
1637             );
1638         case NHORIZWS:
1639             REXEC_FBC_CSCAN(
1640                 !is_HORIZWS_utf8(s),
1641                 !is_HORIZWS_latin1(s)
1642             );
1643         case AHOCORASICKC:
1644         case AHOCORASICK:
1645             {
1646                 DECL_TRIE_TYPE(c);
1647                 /* what trie are we using right now */
1648                 reg_ac_data *aho
1649                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1650                 reg_trie_data *trie
1651                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1652                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1653
1654                 const char *last_start = strend - trie->minlen;
1655 #ifdef DEBUGGING
1656                 const char *real_start = s;
1657 #endif
1658                 STRLEN maxlen = trie->maxlen;
1659                 SV *sv_points;
1660                 U8 **points; /* map of where we were in the input string
1661                                 when reading a given char. For ASCII this
1662                                 is unnecessary overhead as the relationship
1663                                 is always 1:1, but for Unicode, especially
1664                                 case folded Unicode this is not true. */
1665                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1666                 U8 *bitmap=NULL;
1667
1668
1669                 GET_RE_DEBUG_FLAGS_DECL;
1670
1671                 /* We can't just allocate points here. We need to wrap it in
1672                  * an SV so it gets freed properly if there is a croak while
1673                  * running the match */
1674                 ENTER;
1675                 SAVETMPS;
1676                 sv_points=newSV(maxlen * sizeof(U8 *));
1677                 SvCUR_set(sv_points,
1678                     maxlen * sizeof(U8 *));
1679                 SvPOK_on(sv_points);
1680                 sv_2mortal(sv_points);
1681                 points=(U8**)SvPV_nolen(sv_points );
1682                 if ( trie_type != trie_utf8_fold
1683                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1684                 {
1685                     if (trie->bitmap)
1686                         bitmap=(U8*)trie->bitmap;
1687                     else
1688                         bitmap=(U8*)ANYOF_BITMAP(c);
1689                 }
1690                 /* this is the Aho-Corasick algorithm modified a touch
1691                    to include special handling for long "unknown char"
1692                    sequences. The basic idea being that we use AC as long
1693                    as we are dealing with a possible matching char, when
1694                    we encounter an unknown char (and we have not encountered
1695                    an accepting state) we scan forward until we find a legal
1696                    starting char.
1697                    AC matching is basically that of trie matching, except
1698                    that when we encounter a failing transition, we fall back
1699                    to the current states "fail state", and try the current char
1700                    again, a process we repeat until we reach the root state,
1701                    state 1, or a legal transition. If we fail on the root state
1702                    then we can either terminate if we have reached an accepting
1703                    state previously, or restart the entire process from the beginning
1704                    if we have not.
1705
1706                  */
1707                 while (s <= last_start) {
1708                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1709                     U8 *uc = (U8*)s;
1710                     U16 charid = 0;
1711                     U32 base = 1;
1712                     U32 state = 1;
1713                     UV uvc = 0;
1714                     STRLEN len = 0;
1715                     STRLEN foldlen = 0;
1716                     U8 *uscan = (U8*)NULL;
1717                     U8 *leftmost = NULL;
1718 #ifdef DEBUGGING
1719                     U32 accepted_word= 0;
1720 #endif
1721                     U32 pointpos = 0;
1722
1723                     while ( state && uc <= (U8*)strend ) {
1724                         int failed=0;
1725                         U32 word = aho->states[ state ].wordnum;
1726
1727                         if( state==1 ) {
1728                             if ( bitmap ) {
1729                                 DEBUG_TRIE_EXECUTE_r(
1730                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1731                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1732                                             (char *)uc, utf8_target );
1733                                         PerlIO_printf( Perl_debug_log,
1734                                             " Scanning for legal start char...\n");
1735                                     }
1736                                 );
1737                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1738                                     uc++;
1739                                 }
1740                                 s= (char *)uc;
1741                             }
1742                             if (uc >(U8*)last_start) break;
1743                         }
1744
1745                         if ( word ) {
1746                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1747                             if (!leftmost || lpos < leftmost) {
1748                                 DEBUG_r(accepted_word=word);
1749                                 leftmost= lpos;
1750                             }
1751                             if (base==0) break;
1752
1753                         }
1754                         points[pointpos++ % maxlen]= uc;
1755                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1756                                              uscan, len, uvc, charid, foldlen,
1757                                              foldbuf, uniflags);
1758                         DEBUG_TRIE_EXECUTE_r({
1759                             dump_exec_pos( (char *)uc, c, strend, real_start,
1760                                 s,   utf8_target );
1761                             PerlIO_printf(Perl_debug_log,
1762                                 " Charid:%3u CP:%4"UVxf" ",
1763                                  charid, uvc);
1764                         });
1765
1766                         do {
1767 #ifdef DEBUGGING
1768                             word = aho->states[ state ].wordnum;
1769 #endif
1770                             base = aho->states[ state ].trans.base;
1771
1772                             DEBUG_TRIE_EXECUTE_r({
1773                                 if (failed)
1774                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1775                                         s,   utf8_target );
1776                                 PerlIO_printf( Perl_debug_log,
1777                                     "%sState: %4"UVxf", word=%"UVxf,
1778                                     failed ? " Fail transition to " : "",
1779                                     (UV)state, (UV)word);
1780                             });
1781                             if ( base ) {
1782                                 U32 tmp;
1783                                 I32 offset;
1784                                 if (charid &&
1785                                      ( ((offset = base + charid
1786                                         - 1 - trie->uniquecharcount)) >= 0)
1787                                      && ((U32)offset < trie->lasttrans)
1788                                      && trie->trans[offset].check == state
1789                                      && (tmp=trie->trans[offset].next))
1790                                 {
1791                                     DEBUG_TRIE_EXECUTE_r(
1792                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1793                                     state = tmp;
1794                                     break;
1795                                 }
1796                                 else {
1797                                     DEBUG_TRIE_EXECUTE_r(
1798                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1799                                     failed = 1;
1800                                     state = aho->fail[state];
1801                                 }
1802                             }
1803                             else {
1804                                 /* we must be accepting here */
1805                                 DEBUG_TRIE_EXECUTE_r(
1806                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1807                                 failed = 1;
1808                                 break;
1809                             }
1810                         } while(state);
1811                         uc += len;
1812                         if (failed) {
1813                             if (leftmost)
1814                                 break;
1815                             if (!state) state = 1;
1816                         }
1817                     }
1818                     if ( aho->states[ state ].wordnum ) {
1819                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1820                         if (!leftmost || lpos < leftmost) {
1821                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1822                             leftmost = lpos;
1823                         }
1824                     }
1825                     if (leftmost) {
1826                         s = (char*)leftmost;
1827                         DEBUG_TRIE_EXECUTE_r({
1828                             PerlIO_printf(
1829                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1830                                 (UV)accepted_word, (IV)(s - real_start)
1831                             );
1832                         });
1833                         if (!reginfo || regtry(reginfo, &s)) {
1834                             FREETMPS;
1835                             LEAVE;
1836                             goto got_it;
1837                         }
1838                         s = HOPc(s,1);
1839                         DEBUG_TRIE_EXECUTE_r({
1840                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1841                         });
1842                     } else {
1843                         DEBUG_TRIE_EXECUTE_r(
1844                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1845                         break;
1846                     }
1847                 }
1848                 FREETMPS;
1849                 LEAVE;
1850             }
1851             break;
1852         default:
1853             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1854             break;
1855         }
1856         return 0;
1857       got_it:
1858         return s;
1859 }
1860
1861
1862 /*
1863  - regexec_flags - match a regexp against a string
1864  */
1865 I32
1866 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1867               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1868 /* strend: pointer to null at end of string */
1869 /* strbeg: real beginning of string */
1870 /* minend: end of match must be >=minend after stringarg. */
1871 /* data: May be used for some additional optimizations.
1872          Currently its only used, with a U32 cast, for transmitting
1873          the ganch offset when doing a /g match. This will change */
1874 /* nosave: For optimizations. */
1875 {
1876     dVAR;
1877     struct regexp *const prog = (struct regexp *)SvANY(rx);
1878     /*register*/ char *s;
1879     register regnode *c;
1880     /*register*/ char *startpos = stringarg;
1881     I32 minlen;         /* must match at least this many chars */
1882     I32 dontbother = 0; /* how many characters not to try at end */
1883     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1884     I32 scream_pos = -1;                /* Internal iterator of scream. */
1885     char *scream_olds = NULL;
1886     const bool utf8_target = cBOOL(DO_UTF8(sv));
1887     I32 multiline;
1888     RXi_GET_DECL(prog,progi);
1889     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1890     regexp_paren_pair *swap = NULL;
1891     GET_RE_DEBUG_FLAGS_DECL;
1892
1893     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1894     PERL_UNUSED_ARG(data);
1895
1896     /* Be paranoid... */
1897     if (prog == NULL || startpos == NULL) {
1898         Perl_croak(aTHX_ "NULL regexp parameter");
1899         return 0;
1900     }
1901
1902     multiline = prog->extflags & RXf_PMf_MULTILINE;
1903     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1904
1905     RX_MATCH_UTF8_set(rx, utf8_target);
1906     DEBUG_EXECUTE_r(
1907         debug_start_match(rx, utf8_target, startpos, strend,
1908         "Matching");
1909     );
1910
1911     minlen = prog->minlen;
1912
1913     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1914         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1915                               "String too short [regexec_flags]...\n"));
1916         goto phooey;
1917     }
1918
1919
1920     /* Check validity of program. */
1921     if (UCHARAT(progi->program) != REG_MAGIC) {
1922         Perl_croak(aTHX_ "corrupted regexp program");
1923     }
1924
1925     PL_reg_flags = 0;
1926     PL_reg_eval_set = 0;
1927     PL_reg_maxiter = 0;
1928
1929     if (RX_UTF8(rx))
1930         PL_reg_flags |= RF_utf8;
1931
1932     /* Mark beginning of line for ^ and lookbehind. */
1933     reginfo.bol = startpos; /* XXX not used ??? */
1934     PL_bostr  = strbeg;
1935     reginfo.sv = sv;
1936
1937     /* Mark end of line for $ (and such) */
1938     PL_regeol = strend;
1939
1940     /* see how far we have to get to not match where we matched before */
1941     reginfo.till = startpos+minend;
1942
1943     /* If there is a "must appear" string, look for it. */
1944     s = startpos;
1945
1946     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1947         MAGIC *mg;
1948         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1949             reginfo.ganch = startpos + prog->gofs;
1950             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1951               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1952         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1953                   && SvMAGIC(sv)
1954                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1955                   && mg->mg_len >= 0) {
1956             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
1957             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1958                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1959
1960             if (prog->extflags & RXf_ANCH_GPOS) {
1961                 if (s > reginfo.ganch)
1962                     goto phooey;
1963                 s = reginfo.ganch - prog->gofs;
1964                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1965                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1966                 if (s < strbeg)
1967                     goto phooey;
1968             }
1969         }
1970         else if (data) {
1971             reginfo.ganch = strbeg + PTR2UV(data);
1972             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1973                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1974
1975         } else {                                /* pos() not defined */
1976             reginfo.ganch = strbeg;
1977             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1978                  "GPOS: reginfo.ganch = strbeg\n"));
1979         }
1980     }
1981     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1982         /* We have to be careful. If the previous successful match
1983            was from this regex we don't want a subsequent partially
1984            successful match to clobber the old results.
1985            So when we detect this possibility we add a swap buffer
1986            to the re, and switch the buffer each match. If we fail
1987            we switch it back, otherwise we leave it swapped.
1988         */
1989         swap = prog->offs;
1990         /* do we need a save destructor here for eval dies? */
1991         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1992     }
1993     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1994         re_scream_pos_data d;
1995
1996         d.scream_olds = &scream_olds;
1997         d.scream_pos = &scream_pos;
1998         s = re_intuit_start(rx, sv, s, strend, flags, &d);
1999         if (!s) {
2000             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2001             goto phooey;        /* not present */
2002         }
2003     }
2004
2005
2006
2007     /* Simplest case:  anchored match need be tried only once. */
2008     /*  [unless only anchor is BOL and multiline is set] */
2009     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2010         if (s == startpos && regtry(&reginfo, &startpos))
2011             goto got_it;
2012         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2013                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2014         {
2015             char *end;
2016
2017             if (minlen)
2018                 dontbother = minlen - 1;
2019             end = HOP3c(strend, -dontbother, strbeg) - 1;
2020             /* for multiline we only have to try after newlines */
2021             if (prog->check_substr || prog->check_utf8) {
2022                 if (s == startpos)
2023                     goto after_try;
2024                 while (1) {
2025                     if (regtry(&reginfo, &s))
2026                         goto got_it;
2027                   after_try:
2028                     if (s > end)
2029                         goto phooey;
2030                     if (prog->extflags & RXf_USE_INTUIT) {
2031                         s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2032                         if (!s)
2033                             goto phooey;
2034                     }
2035                     else
2036                         s++;
2037                 }
2038             } else {
2039                 if (s > startpos)
2040                     s--;
2041                 while (s < end) {
2042                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2043                         if (regtry(&reginfo, &s))
2044                             goto got_it;
2045                     }
2046                 }
2047             }
2048         }
2049         goto phooey;
2050     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2051     {
2052         /* the warning about reginfo.ganch being used without intialization
2053            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2054            and we only enter this block when the same bit is set. */
2055         char *tmp_s = reginfo.ganch - prog->gofs;
2056
2057         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2058             goto got_it;
2059         goto phooey;
2060     }
2061
2062     /* Messy cases:  unanchored match. */
2063     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2064         /* we have /x+whatever/ */
2065         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2066         char ch;
2067 #ifdef DEBUGGING
2068         int did_match = 0;
2069 #endif
2070         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2071             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2072         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2073
2074         if (utf8_target) {
2075             REXEC_FBC_SCAN(
2076                 if (*s == ch) {
2077                     DEBUG_EXECUTE_r( did_match = 1 );
2078                     if (regtry(&reginfo, &s)) goto got_it;
2079                     s += UTF8SKIP(s);
2080                     while (s < strend && *s == ch)
2081                         s += UTF8SKIP(s);
2082                 }
2083             );
2084         }
2085         else {
2086             REXEC_FBC_SCAN(
2087                 if (*s == ch) {
2088                     DEBUG_EXECUTE_r( did_match = 1 );
2089                     if (regtry(&reginfo, &s)) goto got_it;
2090                     s++;
2091                     while (s < strend && *s == ch)
2092                         s++;
2093                 }
2094             );
2095         }
2096         DEBUG_EXECUTE_r(if (!did_match)
2097                 PerlIO_printf(Perl_debug_log,
2098                                   "Did not find anchored character...\n")
2099                );
2100     }
2101     else if (prog->anchored_substr != NULL
2102               || prog->anchored_utf8 != NULL
2103               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2104                   && prog->float_max_offset < strend - s)) {
2105         SV *must;
2106         I32 back_max;
2107         I32 back_min;
2108         char *last;
2109         char *last1;            /* Last position checked before */
2110 #ifdef DEBUGGING
2111         int did_match = 0;
2112 #endif
2113         if (prog->anchored_substr || prog->anchored_utf8) {
2114             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2115                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2116             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2117             back_max = back_min = prog->anchored_offset;
2118         } else {
2119             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2120                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2121             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2122             back_max = prog->float_max_offset;
2123             back_min = prog->float_min_offset;
2124         }
2125
2126
2127         if (must == &PL_sv_undef)
2128             /* could not downgrade utf8 check substring, so must fail */
2129             goto phooey;
2130
2131         if (back_min<0) {
2132             last = strend;
2133         } else {
2134             last = HOP3c(strend,        /* Cannot start after this */
2135                   -(I32)(CHR_SVLEN(must)
2136                          - (SvTAIL(must) != 0) + back_min), strbeg);
2137         }
2138         if (s > PL_bostr)
2139             last1 = HOPc(s, -1);
2140         else
2141             last1 = s - 1;      /* bogus */
2142
2143         /* XXXX check_substr already used to find "s", can optimize if
2144            check_substr==must. */
2145         scream_pos = -1;
2146         dontbother = end_shift;
2147         strend = HOPc(strend, -dontbother);
2148         while ( (s <= last) &&
2149                 ((flags & REXEC_SCREAM)
2150                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2151                                     end_shift, &scream_pos, 0))
2152                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2153                                   (unsigned char*)strend, must,
2154                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2155             /* we may be pointing at the wrong string */
2156             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2157                 s = strbeg + (s - SvPVX_const(sv));
2158             DEBUG_EXECUTE_r( did_match = 1 );
2159             if (HOPc(s, -back_max) > last1) {
2160                 last1 = HOPc(s, -back_min);
2161                 s = HOPc(s, -back_max);
2162             }
2163             else {
2164                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2165
2166                 last1 = HOPc(s, -back_min);
2167                 s = t;
2168             }
2169             if (utf8_target) {
2170                 while (s <= last1) {
2171                     if (regtry(&reginfo, &s))
2172                         goto got_it;
2173                     s += UTF8SKIP(s);
2174                 }
2175             }
2176             else {
2177                 while (s <= last1) {
2178                     if (regtry(&reginfo, &s))
2179                         goto got_it;
2180                     s++;
2181                 }
2182             }
2183         }
2184         DEBUG_EXECUTE_r(if (!did_match) {
2185             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2186                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2187             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2188                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2189                                ? "anchored" : "floating"),
2190                 quoted, RE_SV_TAIL(must));
2191         });
2192         goto phooey;
2193     }
2194     else if ( (c = progi->regstclass) ) {
2195         if (minlen) {
2196             const OPCODE op = OP(progi->regstclass);
2197             /* don't bother with what can't match */
2198             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2199                 strend = HOPc(strend, -(minlen - 1));
2200         }
2201         DEBUG_EXECUTE_r({
2202             SV * const prop = sv_newmortal();
2203             regprop(prog, prop, c);
2204             {
2205                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2206                     s,strend-s,60);
2207                 PerlIO_printf(Perl_debug_log,
2208                     "Matching stclass %.*s against %s (%d bytes)\n",
2209                     (int)SvCUR(prop), SvPVX_const(prop),
2210                      quoted, (int)(strend - s));
2211             }
2212         });
2213         if (find_byclass(prog, c, s, strend, &reginfo))
2214             goto got_it;
2215         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2216     }
2217     else {
2218         dontbother = 0;
2219         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2220             /* Trim the end. */
2221             char *last;
2222             SV* float_real;
2223
2224             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2225                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2226             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2227
2228             if (flags & REXEC_SCREAM) {
2229                 last = screaminstr(sv, float_real, s - strbeg,
2230                                    end_shift, &scream_pos, 1); /* last one */
2231                 if (!last)
2232                     last = scream_olds; /* Only one occurrence. */
2233                 /* we may be pointing at the wrong string */
2234                 else if (RXp_MATCH_COPIED(prog))
2235                     s = strbeg + (s - SvPVX_const(sv));
2236             }
2237             else {
2238                 STRLEN len;
2239                 const char * const little = SvPV_const(float_real, len);
2240
2241                 if (SvTAIL(float_real)) {
2242                     if (memEQ(strend - len + 1, little, len - 1))
2243                         last = strend - len + 1;
2244                     else if (!multiline)
2245                         last = memEQ(strend - len, little, len)
2246                             ? strend - len : NULL;
2247                     else
2248                         goto find_last;
2249                 } else {
2250                   find_last:
2251                     if (len)
2252                         last = rninstr(s, strend, little, little + len);
2253                     else
2254                         last = strend;  /* matching "$" */
2255                 }
2256             }
2257             if (last == NULL) {
2258                 DEBUG_EXECUTE_r(
2259                     PerlIO_printf(Perl_debug_log,
2260                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2261                         PL_colors[4], PL_colors[5]));
2262                 goto phooey; /* Should not happen! */
2263             }
2264             dontbother = strend - last + prog->float_min_offset;
2265         }
2266         if (minlen && (dontbother < minlen))
2267             dontbother = minlen - 1;
2268         strend -= dontbother;              /* this one's always in bytes! */
2269         /* We don't know much -- general case. */
2270         if (utf8_target) {
2271             for (;;) {
2272                 if (regtry(&reginfo, &s))
2273                     goto got_it;
2274                 if (s >= strend)
2275                     break;
2276                 s += UTF8SKIP(s);
2277             };
2278         }
2279         else {
2280             do {
2281                 if (regtry(&reginfo, &s))
2282                     goto got_it;
2283             } while (s++ < strend);
2284         }
2285     }
2286
2287     /* Failure. */
2288     goto phooey;
2289
2290 got_it:
2291     Safefree(swap);
2292     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2293
2294     if (PL_reg_eval_set)
2295         restore_pos(aTHX_ prog);
2296     if (RXp_PAREN_NAMES(prog))
2297         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2298
2299     /* make sure $`, $&, $', and $digit will work later */
2300     if ( !(flags & REXEC_NOT_FIRST) ) {
2301         RX_MATCH_COPY_FREE(rx);
2302         if (flags & REXEC_COPY_STR) {
2303             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2304 #ifdef PERL_OLD_COPY_ON_WRITE
2305             if ((SvIsCOW(sv)
2306                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2307                 if (DEBUG_C_TEST) {
2308                     PerlIO_printf(Perl_debug_log,
2309                                   "Copy on write: regexp capture, type %d\n",
2310                                   (int) SvTYPE(sv));
2311                 }
2312                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2313                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2314                 assert (SvPOKp(prog->saved_copy));
2315             } else
2316 #endif
2317             {
2318                 RX_MATCH_COPIED_on(rx);
2319                 s = savepvn(strbeg, i);
2320                 prog->subbeg = s;
2321             }
2322             prog->sublen = i;
2323         }
2324         else {
2325             prog->subbeg = strbeg;
2326             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2327         }
2328     }
2329
2330     return 1;
2331
2332 phooey:
2333     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2334                           PL_colors[4], PL_colors[5]));
2335     if (PL_reg_eval_set)
2336         restore_pos(aTHX_ prog);
2337     if (swap) {
2338         /* we failed :-( roll it back */
2339         Safefree(prog->offs);
2340         prog->offs = swap;
2341     }
2342
2343     return 0;
2344 }
2345
2346
2347 /*
2348  - regtry - try match at specific point
2349  */
2350 STATIC I32                      /* 0 failure, 1 success */
2351 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2352 {
2353     dVAR;
2354     CHECKPOINT lastcp;
2355     REGEXP *const rx = reginfo->prog;
2356     regexp *const prog = (struct regexp *)SvANY(rx);
2357     RXi_GET_DECL(prog,progi);
2358     GET_RE_DEBUG_FLAGS_DECL;
2359
2360     PERL_ARGS_ASSERT_REGTRY;
2361
2362     reginfo->cutpoint=NULL;
2363
2364     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2365         MAGIC *mg;
2366
2367         PL_reg_eval_set = RS_init;
2368         DEBUG_EXECUTE_r(DEBUG_s(
2369             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2370                           (IV)(PL_stack_sp - PL_stack_base));
2371             ));
2372         SAVESTACK_CXPOS();
2373         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2374         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2375         SAVETMPS;
2376         /* Apparently this is not needed, judging by wantarray. */
2377         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2378            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2379
2380         if (reginfo->sv) {
2381             /* Make $_ available to executed code. */
2382             if (reginfo->sv != DEFSV) {
2383                 SAVE_DEFSV;
2384                 DEFSV_set(reginfo->sv);
2385             }
2386
2387             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2388                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2389                 /* prepare for quick setting of pos */
2390 #ifdef PERL_OLD_COPY_ON_WRITE
2391                 if (SvIsCOW(reginfo->sv))
2392                     sv_force_normal_flags(reginfo->sv, 0);
2393 #endif
2394                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2395                                  &PL_vtbl_mglob, NULL, 0);
2396                 mg->mg_len = -1;
2397             }
2398             PL_reg_magic    = mg;
2399             PL_reg_oldpos   = mg->mg_len;
2400             SAVEDESTRUCTOR_X(restore_pos, prog);
2401         }
2402         if (!PL_reg_curpm) {
2403             Newxz(PL_reg_curpm, 1, PMOP);
2404 #ifdef USE_ITHREADS
2405             {
2406                 SV* const repointer = &PL_sv_undef;
2407                 /* this regexp is also owned by the new PL_reg_curpm, which
2408                    will try to free it.  */
2409                 av_push(PL_regex_padav, repointer);
2410                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2411                 PL_regex_pad = AvARRAY(PL_regex_padav);
2412             }
2413 #endif
2414         }
2415 #ifdef USE_ITHREADS
2416         /* It seems that non-ithreads works both with and without this code.
2417            So for efficiency reasons it seems best not to have the code
2418            compiled when it is not needed.  */
2419         /* This is safe against NULLs: */
2420         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2421         /* PM_reg_curpm owns a reference to this regexp.  */
2422         ReREFCNT_inc(rx);
2423 #endif
2424         PM_SETRE(PL_reg_curpm, rx);
2425         PL_reg_oldcurpm = PL_curpm;
2426         PL_curpm = PL_reg_curpm;
2427         if (RXp_MATCH_COPIED(prog)) {
2428             /*  Here is a serious problem: we cannot rewrite subbeg,
2429                 since it may be needed if this match fails.  Thus
2430                 $` inside (?{}) could fail... */
2431             PL_reg_oldsaved = prog->subbeg;
2432             PL_reg_oldsavedlen = prog->sublen;
2433 #ifdef PERL_OLD_COPY_ON_WRITE
2434             PL_nrs = prog->saved_copy;
2435 #endif
2436             RXp_MATCH_COPIED_off(prog);
2437         }
2438         else
2439             PL_reg_oldsaved = NULL;
2440         prog->subbeg = PL_bostr;
2441         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2442     }
2443     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2444     prog->offs[0].start = *startpos - PL_bostr;
2445     PL_reginput = *startpos;
2446     PL_reglastparen = &prog->lastparen;
2447     PL_reglastcloseparen = &prog->lastcloseparen;
2448     prog->lastparen = 0;
2449     prog->lastcloseparen = 0;
2450     PL_regsize = 0;
2451     PL_regoffs = prog->offs;
2452     if (PL_reg_start_tmpl <= prog->nparens) {
2453         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2454         if(PL_reg_start_tmp)
2455             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2456         else
2457             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2458     }
2459
2460     /* XXXX What this code is doing here?!!!  There should be no need
2461        to do this again and again, PL_reglastparen should take care of
2462        this!  --ilya*/
2463
2464     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2465      * Actually, the code in regcppop() (which Ilya may be meaning by
2466      * PL_reglastparen), is not needed at all by the test suite
2467      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2468      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2469      * Meanwhile, this code *is* needed for the
2470      * above-mentioned test suite tests to succeed.  The common theme
2471      * on those tests seems to be returning null fields from matches.
2472      * --jhi updated by dapm */
2473 #if 1
2474     if (prog->nparens) {
2475         regexp_paren_pair *pp = PL_regoffs;
2476         register I32 i;
2477         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2478             ++pp;
2479             pp->start = -1;
2480             pp->end = -1;
2481         }
2482     }
2483 #endif
2484     REGCP_SET(lastcp);
2485     if (regmatch(reginfo, progi->program + 1)) {
2486         PL_regoffs[0].end = PL_reginput - PL_bostr;
2487         return 1;
2488     }
2489     if (reginfo->cutpoint)
2490         *startpos= reginfo->cutpoint;
2491     REGCP_UNWIND(lastcp);
2492     return 0;
2493 }
2494
2495
2496 #define sayYES goto yes
2497 #define sayNO goto no
2498 #define sayNO_SILENT goto no_silent
2499
2500 /* we dont use STMT_START/END here because it leads to
2501    "unreachable code" warnings, which are bogus, but distracting. */
2502 #define CACHEsayNO \
2503     if (ST.cache_mask) \
2504        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2505     sayNO
2506
2507 /* this is used to determine how far from the left messages like
2508    'failed...' are printed. It should be set such that messages
2509    are inline with the regop output that created them.
2510 */
2511 #define REPORT_CODE_OFF 32
2512
2513
2514 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2515 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2516
2517 #define SLAB_FIRST(s) (&(s)->states[0])
2518 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2519
2520 /* grab a new slab and return the first slot in it */
2521
2522 STATIC regmatch_state *
2523 S_push_slab(pTHX)
2524 {
2525 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2526     dMY_CXT;
2527 #endif
2528     regmatch_slab *s = PL_regmatch_slab->next;
2529     if (!s) {
2530         Newx(s, 1, regmatch_slab);
2531         s->prev = PL_regmatch_slab;
2532         s->next = NULL;
2533         PL_regmatch_slab->next = s;
2534     }
2535     PL_regmatch_slab = s;
2536     return SLAB_FIRST(s);
2537 }
2538
2539
2540 /* push a new state then goto it */
2541
2542 #define PUSH_STATE_GOTO(state, node) \
2543     scan = node; \
2544     st->resume_state = state; \
2545     goto push_state;
2546
2547 /* push a new state with success backtracking, then goto it */
2548
2549 #define PUSH_YES_STATE_GOTO(state, node) \
2550     scan = node; \
2551     st->resume_state = state; \
2552     goto push_yes_state;
2553
2554
2555
2556 /*
2557
2558 regmatch() - main matching routine
2559
2560 This is basically one big switch statement in a loop. We execute an op,
2561 set 'next' to point the next op, and continue. If we come to a point which
2562 we may need to backtrack to on failure such as (A|B|C), we push a
2563 backtrack state onto the backtrack stack. On failure, we pop the top
2564 state, and re-enter the loop at the state indicated. If there are no more
2565 states to pop, we return failure.
2566
2567 Sometimes we also need to backtrack on success; for example /A+/, where
2568 after successfully matching one A, we need to go back and try to
2569 match another one; similarly for lookahead assertions: if the assertion
2570 completes successfully, we backtrack to the state just before the assertion
2571 and then carry on.  In these cases, the pushed state is marked as
2572 'backtrack on success too'. This marking is in fact done by a chain of
2573 pointers, each pointing to the previous 'yes' state. On success, we pop to
2574 the nearest yes state, discarding any intermediate failure-only states.
2575 Sometimes a yes state is pushed just to force some cleanup code to be
2576 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2577 it to free the inner regex.
2578
2579 Note that failure backtracking rewinds the cursor position, while
2580 success backtracking leaves it alone.
2581
2582 A pattern is complete when the END op is executed, while a subpattern
2583 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2584 ops trigger the "pop to last yes state if any, otherwise return true"
2585 behaviour.
2586
2587 A common convention in this function is to use A and B to refer to the two
2588 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2589 the subpattern to be matched possibly multiple times, while B is the entire
2590 rest of the pattern. Variable and state names reflect this convention.
2591
2592 The states in the main switch are the union of ops and failure/success of
2593 substates associated with with that op.  For example, IFMATCH is the op
2594 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2595 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2596 successfully matched A and IFMATCH_A_fail is a state saying that we have
2597 just failed to match A. Resume states always come in pairs. The backtrack
2598 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2599 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2600 on success or failure.
2601
2602 The struct that holds a backtracking state is actually a big union, with
2603 one variant for each major type of op. The variable st points to the
2604 top-most backtrack struct. To make the code clearer, within each
2605 block of code we #define ST to alias the relevant union.
2606
2607 Here's a concrete example of a (vastly oversimplified) IFMATCH
2608 implementation:
2609
2610     switch (state) {
2611     ....
2612
2613 #define ST st->u.ifmatch
2614
2615     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2616         ST.foo = ...; // some state we wish to save
2617         ...
2618         // push a yes backtrack state with a resume value of
2619         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2620         // first node of A:
2621         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2622         // NOTREACHED
2623
2624     case IFMATCH_A: // we have successfully executed A; now continue with B
2625         next = B;
2626         bar = ST.foo; // do something with the preserved value
2627         break;
2628
2629     case IFMATCH_A_fail: // A failed, so the assertion failed
2630         ...;   // do some housekeeping, then ...
2631         sayNO; // propagate the failure
2632
2633 #undef ST
2634
2635     ...
2636     }
2637
2638 For any old-timers reading this who are familiar with the old recursive
2639 approach, the code above is equivalent to:
2640
2641     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2642     {
2643         int foo = ...
2644         ...
2645         if (regmatch(A)) {
2646             next = B;
2647             bar = foo;
2648             break;
2649         }
2650         ...;   // do some housekeeping, then ...
2651         sayNO; // propagate the failure
2652     }
2653
2654 The topmost backtrack state, pointed to by st, is usually free. If you
2655 want to claim it, populate any ST.foo fields in it with values you wish to
2656 save, then do one of
2657
2658         PUSH_STATE_GOTO(resume_state, node);
2659         PUSH_YES_STATE_GOTO(resume_state, node);
2660
2661 which sets that backtrack state's resume value to 'resume_state', pushes a
2662 new free entry to the top of the backtrack stack, then goes to 'node'.
2663 On backtracking, the free slot is popped, and the saved state becomes the
2664 new free state. An ST.foo field in this new top state can be temporarily
2665 accessed to retrieve values, but once the main loop is re-entered, it
2666 becomes available for reuse.
2667
2668 Note that the depth of the backtrack stack constantly increases during the
2669 left-to-right execution of the pattern, rather than going up and down with
2670 the pattern nesting. For example the stack is at its maximum at Z at the
2671 end of the pattern, rather than at X in the following:
2672
2673     /(((X)+)+)+....(Y)+....Z/
2674
2675 The only exceptions to this are lookahead/behind assertions and the cut,
2676 (?>A), which pop all the backtrack states associated with A before
2677 continuing.
2678
2679 Bascktrack state structs are allocated in slabs of about 4K in size.
2680 PL_regmatch_state and st always point to the currently active state,
2681 and PL_regmatch_slab points to the slab currently containing
2682 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2683 allocated, and is never freed until interpreter destruction. When the slab
2684 is full, a new one is allocated and chained to the end. At exit from
2685 regmatch(), slabs allocated since entry are freed.
2686
2687 */
2688
2689
2690 #define DEBUG_STATE_pp(pp)                                  \
2691     DEBUG_STATE_r({                                         \
2692         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2693         PerlIO_printf(Perl_debug_log,                       \
2694             "    %*s"pp" %s%s%s%s%s\n",                     \
2695             depth*2, "",                                    \
2696             PL_reg_name[st->resume_state],                     \
2697             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2698             ((st==yes_state) ? "Y" : ""),                   \
2699             ((st==mark_state) ? "M" : ""),                  \
2700             ((st==yes_state||st==mark_state) ? "]" : "")    \
2701         );                                                  \
2702     });
2703
2704
2705 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2706
2707 #ifdef DEBUGGING
2708
2709 STATIC void
2710 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2711     const char *start, const char *end, const char *blurb)
2712 {
2713     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2714
2715     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2716
2717     if (!PL_colorset)
2718             reginitcolors();
2719     {
2720         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2721             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2722
2723         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2724             start, end - start, 60);
2725
2726         PerlIO_printf(Perl_debug_log,
2727             "%s%s REx%s %s against %s\n",
2728                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2729
2730         if (utf8_target||utf8_pat)
2731             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2732                 utf8_pat ? "pattern" : "",
2733                 utf8_pat && utf8_target ? " and " : "",
2734                 utf8_target ? "string" : ""
2735             );
2736     }
2737 }
2738
2739 STATIC void
2740 S_dump_exec_pos(pTHX_ const char *locinput,
2741                       const regnode *scan,
2742                       const char *loc_regeol,
2743                       const char *loc_bostr,
2744                       const char *loc_reg_starttry,
2745                       const bool utf8_target)
2746 {
2747     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2748     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2749     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2750     /* The part of the string before starttry has one color
2751        (pref0_len chars), between starttry and current
2752        position another one (pref_len - pref0_len chars),
2753        after the current position the third one.
2754        We assume that pref0_len <= pref_len, otherwise we
2755        decrease pref0_len.  */
2756     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2757         ? (5 + taill) - l : locinput - loc_bostr;
2758     int pref0_len;
2759
2760     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2761
2762     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2763         pref_len++;
2764     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2765     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2766         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2767               ? (5 + taill) - pref_len : loc_regeol - locinput);
2768     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2769         l--;
2770     if (pref0_len < 0)
2771         pref0_len = 0;
2772     if (pref0_len > pref_len)
2773         pref0_len = pref_len;
2774     {
2775         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2776
2777         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2778             (locinput - pref_len),pref0_len, 60, 4, 5);
2779
2780         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2781                     (locinput - pref_len + pref0_len),
2782                     pref_len - pref0_len, 60, 2, 3);
2783
2784         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2785                     locinput, loc_regeol - locinput, 10, 0, 1);
2786
2787         const STRLEN tlen=len0+len1+len2;
2788         PerlIO_printf(Perl_debug_log,
2789                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2790                     (IV)(locinput - loc_bostr),
2791                     len0, s0,
2792                     len1, s1,
2793                     (docolor ? "" : "> <"),
2794                     len2, s2,
2795                     (int)(tlen > 19 ? 0 :  19 - tlen),
2796                     "");
2797     }
2798 }
2799
2800 #endif
2801
2802 /* reg_check_named_buff_matched()
2803  * Checks to see if a named buffer has matched. The data array of
2804  * buffer numbers corresponding to the buffer is expected to reside
2805  * in the regexp->data->data array in the slot stored in the ARG() of
2806  * node involved. Note that this routine doesn't actually care about the
2807  * name, that information is not preserved from compilation to execution.
2808  * Returns the index of the leftmost defined buffer with the given name
2809  * or 0 if non of the buffers matched.
2810  */
2811 STATIC I32
2812 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2813 {
2814     I32 n;
2815     RXi_GET_DECL(rex,rexi);
2816     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2817     I32 *nums=(I32*)SvPVX(sv_dat);
2818
2819     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2820
2821     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2822         if ((I32)*PL_reglastparen >= nums[n] &&
2823             PL_regoffs[nums[n]].end != -1)
2824         {
2825             return nums[n];
2826         }
2827     }
2828     return 0;
2829 }
2830
2831
2832 /* free all slabs above current one  - called during LEAVE_SCOPE */
2833
2834 STATIC void
2835 S_clear_backtrack_stack(pTHX_ void *p)
2836 {
2837     regmatch_slab *s = PL_regmatch_slab->next;
2838     PERL_UNUSED_ARG(p);
2839
2840     if (!s)
2841         return;
2842     PL_regmatch_slab->next = NULL;
2843     while (s) {
2844         regmatch_slab * const osl = s;
2845         s = s->next;
2846         Safefree(osl);
2847     }
2848 }
2849
2850
2851 #define SETREX(Re1,Re2) \
2852     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2853     Re1 = (Re2)
2854
2855 STATIC I32                      /* 0 failure, 1 success */
2856 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2857 {
2858 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2859     dMY_CXT;
2860 #endif
2861     dVAR;
2862     register const bool utf8_target = PL_reg_match_utf8;
2863     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2864     REGEXP *rex_sv = reginfo->prog;
2865     regexp *rex = (struct regexp *)SvANY(rex_sv);
2866     RXi_GET_DECL(rex,rexi);
2867     I32 oldsave;
2868     /* the current state. This is a cached copy of PL_regmatch_state */
2869     register regmatch_state *st;
2870     /* cache heavy used fields of st in registers */
2871     register regnode *scan;
2872     register regnode *next;
2873     register U32 n = 0; /* general value; init to avoid compiler warning */
2874     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2875     register char *locinput = PL_reginput;
2876     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2877
2878     bool result = 0;        /* return value of S_regmatch */
2879     int depth = 0;          /* depth of backtrack stack */
2880     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2881     const U32 max_nochange_depth =
2882         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2883         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2884     regmatch_state *yes_state = NULL; /* state to pop to on success of
2885                                                             subpattern */
2886     /* mark_state piggy backs on the yes_state logic so that when we unwind
2887        the stack on success we can update the mark_state as we go */
2888     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2889     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2890     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2891     U32 state_num;
2892     bool no_final = 0;      /* prevent failure from backtracking? */
2893     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2894     char *startpoint = PL_reginput;
2895     SV *popmark = NULL;     /* are we looking for a mark? */
2896     SV *sv_commit = NULL;   /* last mark name seen in failure */
2897     SV *sv_yes_mark = NULL; /* last mark name we have seen
2898                                during a successfull match */
2899     U32 lastopen = 0;       /* last open we saw */
2900     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2901     SV* const oreplsv = GvSV(PL_replgv);
2902     /* these three flags are set by various ops to signal information to
2903      * the very next op. They have a useful lifetime of exactly one loop
2904      * iteration, and are not preserved or restored by state pushes/pops
2905      */
2906     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2907     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2908     int logical = 0;        /* the following EVAL is:
2909                                 0: (?{...})
2910                                 1: (?(?{...})X|Y)
2911                                 2: (??{...})
2912                                or the following IFMATCH/UNLESSM is:
2913                                 false: plain (?=foo)
2914                                 true:  used as a condition: (?(?=foo))
2915                             */
2916 #ifdef DEBUGGING
2917     GET_RE_DEBUG_FLAGS_DECL;
2918 #endif
2919
2920     PERL_ARGS_ASSERT_REGMATCH;
2921
2922     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2923             PerlIO_printf(Perl_debug_log,"regmatch start\n");
2924     }));
2925     /* on first ever call to regmatch, allocate first slab */
2926     if (!PL_regmatch_slab) {
2927         Newx(PL_regmatch_slab, 1, regmatch_slab);
2928         PL_regmatch_slab->prev = NULL;
2929         PL_regmatch_slab->next = NULL;
2930         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2931     }
2932
2933     oldsave = PL_savestack_ix;
2934     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2935     SAVEVPTR(PL_regmatch_slab);
2936     SAVEVPTR(PL_regmatch_state);
2937
2938     /* grab next free state slot */
2939     st = ++PL_regmatch_state;
2940     if (st >  SLAB_LAST(PL_regmatch_slab))
2941         st = PL_regmatch_state = S_push_slab(aTHX);
2942
2943     /* Note that nextchr is a byte even in UTF */
2944     nextchr = UCHARAT(locinput);
2945     scan = prog;
2946     while (scan != NULL) {
2947
2948         DEBUG_EXECUTE_r( {
2949             SV * const prop = sv_newmortal();
2950             regnode *rnext=regnext(scan);
2951             DUMP_EXEC_POS( locinput, scan, utf8_target );
2952             regprop(rex, prop, scan);
2953
2954             PerlIO_printf(Perl_debug_log,
2955                     "%3"IVdf":%*s%s(%"IVdf")\n",
2956                     (IV)(scan - rexi->program), depth*2, "",
2957                     SvPVX_const(prop),
2958                     (PL_regkind[OP(scan)] == END || !rnext) ?
2959                         0 : (IV)(rnext - rexi->program));
2960         });
2961
2962         next = scan + NEXT_OFF(scan);
2963         if (next == scan)
2964             next = NULL;
2965         state_num = OP(scan);
2966
2967         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
2968       reenter_switch:
2969
2970         assert(PL_reglastparen == &rex->lastparen);
2971         assert(PL_reglastcloseparen == &rex->lastcloseparen);
2972         assert(PL_regoffs == rex->offs);
2973
2974         switch (state_num) {
2975         case BOL:
2976             if (locinput == PL_bostr)
2977             {
2978                 /* reginfo->till = reginfo->bol; */
2979                 break;
2980             }
2981             sayNO;
2982         case MBOL:
2983             if (locinput == PL_bostr ||
2984                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2985             {
2986                 break;
2987             }
2988             sayNO;
2989         case SBOL:
2990             if (locinput == PL_bostr)
2991                 break;
2992             sayNO;
2993         case GPOS:
2994             if (locinput == reginfo->ganch)
2995                 break;
2996             sayNO;
2997
2998         case KEEPS:
2999             /* update the startpoint */
3000             st->u.keeper.val = PL_regoffs[0].start;
3001             PL_reginput = locinput;
3002             PL_regoffs[0].start = locinput - PL_bostr;
3003             PUSH_STATE_GOTO(KEEPS_next, next);
3004             /*NOT-REACHED*/
3005         case KEEPS_next_fail:
3006             /* rollback the start point change */
3007             PL_regoffs[0].start = st->u.keeper.val;
3008             sayNO_SILENT;
3009             /*NOT-REACHED*/
3010         case EOL:
3011                 goto seol;
3012         case MEOL:
3013             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3014                 sayNO;
3015             break;
3016         case SEOL:
3017           seol:
3018             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3019                 sayNO;
3020             if (PL_regeol - locinput > 1)
3021                 sayNO;
3022             break;
3023         case EOS:
3024             if (PL_regeol != locinput)
3025                 sayNO;
3026             break;
3027         case SANY:
3028             if (!nextchr && locinput >= PL_regeol)
3029                 sayNO;
3030             if (utf8_target) {
3031                 locinput += PL_utf8skip[nextchr];
3032                 if (locinput > PL_regeol)
3033                     sayNO;
3034                 nextchr = UCHARAT(locinput);
3035             }
3036             else
3037                 nextchr = UCHARAT(++locinput);
3038             break;
3039         case CANY:
3040             if (!nextchr && locinput >= PL_regeol)
3041                 sayNO;
3042             nextchr = UCHARAT(++locinput);
3043             break;
3044         case REG_ANY:
3045             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3046                 sayNO;
3047             if (utf8_target) {
3048                 locinput += PL_utf8skip[nextchr];
3049                 if (locinput > PL_regeol)
3050                     sayNO;
3051                 nextchr = UCHARAT(locinput);
3052             }
3053             else
3054                 nextchr = UCHARAT(++locinput);
3055             break;
3056
3057 #undef  ST
3058 #define ST st->u.trie
3059         case TRIEC:
3060             /* In this case the charclass data is available inline so
3061                we can fail fast without a lot of extra overhead.
3062              */
3063             if (scan->flags == EXACT || !utf8_target) {
3064                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3065                     DEBUG_EXECUTE_r(
3066                         PerlIO_printf(Perl_debug_log,
3067                                   "%*s  %sfailed to match trie start class...%s\n",
3068                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3069                     );
3070                     sayNO_SILENT;
3071                     /* NOTREACHED */
3072                 }
3073             }
3074             /* FALL THROUGH */
3075         case TRIE:
3076             /* the basic plan of execution of the trie is:
3077              * At the beginning, run though all the states, and
3078              * find the longest-matching word. Also remember the position
3079              * of the shortest matching word. For example, this pattern:
3080              *    1  2 3 4    5
3081              *    ab|a|x|abcd|abc
3082              * when matched against the string "abcde", will generate
3083              * accept states for all words except 3, with the longest
3084              * matching word being 4, and the shortest being 1 (with
3085              * the position being after char 1 of the string).
3086              *
3087              * Then for each matching word, in word order (i.e. 1,2,4,5),
3088              * we run the remainder of the pattern; on each try setting
3089              * the current position to the character following the word,
3090              * returning to try the next word on failure.
3091              *
3092              * We avoid having to build a list of words at runtime by
3093              * using a compile-time structure, wordinfo[].prev, which
3094              * gives, for each word, the previous accepting word (if any).
3095              * In the case above it would contain the mappings 1->2, 2->0,
3096              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3097              * the longest word (4 above), a list of all words, by
3098              * following the list of prev pointers; this gives us the
3099              * unordered list 4,5,1,2. Then given the current word we have
3100              * just tried, we can go through the list and find the
3101              * next-biggest word to try (so if we just failed on word 2,
3102              * the next in the list is 4).
3103              *
3104              * Since at runtime we don't record the matching position in
3105              * the string for each word, we have to work that out for
3106              * each word we're about to process. The wordinfo table holds
3107              * the character length of each word; given that we recorded
3108              * at the start: the position of the shortest word and its
3109              * length in chars, we just need to move the pointer the
3110              * difference between the two char lengths. Depending on
3111              * Unicode status and folding, that's cheap or expensive.
3112              *
3113              * This algorithm is optimised for the case where are only a
3114              * small number of accept states, i.e. 0,1, or maybe 2.
3115              * With lots of accepts states, and having to try all of them,
3116              * it becomes quadratic on number of accept states to find all
3117              * the next words.
3118              */
3119
3120             {
3121                 /* what type of TRIE am I? (utf8 makes this contextual) */
3122                 DECL_TRIE_TYPE(scan);
3123
3124                 /* what trie are we using right now */
3125                 reg_trie_data * const trie
3126                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3127                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3128                 U32 state = trie->startstate;
3129
3130                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3131                     !TRIE_BITMAP_TEST(trie,*locinput)
3132                 ) {
3133                     if (trie->states[ state ].wordnum) {
3134                          DEBUG_EXECUTE_r(
3135                             PerlIO_printf(Perl_debug_log,
3136                                           "%*s  %smatched empty string...%s\n",
3137                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3138                         );
3139                         break;
3140                     } else {
3141                         DEBUG_EXECUTE_r(
3142                             PerlIO_printf(Perl_debug_log,
3143                                           "%*s  %sfailed to match trie start class...%s\n",
3144                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3145                         );
3146                         sayNO_SILENT;
3147                    }
3148                 }
3149
3150             {
3151                 U8 *uc = ( U8* )locinput;
3152
3153                 STRLEN len = 0;
3154                 STRLEN foldlen = 0;
3155                 U8 *uscan = (U8*)NULL;
3156                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3157                 U32 charcount = 0; /* how many input chars we have matched */
3158                 U32 accepted = 0; /* have we seen any accepting states? */
3159
3160                 ST.B = next;
3161                 ST.jump = trie->jump;
3162                 ST.me = scan;
3163                 ST.firstpos = NULL;
3164                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3165                 ST.nextword = 0;
3166
3167                 /* fully traverse the TRIE; note the position of the
3168                    shortest accept state and the wordnum of the longest
3169                    accept state */
3170
3171                 while ( state && uc <= (U8*)PL_regeol ) {
3172                     U32 base = trie->states[ state ].trans.base;
3173                     UV uvc = 0;
3174                     U16 charid = 0;
3175                     U16 wordnum;
3176                     wordnum = trie->states[ state ].wordnum;
3177
3178                     if (wordnum) { /* it's an accept state */
3179                         if (!accepted) {
3180                             accepted = 1;
3181                             /* record first match position */
3182                             if (ST.longfold) {
3183                                 ST.firstpos = (U8*)locinput;
3184                                 ST.firstchars = 0;
3185                             }
3186                             else {
3187                                 ST.firstpos = uc;
3188                                 ST.firstchars = charcount;
3189                             }
3190                         }
3191                         if (!ST.nextword || wordnum < ST.nextword)
3192                             ST.nextword = wordnum;
3193                         ST.topword = wordnum;
3194                     }
3195
3196                     DEBUG_TRIE_EXECUTE_r({
3197                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3198                                 PerlIO_printf( Perl_debug_log,
3199                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3200                                     2+depth * 2, "", PL_colors[4],
3201                                     (UV)state, (accepted ? 'Y' : 'N'));
3202                     });
3203
3204                     /* read a char and goto next state */
3205                     if ( base ) {
3206                         I32 offset;
3207                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3208                                              uscan, len, uvc, charid, foldlen,
3209                                              foldbuf, uniflags);
3210                         charcount++;
3211                         if (foldlen>0)
3212                             ST.longfold = TRUE;
3213                         if (charid &&
3214                              ( ((offset =
3215                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3216
3217                              && ((U32)offset < trie->lasttrans)
3218                              && trie->trans[offset].check == state)
3219                         {
3220                             state = trie->trans[offset].next;
3221                         }
3222                         else {
3223                             state = 0;
3224                         }
3225                         uc += len;
3226
3227                     }
3228                     else {
3229                         state = 0;
3230                     }
3231                     DEBUG_TRIE_EXECUTE_r(
3232                         PerlIO_printf( Perl_debug_log,
3233                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3234                             charid, uvc, (UV)state, PL_colors[5] );
3235                     );
3236                 }
3237                 if (!accepted)
3238                    sayNO;
3239
3240                 /* calculate total number of accept states */
3241                 {
3242                     U16 w = ST.topword;
3243                     accepted = 0;
3244                     while (w) {
3245                         w = trie->wordinfo[w].prev;
3246                         accepted++;
3247                     }
3248                     ST.accepted = accepted;
3249                 }
3250
3251                 DEBUG_EXECUTE_r(
3252                     PerlIO_printf( Perl_debug_log,
3253                         "%*s  %sgot %"IVdf" possible matches%s\n",
3254                         REPORT_CODE_OFF + depth * 2, "",
3255                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3256                 );
3257                 goto trie_first_try; /* jump into the fail handler */
3258             }}
3259             /* NOTREACHED */
3260
3261         case TRIE_next_fail: /* we failed - try next alternative */
3262             if ( ST.jump) {
3263                 REGCP_UNWIND(ST.cp);
3264                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3265                     PL_regoffs[n].end = -1;
3266                 *PL_reglastparen = n;
3267             }
3268             if (!--ST.accepted) {
3269                 DEBUG_EXECUTE_r({
3270                     PerlIO_printf( Perl_debug_log,
3271                         "%*s  %sTRIE failed...%s\n",
3272                         REPORT_CODE_OFF+depth*2, "",
3273                         PL_colors[4],
3274                         PL_colors[5] );
3275                 });
3276                 sayNO_SILENT;
3277             }
3278             {
3279                 /* Find next-highest word to process.  Note that this code
3280                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3281                 register U16 min = 0;
3282                 register U16 word;
3283                 register U16 const nextword = ST.nextword;
3284                 register reg_trie_wordinfo * const wordinfo
3285                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3286                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3287                     if (word > nextword && (!min || word < min))
3288                         min = word;
3289                 }
3290                 ST.nextword = min;
3291             }
3292
3293           trie_first_try:
3294             if (do_cutgroup) {
3295                 do_cutgroup = 0;
3296                 no_final = 0;
3297             }
3298
3299             if ( ST.jump) {
3300                 ST.lastparen = *PL_reglastparen;
3301                 REGCP_SET(ST.cp);
3302             }
3303
3304             /* find start char of end of current word */
3305             {
3306                 U32 chars; /* how many chars to skip */
3307                 U8 *uc = ST.firstpos;
3308                 reg_trie_data * const trie
3309                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3310
3311                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3312                             >=  ST.firstchars);
3313                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3314                             - ST.firstchars;
3315
3316                 if (ST.longfold) {
3317                     /* the hard option - fold each char in turn and find
3318                      * its folded length (which may be different */
3319                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3320                     STRLEN foldlen;
3321                     STRLEN len;
3322                     UV uvc;
3323                     U8 *uscan;
3324
3325                     while (chars) {
3326                         if (utf8_target) {
3327                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3328                                                     uniflags);
3329                             uc += len;
3330                         }
3331                         else {
3332                             uvc = *uc;
3333                             uc++;
3334                         }
3335                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3336                         uscan = foldbuf;
3337                         while (foldlen) {
3338                             if (!--chars)
3339                                 break;
3340                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3341                                             uniflags);
3342                             uscan += len;
3343                             foldlen -= len;
3344                         }
3345                     }
3346                 }
3347                 else {
3348                     if (utf8_target)
3349                         while (chars--)
3350                             uc += UTF8SKIP(uc);
3351                     else
3352                         uc += chars;
3353                 }
3354                 PL_reginput = (char *)uc;
3355             }
3356
3357             scan = (ST.jump && ST.jump[ST.nextword])
3358                         ? ST.me + ST.jump[ST.nextword]
3359                         : ST.B;
3360
3361             DEBUG_EXECUTE_r({
3362                 PerlIO_printf( Perl_debug_log,
3363                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3364                     REPORT_CODE_OFF+depth*2, "",
3365                     PL_colors[4],
3366                     ST.nextword,
3367                     PL_colors[5]
3368                     );
3369             });
3370
3371             if (ST.accepted > 1 || has_cutgroup) {
3372                 PUSH_STATE_GOTO(TRIE_next, scan);
3373                 /* NOTREACHED */
3374             }
3375             /* only one choice left - just continue */
3376             DEBUG_EXECUTE_r({
3377                 AV *const trie_words
3378                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3379                 SV ** const tmp = av_fetch( trie_words,
3380                     ST.nextword-1, 0 );
3381                 SV *sv= tmp ? sv_newmortal() : NULL;
3382
3383                 PerlIO_printf( Perl_debug_log,
3384                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3385                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3386                     ST.nextword,
3387                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3388                             PL_colors[0], PL_colors[1],
3389                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3390                         )
3391                     : "not compiled under -Dr",
3392                     PL_colors[5] );
3393             });
3394
3395             locinput = PL_reginput;
3396             nextchr = UCHARAT(locinput);
3397             continue; /* execute rest of RE */
3398             /* NOTREACHED */
3399 #undef  ST
3400
3401         case EXACT: {
3402             char *s = STRING(scan);
3403             ln = STR_LEN(scan);
3404             if (utf8_target != UTF_PATTERN) {
3405                 /* The target and the pattern have differing utf8ness. */
3406                 char *l = locinput;
3407                 const char * const e = s + ln;
3408
3409                 if (utf8_target) {
3410                     /* The target is utf8, the pattern is not utf8. */
3411                     while (s < e) {
3412                         STRLEN ulen;
3413                         if (l >= PL_regeol)
3414                              sayNO;
3415                         if (NATIVE_TO_UNI(*(U8*)s) !=
3416                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3417                                             uniflags))
3418                              sayNO;
3419                         l += ulen;
3420                         s ++;
3421                     }
3422                 }
3423                 else {
3424                     /* The target is not utf8, the pattern is utf8. */
3425                     while (s < e) {
3426                         STRLEN ulen;
3427                         if (l >= PL_regeol)
3428                             sayNO;
3429                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3430                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3431                                            uniflags))
3432                             sayNO;
3433                         s += ulen;
3434                         l ++;
3435                     }
3436                 }
3437                 locinput = l;
3438                 nextchr = UCHARAT(locinput);
3439                 break;
3440             }
3441             /* The target and the pattern have the same utf8ness. */
3442             /* Inline the first character, for speed. */
3443             if (UCHARAT(s) != nextchr)
3444                 sayNO;
3445             if (PL_regeol - locinput < ln)
3446                 sayNO;
3447             if (ln > 1 && memNE(s, locinput, ln))
3448                 sayNO;
3449             locinput += ln;
3450             nextchr = UCHARAT(locinput);
3451             break;
3452             }
3453         case EXACTFL:
3454             PL_reg_flags |= RF_tainted;
3455             /* FALL THROUGH */
3456         case EXACTF: {
3457             char * const s = STRING(scan);
3458             ln = STR_LEN(scan);
3459
3460             if (utf8_target || UTF_PATTERN) {
3461               /* Either target or the pattern are utf8. */
3462                 const char * const l = locinput;
3463                 char *e = PL_regeol;
3464
3465                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3466                                l, &e, 0,  utf8_target)) {
3467                      /* One more case for the sharp s:
3468                       * pack("U0U*", 0xDF) =~ /ss/i,
3469                       * the 0xC3 0x9F are the UTF-8
3470                       * byte sequence for the U+00DF. */
3471
3472                      if (!(utf8_target &&
3473                            toLOWER(s[0]) == 's' &&
3474                            ln >= 2 &&
3475                            toLOWER(s[1]) == 's' &&
3476                            (U8)l[0] == 0xC3 &&
3477                            e - l >= 2 &&
3478                            (U8)l[1] == 0x9F))
3479                           sayNO;
3480                 }
3481                 locinput = e;
3482                 nextchr = UCHARAT(locinput);
3483                 break;
3484             }
3485
3486             /* Neither the target and the pattern are utf8. */
3487
3488             /* Inline the first character, for speed. */
3489             if (UCHARAT(s) != nextchr &&
3490                 UCHARAT(s) != ((OP(scan) == EXACTF)
3491                                ? PL_fold : PL_fold_locale)[nextchr])
3492                 sayNO;
3493             if (PL_regeol - locinput < ln)
3494                 sayNO;
3495             if (ln > 1 && (OP(scan) == EXACTF
3496                            ? ! foldEQ(s, locinput, ln)
3497                            : ! foldEQ_locale(s, locinput, ln)))
3498                 sayNO;
3499             locinput += ln;
3500             nextchr = UCHARAT(locinput);
3501             break;
3502             }
3503         case BOUNDL:
3504         case NBOUNDL:
3505             PL_reg_flags |= RF_tainted;
3506             /* FALL THROUGH */
3507         case BOUND:
3508         case NBOUND:
3509             /* was last char in word? */
3510             if (utf8_target) {
3511                 if (locinput == PL_bostr)
3512                     ln = '\n';
3513                 else {
3514                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3515
3516                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3517                 }
3518                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3519                     ln = isALNUM_uni(ln);
3520                     LOAD_UTF8_CHARCLASS_ALNUM();
3521                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3522                 }
3523                 else {
3524                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3525                     n = isALNUM_LC_utf8((U8*)locinput);
3526                 }
3527             }
3528             else {
3529                 ln = (locinput != PL_bostr) ?
3530                     UCHARAT(locinput - 1) : '\n';
3531                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3532                     ln = isALNUM(ln);
3533                     n = isALNUM(nextchr);
3534                 }
3535                 else {
3536                     ln = isALNUM_LC(ln);
3537                     n = isALNUM_LC(nextchr);
3538                 }
3539             }
3540             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3541                                     OP(scan) == BOUNDL))
3542                     sayNO;
3543             break;
3544         case ANYOF:
3545             if (utf8_target) {
3546                 STRLEN inclasslen = PL_regeol - locinput;
3547
3548                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3549                     goto anyof_fail;
3550                 if (locinput >= PL_regeol)
3551                     sayNO;
3552                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3553                 nextchr = UCHARAT(locinput);
3554                 break;
3555             }
3556             else {
3557                 if (nextchr < 0)
3558                     nextchr = UCHARAT(locinput);
3559                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3560                     goto anyof_fail;
3561                 if (!nextchr && locinput >= PL_regeol)
3562                     sayNO;
3563                 nextchr = UCHARAT(++locinput);
3564                 break;
3565             }
3566         anyof_fail:
3567             /* If we might have the case of the German sharp s
3568              * in a casefolding Unicode character class. */
3569
3570             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3571                  locinput += SHARP_S_SKIP;
3572                  nextchr = UCHARAT(locinput);
3573             }
3574             else
3575                  sayNO;
3576             break;
3577         /* Special char classes - The defines start on line 129 or so */
3578         CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3579         CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3580
3581         CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3582         CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3583
3584         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3585         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3586
3587         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3588                        a Unicode extended Grapheme Cluster */
3589             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3590               extended Grapheme Cluster is:
3591
3592                CR LF
3593                | Prepend* Begin Extend*
3594                | .
3595
3596                Begin is (Hangul-syllable | ! Control)
3597                Extend is (Grapheme_Extend | Spacing_Mark)
3598                Control is [ GCB_Control CR LF ]
3599
3600                The discussion below shows how the code for CLUMP is derived
3601                from this regex.  Note that most of these concepts are from
3602                property values of the Grapheme Cluster Boundary (GCB) property.
3603                No code point can have multiple property values for a given
3604                property.  Thus a code point in Prepend can't be in Control, but
3605                it must be in !Control.  This is why Control above includes
3606                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3607                property separately, and so can't be in GCB_Control, even though
3608                they logically are controls.  Control is not the same as gc=cc,
3609                but includes format and other characters as well.
3610
3611                The Unicode definition of Hangul-syllable is:
3612                    L+
3613                    | (L* ( ( V | LV ) V* | LVT ) T*)
3614                    | T+
3615                   )
3616                Each of these is a value for the GCB property, and hence must be
3617                disjoint, so the order they are tested is immaterial, so the
3618                above can safely be changed to
3619                    T+
3620                    | L+
3621                    | (L* ( LVT | ( V | LV ) V*) T*)
3622
3623                The last two terms can be combined like this:
3624                    L* ( L
3625                         | (( LVT | ( V | LV ) V*) T*))
3626
3627                And refactored into this:
3628                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3629
3630                That means that if we have seen any L's at all we can quit
3631                there, but if the next character is a LVT, a V or and LV we
3632                should keep going.
3633
3634                There is a subtlety with Prepend* which showed up in testing.
3635                Note that the Begin, and only the Begin is required in:
3636                 | Prepend* Begin Extend*
3637                Also, Begin contains '! Control'.  A Prepend must be a '!
3638                Control', which means it must be a Begin.  What it comes down to
3639                is that if we match Prepend* and then find no suitable Begin
3640                afterwards, that if we backtrack the last Prepend, that one will
3641                be a suitable Begin.
3642             */
3643
3644             if (locinput >= PL_regeol)
3645                 sayNO;
3646             if  (! utf8_target) {
3647
3648                 /* Match either CR LF  or '.', as all the other possibilities
3649                  * require utf8 */
3650                 locinput++;         /* Match the . or CR */
3651                 if (nextchr == '\r'
3652                     && locinput < PL_regeol
3653                     && UCHARAT(locinput) == '\n') locinput++;
3654             }
3655             else {
3656
3657                 /* Utf8: See if is ( CR LF ); already know that locinput <
3658                  * PL_regeol, so locinput+1 is in bounds */
3659                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3660                     locinput += 2;
3661                 }
3662                 else {
3663                     /* In case have to backtrack to beginning, then match '.' */
3664                     char *starting = locinput;
3665
3666                     /* In case have to backtrack the last prepend */
3667                     char *previous_prepend = 0;
3668
3669                     LOAD_UTF8_CHARCLASS_GCB();
3670
3671                     /* Match (prepend)* */
3672                     while (locinput < PL_regeol
3673                            && swash_fetch(PL_utf8_X_prepend,
3674                                           (U8*)locinput, utf8_target))
3675                     {
3676                         previous_prepend = locinput;
3677                         locinput += UTF8SKIP(locinput);
3678                     }
3679
3680                     /* As noted above, if we matched a prepend character, but
3681                      * the next thing won't match, back off the last prepend we
3682                      * matched, as it is guaranteed to match the begin */
3683                     if (previous_prepend
3684                         && (locinput >=  PL_regeol
3685                             || ! swash_fetch(PL_utf8_X_begin,
3686                                              (U8*)locinput, utf8_target)))
3687                     {
3688                         locinput = previous_prepend;
3689                     }
3690
3691                     /* Note that here we know PL_regeol > locinput, as we
3692                      * tested that upon input to this switch case, and if we
3693                      * moved locinput forward, we tested the result just above
3694                      * and it either passed, or we backed off so that it will
3695                      * now pass */
3696                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3697
3698                         /* Here did not match the required 'Begin' in the
3699                          * second term.  So just match the very first
3700                          * character, the '.' of the final term of the regex */
3701                         locinput = starting + UTF8SKIP(starting);
3702                     } else {
3703
3704                         /* Here is the beginning of a character that can have
3705                          * an extender.  It is either a hangul syllable, or a
3706                          * non-control */
3707                         if (swash_fetch(PL_utf8_X_non_hangul,
3708                                         (U8*)locinput, utf8_target))
3709                         {
3710
3711                             /* Here not a Hangul syllable, must be a
3712                              * ('!  * Control') */
3713                             locinput += UTF8SKIP(locinput);
3714                         } else {
3715
3716                             /* Here is a Hangul syllable.  It can be composed
3717                              * of several individual characters.  One
3718                              * possibility is T+ */
3719                             if (swash_fetch(PL_utf8_X_T,
3720                                             (U8*)locinput, utf8_target))
3721                             {
3722                                 while (locinput < PL_regeol
3723                                         && swash_fetch(PL_utf8_X_T,
3724                                                         (U8*)locinput, utf8_target))
3725                                 {
3726                                     locinput += UTF8SKIP(locinput);
3727                                 }
3728                             } else {
3729
3730                                 /* Here, not T+, but is a Hangul.  That means
3731                                  * it is one of the others: L, LV, LVT or V,
3732                                  * and matches:
3733                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3734
3735                                 /* Match L*           */
3736                                 while (locinput < PL_regeol
3737                                         && swash_fetch(PL_utf8_X_L,
3738                                                         (U8*)locinput, utf8_target))
3739                                 {
3740                                     locinput += UTF8SKIP(locinput);
3741                                 }
3742
3743                                 /* Here, have exhausted L*.  If the next
3744                                  * character is not an LV, LVT nor V, it means
3745                                  * we had to have at least one L, so matches L+
3746                                  * in the original equation, we have a complete
3747                                  * hangul syllable.  Are done. */
3748
3749                                 if (locinput < PL_regeol
3750                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3751                                                     (U8*)locinput, utf8_target))
3752                                 {
3753
3754                                     /* Otherwise keep going.  Must be LV, LVT
3755                                      * or V.  See if LVT */
3756                                     if (swash_fetch(PL_utf8_X_LVT,
3757                                                     (U8*)locinput, utf8_target))
3758                                     {
3759                                         locinput += UTF8SKIP(locinput);
3760                                     } else {
3761
3762                                         /* Must be  V or LV.  Take it, then
3763                                          * match V*     */
3764                                         locinput += UTF8SKIP(locinput);
3765                                         while (locinput < PL_regeol
3766                                                 && swash_fetch(PL_utf8_X_V,
3767                                                          (U8*)locinput, utf8_target))
3768                                         {
3769                                             locinput += UTF8SKIP(locinput);
3770                                         }
3771                                     }
3772
3773                                     /* And any of LV, LVT, or V can be followed
3774                                      * by T*            */
3775                                     while (locinput < PL_regeol
3776                                            && swash_fetch(PL_utf8_X_T,
3777                                                            (U8*)locinput,
3778                                                            utf8_target))
3779                                     {
3780                                         locinput += UTF8SKIP(locinput);
3781                                     }
3782                                 }
3783                             }
3784                         }
3785
3786                         /* Match any extender */
3787                         while (locinput < PL_regeol
3788                                 && swash_fetch(PL_utf8_X_extend,
3789                                                 (U8*)locinput, utf8_target))
3790                         {
3791                             locinput += UTF8SKIP(locinput);
3792                         }
3793                     }
3794                 }
3795                 if (locinput > PL_regeol) sayNO;
3796             }
3797             nextchr = UCHARAT(locinput);
3798             break;
3799
3800         case NREFFL:
3801         {
3802             char *s;
3803             char type;
3804             PL_reg_flags |= RF_tainted;
3805             /* FALL THROUGH */
3806         case NREF:
3807         case NREFF:
3808             type = OP(scan);
3809             n = reg_check_named_buff_matched(rex,scan);
3810
3811             if ( n ) {
3812                 type = REF + ( type - NREF );
3813                 goto do_ref;
3814             } else {
3815                 sayNO;
3816             }
3817             /* unreached */
3818         case REFFL:
3819             PL_reg_flags |= RF_tainted;
3820             /* FALL THROUGH */
3821         case REF:
3822         case REFF:
3823             n = ARG(scan);  /* which paren pair */
3824             type = OP(scan);
3825           do_ref:
3826             ln = PL_regoffs[n].start;
3827             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3828             if (*PL_reglastparen < n || ln == -1)
3829                 sayNO;                  /* Do not match unless seen CLOSEn. */
3830             if (ln == PL_regoffs[n].end)
3831                 break;
3832
3833             s = PL_bostr + ln;
3834             if (utf8_target && type != REF) {   /* REF can do byte comparison */
3835                 char *l = locinput;
3836                 const char *e = PL_bostr + PL_regoffs[n].end;
3837                 /*
3838                  * Note that we can't do the "other character" lookup trick as
3839                  * in the 8-bit case (no pun intended) because in Unicode we
3840                  * have to map both upper and title case to lower case.
3841                  */
3842                 if (type == REFF) {
3843                     while (s < e) {
3844                         STRLEN ulen1, ulen2;
3845                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3846                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3847
3848                         if (l >= PL_regeol)
3849                             sayNO;
3850                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3851                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3852                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3853                             sayNO;
3854                         s += ulen1;
3855                         l += ulen2;
3856                     }
3857                 }
3858                 locinput = l;
3859                 nextchr = UCHARAT(locinput);
3860                 break;
3861             }
3862
3863             /* Inline the first character, for speed. */
3864             if (UCHARAT(s) != nextchr &&
3865                 (type == REF ||
3866                  (UCHARAT(s) != (type == REFF
3867                                   ? PL_fold : PL_fold_locale)[nextchr])))
3868                 sayNO;
3869             ln = PL_regoffs[n].end - ln;
3870             if (locinput + ln > PL_regeol)
3871                 sayNO;
3872             if (ln > 1 && (type == REF
3873                            ? memNE(s, locinput, ln)
3874                            : (type == REFF
3875                               ? ! foldEQ(s, locinput, ln)
3876                               : ! foldEQ_locale(s, locinput, ln))))
3877                 sayNO;
3878             locinput += ln;
3879             nextchr = UCHARAT(locinput);
3880             break;
3881         }
3882         case NOTHING:
3883         case TAIL:
3884             break;
3885         case BACK:
3886             break;
3887
3888 #undef  ST
3889 #define ST st->u.eval
3890         {
3891             SV *ret;
3892             REGEXP *re_sv;
3893             regexp *re;
3894             regexp_internal *rei;
3895             regnode *startpoint;
3896
3897         case GOSTART:
3898         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3899             if (cur_eval && cur_eval->locinput==locinput) {
3900                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3901                     Perl_croak(aTHX_ "Infinite recursion in regex");
3902                 if ( ++nochange_depth > max_nochange_depth )
3903                     Perl_croak(aTHX_
3904                         "Pattern subroutine nesting without pos change"
3905                         " exceeded limit in regex");
3906             } else {
3907                 nochange_depth = 0;
3908             }
3909             re_sv = rex_sv;
3910             re = rex;
3911             rei = rexi;
3912             (void)ReREFCNT_inc(rex_sv);
3913             if (OP(scan)==GOSUB) {
3914                 startpoint = scan + ARG2L(scan);
3915                 ST.close_paren = ARG(scan);
3916             } else {
3917                 startpoint = rei->program+1;
3918                 ST.close_paren = 0;
3919             }
3920             goto eval_recurse_doit;
3921             /* NOTREACHED */
3922         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3923             if (cur_eval && cur_eval->locinput==locinput) {
3924                 if ( ++nochange_depth > max_nochange_depth )
3925                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3926             } else {
3927                 nochange_depth = 0;
3928             }
3929             {
3930                 /* execute the code in the {...} */
3931                 dSP;
3932                 SV ** const before = SP;
3933                 OP_4tree * const oop = PL_op;
3934                 COP * const ocurcop = PL_curcop;
3935                 PAD *old_comppad;
3936                 char *saved_regeol = PL_regeol;
3937
3938                 n = ARG(scan);
3939                 PL_op = (OP_4tree*)rexi->data->data[n];
3940                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3941                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3942                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3943                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3944
3945                 if (sv_yes_mark) {
3946                     SV *sv_mrk = get_sv("REGMARK", 1);
3947                     sv_setsv(sv_mrk, sv_yes_mark);
3948                 }
3949
3950                 CALLRUNOPS(aTHX);                       /* Scalar context. */
3951                 SPAGAIN;
3952                 if (SP == before)
3953                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3954                 else {
3955                     ret = POPs;
3956                     PUTBACK;
3957                 }
3958
3959                 PL_op = oop;
3960                 PAD_RESTORE_LOCAL(old_comppad);
3961                 PL_curcop = ocurcop;
3962                 PL_regeol = saved_regeol;
3963                 if (!logical) {
3964                     /* /(?{...})/ */
3965                     sv_setsv(save_scalar(PL_replgv), ret);
3966                     break;
3967                 }
3968             }
3969             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3970                 logical = 0;
3971                 {
3972                     /* extract RE object from returned value; compiling if
3973                      * necessary */
3974                     MAGIC *mg = NULL;
3975                     REGEXP *rx = NULL;
3976
3977                     if (SvROK(ret)) {
3978                         SV *const sv = SvRV(ret);
3979
3980                         if (SvTYPE(sv) == SVt_REGEXP) {
3981                             rx = (REGEXP*) sv;
3982                         } else if (SvSMAGICAL(sv)) {
3983                             mg = mg_find(sv, PERL_MAGIC_qr);
3984                             assert(mg);
3985                         }
3986                     } else if (SvTYPE(ret) == SVt_REGEXP) {
3987                         rx = (REGEXP*) ret;
3988                     } else if (SvSMAGICAL(ret)) {
3989                         if (SvGMAGICAL(ret)) {
3990                             /* I don't believe that there is ever qr magic
3991                                here.  */
3992                             assert(!mg_find(ret, PERL_MAGIC_qr));
3993                             sv_unmagic(ret, PERL_MAGIC_qr);
3994                         }
3995                         else {
3996                             mg = mg_find(ret, PERL_MAGIC_qr);
3997                             /* testing suggests mg only ends up non-NULL for
3998                                scalars who were upgraded and compiled in the
3999                                else block below. In turn, this is only
4000                                triggered in the "postponed utf8 string" tests
4001                                in t/op/pat.t  */
4002                         }
4003                     }
4004
4005                     if (mg) {
4006                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4007                         assert(rx);
4008                     }
4009                     if (rx) {
4010                         rx = reg_temp_copy(NULL, rx);
4011                     }
4012                     else {
4013                         U32 pm_flags = 0;
4014                         const I32 osize = PL_regsize;
4015
4016                         if (DO_UTF8(ret)) {
4017                             assert (SvUTF8(ret));
4018                         } else if (SvUTF8(ret)) {
4019                             /* Not doing UTF-8, despite what the SV says. Is
4020                                this only if we're trapped in use 'bytes'?  */
4021                             /* Make a copy of the octet sequence, but without
4022                                the flag on, as the compiler now honours the
4023                                SvUTF8 flag on ret.  */
4024                             STRLEN len;
4025                             const char *const p = SvPV(ret, len);
4026                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4027                         }
4028                         rx = CALLREGCOMP(ret, pm_flags);
4029                         if (!(SvFLAGS(ret)
4030                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4031                                  | SVs_GMG))) {
4032                             /* This isn't a first class regexp. Instead, it's
4033                                caching a regexp onto an existing, Perl visible
4034                                scalar.  */
4035                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4036                         }
4037                         PL_regsize = osize;
4038                     }
4039                     re_sv = rx;
4040                     re = (struct regexp *)SvANY(rx);
4041                 }
4042                 RXp_MATCH_COPIED_off(re);
4043                 re->subbeg = rex->subbeg;
4044                 re->sublen = rex->sublen;
4045                 rei = RXi_GET(re);
4046                 DEBUG_EXECUTE_r(
4047                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4048                         "Matching embedded");
4049                 );
4050                 startpoint = rei->program + 1;
4051                 ST.close_paren = 0; /* only used for GOSUB */
4052                 /* borrowed from regtry */
4053                 if (PL_reg_start_tmpl <= re->nparens) {
4054                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4055                     if(PL_reg_start_tmp)
4056                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4057                     else
4058                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4059                 }
4060
4061         eval_recurse_doit: /* Share code with GOSUB below this line */
4062                 /* run the pattern returned from (??{...}) */
4063                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4064                 REGCP_SET(ST.lastcp);
4065
4066                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4067
4068                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4069                 PL_reglastparen = &re->lastparen;
4070                 PL_reglastcloseparen = &re->lastcloseparen;
4071                 re->lastparen = 0;
4072                 re->lastcloseparen = 0;
4073
4074                 PL_reginput = locinput;
4075                 PL_regsize = 0;
4076
4077                 /* XXXX This is too dramatic a measure... */
4078                 PL_reg_maxiter = 0;
4079
4080                 ST.toggle_reg_flags = PL_reg_flags;
4081                 if (RX_UTF8(re_sv))
4082                     PL_reg_flags |= RF_utf8;
4083                 else
4084                     PL_reg_flags &= ~RF_utf8;
4085                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4086
4087                 ST.prev_rex = rex_sv;
4088                 ST.prev_curlyx = cur_curlyx;
4089                 SETREX(rex_sv,re_sv);
4090                 rex = re;
4091                 rexi = rei;
4092                 cur_curlyx = NULL;
4093                 ST.B = next;
4094                 ST.prev_eval = cur_eval;
4095                 cur_eval = st;
4096                 /* now continue from first node in postoned RE */
4097                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4098                 /* NOTREACHED */
4099             }
4100             /* logical is 1,   /(?(?{...})X|Y)/ */
4101             sw = cBOOL(SvTRUE(ret));
4102             logical = 0;
4103             break;
4104         }
4105
4106         case EVAL_AB: /* cleanup after a successful (??{A})B */
4107             /* note: this is called twice; first after popping B, then A */
4108             PL_reg_flags ^= ST.toggle_reg_flags;
4109             ReREFCNT_dec(rex_sv);
4110             SETREX(rex_sv,ST.prev_rex);
4111             rex = (struct regexp *)SvANY(rex_sv);
4112             rexi = RXi_GET(rex);
4113             regcpblow(ST.cp);
4114             cur_eval = ST.prev_eval;
4115             cur_curlyx = ST.prev_curlyx;
4116
4117             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4118             PL_reglastparen = &rex->lastparen;
4119             PL_reglastcloseparen = &rex->lastcloseparen;
4120             /* also update PL_regoffs */
4121             PL_regoffs = rex->offs;
4122
4123             /* XXXX This is too dramatic a measure... */
4124             PL_reg_maxiter = 0;
4125             if ( nochange_depth )
4126                 nochange_depth--;
4127             sayYES;
4128
4129
4130         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4131             /* note: this is called twice; first after popping B, then A */
4132             PL_reg_flags ^= ST.toggle_reg_flags;
4133             ReREFCNT_dec(rex_sv);
4134             SETREX(rex_sv,ST.prev_rex);
4135             rex = (struct regexp *)SvANY(rex_sv);
4136             rexi = RXi_GET(rex);
4137             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4138             PL_reglastparen = &rex->lastparen;
4139             PL_reglastcloseparen = &rex->lastcloseparen;
4140
4141             PL_reginput = locinput;
4142             REGCP_UNWIND(ST.lastcp);
4143             regcppop(rex);
4144             cur_eval = ST.prev_eval;
4145             cur_curlyx = ST.prev_curlyx;
4146             /* XXXX This is too dramatic a measure... */
4147             PL_reg_maxiter = 0;
4148             if ( nochange_depth )
4149                 nochange_depth--;
4150             sayNO_SILENT;
4151 #undef ST
4152
4153         case OPEN:
4154             n = ARG(scan);  /* which paren pair */
4155             PL_reg_start_tmp[n] = locinput;
4156             if (n > PL_regsize)
4157                 PL_regsize = n;
4158             lastopen = n;
4159             break;
4160         case CLOSE:
4161             n = ARG(scan);  /* which paren pair */
4162             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4163             PL_regoffs[n].end = locinput - PL_bostr;
4164             /*if (n > PL_regsize)
4165                 PL_regsize = n;*/
4166             if (n > *PL_reglastparen)
4167                 *PL_reglastparen = n;
4168             *PL_reglastcloseparen = n;
4169             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4170                 goto fake_end;
4171             }
4172             break;
4173         case ACCEPT:
4174             if (ARG(scan)){
4175                 regnode *cursor;
4176                 for (cursor=scan;
4177                      cursor && OP(cursor)!=END;
4178                      cursor=regnext(cursor))
4179                 {
4180                     if ( OP(cursor)==CLOSE ){
4181                         n = ARG(cursor);
4182                         if ( n <= lastopen ) {
4183                             PL_regoffs[n].start
4184                                 = PL_reg_start_tmp[n] - PL_bostr;
4185                             PL_regoffs[n].end = locinput - PL_bostr;
4186                             /*if (n > PL_regsize)
4187                             PL_regsize = n;*/
4188                             if (n > *PL_reglastparen)
4189                                 *PL_reglastparen = n;
4190                             *PL_reglastcloseparen = n;
4191                             if ( n == ARG(scan) || (cur_eval &&
4192                                 cur_eval->u.eval.close_paren == n))
4193                                 break;
4194                         }
4195                     }
4196                 }
4197             }
4198             goto fake_end;
4199             /*NOTREACHED*/
4200         case GROUPP:
4201             n = ARG(scan);  /* which paren pair */
4202             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4203             break;
4204         case NGROUPP:
4205             /* reg_check_named_buff_matched returns 0 for no match */
4206             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4207             break;
4208         case INSUBP:
4209             n = ARG(scan);
4210             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4211             break;
4212         case DEFINEP:
4213             sw = 0;
4214             break;
4215         case IFTHEN:
4216             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4217             if (sw)
4218                 next = NEXTOPER(NEXTOPER(scan));
4219             else {
4220                 next = scan + ARG(scan);
4221                 if (OP(next) == IFTHEN) /* Fake one. */
4222                     next = NEXTOPER(NEXTOPER(next));
4223             }
4224             break;
4225         case LOGICAL:
4226             logical = scan->flags;
4227             break;
4228
4229 /*******************************************************************
4230
4231 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4232 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4233 STAR/PLUS/CURLY/CURLYN are used instead.)
4234
4235 A*B is compiled as <CURLYX><A><WHILEM><B>
4236
4237 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4238 state, which contains the current count, initialised to -1. It also sets
4239 cur_curlyx to point to this state, with any previous value saved in the
4240 state block.
4241
4242 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4243 since the pattern may possibly match zero times (i.e. it's a while {} loop
4244 rather than a do {} while loop).
4245
4246 Each entry to WHILEM represents a successful match of A. The count in the
4247 CURLYX block is incremented, another WHILEM state is pushed, and execution
4248 passes to A or B depending on greediness and the current count.
4249
4250 For example, if matching against the string a1a2a3b (where the aN are
4251 substrings that match /A/), then the match progresses as follows: (the
4252 pushed states are interspersed with the bits of strings matched so far):
4253
4254     <CURLYX cnt=-1>
4255     <CURLYX cnt=0><WHILEM>
4256     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4257     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4258     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4259     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4260
4261 (Contrast this with something like CURLYM, which maintains only a single
4262 backtrack state:
4263
4264     <CURLYM cnt=0> a1
4265     a1 <CURLYM cnt=1> a2
4266     a1 a2 <CURLYM cnt=2> a3
4267     a1 a2 a3 <CURLYM cnt=3> b
4268 )
4269
4270 Each WHILEM state block marks a point to backtrack to upon partial failure
4271 of A or B, and also contains some minor state data related to that
4272 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4273 overall state, such as the count, and pointers to the A and B ops.
4274
4275 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4276 must always point to the *current* CURLYX block, the rules are:
4277
4278 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4279 and set cur_curlyx to point the new block.
4280
4281 When popping the CURLYX block after a successful or unsuccessful match,
4282 restore the previous cur_curlyx.
4283
4284 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4285 to the outer one saved in the CURLYX block.
4286
4287 When popping the WHILEM block after a successful or unsuccessful B match,
4288 restore the previous cur_curlyx.
4289
4290 Here's an example for the pattern (AI* BI)*BO
4291 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4292
4293 cur_
4294 curlyx backtrack stack
4295 ------ ---------------
4296 NULL
4297 CO     <CO prev=NULL> <WO>
4298 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4299 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4300 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4301
4302 At this point the pattern succeeds, and we work back down the stack to
4303 clean up, restoring as we go:
4304
4305 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4306 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4307 CO     <CO prev=NULL> <WO>
4308 NULL
4309
4310 *******************************************************************/
4311
4312 #define ST st->u.curlyx
4313
4314         case CURLYX:    /* start of /A*B/  (for complex A) */
4315         {
4316             /* No need to save/restore up to this paren */
4317             I32 parenfloor = scan->flags;
4318
4319             assert(next); /* keep Coverity happy */
4320             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4321                 next += ARG(next);
4322
4323             /* XXXX Probably it is better to teach regpush to support
4324                parenfloor > PL_regsize... */
4325             if (parenfloor > (I32)*PL_reglastparen)
4326                 parenfloor = *PL_reglastparen; /* Pessimization... */
4327
4328             ST.prev_curlyx= cur_curlyx;
4329             cur_curlyx = st;
4330             ST.cp = PL_savestack_ix;
4331
4332             /* these fields contain the state of the current curly.
4333              * they are accessed by subsequent WHILEMs */
4334             ST.parenfloor = parenfloor;
4335             ST.me = scan;
4336             ST.B = next;
4337             ST.minmod = minmod;
4338             minmod = 0;
4339             ST.count = -1;      /* this will be updated by WHILEM */
4340             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4341
4342             PL_reginput = locinput;
4343             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4344             /* NOTREACHED */
4345         }
4346
4347         case CURLYX_end: /* just finished matching all of A*B */
4348             cur_curlyx = ST.prev_curlyx;
4349             sayYES;
4350             /* NOTREACHED */
4351
4352         case CURLYX_end_fail: /* just failed to match all of A*B */
4353             regcpblow(ST.cp);
4354             cur_curlyx = ST.prev_curlyx;
4355             sayNO;
4356             /* NOTREACHED */
4357
4358
4359 #undef ST
4360 #define ST st->u.whilem
4361
4362         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4363         {
4364             /* see the discussion above about CURLYX/WHILEM */
4365             I32 n;
4366             int min = ARG1(cur_curlyx->u.curlyx.me);
4367             int max = ARG2(cur_curlyx->u.curlyx.me);
4368             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4369
4370             assert(cur_curlyx); /* keep Coverity happy */
4371             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4372             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4373             ST.cache_offset = 0;
4374             ST.cache_mask = 0;
4375
4376             PL_reginput = locinput;
4377
4378             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4379                   "%*s  whilem: matched %ld out of %d..%d\n",
4380                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4381             );
4382
4383             /* First just match a string of min A's. */
4384
4385             if (n < min) {
4386                 cur_curlyx->u.curlyx.lastloc = locinput;
4387                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4388                 /* NOTREACHED */
4389             }
4390
4391             /* If degenerate A matches "", assume A done. */
4392
4393             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4394                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4395                    "%*s  whilem: empty match detected, trying continuation...\n",
4396                    REPORT_CODE_OFF+depth*2, "")
4397                 );
4398                 goto do_whilem_B_max;
4399             }
4400
4401             /* super-linear cache processing */
4402
4403             if (scan->flags) {
4404
4405                 if (!PL_reg_maxiter) {
4406                     /* start the countdown: Postpone detection until we
4407                      * know the match is not *that* much linear. */
4408                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4409                     /* possible overflow for long strings and many CURLYX's */
4410                     if (PL_reg_maxiter < 0)
4411                         PL_reg_maxiter = I32_MAX;
4412                     PL_reg_leftiter = PL_reg_maxiter;
4413                 }
4414
4415                 if (PL_reg_leftiter-- == 0) {
4416                     /* initialise cache */
4417                     const I32 size = (PL_reg_maxiter + 7)/8;
4418                     if (PL_reg_poscache) {
4419                         if ((I32)PL_reg_poscache_size < size) {
4420                             Renew(PL_reg_poscache, size, char);
4421                             PL_reg_poscache_size = size;
4422                         }
4423                         Zero(PL_reg_poscache, size, char);
4424                     }
4425                     else {
4426                         PL_reg_poscache_size = size;
4427                         Newxz(PL_reg_poscache, size, char);
4428                     }
4429                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4430       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4431                               PL_colors[4], PL_colors[5])
4432                     );
4433                 }
4434
4435                 if (PL_reg_leftiter < 0) {
4436                     /* have we already failed at this position? */
4437                     I32 offset, mask;
4438                     offset  = (scan->flags & 0xf) - 1
4439                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4440                     mask    = 1 << (offset % 8);
4441                     offset /= 8;
4442                     if (PL_reg_poscache[offset] & mask) {
4443                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4444                             "%*s  whilem: (cache) already tried at this position...\n",
4445                             REPORT_CODE_OFF+depth*2, "")
4446                         );
4447                         sayNO; /* cache records failure */
4448                     }
4449                     ST.cache_offset = offset;
4450                     ST.cache_mask   = mask;
4451                 }
4452             }
4453
4454             /* Prefer B over A for minimal matching. */
4455
4456             if (cur_curlyx->u.curlyx.minmod) {
4457                 ST.save_curlyx = cur_curlyx;
4458                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4459                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4460                 REGCP_SET(ST.lastcp);
4461                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4462                 /* NOTREACHED */
4463             }
4464
4465             /* Prefer A over B for maximal matching. */
4466
4467             if (n < max) { /* More greed allowed? */
4468                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4469                 cur_curlyx->u.curlyx.lastloc = locinput;
4470                 REGCP_SET(ST.lastcp);
4471                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4472                 /* NOTREACHED */
4473             }
4474             goto do_whilem_B_max;
4475         }
4476         /* NOTREACHED */
4477
4478         case WHILEM_B_min: /* just matched B in a minimal match */
4479         case WHILEM_B_max: /* just matched B in a maximal match */
4480             cur_curlyx = ST.save_curlyx;
4481             sayYES;
4482             /* NOTREACHED */
4483
4484         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4485             cur_curlyx = ST.save_curlyx;
4486             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4487             cur_curlyx->u.curlyx.count--;
4488             CACHEsayNO;
4489             /* NOTREACHED */
4490
4491         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4492             REGCP_UNWIND(ST.lastcp);
4493             regcppop(rex);
4494             /* FALL THROUGH */
4495         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4496             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4497             cur_curlyx->u.curlyx.count--;
4498             CACHEsayNO;
4499             /* NOTREACHED */
4500
4501         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4502             REGCP_UNWIND(ST.lastcp);
4503             regcppop(rex);      /* Restore some previous $<digit>s? */
4504             PL_reginput = locinput;
4505             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4506                 "%*s  whilem: failed, trying continuation...\n",
4507                 REPORT_CODE_OFF+depth*2, "")
4508             );
4509           do_whilem_B_max:
4510             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4511                 && ckWARN(WARN_REGEXP)
4512                 && !(PL_reg_flags & RF_warned))
4513             {
4514                 PL_reg_flags |= RF_warned;
4515                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4516                      "Complex regular subexpression recursion",
4517                      REG_INFTY - 1);
4518             }
4519
4520             /* now try B */
4521             ST.save_curlyx = cur_curlyx;
4522             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4523             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4524             /* NOTREACHED */
4525
4526         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4527             cur_curlyx = ST.save_curlyx;
4528             REGCP_UNWIND(ST.lastcp);
4529             regcppop(rex);
4530
4531             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4532                 /* Maximum greed exceeded */
4533                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4534                     && ckWARN(WARN_REGEXP)
4535                     && !(PL_reg_flags & RF_warned))
4536                 {
4537                     PL_reg_flags |= RF_warned;
4538                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4539                         "%s limit (%d) exceeded",
4540                         "Complex regular subexpression recursion",
4541                         REG_INFTY - 1);
4542                 }
4543                 cur_curlyx->u.curlyx.count--;
4544                 CACHEsayNO;
4545             }
4546
4547             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4548                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4549             );
4550             /* Try grabbing another A and see if it helps. */
4551             PL_reginput = locinput;
4552             cur_curlyx->u.curlyx.lastloc = locinput;
4553             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4554             REGCP_SET(ST.lastcp);
4555             PUSH_STATE_GOTO(WHILEM_A_min,
4556                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4557             /* NOTREACHED */
4558
4559 #undef  ST
4560 #define ST st->u.branch
4561
4562         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4563             next = scan + ARG(scan);
4564             if (next == scan)
4565                 next = NULL;
4566             scan = NEXTOPER(scan);
4567             /* FALL THROUGH */
4568
4569         case BRANCH:        /*  /(...|A|...)/ */
4570             scan = NEXTOPER(scan); /* scan now points to inner node */
4571             ST.lastparen = *PL_reglastparen;
4572             ST.next_branch = next;
4573             REGCP_SET(ST.cp);
4574             PL_reginput = locinput;
4575
4576             /* Now go into the branch */
4577             if (has_cutgroup) {
4578                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4579             } else {
4580                 PUSH_STATE_GOTO(BRANCH_next, scan);
4581             }
4582             /* NOTREACHED */
4583         case CUTGROUP:
4584             PL_reginput = locinput;
4585             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4586                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4587             PUSH_STATE_GOTO(CUTGROUP_next,next);
4588             /* NOTREACHED */
4589         case CUTGROUP_next_fail:
4590             do_cutgroup = 1;
4591             no_final = 1;
4592             if (st->u.mark.mark_name)
4593                 sv_commit = st->u.mark.mark_name;
4594             sayNO;
4595             /* NOTREACHED */
4596         case BRANCH_next:
4597             sayYES;
4598             /* NOTREACHED */
4599         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4600             if (do_cutgroup) {
4601                 do_cutgroup = 0;
4602                 no_final = 0;
4603             }
4604             REGCP_UNWIND(ST.cp);
4605             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4606                 PL_regoffs[n].end = -1;
4607             *PL_reglastparen = n;
4608             /*dmq: *PL_reglastcloseparen = n; */
4609             scan = ST.next_branch;
4610             /* no more branches? */
4611             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4612                 DEBUG_EXECUTE_r({
4613                     PerlIO_printf( Perl_debug_log,
4614                         "%*s  %sBRANCH failed...%s\n",
4615                         REPORT_CODE_OFF+depth*2, "",
4616                         PL_colors[4],
4617                         PL_colors[5] );
4618                 });
4619                 sayNO_SILENT;
4620             }
4621             continue; /* execute next BRANCH[J] op */
4622             /* NOTREACHED */
4623
4624         case MINMOD:
4625             minmod = 1;
4626             break;
4627
4628 #undef  ST
4629 #define ST st->u.curlym
4630
4631         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4632
4633             /* This is an optimisation of CURLYX that enables us to push
4634              * only a single backtracking state, no matter how many matches
4635              * there are in {m,n}. It relies on the pattern being constant
4636              * length, with no parens to influence future backrefs
4637              */
4638
4639             ST.me = scan;
4640             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4641
4642             /* if paren positive, emulate an OPEN/CLOSE around A */
4643             if (ST.me->flags) {
4644                 U32 paren = ST.me->flags;
4645                 if (paren > PL_regsize)
4646                     PL_regsize = paren;
4647                 if (paren > *PL_reglastparen)
4648                     *PL_reglastparen = paren;
4649                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4650             }
4651             ST.A = scan;
4652             ST.B = next;
4653             ST.alen = 0;
4654             ST.count = 0;
4655             ST.minmod = minmod;
4656             minmod = 0;
4657             ST.c1 = CHRTEST_UNINIT;
4658             REGCP_SET(ST.cp);
4659
4660             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4661                 goto curlym_do_B;
4662
4663           curlym_do_A: /* execute the A in /A{m,n}B/  */
4664             PL_reginput = locinput;
4665             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4666             /* NOTREACHED */
4667
4668         case CURLYM_A: /* we've just matched an A */
4669             locinput = st->locinput;
4670             nextchr = UCHARAT(locinput);
4671
4672             ST.count++;
4673             /* after first match, determine A's length: u.curlym.alen */
4674             if (ST.count == 1) {
4675                 if (PL_reg_match_utf8) {
4676                     char *s = locinput;
4677                     while (s < PL_reginput) {
4678                         ST.alen++;
4679                         s += UTF8SKIP(s);
4680                     }
4681                 }
4682                 else {
4683                     ST.alen = PL_reginput - locinput;
4684                 }
4685                 if (ST.alen == 0)
4686                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4687             }
4688             DEBUG_EXECUTE_r(
4689                 PerlIO_printf(Perl_debug_log,
4690                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4691                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4692                           (IV) ST.count, (IV)ST.alen)
4693             );
4694
4695             locinput = PL_reginput;
4696
4697             if (cur_eval && cur_eval->u.eval.close_paren &&
4698                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4699                 goto fake_end;
4700
4701             {
4702                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4703                 if ( max == REG_INFTY || ST.count < max )
4704                     goto curlym_do_A; /* try to match another A */
4705             }
4706             goto curlym_do_B; /* try to match B */
4707
4708         case CURLYM_A_fail: /* just failed to match an A */
4709             REGCP_UNWIND(ST.cp);
4710
4711             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4712                 || (cur_eval && cur_eval->u.eval.close_paren &&
4713                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4714                 sayNO;
4715
4716           curlym_do_B: /* execute the B in /A{m,n}B/  */
4717             PL_reginput = locinput;
4718             if (ST.c1 == CHRTEST_UNINIT) {
4719                 /* calculate c1 and c2 for possible match of 1st char
4720                  * following curly */
4721                 ST.c1 = ST.c2 = CHRTEST_VOID;
4722                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4723                     regnode *text_node = ST.B;
4724                     if (! HAS_TEXT(text_node))
4725                         FIND_NEXT_IMPT(text_node);
4726                     /* this used to be
4727
4728                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4729
4730                         But the former is redundant in light of the latter.
4731
4732                         if this changes back then the macro for
4733                         IS_TEXT and friends need to change.
4734                      */
4735                     if (PL_regkind[OP(text_node)] == EXACT)
4736                     {
4737
4738                         ST.c1 = (U8)*STRING(text_node);
4739                         ST.c2 =
4740                             (IS_TEXTF(text_node))
4741                             ? PL_fold[ST.c1]
4742                             : (IS_TEXTFL(text_node))
4743                                 ? PL_fold_locale[ST.c1]
4744                                 : ST.c1;
4745                     }
4746                 }
4747             }
4748
4749             DEBUG_EXECUTE_r(
4750                 PerlIO_printf(Perl_debug_log,
4751                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4752                     (int)(REPORT_CODE_OFF+(depth*2)),
4753                     "", (IV)ST.count)
4754                 );
4755             if (ST.c1 != CHRTEST_VOID
4756                     && UCHARAT(PL_reginput) != ST.c1
4757                     && UCHARAT(PL_reginput) != ST.c2)
4758             {
4759                 /* simulate B failing */
4760                 DEBUG_OPTIMISE_r(
4761                     PerlIO_printf(Perl_debug_log,
4762                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4763                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4764                         (IV)ST.c1,(IV)ST.c2
4765                 ));
4766                 state_num = CURLYM_B_fail;
4767                 goto reenter_switch;
4768             }
4769
4770             if (ST.me->flags) {
4771                 /* mark current A as captured */
4772                 I32 paren = ST.me->flags;
4773                 if (ST.count) {
4774                     PL_regoffs[paren].start
4775                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4776                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4777                     /*dmq: *PL_reglastcloseparen = paren; */
4778                 }
4779                 else
4780                     PL_regoffs[paren].end = -1;
4781                 if (cur_eval && cur_eval->u.eval.close_paren &&
4782                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4783                 {
4784                     if (ST.count)
4785                         goto fake_end;
4786                     else
4787                         sayNO;
4788                 }
4789             }
4790
4791             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4792             /* NOTREACHED */
4793
4794         case CURLYM_B_fail: /* just failed to match a B */
4795             REGCP_UNWIND(ST.cp);
4796             if (ST.minmod) {
4797                 I32 max = ARG2(ST.me);
4798                 if (max != REG_INFTY && ST.count == max)
4799                     sayNO;
4800                 goto curlym_do_A; /* try to match a further A */
4801             }
4802             /* backtrack one A */
4803             if (ST.count == ARG1(ST.me) /* min */)
4804                 sayNO;
4805             ST.count--;
4806             locinput = HOPc(locinput, -ST.alen);
4807             goto curlym_do_B; /* try to match B */
4808
4809 #undef ST
4810 #define ST st->u.curly
4811
4812 #define CURLY_SETPAREN(paren, success) \
4813     if (paren) { \
4814         if (success) { \
4815             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4816             PL_regoffs[paren].end = locinput - PL_bostr; \
4817             *PL_reglastcloseparen = paren; \
4818         } \
4819         else \
4820             PL_regoffs[paren].end = -1; \
4821     }
4822
4823         case STAR:              /*  /A*B/ where A is width 1 */
4824             ST.paren = 0;
4825             ST.min = 0;
4826             ST.max = REG_INFTY;
4827             scan = NEXTOPER(scan);
4828             goto repeat;
4829         case PLUS:              /*  /A+B/ where A is width 1 */
4830             ST.paren = 0;
4831             ST.min = 1;
4832             ST.max = REG_INFTY;
4833             scan = NEXTOPER(scan);
4834             goto repeat;
4835         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4836             ST.paren = scan->flags;     /* Which paren to set */
4837             if (ST.paren > PL_regsize)
4838                 PL_regsize = ST.paren;
4839             if (ST.paren > *PL_reglastparen)
4840                 *PL_reglastparen = ST.paren;
4841             ST.min = ARG1(scan);  /* min to match */
4842             ST.max = ARG2(scan);  /* max to match */
4843             if (cur_eval && cur_eval->u.eval.close_paren &&
4844                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4845                 ST.min=1;
4846                 ST.max=1;
4847             }
4848             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4849             goto repeat;
4850         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4851             ST.paren = 0;
4852             ST.min = ARG1(scan);  /* min to match */
4853             ST.max = ARG2(scan);  /* max to match */
4854             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4855           repeat:
4856             /*
4857             * Lookahead to avoid useless match attempts
4858             * when we know what character comes next.
4859             *
4860             * Used to only do .*x and .*?x, but now it allows
4861             * for )'s, ('s and (?{ ... })'s to be in the way
4862             * of the quantifier and the EXACT-like node.  -- japhy
4863             */
4864
4865             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4866                 sayNO;
4867             if (HAS_TEXT(next) || JUMPABLE(next)) {
4868                 U8 *s;
4869                 regnode *text_node = next;
4870
4871                 if (! HAS_TEXT(text_node))
4872                     FIND_NEXT_IMPT(text_node);
4873
4874                 if (! HAS_TEXT(text_node))
4875                     ST.c1 = ST.c2 = CHRTEST_VOID;
4876                 else {
4877                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4878                         ST.c1 = ST.c2 = CHRTEST_VOID;
4879                         goto assume_ok_easy;
4880                     }
4881                     else
4882                         s = (U8*)STRING(text_node);
4883
4884                     /*  Currently we only get here when
4885
4886                         PL_rekind[OP(text_node)] == EXACT
4887
4888                         if this changes back then the macro for IS_TEXT and
4889                         friends need to change. */
4890                     if (!UTF_PATTERN) {
4891                         ST.c2 = ST.c1 = *s;
4892                         if (IS_TEXTF(text_node))
4893                             ST.c2 = PL_fold[ST.c1];
4894                         else if (IS_TEXTFL(text_node))
4895                             ST.c2 = PL_fold_locale[ST.c1];
4896                     }
4897                     else { /* UTF_PATTERN */
4898                         if (IS_TEXTF(text_node)) {
4899                              STRLEN ulen1, ulen2;
4900                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4901                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4902
4903                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4904                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4905 #ifdef EBCDIC
4906                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4907                                                     ckWARN(WARN_UTF8) ?
4908                                                     0 : UTF8_ALLOW_ANY);
4909                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4910                                                     ckWARN(WARN_UTF8) ?
4911                                                     0 : UTF8_ALLOW_ANY);
4912 #else
4913                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4914                                                     uniflags);
4915                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4916                                                     uniflags);
4917 #endif
4918                         }
4919                         else {
4920                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4921                                                      uniflags);
4922                         }
4923                     }
4924                 }
4925             }
4926             else
4927                 ST.c1 = ST.c2 = CHRTEST_VOID;
4928         assume_ok_easy:
4929
4930             ST.A = scan;
4931             ST.B = next;
4932             PL_reginput = locinput;
4933             if (minmod) {
4934                 minmod = 0;
4935                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4936                     sayNO;
4937                 ST.count = ST.min;
4938                 locinput = PL_reginput;
4939                 REGCP_SET(ST.cp);
4940                 if (ST.c1 == CHRTEST_VOID)
4941                     goto curly_try_B_min;
4942
4943                 ST.oldloc = locinput;
4944
4945                 /* set ST.maxpos to the furthest point along the
4946                  * string that could possibly match */
4947                 if  (ST.max == REG_INFTY) {
4948                     ST.maxpos = PL_regeol - 1;
4949                     if (utf8_target)
4950                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4951                             ST.maxpos--;
4952                 }
4953                 else if (utf8_target) {
4954                     int m = ST.max - ST.min;
4955                     for (ST.maxpos = locinput;
4956                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4957                         ST.maxpos += UTF8SKIP(ST.maxpos);
4958                 }
4959                 else {
4960                     ST.maxpos = locinput + ST.max - ST.min;
4961                     if (ST.maxpos >= PL_regeol)
4962                         ST.maxpos = PL_regeol - 1;
4963                 }
4964                 goto curly_try_B_min_known;
4965
4966             }
4967             else {
4968                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4969                 locinput = PL_reginput;
4970                 if (ST.count < ST.min)
4971                     sayNO;
4972                 if ((ST.count > ST.min)
4973                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4974                 {
4975                     /* A{m,n} must come at the end of the string, there's
4976                      * no point in backing off ... */
4977                     ST.min = ST.count;
4978                     /* ...except that $ and \Z can match before *and* after
4979                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4980                        We may back off by one in this case. */
4981                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4982                         ST.min--;
4983                 }
4984                 REGCP_SET(ST.cp);
4985                 goto curly_try_B_max;
4986             }
4987             /* NOTREACHED */
4988
4989
4990         case CURLY_B_min_known_fail:
4991             /* failed to find B in a non-greedy match where c1,c2 valid */
4992             if (ST.paren && ST.count)
4993                 PL_regoffs[ST.paren].end = -1;
4994
4995             PL_reginput = locinput;     /* Could be reset... */
4996             REGCP_UNWIND(ST.cp);
4997             /* Couldn't or didn't -- move forward. */
4998             ST.oldloc = locinput;
4999             if (utf8_target)
5000                 locinput += UTF8SKIP(locinput);
5001             else
5002                 locinput++;
5003             ST.count++;
5004           curly_try_B_min_known:
5005              /* find the next place where 'B' could work, then call B */
5006             {
5007                 int n;
5008                 if (utf8_target) {
5009                     n = (ST.oldloc == locinput) ? 0 : 1;
5010                     if (ST.c1 == ST.c2) {
5011                         STRLEN len;
5012                         /* set n to utf8_distance(oldloc, locinput) */
5013                         while (locinput <= ST.maxpos &&
5014                                utf8n_to_uvchr((U8*)locinput,
5015                                               UTF8_MAXBYTES, &len,
5016                                               uniflags) != (UV)ST.c1) {
5017                             locinput += len;
5018                             n++;
5019                         }
5020                     }
5021                     else {
5022                         /* set n to utf8_distance(oldloc, locinput) */
5023                         while (locinput <= ST.maxpos) {
5024                             STRLEN len;
5025                             const UV c = utf8n_to_uvchr((U8*)locinput,
5026                                                   UTF8_MAXBYTES, &len,
5027                                                   uniflags);
5028                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5029                                 break;
5030                             locinput += len;
5031                             n++;
5032                         }
5033                     }
5034                 }
5035                 else {
5036                     if (ST.c1 == ST.c2) {
5037                         while (locinput <= ST.maxpos &&
5038                                UCHARAT(locinput) != ST.c1)
5039                             locinput++;
5040                     }
5041                     else {
5042                         while (locinput <= ST.maxpos
5043                                && UCHARAT(locinput) != ST.c1
5044                                && UCHARAT(locinput) != ST.c2)
5045                             locinput++;
5046                     }
5047                     n = locinput - ST.oldloc;
5048                 }
5049                 if (locinput > ST.maxpos)
5050                     sayNO;
5051                 /* PL_reginput == oldloc now */
5052                 if (n) {
5053                     ST.count += n;
5054                     if (regrepeat(rex, ST.A, n, depth) < n)
5055                         sayNO;
5056                 }
5057                 PL_reginput = locinput;
5058                 CURLY_SETPAREN(ST.paren, ST.count);
5059                 if (cur_eval && cur_eval->u.eval.close_paren &&
5060                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5061                     goto fake_end;
5062                 }
5063                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5064             }
5065             /* NOTREACHED */
5066
5067
5068         case CURLY_B_min_fail:
5069             /* failed to find B in a non-greedy match where c1,c2 invalid */
5070             if (ST.paren && ST.count)
5071                 PL_regoffs[ST.paren].end = -1;
5072
5073             REGCP_UNWIND(ST.cp);
5074             /* failed -- move forward one */
5075             PL_reginput = locinput;
5076             if (regrepeat(rex, ST.A, 1, depth)) {
5077                 ST.count++;
5078                 locinput = PL_reginput;
5079                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5080                         ST.count > 0)) /* count overflow ? */
5081                 {
5082                   curly_try_B_min:
5083                     CURLY_SETPAREN(ST.paren, ST.count);
5084                     if (cur_eval && cur_eval->u.eval.close_paren &&
5085                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5086                         goto fake_end;
5087                     }
5088                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5089                 }
5090             }
5091             sayNO;
5092             /* NOTREACHED */
5093
5094
5095         curly_try_B_max:
5096             /* a successful greedy match: now try to match B */
5097             if (cur_eval && cur_eval->u.eval.close_paren &&
5098                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5099                 goto fake_end;
5100             }
5101             {
5102                 UV c = 0;
5103                 if (ST.c1 != CHRTEST_VOID)
5104                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5105                                            UTF8_MAXBYTES, 0, uniflags)
5106                                 : (UV) UCHARAT(PL_reginput);
5107                 /* If it could work, try it. */
5108                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5109                     CURLY_SETPAREN(ST.paren, ST.count);
5110                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5111                     /* NOTREACHED */
5112                 }
5113             }
5114             /* FALL THROUGH */
5115         case CURLY_B_max_fail:
5116             /* failed to find B in a greedy match */
5117             if (ST.paren && ST.count)
5118                 PL_regoffs[ST.paren].end = -1;
5119
5120             REGCP_UNWIND(ST.cp);
5121             /*  back up. */
5122             if (--ST.count < ST.min)
5123                 sayNO;
5124             PL_reginput = locinput = HOPc(locinput, -1);
5125             goto curly_try_B_max;
5126
5127 #undef ST
5128
5129         case END:
5130             fake_end:
5131             if (cur_eval) {
5132                 /* we've just finished A in /(??{A})B/; now continue with B */
5133                 I32 tmpix;
5134                 st->u.eval.toggle_reg_flags
5135                             = cur_eval->u.eval.toggle_reg_flags;
5136                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5137
5138                 st->u.eval.prev_rex = rex_sv;           /* inner */
5139                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5140                 rex = (struct regexp *)SvANY(rex_sv);
5141                 rexi = RXi_GET(rex);
5142                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5143                 ReREFCNT_inc(rex_sv);
5144                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5145
5146                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5147                 PL_reglastparen = &rex->lastparen;
5148                 PL_reglastcloseparen = &rex->lastcloseparen;
5149
5150                 REGCP_SET(st->u.eval.lastcp);
5151                 PL_reginput = locinput;
5152
5153                 /* Restore parens of the outer rex without popping the
5154                  * savestack */
5155                 tmpix = PL_savestack_ix;
5156                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5157                 regcppop(rex);
5158                 PL_savestack_ix = tmpix;
5159
5160                 st->u.eval.prev_eval = cur_eval;
5161                 cur_eval = cur_eval->u.eval.prev_eval;
5162                 DEBUG_EXECUTE_r(
5163                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5164                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5165                 if ( nochange_depth )
5166                     nochange_depth--;
5167
5168                 PUSH_YES_STATE_GOTO(EVAL_AB,
5169                         st->u.eval.prev_eval->u.eval.B); /* match B */
5170             }
5171
5172             if (locinput < reginfo->till) {
5173                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5174                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5175                                       PL_colors[4],
5176                                       (long)(locinput - PL_reg_starttry),
5177                                       (long)(reginfo->till - PL_reg_starttry),
5178                                       PL_colors[5]));
5179
5180                 sayNO_SILENT;           /* Cannot match: too short. */
5181             }
5182             PL_reginput = locinput;     /* put where regtry can find it */
5183             sayYES;                     /* Success! */
5184
5185         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5186             DEBUG_EXECUTE_r(
5187             PerlIO_printf(Perl_debug_log,
5188                 "%*s  %ssubpattern success...%s\n",
5189                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5190             PL_reginput = locinput;     /* put where regtry can find it */
5191             sayYES;                     /* Success! */
5192
5193 #undef  ST
5194 #define ST st->u.ifmatch
5195
5196         case SUSPEND:   /* (?>A) */
5197             ST.wanted = 1;
5198             PL_reginput = locinput;
5199             goto do_ifmatch;
5200
5201         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5202             ST.wanted = 0;
5203             goto ifmatch_trivial_fail_test;
5204
5205         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5206             ST.wanted = 1;
5207           ifmatch_trivial_fail_test:
5208             if (scan->flags) {
5209                 char * const s = HOPBACKc(locinput, scan->flags);
5210                 if (!s) {
5211                     /* trivial fail */
5212                     if (logical) {
5213                         logical = 0;
5214                         sw = 1 - cBOOL(ST.wanted);
5215                     }
5216                     else if (ST.wanted)
5217                         sayNO;
5218                     next = scan + ARG(scan);
5219                     if (next == scan)
5220                         next = NULL;
5221                     break;
5222                 }
5223                 PL_reginput = s;
5224             }
5225             else
5226                 PL_reginput = locinput;
5227
5228           do_ifmatch:
5229             ST.me = scan;
5230             ST.logical = logical;
5231             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5232
5233             /* execute body of (?...A) */
5234             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5235             /* NOTREACHED */
5236
5237         case IFMATCH_A_fail: /* body of (?...A) failed */
5238             ST.wanted = !ST.wanted;
5239             /* FALL THROUGH */
5240
5241         case IFMATCH_A: /* body of (?...A) succeeded */
5242             if (ST.logical) {
5243                 sw = cBOOL(ST.wanted);
5244             }
5245             else if (!ST.wanted)
5246                 sayNO;
5247
5248             if (OP(ST.me) == SUSPEND)
5249                 locinput = PL_reginput;
5250             else {
5251                 locinput = PL_reginput = st->locinput;
5252                 nextchr = UCHARAT(locinput);
5253             }
5254             scan = ST.me + ARG(ST.me);
5255             if (scan == ST.me)
5256                 scan = NULL;
5257             continue; /* execute B */
5258
5259 #undef ST
5260
5261         case LONGJMP:
5262             next = scan + ARG(scan);
5263             if (next == scan)
5264                 next = NULL;
5265             break;
5266         case COMMIT:
5267             reginfo->cutpoint = PL_regeol;
5268             /* FALLTHROUGH */
5269         case PRUNE:
5270             PL_reginput = locinput;
5271             if (!scan->flags)
5272                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5273             PUSH_STATE_GOTO(COMMIT_next,next);
5274             /* NOTREACHED */
5275         case COMMIT_next_fail:
5276             no_final = 1;
5277             /* FALLTHROUGH */
5278         case OPFAIL:
5279             sayNO;
5280             /* NOTREACHED */
5281
5282 #define ST st->u.mark
5283         case MARKPOINT:
5284             ST.prev_mark = mark_state;
5285             ST.mark_name = sv_commit = sv_yes_mark
5286                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5287             mark_state = st;
5288             ST.mark_loc = PL_reginput = locinput;
5289             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5290             /* NOTREACHED */
5291         case MARKPOINT_next:
5292             mark_state = ST.prev_mark;
5293             sayYES;
5294             /* NOTREACHED */
5295         case MARKPOINT_next_fail:
5296             if (popmark && sv_eq(ST.mark_name,popmark))
5297             {
5298                 if (ST.mark_loc > startpoint)
5299                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5300                 popmark = NULL; /* we found our mark */
5301                 sv_commit = ST.mark_name;
5302
5303                 DEBUG_EXECUTE_r({
5304                         PerlIO_printf(Perl_debug_log,
5305                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5306                             REPORT_CODE_OFF+depth*2, "",
5307                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5308                 });
5309             }
5310             mark_state = ST.prev_mark;
5311             sv_yes_mark = mark_state ?
5312                 mark_state->u.mark.mark_name : NULL;
5313             sayNO;
5314             /* NOTREACHED */
5315         case SKIP:
5316             PL_reginput = locinput;
5317             if (scan->flags) {
5318                 /* (*SKIP) : if we fail we cut here*/
5319                 ST.mark_name = NULL;
5320                 ST.mark_loc = locinput;
5321                 PUSH_STATE_GOTO(SKIP_next,next);
5322             } else {
5323                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5324                    otherwise do nothing.  Meaning we need to scan
5325                  */
5326                 regmatch_state *cur = mark_state;
5327                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5328
5329                 while (cur) {
5330                     if ( sv_eq( cur->u.mark.mark_name,
5331                                 find ) )
5332                     {
5333                         ST.mark_name = find;
5334                         PUSH_STATE_GOTO( SKIP_next, next );
5335                     }
5336                     cur = cur->u.mark.prev_mark;
5337                 }
5338             }
5339             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5340             break;
5341         case SKIP_next_fail:
5342             if (ST.mark_name) {
5343                 /* (*CUT:NAME) - Set up to search for the name as we
5344                    collapse the stack*/
5345                 popmark = ST.mark_name;
5346             } else {
5347                 /* (*CUT) - No name, we cut here.*/
5348                 if (ST.mark_loc > startpoint)
5349                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5350                 /* but we set sv_commit to latest mark_name if there
5351                    is one so they can test to see how things lead to this
5352                    cut */
5353                 if (mark_state)
5354                     sv_commit=mark_state->u.mark.mark_name;
5355             }
5356             no_final = 1;
5357             sayNO;
5358             /* NOTREACHED */
5359 #undef ST
5360         case FOLDCHAR:
5361             n = ARG(scan);
5362             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5363                 locinput += ln;
5364             } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
5365                 sayNO;
5366             } else  {
5367                 U8 folded[UTF8_MAXBYTES_CASE+1];
5368                 STRLEN foldlen;
5369                 const char * const l = locinput;
5370                 char *e = PL_regeol;
5371                 to_uni_fold(n, folded, &foldlen);
5372
5373                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5374                                l, &e, 0,  utf8_target)) {
5375                         sayNO;
5376                 }
5377                 locinput = e;
5378             }
5379             nextchr = UCHARAT(locinput);
5380             break;
5381         case LNBREAK:
5382             if ((n=is_LNBREAK(locinput,utf8_target))) {
5383                 locinput += n;
5384                 nextchr = UCHARAT(locinput);
5385             } else
5386                 sayNO;
5387             break;
5388
5389 #define CASE_CLASS(nAmE)                              \
5390         case nAmE:                                    \
5391             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5392                 locinput += n;                        \
5393                 nextchr = UCHARAT(locinput);          \
5394             } else                                    \
5395                 sayNO;                                \
5396             break;                                    \
5397         case N##nAmE:                                 \
5398             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5399                 sayNO;                                \
5400             } else {                                  \
5401                 locinput += UTF8SKIP(locinput);       \
5402                 nextchr = UCHARAT(locinput);          \
5403             }                                         \
5404             break
5405
5406         CASE_CLASS(VERTWS);
5407         CASE_CLASS(HORIZWS);
5408 #undef CASE_CLASS
5409
5410         default:
5411             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5412                           PTR2UV(scan), OP(scan));
5413             Perl_croak(aTHX_ "regexp memory corruption");
5414
5415         } /* end switch */
5416
5417         /* switch break jumps here */
5418         scan = next; /* prepare to execute the next op and ... */
5419         continue;    /* ... jump back to the top, reusing st */
5420         /* NOTREACHED */
5421
5422       push_yes_state:
5423         /* push a state that backtracks on success */
5424         st->u.yes.prev_yes_state = yes_state;
5425         yes_state = st;
5426         /* FALL THROUGH */
5427       push_state:
5428         /* push a new regex state, then continue at scan  */
5429         {
5430             regmatch_state *newst;
5431
5432             DEBUG_STACK_r({
5433                 regmatch_state *cur = st;
5434                 regmatch_state *curyes = yes_state;
5435                 int curd = depth;
5436                 regmatch_slab *slab = PL_regmatch_slab;
5437                 for (;curd > -1;cur--,curd--) {
5438                     if (cur < SLAB_FIRST(slab)) {
5439                         slab = slab->prev;
5440                         cur = SLAB_LAST(slab);
5441                     }
5442                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5443                         REPORT_CODE_OFF + 2 + depth * 2,"",
5444                         curd, PL_reg_name[cur->resume_state],
5445                         (curyes == cur) ? "yes" : ""
5446                     );
5447                     if (curyes == cur)
5448                         curyes = cur->u.yes.prev_yes_state;
5449                 }
5450             } else
5451                 DEBUG_STATE_pp("push")
5452             );
5453             depth++;
5454             st->locinput = locinput;
5455             newst = st+1;
5456             if (newst >  SLAB_LAST(PL_regmatch_slab))
5457                 newst = S_push_slab(aTHX);
5458             PL_regmatch_state = newst;
5459
5460             locinput = PL_reginput;
5461             nextchr = UCHARAT(locinput);
5462             st = newst;
5463             continue;
5464             /* NOTREACHED */
5465         }
5466     }
5467
5468     /*
5469     * We get here only if there's trouble -- normally "case END" is
5470     * the terminating point.
5471     */
5472     Perl_croak(aTHX_ "corrupted regexp pointers");
5473     /*NOTREACHED*/
5474     sayNO;
5475
5476 yes:
5477     if (yes_state) {
5478         /* we have successfully completed a subexpression, but we must now
5479          * pop to the state marked by yes_state and continue from there */
5480         assert(st != yes_state);
5481 #ifdef DEBUGGING
5482         while (st != yes_state) {
5483             st--;
5484             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5485                 PL_regmatch_slab = PL_regmatch_slab->prev;
5486                 st = SLAB_LAST(PL_regmatch_slab);
5487             }
5488             DEBUG_STATE_r({
5489                 if (no_final) {
5490                     DEBUG_STATE_pp("pop (no final)");
5491                 } else {
5492                     DEBUG_STATE_pp("pop (yes)");
5493                 }
5494             });
5495             depth--;
5496         }
5497 #else
5498         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5499             || yes_state > SLAB_LAST(PL_regmatch_slab))
5500         {
5501             /* not in this slab, pop slab */
5502             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5503             PL_regmatch_slab = PL_regmatch_slab->prev;
5504             st = SLAB_LAST(PL_regmatch_slab);
5505         }
5506         depth -= (st - yes_state);
5507 #endif
5508         st = yes_state;
5509         yes_state = st->u.yes.prev_yes_state;
5510         PL_regmatch_state = st;
5511
5512         if (no_final) {
5513             locinput= st->locinput;
5514             nextchr = UCHARAT(locinput);
5515         }
5516         state_num = st->resume_state + no_final;
5517         goto reenter_switch;
5518     }
5519
5520     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5521                           PL_colors[4], PL_colors[5]));
5522
5523     if (PL_reg_eval_set) {
5524         /* each successfully executed (?{...}) block does the equivalent of
5525          *   local $^R = do {...}
5526          * When popping the save stack, all these locals would be undone;
5527          * bypass this by setting the outermost saved $^R to the latest
5528          * value */
5529         if (oreplsv != GvSV(PL_replgv))
5530             sv_setsv(oreplsv, GvSV(PL_replgv));
5531     }
5532     result = 1;
5533     goto final_exit;
5534
5535 no:
5536     DEBUG_EXECUTE_r(
5537         PerlIO_printf(Perl_debug_log,
5538             "%*s  %sfailed...%s\n",
5539             REPORT_CODE_OFF+depth*2, "",
5540             PL_colors[4], PL_colors[5])
5541         );
5542
5543 no_silent:
5544     if (no_final) {
5545         if (yes_state) {
5546             goto yes;
5547         } else {
5548             goto final_exit;
5549         }
5550     }
5551     if (depth) {
5552         /* there's a previous state to backtrack to */
5553         st--;
5554         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5555             PL_regmatch_slab = PL_regmatch_slab->prev;
5556             st = SLAB_LAST(PL_regmatch_slab);
5557         }
5558         PL_regmatch_state = st;
5559         locinput= st->locinput;
5560         nextchr = UCHARAT(locinput);
5561
5562         DEBUG_STATE_pp("pop");
5563         depth--;
5564         if (yes_state == st)
5565             yes_state = st->u.yes.prev_yes_state;
5566
5567         state_num = st->resume_state + 1; /* failure = success + 1 */
5568         goto reenter_switch;
5569     }
5570     result = 0;
5571
5572   final_exit:
5573     if (rex->intflags & PREGf_VERBARG_SEEN) {
5574         SV *sv_err = get_sv("REGERROR", 1);
5575         SV *sv_mrk = get_sv("REGMARK", 1);
5576         if (result) {
5577             sv_commit = &PL_sv_no;
5578             if (!sv_yes_mark)
5579                 sv_yes_mark = &PL_sv_yes;
5580         } else {
5581             if (!sv_commit)
5582                 sv_commit = &PL_sv_yes;
5583             sv_yes_mark = &PL_sv_no;
5584         }
5585         sv_setsv(sv_err, sv_commit);
5586         sv_setsv(sv_mrk, sv_yes_mark);
5587     }
5588
5589     /* clean up; in particular, free all slabs above current one */
5590     LEAVE_SCOPE(oldsave);
5591
5592     return result;
5593 }
5594
5595 /*
5596  - regrepeat - repeatedly match something simple, report how many
5597  */
5598 /*
5599  * [This routine now assumes that it will only match on things of length 1.
5600  * That was true before, but now we assume scan - reginput is the count,
5601  * rather than incrementing count on every character.  [Er, except utf8.]]
5602  */
5603 STATIC I32
5604 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5605 {
5606     dVAR;
5607     register char *scan;
5608     register I32 c;
5609     register char *loceol = PL_regeol;
5610     register I32 hardcount = 0;
5611     register bool utf8_target = PL_reg_match_utf8;
5612 #ifndef DEBUGGING
5613     PERL_UNUSED_ARG(depth);
5614 #endif
5615
5616     PERL_ARGS_ASSERT_REGREPEAT;
5617
5618     scan = PL_reginput;
5619     if (max == REG_INFTY)
5620         max = I32_MAX;
5621     else if (max < loceol - scan)
5622         loceol = scan + max;
5623     switch (OP(p)) {
5624     case REG_ANY:
5625         if (utf8_target) {
5626             loceol = PL_regeol;
5627             while (scan < loceol && hardcount < max && *scan != '\n') {
5628                 scan += UTF8SKIP(scan);
5629                 hardcount++;
5630             }
5631         } else {
5632             while (scan < loceol && *scan != '\n')
5633                 scan++;
5634         }
5635         break;
5636     case SANY:
5637         if (utf8_target) {
5638             loceol = PL_regeol;
5639             while (scan < loceol && hardcount < max) {
5640                 scan += UTF8SKIP(scan);
5641                 hardcount++;
5642             }
5643         }
5644         else
5645             scan = loceol;
5646         break;
5647     case CANY:
5648         scan = loceol;
5649         break;
5650     case EXACT:         /* length of string is 1 */
5651         c = (U8)*STRING(p);
5652         while (scan < loceol && UCHARAT(scan) == c)
5653             scan++;
5654         break;
5655     case EXACTF:        /* length of string is 1 */
5656         c = (U8)*STRING(p);
5657         while (scan < loceol &&
5658                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5659             scan++;
5660         break;
5661     case EXACTFL:       /* length of string is 1 */
5662         PL_reg_flags |= RF_tainted;
5663         c = (U8)*STRING(p);
5664         while (scan < loceol &&
5665                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5666             scan++;
5667         break;
5668     case ANYOF:
5669         if (utf8_target) {
5670             loceol = PL_regeol;
5671             while (hardcount < max && scan < loceol &&
5672                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5673                 scan += UTF8SKIP(scan);
5674                 hardcount++;
5675             }
5676         } else {
5677             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5678                 scan++;
5679         }
5680         break;
5681     case ALNUM:
5682         if (utf8_target) {
5683             loceol = PL_regeol;
5684             LOAD_UTF8_CHARCLASS_ALNUM();
5685             while (hardcount < max && scan < loceol &&
5686                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
5687                 scan += UTF8SKIP(scan);
5688                 hardcount++;
5689             }
5690         } else {
5691             while (scan < loceol && isALNUM(*scan))
5692                 scan++;
5693         }
5694         break;
5695     case ALNUML:
5696         PL_reg_flags |= RF_tainted;
5697         if (utf8_target) {
5698             loceol = PL_regeol;
5699             while (hardcount < max && scan < loceol &&
5700                    isALNUM_LC_utf8((U8*)scan)) {
5701                 scan += UTF8SKIP(scan);
5702                 hardcount++;
5703             }
5704         } else {
5705             while (scan < loceol && isALNUM_LC(*scan))
5706                 scan++;
5707         }
5708         break;
5709     case NALNUM:
5710         if (utf8_target) {
5711             loceol = PL_regeol;
5712             LOAD_UTF8_CHARCLASS_ALNUM();
5713             while (hardcount < max && scan < loceol &&
5714                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
5715                 scan += UTF8SKIP(scan);
5716                 hardcount++;
5717             }
5718         } else {
5719             while (scan < loceol && !isALNUM(*scan))
5720                 scan++;
5721         }
5722         break;
5723     case NALNUML:
5724         PL_reg_flags |= RF_tainted;
5725         if (utf8_target) {
5726             loceol = PL_regeol;
5727             while (hardcount < max && scan < loceol &&
5728                    !isALNUM_LC_utf8((U8*)scan)) {
5729                 scan += UTF8SKIP(scan);
5730                 hardcount++;
5731             }
5732         } else {
5733             while (scan < loceol && !isALNUM_LC(*scan))
5734                 scan++;
5735         }
5736         break;
5737     case SPACE:
5738         if (utf8_target) {
5739             loceol = PL_regeol;
5740             LOAD_UTF8_CHARCLASS_SPACE();
5741             while (hardcount < max && scan < loceol &&
5742                    (*scan == ' ' ||
5743                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
5744                 scan += UTF8SKIP(scan);
5745                 hardcount++;
5746             }
5747         } else {
5748             while (scan < loceol && isSPACE(*scan))
5749                 scan++;
5750         }
5751         break;
5752     case SPACEL:
5753         PL_reg_flags |= RF_tainted;
5754         if (utf8_target) {
5755             loceol = PL_regeol;
5756             while (hardcount < max && scan < loceol &&
5757                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5758                 scan += UTF8SKIP(scan);
5759                 hardcount++;
5760             }
5761         } else {
5762             while (scan < loceol && isSPACE_LC(*scan))
5763                 scan++;
5764         }
5765         break;
5766     case NSPACE:
5767         if (utf8_target) {
5768             loceol = PL_regeol;
5769             LOAD_UTF8_CHARCLASS_SPACE();
5770             while (hardcount < max && scan < loceol &&
5771                    !(*scan == ' ' ||
5772                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
5773                 scan += UTF8SKIP(scan);
5774                 hardcount++;
5775             }
5776         } else {
5777             while (scan < loceol && !isSPACE(*scan))
5778                 scan++;
5779         }
5780         break;
5781     case NSPACEL:
5782         PL_reg_flags |= RF_tainted;
5783         if (utf8_target) {
5784             loceol = PL_regeol;
5785             while (hardcount < max && scan < loceol &&
5786                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5787                 scan += UTF8SKIP(scan);
5788                 hardcount++;
5789             }
5790         } else {
5791             while (scan < loceol && !isSPACE_LC(*scan))
5792                 scan++;
5793         }
5794         break;
5795     case DIGIT:
5796         if (utf8_target) {
5797             loceol = PL_regeol;
5798             LOAD_UTF8_CHARCLASS_DIGIT();
5799             while (hardcount < max && scan < loceol &&
5800                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5801                 scan += UTF8SKIP(scan);
5802                 hardcount++;
5803             }
5804         } else {
5805             while (scan < loceol && isDIGIT(*scan))
5806                 scan++;
5807         }
5808         break;
5809     case NDIGIT:
5810         if (utf8_target) {
5811             loceol = PL_regeol;
5812             LOAD_UTF8_CHARCLASS_DIGIT();
5813             while (hardcount < max && scan < loceol &&
5814                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5815                 scan += UTF8SKIP(scan);
5816                 hardcount++;
5817             }
5818         } else {
5819             while (scan < loceol && !isDIGIT(*scan))
5820                 scan++;
5821         }
5822     case LNBREAK:
5823         if (utf8_target) {
5824             loceol = PL_regeol;
5825             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5826                 scan += c;
5827                 hardcount++;
5828             }
5829         } else {
5830             /*
5831               LNBREAK can match two latin chars, which is ok,
5832               because we have a null terminated string, but we
5833               have to use hardcount in this situation
5834             */
5835             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5836                 scan+=c;
5837                 hardcount++;
5838             }
5839         }
5840         break;
5841     case HORIZWS:
5842         if (utf8_target) {
5843             loceol = PL_regeol;
5844             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5845                 scan += c;
5846                 hardcount++;
5847             }
5848         } else {
5849             while (scan < loceol && is_HORIZWS_latin1(scan))
5850                 scan++;
5851         }
5852         break;
5853     case NHORIZWS:
5854         if (utf8_target) {
5855             loceol = PL_regeol;
5856             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5857                 scan += UTF8SKIP(scan);
5858                 hardcount++;
5859             }
5860         } else {
5861             while (scan < loceol && !is_HORIZWS_latin1(scan))
5862                 scan++;
5863
5864         }
5865         break;
5866     case VERTWS:
5867         if (utf8_target) {
5868             loceol = PL_regeol;
5869             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5870                 scan += c;
5871                 hardcount++;
5872             }
5873         } else {
5874             while (scan < loceol && is_VERTWS_latin1(scan))
5875                 scan++;
5876
5877         }
5878         break;
5879     case NVERTWS:
5880         if (utf8_target) {
5881             loceol = PL_regeol;
5882             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5883                 scan += UTF8SKIP(scan);
5884                 hardcount++;
5885             }
5886         } else {
5887             while (scan < loceol && !is_VERTWS_latin1(scan))
5888                 scan++;
5889
5890         }
5891         break;
5892
5893     default:            /* Called on something of 0 width. */
5894         break;          /* So match right here or not at all. */
5895     }
5896
5897     if (hardcount)
5898         c = hardcount;
5899     else
5900         c = scan - PL_reginput;
5901     PL_reginput = scan;
5902
5903     DEBUG_r({
5904         GET_RE_DEBUG_FLAGS_DECL;
5905         DEBUG_EXECUTE_r({
5906             SV * const prop = sv_newmortal();
5907             regprop(prog, prop, p);
5908             PerlIO_printf(Perl_debug_log,
5909                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5910                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5911         });
5912     });
5913
5914     return(c);
5915 }
5916
5917
5918 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5919 /*
5920 - regclass_swash - prepare the utf8 swash
5921 */
5922
5923 SV *
5924 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5925 {
5926     dVAR;
5927     SV *sw  = NULL;
5928     SV *si  = NULL;
5929     SV *alt = NULL;
5930     RXi_GET_DECL(prog,progi);
5931     const struct reg_data * const data = prog ? progi->data : NULL;
5932
5933     PERL_ARGS_ASSERT_REGCLASS_SWASH;
5934
5935     if (data && data->count) {
5936         const U32 n = ARG(node);
5937
5938         if (data->what[n] == 's') {
5939             SV * const rv = MUTABLE_SV(data->data[n]);
5940             AV * const av = MUTABLE_AV(SvRV(rv));
5941             SV **const ary = AvARRAY(av);
5942             SV **a, **b;
5943
5944             /* See the end of regcomp.c:S_regclass() for
5945              * documentation of these array elements. */
5946
5947             si = *ary;
5948             a  = SvROK(ary[1]) ? &ary[1] : NULL;
5949             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5950
5951             if (a)
5952                 sw = *a;
5953             else if (si && doinit) {
5954                 sw = swash_init("utf8", "", si, 1, 0);
5955                 (void)av_store(av, 1, sw);
5956             }
5957             if (b)
5958                 alt = *b;
5959         }
5960     }
5961
5962     if (listsvp)
5963         *listsvp = si;
5964     if (altsvp)
5965         *altsvp  = alt;
5966
5967     return sw;
5968 }
5969 #endif
5970
5971 /*
5972  - reginclass - determine if a character falls into a character class
5973
5974   The n is the ANYOF regnode, the p is the target string, lenp
5975   is pointer to the maximum length of how far to go in the p
5976   (if the lenp is zero, UTF8SKIP(p) is used),
5977   utf8_target tells whether the target string is in UTF-8.
5978
5979  */
5980
5981 STATIC bool
5982 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
5983 {
5984     dVAR;
5985     const char flags = ANYOF_FLAGS(n);
5986     bool match = FALSE;
5987     UV c = *p;
5988     STRLEN len = 0;
5989     STRLEN plen;
5990
5991     PERL_ARGS_ASSERT_REGINCLASS;
5992
5993     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
5994         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5995                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5996                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5997                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5998                  * UTF8_ALLOW_FFFF */
5999         if (len == (STRLEN)-1)
6000             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6001     }
6002
6003     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
6004     if (utf8_target || (flags & ANYOF_UNICODE)) {
6005         if (lenp)
6006             *lenp = 0;
6007         if (utf8_target && !ANYOF_RUNTIME(n)) {
6008             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
6009                 match = TRUE;
6010         }
6011         if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
6012             match = TRUE;
6013         if (!match) {
6014             AV *av;
6015             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6016
6017             if (sw) {
6018                 U8 * utf8_p;
6019                 if (utf8_target) {
6020                     utf8_p = (U8 *) p;
6021                 } else {
6022                     STRLEN len = 1;
6023                     utf8_p = bytes_to_utf8(p, &len);
6024                 }
6025                 if (swash_fetch(sw, utf8_p, 1))
6026                     match = TRUE;
6027                 else if (flags & ANYOF_FOLD) {
6028                     if (!match && lenp && av) {
6029                         I32 i;
6030                         for (i = 0; i <= av_len(av); i++) {
6031                             SV* const sv = *av_fetch(av, i, FALSE);
6032                             STRLEN len;
6033                             const char * const s = SvPV_const(sv, len);
6034                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
6035                                 *lenp = len;
6036                                 match = TRUE;
6037                                 break;
6038                             }
6039                         }
6040                     }
6041                     if (!match) {
6042                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6043
6044                         STRLEN tmplen;
6045                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6046                         if (swash_fetch(sw, tmpbuf, 1))
6047                             match = TRUE;
6048                     }
6049                 }
6050
6051                 /* If we allocated a string above, free it */
6052                 if (! utf8_target) Safefree(utf8_p);
6053             }
6054         }
6055         if (match && lenp && *lenp == 0)
6056             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6057     }
6058     if (!match && c < 256) {
6059         if (ANYOF_BITMAP_TEST(n, c))
6060             match = TRUE;
6061         else if (flags & ANYOF_FOLD) {
6062             U8 f;
6063
6064             if (flags & ANYOF_LOCALE) {
6065                 PL_reg_flags |= RF_tainted;
6066                 f = PL_fold_locale[c];
6067             }
6068             else
6069                 f = PL_fold[c];
6070             if (f != c && ANYOF_BITMAP_TEST(n, f))
6071                 match = TRUE;
6072         }
6073
6074         if (!match && (flags & ANYOF_CLASS)) {
6075             PL_reg_flags |= RF_tainted;
6076             if (
6077                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6078                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6079                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6080                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6081                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6082                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6083                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6084                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6085                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6086                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6087                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6088                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6089                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6090                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6091                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6092                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6093                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6094                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6095                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6096                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6097                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6098                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6099                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6100                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6101                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6102                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6103                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6104                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6105                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6106                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6107                 ) /* How's that for a conditional? */
6108             {
6109                 match = TRUE;
6110             }
6111         }
6112     }
6113
6114     return (flags & ANYOF_INVERT) ? !match : match;
6115 }
6116
6117 STATIC U8 *
6118 S_reghop3(U8 *s, I32 off, const U8* lim)
6119 {
6120     dVAR;
6121
6122     PERL_ARGS_ASSERT_REGHOP3;
6123
6124     if (off >= 0) {
6125         while (off-- && s < lim) {
6126             /* XXX could check well-formedness here */
6127             s += UTF8SKIP(s);
6128         }
6129     }
6130     else {
6131         while (off++ && s > lim) {
6132             s--;
6133             if (UTF8_IS_CONTINUED(*s)) {
6134                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6135                     s--;
6136             }
6137             /* XXX could check well-formedness here */
6138         }
6139     }
6140     return s;
6141 }
6142
6143 #ifdef XXX_dmq
6144 /* there are a bunch of places where we use two reghop3's that should
6145    be replaced with this routine. but since thats not done yet
6146    we ifdef it out - dmq
6147 */
6148 STATIC U8 *
6149 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6150 {
6151     dVAR;
6152
6153     PERL_ARGS_ASSERT_REGHOP4;
6154
6155     if (off >= 0) {
6156         while (off-- && s < rlim) {
6157             /* XXX could check well-formedness here */
6158             s += UTF8SKIP(s);
6159         }
6160     }
6161     else {
6162         while (off++ && s > llim) {
6163             s--;
6164             if (UTF8_IS_CONTINUED(*s)) {
6165                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6166                     s--;
6167             }
6168             /* XXX could check well-formedness here */
6169         }
6170     }
6171     return s;
6172 }
6173 #endif
6174
6175 STATIC U8 *
6176 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6177 {
6178     dVAR;
6179
6180     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6181
6182     if (off >= 0) {
6183         while (off-- && s < lim) {
6184             /* XXX could check well-formedness here */
6185             s += UTF8SKIP(s);
6186         }
6187         if (off >= 0)
6188             return NULL;
6189     }
6190     else {
6191         while (off++ && s > lim) {
6192             s--;
6193             if (UTF8_IS_CONTINUED(*s)) {
6194                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6195                     s--;
6196             }
6197             /* XXX could check well-formedness here */
6198         }
6199         if (off <= 0)
6200             return NULL;
6201     }
6202     return s;
6203 }
6204
6205 static void
6206 restore_pos(pTHX_ void *arg)
6207 {
6208     dVAR;
6209     regexp * const rex = (regexp *)arg;
6210     if (PL_reg_eval_set) {
6211         if (PL_reg_oldsaved) {
6212             rex->subbeg = PL_reg_oldsaved;
6213             rex->sublen = PL_reg_oldsavedlen;
6214 #ifdef PERL_OLD_COPY_ON_WRITE
6215             rex->saved_copy = PL_nrs;
6216 #endif
6217             RXp_MATCH_COPIED_on(rex);
6218         }
6219         PL_reg_magic->mg_len = PL_reg_oldpos;
6220         PL_reg_eval_set = 0;
6221         PL_curpm = PL_reg_oldcurpm;
6222     }
6223 }
6224
6225 STATIC void
6226 S_to_utf8_substr(pTHX_ register regexp *prog)
6227 {
6228     int i = 1;
6229
6230     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6231
6232     do {
6233         if (prog->substrs->data[i].substr
6234             && !prog->substrs->data[i].utf8_substr) {
6235             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6236             prog->substrs->data[i].utf8_substr = sv;
6237             sv_utf8_upgrade(sv);
6238             if (SvVALID(prog->substrs->data[i].substr)) {
6239                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6240                 if (flags & FBMcf_TAIL) {
6241                     /* Trim the trailing \n that fbm_compile added last
6242                        time.  */
6243                     SvCUR_set(sv, SvCUR(sv) - 1);
6244                     /* Whilst this makes the SV technically "invalid" (as its
6245                        buffer is no longer followed by "\0") when fbm_compile()
6246                        adds the "\n" back, a "\0" is restored.  */
6247                 }
6248                 fbm_compile(sv, flags);
6249             }
6250             if (prog->substrs->data[i].substr == prog->check_substr)
6251                 prog->check_utf8 = sv;
6252         }
6253     } while (i--);
6254 }
6255
6256 STATIC void
6257 S_to_byte_substr(pTHX_ register regexp *prog)
6258 {
6259     dVAR;
6260     int i = 1;
6261
6262     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6263
6264     do {
6265         if (prog->substrs->data[i].utf8_substr
6266             && !prog->substrs->data[i].substr) {
6267             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6268             if (sv_utf8_downgrade(sv, TRUE)) {
6269                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6270                     const U8 flags
6271                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6272                     if (flags & FBMcf_TAIL) {
6273                         /* Trim the trailing \n that fbm_compile added last
6274                            time.  */
6275                         SvCUR_set(sv, SvCUR(sv) - 1);
6276                     }
6277                     fbm_compile(sv, flags);
6278                 }
6279             } else {
6280                 SvREFCNT_dec(sv);
6281                 sv = &PL_sv_undef;
6282             }
6283             prog->substrs->data[i].substr = sv;
6284             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6285                 prog->check_substr = sv;
6286         }
6287     } while (i--);
6288 }
6289
6290 /*
6291  * Local variables:
6292  * c-indentation-style: bsd
6293  * c-basic-offset: 4
6294  * indent-tabs-mode: t
6295  * End:
6296  *
6297  * ex: set ts=8 sts=4 sw=4 noet:
6298  */