src/5012001/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1               /* tainted information used? */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  99
 100 /*
 101  * Forwards.
 102  */
 103
 104 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 106
 107 #define HOPc(pos,off) \
 108         (char *)(PL_reg_match_utf8 \
 109             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 110             : (U8*)(pos + off))
 111 #define HOPBACKc(pos, off) \
 112         (char*)(PL_reg_match_utf8\
 113             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 114             : (pos - off >= PL_bostr)           \
 115                 ? (U8*)pos - off                \
 116                 : NULL)
 117
 118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 120
 121 /* these are unrolled below in the CCC_TRY_XXX defined */
 122 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 123     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 124
 125 /* Doesn't do an assert to verify that is correct */
 126 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 128
 129 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 130 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 131 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 132
 133 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 134         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 135         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 136         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 137             * assert should likely and hopefully fail on an EBCDIC machine */ \
 138         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 139                                                                             \
 140         /* No asserts are done for these, in case called on an early        \
 141             * Unicode version in which they map to nothing */               \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 149
 150 /*
 151    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 152    so that it is possible to override the option here without having to
 153    rebuild the entire core. as we are required to do if we change regcomp.h
 154    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 155 */
 156 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 157 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 158 #endif
 159
 160 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 162 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 163 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 164 #define RE_utf8_perl_word   PL_utf8_alnum
 165 #define RE_utf8_perl_space  PL_utf8_space
 166 #define RE_utf8_posix_digit PL_utf8_digit
 167 #define perl_word  alnum
 168 #define perl_space space
 169 #define posix_digit digit
 170 #else
 171 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 172 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 173 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 174 #define RE_utf8_perl_word   PL_utf8_perl_word
 175 #define RE_utf8_perl_space  PL_utf8_perl_space
 176 #define RE_utf8_posix_digit PL_utf8_posix_digit
 177 #endif
 178
 179
 180 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 181         case NAMEL:                                                              \
 182             PL_reg_flags |= RF_tainted;                                                 \
 183             /* FALL THROUGH */                                                          \
 184         case NAME:                                                                     \
 185             if (!nextchr)                                                               \
 186                 sayNO;                                                                  \
 187             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 188                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 189                     bool ok;                                                            \
 190                     ENTER;                                                              \
 191                     save_re_context();                                                  \
 192                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 193                     assert(ok);                                                         \
 194                     LEAVE;                                                              \
 195                 }                                                                       \
 196                 if (!(OP(scan) == NAME                                                  \
 197                     ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)   \
 198                     : LCFUNC_utf8((U8*)locinput)))                                      \
 199                 {                                                                       \
 200                     sayNO;                                                              \
 201                 }                                                                       \
 202                 locinput += PL_utf8skip[nextchr];                                       \
 203                 nextchr = UCHARAT(locinput);                                            \
 204                 break;                                                                  \
 205             }                                                                           \
 206             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 207                 sayNO;                                                                  \
 208             nextchr = UCHARAT(++locinput);                                              \
 209             break
 210
 211 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 212         case NAMEL:                                                              \
 213             PL_reg_flags |= RF_tainted;                                                 \
 214             /* FALL THROUGH */                                                          \
 215         case NAME :                                                                     \
 216             if (!nextchr && locinput >= PL_regeol)                                      \
 217                 sayNO;                                                                  \
 218             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 219                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 220                     bool ok;                                                            \
 221                     ENTER;                                                              \
 222                     save_re_context();                                                  \
 223                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 224                     assert(ok);                                                         \
 225                     LEAVE;                                                              \
 226                 }                                                                       \
 227                 if ((OP(scan) == NAME                                                  \
 228                     ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)    \
 229                     : LCFUNC_utf8((U8*)locinput)))                                      \
 230                 {                                                                       \
 231                     sayNO;                                                              \
 232                 }                                                                       \
 233                 locinput += PL_utf8skip[nextchr];                                       \
 234                 nextchr = UCHARAT(locinput);                                            \
 235                 break;                                                                  \
 236             }                                                                           \
 237             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 238                 sayNO;                                                                  \
 239             nextchr = UCHARAT(++locinput);                                              \
 240             break
 241
 242
 243
 244
 245
 246 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 247
 248 /* for use after a quantifier and before an EXACT-like node -- japhy */
 249 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 250 #define JUMPABLE(rn) (      \
 251     OP(rn) == OPEN ||       \
 252     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 253     OP(rn) == EVAL ||   \
 254     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 255     OP(rn) == PLUS || OP(rn) == MINMOD || \
 256     OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 257     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 258 )
 259 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 260
 261 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 262
 263 #if 0
 264 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 265    we don't need this definition. */
 266 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 267 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 268 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 269
 270 #else
 271 /* ... so we use this as its faster. */
 272 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 273 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 274 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 275
 276 #endif
 277
 278 /*
 279   Search for mandatory following text node; for lookahead, the text must
 280   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 281 */
 282 #define FIND_NEXT_IMPT(rn) STMT_START { \
 283     while (JUMPABLE(rn)) { \
 284         const OPCODE type = OP(rn); \
 285         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 286             rn = NEXTOPER(NEXTOPER(rn)); \
 287         else if (type == PLUS) \
 288             rn = NEXTOPER(rn); \
 289         else if (type == IFMATCH) \
 290             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 291         else rn += NEXT_OFF(rn); \
 292     } \
 293 } STMT_END
 294
 295
 296 static void restore_pos(pTHX_ void *arg);
 297
 298 STATIC CHECKPOINT
 299 S_regcppush(pTHX_ I32 parenfloor)
 300 {
 301     dVAR;
 302     const int retval = PL_savestack_ix;
 303 #define REGCP_PAREN_ELEMS 4
 304     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 305     int p;
 306     GET_RE_DEBUG_FLAGS_DECL;
 307
 308     if (paren_elems_to_push < 0)
 309         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 310
 311 #define REGCP_OTHER_ELEMS 7
 312     SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
 313
 314     for (p = PL_regsize; p > parenfloor; p--) {
 315 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 316         SSPUSHINT(PL_regoffs[p].end);
 317         SSPUSHINT(PL_regoffs[p].start);
 318         SSPUSHPTR(PL_reg_start_tmp[p]);
 319         SSPUSHINT(p);
 320         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 321           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 322                       (UV)p, (IV)PL_regoffs[p].start,
 323                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 324                       (IV)PL_regoffs[p].end
 325         ));
 326     }
 327 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 328     SSPUSHPTR(PL_regoffs);
 329     SSPUSHINT(PL_regsize);
 330     SSPUSHINT(*PL_reglastparen);
 331     SSPUSHINT(*PL_reglastcloseparen);
 332     SSPUSHPTR(PL_reginput);
 333 #define REGCP_FRAME_ELEMS 2
 334 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
 335  * are needed for the regexp context stack bookkeeping. */
 336     SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 337     SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
 338
 339     return retval;
 340 }
 341
 342 /* These are needed since we do not localize EVAL nodes: */
 343 #define REGCP_SET(cp)                                           \
 344     DEBUG_STATE_r(                                              \
 345             PerlIO_printf(Perl_debug_log,                       \
 346                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 347                 (IV)PL_savestack_ix));                          \
 348     cp = PL_savestack_ix
 349
 350 #define REGCP_UNWIND(cp)                                        \
 351     DEBUG_STATE_r(                                              \
 352         if (cp != PL_savestack_ix)                              \
 353             PerlIO_printf(Perl_debug_log,                       \
 354                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 355                 (IV)(cp), (IV)PL_savestack_ix));                \
 356     regcpblow(cp)
 357
 358 STATIC char *
 359 S_regcppop(pTHX_ const regexp *rex)
 360 {
 361     dVAR;
 362     U32 i;
 363     char *input;
 364     GET_RE_DEBUG_FLAGS_DECL;
 365
 366     PERL_ARGS_ASSERT_REGCPPOP;
 367
 368     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 369     i = SSPOPINT;
 370     assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 371     i = SSPOPINT; /* Parentheses elements to pop. */
 372     input = (char *) SSPOPPTR;
 373     *PL_reglastcloseparen = SSPOPINT;
 374     *PL_reglastparen = SSPOPINT;
 375     PL_regsize = SSPOPINT;
 376     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 377
 378
 379     /* Now restore the parentheses context. */
 380     for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 381          i > 0; i -= REGCP_PAREN_ELEMS) {
 382         I32 tmps;
 383         U32 paren = (U32)SSPOPINT;
 384         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 385         PL_regoffs[paren].start = SSPOPINT;
 386         tmps = SSPOPINT;
 387         if (paren <= *PL_reglastparen)
 388             PL_regoffs[paren].end = tmps;
 389         DEBUG_BUFFERS_r(
 390             PerlIO_printf(Perl_debug_log,
 391                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 392                           (UV)paren, (IV)PL_regoffs[paren].start,
 393                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 394                           (IV)PL_regoffs[paren].end,
 395                           (paren > *PL_reglastparen ? "(no)" : ""));
 396         );
 397     }
 398     DEBUG_BUFFERS_r(
 399         if (*PL_reglastparen + 1 <= rex->nparens) {
 400             PerlIO_printf(Perl_debug_log,
 401                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 402                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 403         }
 404     );
 405 #if 1
 406     /* It would seem that the similar code in regtry()
 407      * already takes care of this, and in fact it is in
 408      * a better location to since this code can #if 0-ed out
 409      * but the code in regtry() is needed or otherwise tests
 410      * requiring null fields (pat.t#187 and split.t#{13,14}
 411      * (as of patchlevel 7877)  will fail.  Then again,
 412      * this code seems to be necessary or otherwise
 413      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 414      * --jhi updated by dapm */
 415     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 416         if (i > PL_regsize)
 417             PL_regoffs[i].start = -1;
 418         PL_regoffs[i].end = -1;
 419     }
 420 #endif
 421     return input;
 422 }
 423
 424 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 425
 426 /*
 427  * pregexec and friends
 428  */
 429
 430 #ifndef PERL_IN_XSUB_RE
 431 /*
 432  - pregexec - match a regexp against a string
 433  */
 434 I32
 435 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 436          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 437 /* strend: pointer to null at end of string */
 438 /* strbeg: real beginning of string */
 439 /* minend: end of match must be >=minend after stringarg. */
 440 /* nosave: For optimizations. */
 441 {
 442     PERL_ARGS_ASSERT_PREGEXEC;
 443
 444     return
 445         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 446                       nosave ? 0 : REXEC_COPY_STR);
 447 }
 448 #endif
 449
 450 /*
 451  * Need to implement the following flags for reg_anch:
 452  *
 453  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 454  * USE_INTUIT_ML
 455  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 456  * INTUIT_AUTORITATIVE_ML
 457  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 458  * INTUIT_ONCE_ML
 459  *
 460  * Another flag for this function: SECOND_TIME (so that float substrs
 461  * with giant delta may be not rechecked).
 462  */
 463
 464 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 465
 466 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 467    Otherwise, only SvCUR(sv) is used to get strbeg. */
 468
 469 /* XXXX We assume that strpos is strbeg unless sv. */
 470
 471 /* XXXX Some places assume that there is a fixed substring.
 472         An update may be needed if optimizer marks as "INTUITable"
 473         RExen without fixed substrings.  Similarly, it is assumed that
 474         lengths of all the strings are no more than minlen, thus they
 475         cannot come from lookahead.
 476         (Or minlen should take into account lookahead.)
 477   NOTE: Some of this comment is not correct. minlen does now take account
 478   of lookahead/behind. Further research is required. -- demerphq
 479
 480 */
 481
 482 /* A failure to find a constant substring means that there is no need to make
 483    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 484    finding a substring too deep into the string means that less calls to
 485    regtry() should be needed.
 486
 487    REx compiler's optimizer found 4 possible hints:
 488         a) Anchored substring;
 489         b) Fixed substring;
 490         c) Whether we are anchored (beginning-of-line or \G);
 491         d) First node (of those at offset 0) which may distingush positions;
 492    We use a)b)d) and multiline-part of c), and try to find a position in the
 493    string which does not contradict any of them.
 494  */
 495
 496 /* Most of decisions we do here should have been done at compile time.
 497    The nodes of the REx which we used for the search should have been
 498    deleted from the finite automaton. */
 499
 500 char *
 501 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 502                      char *strend, const U32 flags, re_scream_pos_data *data)
 503 {
 504     dVAR;
 505     struct regexp *const prog = (struct regexp *)SvANY(rx);
 506     register I32 start_shift = 0;
 507     /* Should be nonnegative! */
 508     register I32 end_shift   = 0;
 509     register char *s;
 510     register SV *check;
 511     char *strbeg;
 512     char *t;
 513     const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 514     I32 ml_anch;
 515     register char *other_last = NULL;   /* other substr checked before this */
 516     char *check_at = NULL;              /* check substr found at this pos */
 517     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 518     RXi_GET_DECL(prog,progi);
 519 #ifdef DEBUGGING
 520     const char * const i_strpos = strpos;
 521 #endif
 522     GET_RE_DEBUG_FLAGS_DECL;
 523
 524     PERL_ARGS_ASSERT_RE_INTUIT_START;
 525
 526     RX_MATCH_UTF8_set(rx,do_utf8);
 527
 528     if (RX_UTF8(rx)) {
 529         PL_reg_flags |= RF_utf8;
 530     }
 531     DEBUG_EXECUTE_r(
 532         debug_start_match(rx, do_utf8, strpos, strend,
 533             sv ? "Guessing start of match in sv for"
 534                : "Guessing start of match in string for");
 535               );
 536
 537     /* CHR_DIST() would be more correct here but it makes things slow. */
 538     if (prog->minlen > strend - strpos) {
 539         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 540                               "String too short... [re_intuit_start]\n"));
 541         goto fail;
 542     }
 543
 544     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 545     PL_regeol = strend;
 546     if (do_utf8) {
 547         if (!prog->check_utf8 && prog->check_substr)
 548             to_utf8_substr(prog);
 549         check = prog->check_utf8;
 550     } else {
 551         if (!prog->check_substr && prog->check_utf8)
 552             to_byte_substr(prog);
 553         check = prog->check_substr;
 554     }
 555     if (check == &PL_sv_undef) {
 556         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 557                 "Non-utf8 string cannot match utf8 check string\n"));
 558         goto fail;
 559     }
 560     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 561         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 562                      || ( (prog->extflags & RXf_ANCH_BOL)
 563                           && !multiline ) );    /* Check after \n? */
 564
 565         if (!ml_anch) {
 566           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 567                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 568                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 569                && sv && !SvROK(sv)
 570                && (strpos != strbeg)) {
 571               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 572               goto fail;
 573           }
 574           if (prog->check_offset_min == prog->check_offset_max &&
 575               !(prog->extflags & RXf_CANY_SEEN)) {
 576             /* Substring at constant offset from beg-of-str... */
 577             I32 slen;
 578
 579             s = HOP3c(strpos, prog->check_offset_min, strend);
 580
 581             if (SvTAIL(check)) {
 582                 slen = SvCUR(check);    /* >= 1 */
 583
 584                 if ( strend - s > slen || strend - s < slen - 1
 585                      || (strend - s == slen && strend[-1] != '\n')) {
 586                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 587                     goto fail_finish;
 588                 }
 589                 /* Now should match s[0..slen-2] */
 590                 slen--;
 591                 if (slen && (*SvPVX_const(check) != *s
 592                              || (slen > 1
 593                                  && memNE(SvPVX_const(check), s, slen)))) {
 594                   report_neq:
 595                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 596                     goto fail_finish;
 597                 }
 598             }
 599             else if (*SvPVX_const(check) != *s
 600                      || ((slen = SvCUR(check)) > 1
 601                          && memNE(SvPVX_const(check), s, slen)))
 602                 goto report_neq;
 603             check_at = s;
 604             goto success_at_start;
 605           }
 606         }
 607         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 608         s = strpos;
 609         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 610         end_shift = prog->check_end_shift;
 611
 612         if (!ml_anch) {
 613             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 614                                          - (SvTAIL(check) != 0);
 615             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 616
 617             if (end_shift < eshift)
 618                 end_shift = eshift;
 619         }
 620     }
 621     else {                              /* Can match at random position */
 622         ml_anch = 0;
 623         s = strpos;
 624         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 625         end_shift = prog->check_end_shift;
 626
 627         /* end shift should be non negative here */
 628     }
 629
 630 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 631     if (end_shift < 0)
 632         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 633                    (IV)end_shift, RX_PRECOMP(prog));
 634 #endif
 635
 636   restart:
 637     /* Find a possible match in the region s..strend by looking for
 638        the "check" substring in the region corrected by start/end_shift. */
 639
 640     {
 641         I32 srch_start_shift = start_shift;
 642         I32 srch_end_shift = end_shift;
 643         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 644             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 645             srch_start_shift = strbeg - s;
 646         }
 647     DEBUG_OPTIMISE_MORE_r({
 648         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 649             (IV)prog->check_offset_min,
 650             (IV)srch_start_shift,
 651             (IV)srch_end_shift,
 652             (IV)prog->check_end_shift);
 653     });
 654
 655     if (flags & REXEC_SCREAM) {
 656         I32 p = -1;                     /* Internal iterator of scream. */
 657         I32 * const pp = data ? data->scream_pos : &p;
 658
 659         if (PL_screamfirst[BmRARE(check)] >= 0
 660             || ( BmRARE(check) == '\n'
 661                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 662                  && SvTAIL(check) ))
 663             s = screaminstr(sv, check,
 664                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 665         else
 666             goto fail_finish;
 667         /* we may be pointing at the wrong string */
 668         if (s && RXp_MATCH_COPIED(prog))
 669             s = strbeg + (s - SvPVX_const(sv));
 670         if (data)
 671             *data->scream_olds = s;
 672     }
 673     else {
 674         U8* start_point;
 675         U8* end_point;
 676         if (prog->extflags & RXf_CANY_SEEN) {
 677             start_point= (U8*)(s + srch_start_shift);
 678             end_point= (U8*)(strend - srch_end_shift);
 679         } else {
 680             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 681             end_point= HOP3(strend, -srch_end_shift, strbeg);
 682         }
 683         DEBUG_OPTIMISE_MORE_r({
 684             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 685                 (int)(end_point - start_point),
 686                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 687                 start_point);
 688         });
 689
 690         s = fbm_instr( start_point, end_point,
 691                       check, multiline ? FBMrf_MULTILINE : 0);
 692     }
 693     }
 694     /* Update the count-of-usability, remove useless subpatterns,
 695         unshift s.  */
 696
 697     DEBUG_EXECUTE_r({
 698         RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 699             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 700         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 701                           (s ? "Found" : "Did not find"),
 702             (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
 703                 ? "anchored" : "floating"),
 704             quoted,
 705             RE_SV_TAIL(check),
 706             (s ? " at offset " : "...\n") );
 707     });
 708
 709     if (!s)
 710         goto fail_finish;
 711     /* Finish the diagnostic message */
 712     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 713
 714     /* XXX dmq: first branch is for positive lookbehind...
 715        Our check string is offset from the beginning of the pattern.
 716        So we need to do any stclass tests offset forward from that
 717        point. I think. :-(
 718      */
 719
 720
 721
 722     check_at=s;
 723
 724
 725     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 726        Start with the other substr.
 727        XXXX no SCREAM optimization yet - and a very coarse implementation
 728        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 729                 *always* match.  Probably should be marked during compile...
 730        Probably it is right to do no SCREAM here...
 731      */
 732
 733     if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
 734                 : (prog->float_substr && prog->anchored_substr))
 735     {
 736         /* Take into account the "other" substring. */
 737         /* XXXX May be hopelessly wrong for UTF... */
 738         if (!other_last)
 739             other_last = strpos;
 740         if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
 741           do_other_anchored:
 742             {
 743                 char * const last = HOP3c(s, -start_shift, strbeg);
 744                 char *last1, *last2;
 745                 char * const saved_s = s;
 746                 SV* must;
 747
 748                 t = s - prog->check_offset_max;
 749                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 750                     && (!do_utf8
 751                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 752                             && t > strpos)))
 753                     NOOP;
 754                 else
 755                     t = strpos;
 756                 t = HOP3c(t, prog->anchored_offset, strend);
 757                 if (t < other_last)     /* These positions already checked */
 758                     t = other_last;
 759                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 760                 if (last < last1)
 761                     last1 = last;
 762                 /* XXXX It is not documented what units *_offsets are in.
 763                    We assume bytes, but this is clearly wrong.
 764                    Meaning this code needs to be carefully reviewed for errors.
 765                    dmq.
 766                   */
 767
 768                 /* On end-of-str: see comment below. */
 769                 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
 770                 if (must == &PL_sv_undef) {
 771                     s = (char*)NULL;
 772                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 773                 }
 774                 else
 775                     s = fbm_instr(
 776                         (unsigned char*)t,
 777                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 778                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 779                         must,
 780                         multiline ? FBMrf_MULTILINE : 0
 781                     );
 782                 DEBUG_EXECUTE_r({
 783                     RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 784                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 785                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 786                         (s ? "Found" : "Contradicts"),
 787                         quoted, RE_SV_TAIL(must));
 788                 });
 789
 790
 791                 if (!s) {
 792                     if (last1 >= last2) {
 793                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 794                                                 ", giving up...\n"));
 795                         goto fail_finish;
 796                     }
 797                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 798                         ", trying floating at offset %ld...\n",
 799                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 800                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 801                     s = HOP3c(last, 1, strend);
 802                     goto restart;
 803                 }
 804                 else {
 805                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 806                           (long)(s - i_strpos)));
 807                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 808                     other_last = HOP3c(s, 1, strend);
 809                     s = saved_s;
 810                     if (t == strpos)
 811                         goto try_at_start;
 812                     goto try_at_offset;
 813                 }
 814             }
 815         }
 816         else {          /* Take into account the floating substring. */
 817             char *last, *last1;
 818             char * const saved_s = s;
 819             SV* must;
 820
 821             t = HOP3c(s, -start_shift, strbeg);
 822             last1 = last =
 823                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 824             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 825                 last = HOP3c(t, prog->float_max_offset, strend);
 826             s = HOP3c(t, prog->float_min_offset, strend);
 827             if (s < other_last)
 828                 s = other_last;
 829  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 830             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
 831             /* fbm_instr() takes into account exact value of end-of-str
 832                if the check is SvTAIL(ed).  Since false positives are OK,
 833                and end-of-str is not later than strend we are OK. */
 834             if (must == &PL_sv_undef) {
 835                 s = (char*)NULL;
 836                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 837             }
 838             else
 839                 s = fbm_instr((unsigned char*)s,
 840                               (unsigned char*)last + SvCUR(must)
 841                                   - (SvTAIL(must)!=0),
 842                               must, multiline ? FBMrf_MULTILINE : 0);
 843             DEBUG_EXECUTE_r({
 844                 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 845                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 846                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 847                     (s ? "Found" : "Contradicts"),
 848                     quoted, RE_SV_TAIL(must));
 849             });
 850             if (!s) {
 851                 if (last1 == last) {
 852                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 853                                             ", giving up...\n"));
 854                     goto fail_finish;
 855                 }
 856                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 857                     ", trying anchored starting at offset %ld...\n",
 858                     (long)(saved_s + 1 - i_strpos)));
 859                 other_last = last;
 860                 s = HOP3c(t, 1, strend);
 861                 goto restart;
 862             }
 863             else {
 864                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 865                       (long)(s - i_strpos)));
 866                 other_last = s; /* Fix this later. --Hugo */
 867                 s = saved_s;
 868                 if (t == strpos)
 869                     goto try_at_start;
 870                 goto try_at_offset;
 871             }
 872         }
 873     }
 874
 875
 876     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 877
 878     DEBUG_OPTIMISE_MORE_r(
 879         PerlIO_printf(Perl_debug_log,
 880             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 881             (IV)prog->check_offset_min,
 882             (IV)prog->check_offset_max,
 883             (IV)(s-strpos),
 884             (IV)(t-strpos),
 885             (IV)(t-s),
 886             (IV)(strend-strpos)
 887         )
 888     );
 889
 890     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 891         && (!do_utf8
 892             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 893                  && t > strpos)))
 894     {
 895         /* Fixed substring is found far enough so that the match
 896            cannot start at strpos. */
 897       try_at_offset:
 898         if (ml_anch && t[-1] != '\n') {
 899             /* Eventually fbm_*() should handle this, but often
 900                anchored_offset is not 0, so this check will not be wasted. */
 901             /* XXXX In the code below we prefer to look for "^" even in
 902                presence of anchored substrings.  And we search even
 903                beyond the found float position.  These pessimizations
 904                are historical artefacts only.  */
 905           find_anchor:
 906             while (t < strend - prog->minlen) {
 907                 if (*t == '\n') {
 908                     if (t < check_at - prog->check_offset_min) {
 909                         if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
 910                             /* Since we moved from the found position,
 911                                we definitely contradict the found anchored
 912                                substr.  Due to the above check we do not
 913                                contradict "check" substr.
 914                                Thus we can arrive here only if check substr
 915                                is float.  Redo checking for "other"=="fixed".
 916                              */
 917                             strpos = t + 1;
 918                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 919                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 920                             goto do_other_anchored;
 921                         }
 922                         /* We don't contradict the found floating substring. */
 923                         /* XXXX Why not check for STCLASS? */
 924                         s = t + 1;
 925                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 926                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 927                         goto set_useful;
 928                     }
 929                     /* Position contradicts check-string */
 930                     /* XXXX probably better to look for check-string
 931                        than for "\n", so one should lower the limit for t? */
 932                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 933                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 934                     other_last = strpos = s = t + 1;
 935                     goto restart;
 936                 }
 937                 t++;
 938             }
 939             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 940                         PL_colors[0], PL_colors[1]));
 941             goto fail_finish;
 942         }
 943         else {
 944             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 945                         PL_colors[0], PL_colors[1]));
 946         }
 947         s = t;
 948       set_useful:
 949         ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr);    /* hooray/5 */
 950     }
 951     else {
 952         /* The found string does not prohibit matching at strpos,
 953            - no optimization of calling REx engine can be performed,
 954            unless it was an MBOL and we are not after MBOL,
 955            or a future STCLASS check will fail this. */
 956       try_at_start:
 957         /* Even in this situation we may use MBOL flag if strpos is offset
 958            wrt the start of the string. */
 959         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 960             && (strpos != strbeg) && strpos[-1] != '\n'
 961             /* May be due to an implicit anchor of m{.*foo}  */
 962             && !(prog->intflags & PREGf_IMPLICIT))
 963         {
 964             t = strpos;
 965             goto find_anchor;
 966         }
 967         DEBUG_EXECUTE_r( if (ml_anch)
 968             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 969                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 970         );
 971       success_at_start:
 972         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
 973             && (do_utf8 ? (
 974                 prog->check_utf8                /* Could be deleted already */
 975                 && --BmUSEFUL(prog->check_utf8) < 0
 976                 && (prog->check_utf8 == prog->float_utf8)
 977             ) : (
 978                 prog->check_substr              /* Could be deleted already */
 979                 && --BmUSEFUL(prog->check_substr) < 0
 980                 && (prog->check_substr == prog->float_substr)
 981             )))
 982         {
 983             /* If flags & SOMETHING - do not do it many times on the same match */
 984             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 985             /* XXX Does the destruction order has to change with do_utf8? */
 986             SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
 987             SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
 988             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
 989             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
 990             check = NULL;                       /* abort */
 991             s = strpos;
 992             /* XXXX This is a remnant of the old implementation.  It
 993                     looks wasteful, since now INTUIT can use many
 994                     other heuristics. */
 995             prog->extflags &= ~RXf_USE_INTUIT;
 996         }
 997         else
 998             s = strpos;
 999     }
1000
1001     /* Last resort... */
1002     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1003     /* trie stclasses are too expensive to use here, we are better off to
1004        leave it to regmatch itself */
1005     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1006         /* minlen == 0 is possible if regstclass is \b or \B,
1007            and the fixed substr is ''$.
1008            Since minlen is already taken into account, s+1 is before strend;
1009            accidentally, minlen >= 1 guaranties no false positives at s + 1
1010            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1011            regstclass does not come from lookahead...  */
1012         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1013            This leaves EXACTF only, which is dealt with in find_byclass().  */
1014         const U8* const str = (U8*)STRING(progi->regstclass);
1015         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1016                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1017                     : 1);
1018         char * endpos;
1019         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1020             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1021         else if (prog->float_substr || prog->float_utf8)
1022             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1023         else
1024             endpos= strend;
1025
1026         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1027                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1028
1029         t = s;
1030         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1031         if (!s) {
1032 #ifdef DEBUGGING
1033             const char *what = NULL;
1034 #endif
1035             if (endpos == strend) {
1036                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1037                                 "Could not match STCLASS...\n") );
1038                 goto fail;
1039             }
1040             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1041                                    "This position contradicts STCLASS...\n") );
1042             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1043                 goto fail;
1044             /* Contradict one of substrings */
1045             if (prog->anchored_substr || prog->anchored_utf8) {
1046                 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1047                     DEBUG_EXECUTE_r( what = "anchored" );
1048                   hop_and_restart:
1049                     s = HOP3c(t, 1, strend);
1050                     if (s + start_shift + end_shift > strend) {
1051                         /* XXXX Should be taken into account earlier? */
1052                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1053                                                "Could not match STCLASS...\n") );
1054                         goto fail;
1055                     }
1056                     if (!check)
1057                         goto giveup;
1058                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1059                                 "Looking for %s substr starting at offset %ld...\n",
1060                                  what, (long)(s + start_shift - i_strpos)) );
1061                     goto restart;
1062                 }
1063                 /* Have both, check_string is floating */
1064                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1065                     goto retry_floating_check;
1066                 /* Recheck anchored substring, but not floating... */
1067                 s = check_at;
1068                 if (!check)
1069                     goto giveup;
1070                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1071                           "Looking for anchored substr starting at offset %ld...\n",
1072                           (long)(other_last - i_strpos)) );
1073                 goto do_other_anchored;
1074             }
1075             /* Another way we could have checked stclass at the
1076                current position only: */
1077             if (ml_anch) {
1078                 s = t = t + 1;
1079                 if (!check)
1080                     goto giveup;
1081                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                           "Looking for /%s^%s/m starting at offset %ld...\n",
1083                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1084                 goto try_at_offset;
1085             }
1086             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))     /* Could have been deleted */
1087                 goto fail;
1088             /* Check is floating subtring. */
1089           retry_floating_check:
1090             t = check_at - start_shift;
1091             DEBUG_EXECUTE_r( what = "floating" );
1092             goto hop_and_restart;
1093         }
1094         if (t != s) {
1095             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1096                         "By STCLASS: moving %ld --> %ld\n",
1097                                   (long)(t - i_strpos), (long)(s - i_strpos))
1098                    );
1099         }
1100         else {
1101             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1102                                   "Does not contradict STCLASS...\n");
1103                    );
1104         }
1105     }
1106   giveup:
1107     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1108                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1109                           PL_colors[5], (long)(s - i_strpos)) );
1110     return s;
1111
1112   fail_finish:                          /* Substring not found */
1113     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1114         BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1115   fail:
1116     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1117                           PL_colors[4], PL_colors[5]));
1118     return NULL;
1119 }
1120
1121 #define DECL_TRIE_TYPE(scan) \
1122     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1123                     trie_type = (scan->flags != EXACT) \
1124                               ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1125                               : (do_utf8 ? trie_utf8 : trie_plain)
1126
1127 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1128 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1129     switch (trie_type) {                                                    \
1130     case trie_utf8_fold:                                                    \
1131         if ( foldlen>0 ) {                                                  \
1132             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1133             foldlen -= len;                                                 \
1134             uscan += len;                                                   \
1135             len=0;                                                          \
1136         } else {                                                            \
1137             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1138             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1139             foldlen -= UNISKIP( uvc );                                      \
1140             uscan = foldbuf + UNISKIP( uvc );                               \
1141         }                                                                   \
1142         break;                                                              \
1143     case trie_latin_utf8_fold:                                              \
1144         if ( foldlen>0 ) {                                                  \
1145             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1146             foldlen -= len;                                                 \
1147             uscan += len;                                                   \
1148             len=0;                                                          \
1149         } else {                                                            \
1150             len = 1;                                                        \
1151             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1152             foldlen -= UNISKIP( uvc );                                      \
1153             uscan = foldbuf + UNISKIP( uvc );                               \
1154         }                                                                   \
1155         break;                                                              \
1156     case trie_utf8:                                                         \
1157         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1158         break;                                                              \
1159     case trie_plain:                                                        \
1160         uvc = (UV)*uc;                                                      \
1161         len = 1;                                                            \
1162     }                                                                       \
1163     if (uvc < 256) {                                                        \
1164         charid = trie->charmap[ uvc ];                                      \
1165     }                                                                       \
1166     else {                                                                  \
1167         charid = 0;                                                         \
1168         if (widecharmap) {                                                  \
1169             SV** const svpp = hv_fetch(widecharmap,                         \
1170                         (char*)&uvc, sizeof(UV), 0);                        \
1171             if (svpp)                                                       \
1172                 charid = (U16)SvIV(*svpp);                                  \
1173         }                                                                   \
1174     }                                                                       \
1175 } STMT_END
1176
1177 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1178 {                                                      \
1179     char *my_strend= (char *)strend;                   \
1180     if ( (CoNd)                                        \
1181          && (ln == len ||                              \
1182              !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
1183                         m, NULL, ln, (bool)UTF))       \
1184          && (!reginfo || regtry(reginfo, &s)) )        \
1185         goto got_it;                                   \
1186     else {                                             \
1187          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1188          uvchr_to_utf8(tmpbuf, c);                     \
1189          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1190          if ( f != c                                   \
1191               && (f == c1 || f == c2)                  \
1192               && (ln == len ||                         \
1193                 !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
1194                               m, NULL, ln, (bool)UTF)) \
1195               && (!reginfo || regtry(reginfo, &s)) )   \
1196               goto got_it;                             \
1197     }                                                  \
1198 }                                                      \
1199 s += len
1200
1201 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1202 STMT_START {                                              \
1203     while (s <= e) {                                      \
1204         if ( (CoNd)                                       \
1205              && (ln == 1 || !(OP(c) == EXACTF             \
1206                               ? ibcmp(s, m, ln)           \
1207                               : ibcmp_locale(s, m, ln)))  \
1208              && (!reginfo || regtry(reginfo, &s)) )        \
1209             goto got_it;                                  \
1210         s++;                                              \
1211     }                                                     \
1212 } STMT_END
1213
1214 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1215 STMT_START {                                          \
1216     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1217         CoDe                                          \
1218         s += uskip;                                   \
1219     }                                                 \
1220 } STMT_END
1221
1222 #define REXEC_FBC_SCAN(CoDe)                          \
1223 STMT_START {                                          \
1224     while (s < strend) {                              \
1225         CoDe                                          \
1226         s++;                                          \
1227     }                                                 \
1228 } STMT_END
1229
1230 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1231 REXEC_FBC_UTF8_SCAN(                                  \
1232     if (CoNd) {                                       \
1233         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1234             goto got_it;                              \
1235         else                                          \
1236             tmp = doevery;                            \
1237     }                                                 \
1238     else                                              \
1239         tmp = 1;                                      \
1240 )
1241
1242 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1243 REXEC_FBC_SCAN(                                       \
1244     if (CoNd) {                                       \
1245         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1246             goto got_it;                              \
1247         else                                          \
1248             tmp = doevery;                            \
1249     }                                                 \
1250     else                                              \
1251         tmp = 1;                                      \
1252 )
1253
1254 #define REXEC_FBC_TRYIT               \
1255 if ((!reginfo || regtry(reginfo, &s))) \
1256     goto got_it
1257
1258 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1259     if (do_utf8) {                                             \
1260         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1261     }                                                          \
1262     else {                                                     \
1263         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1264     }                                                          \
1265     break
1266
1267 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1268     if (do_utf8) {                                             \
1269         UtFpReLoAd;                                            \
1270         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1271     }                                                          \
1272     else {                                                     \
1273         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1274     }                                                          \
1275     break
1276
1277 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1278     PL_reg_flags |= RF_tainted;                                \
1279     if (do_utf8) {                                             \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }                                                          \
1285     break
1286
1287 #define DUMP_EXEC_POS(li,s,doutf8) \
1288     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1289
1290 /* We know what class REx starts with.  Try to find this position... */
1291 /* if reginfo is NULL, its a dryrun */
1292 /* annoyingly all the vars in this routine have different names from their counterparts
1293    in regmatch. /grrr */
1294
1295 STATIC char *
1296 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1297     const char *strend, regmatch_info *reginfo)
1298 {
1299         dVAR;
1300         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1301         char *m;
1302         STRLEN ln;
1303         STRLEN lnc;
1304         register STRLEN uskip;
1305         unsigned int c1;
1306         unsigned int c2;
1307         char *e;
1308         register I32 tmp = 1;   /* Scratch variable? */
1309         register const bool do_utf8 = PL_reg_match_utf8;
1310         RXi_GET_DECL(prog,progi);
1311
1312         PERL_ARGS_ASSERT_FIND_BYCLASS;
1313
1314         /* We know what class it must start with. */
1315         switch (OP(c)) {
1316         case ANYOF:
1317             if (do_utf8) {
1318                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1319                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1320                           reginclass(prog, c, (U8*)s, 0, do_utf8) :
1321                           REGINCLASS(prog, c, (U8*)s));
1322             }
1323             else {
1324                  while (s < strend) {
1325                       STRLEN skip = 1;
1326
1327                       if (REGINCLASS(prog, c, (U8*)s) ||
1328                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1329                            /* The assignment of 2 is intentional:
1330                             * for the folded sharp s, the skip is 2. */
1331                            (skip = SHARP_S_SKIP))) {
1332                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1333                                 goto got_it;
1334                            else
1335                                 tmp = doevery;
1336                       }
1337                       else
1338                            tmp = 1;
1339                       s += skip;
1340                  }
1341             }
1342             break;
1343         case CANY:
1344             REXEC_FBC_SCAN(
1345                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1346                     goto got_it;
1347                 else
1348                     tmp = doevery;
1349             );
1350             break;
1351         case EXACTF:
1352             m   = STRING(c);
1353             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1354             lnc = (I32) ln;     /* length to match in characters */
1355             if (UTF) {
1356                 STRLEN ulen1, ulen2;
1357                 U8 *sm = (U8 *) m;
1358                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1359                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1360                 /* used by commented-out code below */
1361                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1362
1363                 /* XXX: Since the node will be case folded at compile
1364                    time this logic is a little odd, although im not
1365                    sure that its actually wrong. --dmq */
1366
1367                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1368                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1369
1370                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1371                    codepoint of the first character in the converted
1372                    form, yet originally we did the extra step.
1373                    No tests fail by commenting this code out however
1374                    so Ive left it out. -- dmq.
1375
1376                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1377                                     0, uniflags);
1378                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1379                                     0, uniflags);
1380                 */
1381
1382                 lnc = 0;
1383                 while (sm < ((U8 *) m + ln)) {
1384                     lnc++;
1385                     sm += UTF8SKIP(sm);
1386                 }
1387             }
1388             else {
1389                 c1 = *(U8*)m;
1390                 c2 = PL_fold[c1];
1391             }
1392             goto do_exactf;
1393         case EXACTFL:
1394             m   = STRING(c);
1395             ln  = STR_LEN(c);
1396             lnc = (I32) ln;
1397             c1 = *(U8*)m;
1398             c2 = PL_fold_locale[c1];
1399           do_exactf:
1400             e = HOP3c(strend, -((I32)lnc), s);
1401
1402             if (!reginfo && e < s)
1403                 e = s;                  /* Due to minlen logic of intuit() */
1404
1405             /* The idea in the EXACTF* cases is to first find the
1406              * first character of the EXACTF* node and then, if
1407              * necessary, case-insensitively compare the full
1408              * text of the node.  The c1 and c2 are the first
1409              * characters (though in Unicode it gets a bit
1410              * more complicated because there are more cases
1411              * than just upper and lower: one needs to use
1412              * the so-called folding case for case-insensitive
1413              * matching (called "loose matching" in Unicode).
1414              * ibcmp_utf8() will do just that. */
1415
1416             if (do_utf8 || UTF) {
1417                 UV c, f;
1418                 U8 tmpbuf [UTF8_MAXBYTES+1];
1419                 STRLEN len = 1;
1420                 STRLEN foldlen;
1421                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1422                 if (c1 == c2) {
1423                     /* Upper and lower of 1st char are equal -
1424                      * probably not a "letter". */
1425                     while (s <= e) {
1426                         if (do_utf8) {
1427                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1428                                            uniflags);
1429                         } else {
1430                             c = *((U8*)s);
1431                         }
1432                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1433                     }
1434                 }
1435                 else {
1436                     while (s <= e) {
1437                         if (do_utf8) {
1438                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1439                                            uniflags);
1440                         } else {
1441                             c = *((U8*)s);
1442                         }
1443
1444                         /* Handle some of the three Greek sigmas cases.
1445                          * Note that not all the possible combinations
1446                          * are handled here: some of them are handled
1447                          * by the standard folding rules, and some of
1448                          * them (the character class or ANYOF cases)
1449                          * are handled during compiletime in
1450                          * regexec.c:S_regclass(). */
1451                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1452                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1453                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1454
1455                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1456                     }
1457                 }
1458             }
1459             else {
1460                 /* Neither pattern nor string are UTF8 */
1461                 if (c1 == c2)
1462                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1463                 else
1464                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1465             }
1466             break;
1467         case BOUNDL:
1468             PL_reg_flags |= RF_tainted;
1469             /* FALL THROUGH */
1470         case BOUND:
1471             if (do_utf8) {
1472                 if (s == PL_bostr)
1473                     tmp = '\n';
1474                 else {
1475                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1476                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1477                 }
1478                 tmp = ((OP(c) == BOUND ?
1479                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1480                 LOAD_UTF8_CHARCLASS_ALNUM();
1481                 REXEC_FBC_UTF8_SCAN(
1482                     if (tmp == !(OP(c) == BOUND ?
1483                                  (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1484                                  isALNUM_LC_utf8((U8*)s)))
1485                     {
1486                         tmp = !tmp;
1487                         REXEC_FBC_TRYIT;
1488                 }
1489                 );
1490             }
1491             else {
1492                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1493                 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1494                 REXEC_FBC_SCAN(
1495                     if (tmp ==
1496                         !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1497                         tmp = !tmp;
1498                         REXEC_FBC_TRYIT;
1499                 }
1500                 );
1501             }
1502             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1503                 goto got_it;
1504             break;
1505         case NBOUNDL:
1506             PL_reg_flags |= RF_tainted;
1507             /* FALL THROUGH */
1508         case NBOUND:
1509             if (do_utf8) {
1510                 if (s == PL_bostr)
1511                     tmp = '\n';
1512                 else {
1513                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1514                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1515                 }
1516                 tmp = ((OP(c) == NBOUND ?
1517                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1518                 LOAD_UTF8_CHARCLASS_ALNUM();
1519                 REXEC_FBC_UTF8_SCAN(
1520                     if (tmp == !(OP(c) == NBOUND ?
1521                                  (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1522                                  isALNUM_LC_utf8((U8*)s)))
1523                         tmp = !tmp;
1524                     else REXEC_FBC_TRYIT;
1525                 );
1526             }
1527             else {
1528                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1529                 tmp = ((OP(c) == NBOUND ?
1530                         isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1531                 REXEC_FBC_SCAN(
1532                     if (tmp ==
1533                         !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1534                         tmp = !tmp;
1535                     else REXEC_FBC_TRYIT;
1536                 );
1537             }
1538             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1539                 goto got_it;
1540             break;
1541         case ALNUM:
1542             REXEC_FBC_CSCAN_PRELOAD(
1543                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1544                 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1545                 isALNUM(*s)
1546             );
1547         case ALNUML:
1548             REXEC_FBC_CSCAN_TAINT(
1549                 isALNUM_LC_utf8((U8*)s),
1550                 isALNUM_LC(*s)
1551             );
1552         case NALNUM:
1553             REXEC_FBC_CSCAN_PRELOAD(
1554                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1555                 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1556                 !isALNUM(*s)
1557             );
1558         case NALNUML:
1559             REXEC_FBC_CSCAN_TAINT(
1560                 !isALNUM_LC_utf8((U8*)s),
1561                 !isALNUM_LC(*s)
1562             );
1563         case SPACE:
1564             REXEC_FBC_CSCAN_PRELOAD(
1565                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1566                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1567                 isSPACE(*s)
1568             );
1569         case SPACEL:
1570             REXEC_FBC_CSCAN_TAINT(
1571                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1572                 isSPACE_LC(*s)
1573             );
1574         case NSPACE:
1575             REXEC_FBC_CSCAN_PRELOAD(
1576                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1577                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1578                 !isSPACE(*s)
1579             );
1580         case NSPACEL:
1581             REXEC_FBC_CSCAN_TAINT(
1582                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1583                 !isSPACE_LC(*s)
1584             );
1585         case DIGIT:
1586             REXEC_FBC_CSCAN_PRELOAD(
1587                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1588                 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1589                 isDIGIT(*s)
1590             );
1591         case DIGITL:
1592             REXEC_FBC_CSCAN_TAINT(
1593                 isDIGIT_LC_utf8((U8*)s),
1594                 isDIGIT_LC(*s)
1595             );
1596         case NDIGIT:
1597             REXEC_FBC_CSCAN_PRELOAD(
1598                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1599                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1600                 !isDIGIT(*s)
1601             );
1602         case NDIGITL:
1603             REXEC_FBC_CSCAN_TAINT(
1604                 !isDIGIT_LC_utf8((U8*)s),
1605                 !isDIGIT_LC(*s)
1606             );
1607         case LNBREAK:
1608             REXEC_FBC_CSCAN(
1609                 is_LNBREAK_utf8(s),
1610                 is_LNBREAK_latin1(s)
1611             );
1612         case VERTWS:
1613             REXEC_FBC_CSCAN(
1614                 is_VERTWS_utf8(s),
1615                 is_VERTWS_latin1(s)
1616             );
1617         case NVERTWS:
1618             REXEC_FBC_CSCAN(
1619                 !is_VERTWS_utf8(s),
1620                 !is_VERTWS_latin1(s)
1621             );
1622         case HORIZWS:
1623             REXEC_FBC_CSCAN(
1624                 is_HORIZWS_utf8(s),
1625                 is_HORIZWS_latin1(s)
1626             );
1627         case NHORIZWS:
1628             REXEC_FBC_CSCAN(
1629                 !is_HORIZWS_utf8(s),
1630                 !is_HORIZWS_latin1(s)
1631             );
1632         case AHOCORASICKC:
1633         case AHOCORASICK:
1634             {
1635                 DECL_TRIE_TYPE(c);
1636                 /* what trie are we using right now */
1637                 reg_ac_data *aho
1638                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1639                 reg_trie_data *trie
1640                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1641                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1642
1643                 const char *last_start = strend - trie->minlen;
1644 #ifdef DEBUGGING
1645                 const char *real_start = s;
1646 #endif
1647                 STRLEN maxlen = trie->maxlen;
1648                 SV *sv_points;
1649                 U8 **points; /* map of where we were in the input string
1650                                 when reading a given char. For ASCII this
1651                                 is unnecessary overhead as the relationship
1652                                 is always 1:1, but for Unicode, especially
1653                                 case folded Unicode this is not true. */
1654                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1655                 U8 *bitmap=NULL;
1656
1657
1658                 GET_RE_DEBUG_FLAGS_DECL;
1659
1660                 /* We can't just allocate points here. We need to wrap it in
1661                  * an SV so it gets freed properly if there is a croak while
1662                  * running the match */
1663                 ENTER;
1664                 SAVETMPS;
1665                 sv_points=newSV(maxlen * sizeof(U8 *));
1666                 SvCUR_set(sv_points,
1667                     maxlen * sizeof(U8 *));
1668                 SvPOK_on(sv_points);
1669                 sv_2mortal(sv_points);
1670                 points=(U8**)SvPV_nolen(sv_points );
1671                 if ( trie_type != trie_utf8_fold
1672                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1673                 {
1674                     if (trie->bitmap)
1675                         bitmap=(U8*)trie->bitmap;
1676                     else
1677                         bitmap=(U8*)ANYOF_BITMAP(c);
1678                 }
1679                 /* this is the Aho-Corasick algorithm modified a touch
1680                    to include special handling for long "unknown char"
1681                    sequences. The basic idea being that we use AC as long
1682                    as we are dealing with a possible matching char, when
1683                    we encounter an unknown char (and we have not encountered
1684                    an accepting state) we scan forward until we find a legal
1685                    starting char.
1686                    AC matching is basically that of trie matching, except
1687                    that when we encounter a failing transition, we fall back
1688                    to the current states "fail state", and try the current char
1689                    again, a process we repeat until we reach the root state,
1690                    state 1, or a legal transition. If we fail on the root state
1691                    then we can either terminate if we have reached an accepting
1692                    state previously, or restart the entire process from the beginning
1693                    if we have not.
1694
1695                  */
1696                 while (s <= last_start) {
1697                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1698                     U8 *uc = (U8*)s;
1699                     U16 charid = 0;
1700                     U32 base = 1;
1701                     U32 state = 1;
1702                     UV uvc = 0;
1703                     STRLEN len = 0;
1704                     STRLEN foldlen = 0;
1705                     U8 *uscan = (U8*)NULL;
1706                     U8 *leftmost = NULL;
1707 #ifdef DEBUGGING
1708                     U32 accepted_word= 0;
1709 #endif
1710                     U32 pointpos = 0;
1711
1712                     while ( state && uc <= (U8*)strend ) {
1713                         int failed=0;
1714                         U32 word = aho->states[ state ].wordnum;
1715
1716                         if( state==1 ) {
1717                             if ( bitmap ) {
1718                                 DEBUG_TRIE_EXECUTE_r(
1719                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1720                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1721                                             (char *)uc, do_utf8 );
1722                                         PerlIO_printf( Perl_debug_log,
1723                                             " Scanning for legal start char...\n");
1724                                     }
1725                                 );
1726                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1727                                     uc++;
1728                                 }
1729                                 s= (char *)uc;
1730                             }
1731                             if (uc >(U8*)last_start) break;
1732                         }
1733
1734                         if ( word ) {
1735                             U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1736                             if (!leftmost || lpos < leftmost) {
1737                                 DEBUG_r(accepted_word=word);
1738                                 leftmost= lpos;
1739                             }
1740                             if (base==0) break;
1741
1742                         }
1743                         points[pointpos++ % maxlen]= uc;
1744                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1745                                              uscan, len, uvc, charid, foldlen,
1746                                              foldbuf, uniflags);
1747                         DEBUG_TRIE_EXECUTE_r({
1748                             dump_exec_pos( (char *)uc, c, strend, real_start,
1749                                 s,   do_utf8 );
1750                             PerlIO_printf(Perl_debug_log,
1751                                 " Charid:%3u CP:%4"UVxf" ",
1752                                  charid, uvc);
1753                         });
1754
1755                         do {
1756 #ifdef DEBUGGING
1757                             word = aho->states[ state ].wordnum;
1758 #endif
1759                             base = aho->states[ state ].trans.base;
1760
1761                             DEBUG_TRIE_EXECUTE_r({
1762                                 if (failed)
1763                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1764                                         s,   do_utf8 );
1765                                 PerlIO_printf( Perl_debug_log,
1766                                     "%sState: %4"UVxf", word=%"UVxf,
1767                                     failed ? " Fail transition to " : "",
1768                                     (UV)state, (UV)word);
1769                             });
1770                             if ( base ) {
1771                                 U32 tmp;
1772                                 if (charid &&
1773                                      (base + charid > trie->uniquecharcount )
1774                                      && (base + charid - 1 - trie->uniquecharcount
1775                                             < trie->lasttrans)
1776                                      && trie->trans[base + charid - 1 -
1777                                             trie->uniquecharcount].check == state
1778                                      && (tmp=trie->trans[base + charid - 1 -
1779                                         trie->uniquecharcount ].next))
1780                                 {
1781                                     DEBUG_TRIE_EXECUTE_r(
1782                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1783                                     state = tmp;
1784                                     break;
1785                                 }
1786                                 else {
1787                                     DEBUG_TRIE_EXECUTE_r(
1788                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1789                                     failed = 1;
1790                                     state = aho->fail[state];
1791                                 }
1792                             }
1793                             else {
1794                                 /* we must be accepting here */
1795                                 DEBUG_TRIE_EXECUTE_r(
1796                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1797                                 failed = 1;
1798                                 break;
1799                             }
1800                         } while(state);
1801                         uc += len;
1802                         if (failed) {
1803                             if (leftmost)
1804                                 break;
1805                             if (!state) state = 1;
1806                         }
1807                     }
1808                     if ( aho->states[ state ].wordnum ) {
1809                         U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1810                         if (!leftmost || lpos < leftmost) {
1811                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1812                             leftmost = lpos;
1813                         }
1814                     }
1815                     if (leftmost) {
1816                         s = (char*)leftmost;
1817                         DEBUG_TRIE_EXECUTE_r({
1818                             PerlIO_printf(
1819                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1820                                 (UV)accepted_word, (IV)(s - real_start)
1821                             );
1822                         });
1823                         if (!reginfo || regtry(reginfo, &s)) {
1824                             FREETMPS;
1825                             LEAVE;
1826                             goto got_it;
1827                         }
1828                         s = HOPc(s,1);
1829                         DEBUG_TRIE_EXECUTE_r({
1830                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1831                         });
1832                     } else {
1833                         DEBUG_TRIE_EXECUTE_r(
1834                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1835                         break;
1836                     }
1837                 }
1838                 FREETMPS;
1839                 LEAVE;
1840             }
1841             break;
1842         default:
1843             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1844             break;
1845         }
1846         return 0;
1847       got_it:
1848         return s;
1849 }
1850
1851
1852 /*
1853  - regexec_flags - match a regexp against a string
1854  */
1855 I32
1856 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1857               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1858 /* strend: pointer to null at end of string */
1859 /* strbeg: real beginning of string */
1860 /* minend: end of match must be >=minend after stringarg. */
1861 /* data: May be used for some additional optimizations.
1862          Currently its only used, with a U32 cast, for transmitting
1863          the ganch offset when doing a /g match. This will change */
1864 /* nosave: For optimizations. */
1865 {
1866     dVAR;
1867     struct regexp *const prog = (struct regexp *)SvANY(rx);
1868     /*register*/ char *s;
1869     register regnode *c;
1870     /*register*/ char *startpos = stringarg;
1871     I32 minlen;         /* must match at least this many chars */
1872     I32 dontbother = 0; /* how many characters not to try at end */
1873     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1874     I32 scream_pos = -1;                /* Internal iterator of scream. */
1875     char *scream_olds = NULL;
1876     const bool do_utf8 = (bool)DO_UTF8(sv);
1877     I32 multiline;
1878     RXi_GET_DECL(prog,progi);
1879     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1880     regexp_paren_pair *swap = NULL;
1881     GET_RE_DEBUG_FLAGS_DECL;
1882
1883     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1884     PERL_UNUSED_ARG(data);
1885
1886     /* Be paranoid... */
1887     if (prog == NULL || startpos == NULL) {
1888         Perl_croak(aTHX_ "NULL regexp parameter");
1889         return 0;
1890     }
1891
1892     multiline = prog->extflags & RXf_PMf_MULTILINE;
1893     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1894
1895     RX_MATCH_UTF8_set(rx, do_utf8);
1896     DEBUG_EXECUTE_r(
1897         debug_start_match(rx, do_utf8, startpos, strend,
1898         "Matching");
1899     );
1900
1901     minlen = prog->minlen;
1902
1903     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1904         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1905                               "String too short [regexec_flags]...\n"));
1906         goto phooey;
1907     }
1908
1909
1910     /* Check validity of program. */
1911     if (UCHARAT(progi->program) != REG_MAGIC) {
1912         Perl_croak(aTHX_ "corrupted regexp program");
1913     }
1914
1915     PL_reg_flags = 0;
1916     PL_reg_eval_set = 0;
1917     PL_reg_maxiter = 0;
1918
1919     if (RX_UTF8(rx))
1920         PL_reg_flags |= RF_utf8;
1921
1922     /* Mark beginning of line for ^ and lookbehind. */
1923     reginfo.bol = startpos; /* XXX not used ??? */
1924     PL_bostr  = strbeg;
1925     reginfo.sv = sv;
1926
1927     /* Mark end of line for $ (and such) */
1928     PL_regeol = strend;
1929
1930     /* see how far we have to get to not match where we matched before */
1931     reginfo.till = startpos+minend;
1932
1933     /* If there is a "must appear" string, look for it. */
1934     s = startpos;
1935
1936     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1937         MAGIC *mg;
1938         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1939             reginfo.ganch = startpos + prog->gofs;
1940             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1941               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1942         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1943                   && SvMAGIC(sv)
1944                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1945                   && mg->mg_len >= 0) {
1946             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
1947             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1948                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1949
1950             if (prog->extflags & RXf_ANCH_GPOS) {
1951                 if (s > reginfo.ganch)
1952                     goto phooey;
1953                 s = reginfo.ganch - prog->gofs;
1954                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1955                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1956                 if (s < strbeg)
1957                     goto phooey;
1958             }
1959         }
1960         else if (data) {
1961             reginfo.ganch = strbeg + PTR2UV(data);
1962             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1963                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1964
1965         } else {                                /* pos() not defined */
1966             reginfo.ganch = strbeg;
1967             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1968                  "GPOS: reginfo.ganch = strbeg\n"));
1969         }
1970     }
1971     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1972         /* We have to be careful. If the previous successful match
1973            was from this regex we don't want a subsequent partially
1974            successful match to clobber the old results.
1975            So when we detect this possibility we add a swap buffer
1976            to the re, and switch the buffer each match. If we fail
1977            we switch it back, otherwise we leave it swapped.
1978         */
1979         swap = prog->offs;
1980         /* do we need a save destructor here for eval dies? */
1981         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1982     }
1983     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1984         re_scream_pos_data d;
1985
1986         d.scream_olds = &scream_olds;
1987         d.scream_pos = &scream_pos;
1988         s = re_intuit_start(rx, sv, s, strend, flags, &d);
1989         if (!s) {
1990             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1991             goto phooey;        /* not present */
1992         }
1993     }
1994
1995
1996
1997     /* Simplest case:  anchored match need be tried only once. */
1998     /*  [unless only anchor is BOL and multiline is set] */
1999     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2000         if (s == startpos && regtry(&reginfo, &startpos))
2001             goto got_it;
2002         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2003                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2004         {
2005             char *end;
2006
2007             if (minlen)
2008                 dontbother = minlen - 1;
2009             end = HOP3c(strend, -dontbother, strbeg) - 1;
2010             /* for multiline we only have to try after newlines */
2011             if (prog->check_substr || prog->check_utf8) {
2012                 if (s == startpos)
2013                     goto after_try;
2014                 while (1) {
2015                     if (regtry(&reginfo, &s))
2016                         goto got_it;
2017                   after_try:
2018                     if (s > end)
2019                         goto phooey;
2020                     if (prog->extflags & RXf_USE_INTUIT) {
2021                         s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2022                         if (!s)
2023                             goto phooey;
2024                     }
2025                     else
2026                         s++;
2027                 }
2028             } else {
2029                 if (s > startpos)
2030                     s--;
2031                 while (s < end) {
2032                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2033                         if (regtry(&reginfo, &s))
2034                             goto got_it;
2035                     }
2036                 }
2037             }
2038         }
2039         goto phooey;
2040     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2041     {
2042         /* the warning about reginfo.ganch being used without intialization
2043            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2044            and we only enter this block when the same bit is set. */
2045         char *tmp_s = reginfo.ganch - prog->gofs;
2046
2047         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2048             goto got_it;
2049         goto phooey;
2050     }
2051
2052     /* Messy cases:  unanchored match. */
2053     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2054         /* we have /x+whatever/ */
2055         /* it must be a one character string (XXXX Except UTF?) */
2056         char ch;
2057 #ifdef DEBUGGING
2058         int did_match = 0;
2059 #endif
2060         if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2061             do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2062         ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2063
2064         if (do_utf8) {
2065             REXEC_FBC_SCAN(
2066                 if (*s == ch) {
2067                     DEBUG_EXECUTE_r( did_match = 1 );
2068                     if (regtry(&reginfo, &s)) goto got_it;
2069                     s += UTF8SKIP(s);
2070                     while (s < strend && *s == ch)
2071                         s += UTF8SKIP(s);
2072                 }
2073             );
2074         }
2075         else {
2076             REXEC_FBC_SCAN(
2077                 if (*s == ch) {
2078                     DEBUG_EXECUTE_r( did_match = 1 );
2079                     if (regtry(&reginfo, &s)) goto got_it;
2080                     s++;
2081                     while (s < strend && *s == ch)
2082                         s++;
2083                 }
2084             );
2085         }
2086         DEBUG_EXECUTE_r(if (!did_match)
2087                 PerlIO_printf(Perl_debug_log,
2088                                   "Did not find anchored character...\n")
2089                );
2090     }
2091     else if (prog->anchored_substr != NULL
2092               || prog->anchored_utf8 != NULL
2093               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2094                   && prog->float_max_offset < strend - s)) {
2095         SV *must;
2096         I32 back_max;
2097         I32 back_min;
2098         char *last;
2099         char *last1;            /* Last position checked before */
2100 #ifdef DEBUGGING
2101         int did_match = 0;
2102 #endif
2103         if (prog->anchored_substr || prog->anchored_utf8) {
2104             if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2105                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2106             must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2107             back_max = back_min = prog->anchored_offset;
2108         } else {
2109             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2110                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2111             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2112             back_max = prog->float_max_offset;
2113             back_min = prog->float_min_offset;
2114         }
2115
2116
2117         if (must == &PL_sv_undef)
2118             /* could not downgrade utf8 check substring, so must fail */
2119             goto phooey;
2120
2121         if (back_min<0) {
2122             last = strend;
2123         } else {
2124             last = HOP3c(strend,        /* Cannot start after this */
2125                   -(I32)(CHR_SVLEN(must)
2126                          - (SvTAIL(must) != 0) + back_min), strbeg);
2127         }
2128         if (s > PL_bostr)
2129             last1 = HOPc(s, -1);
2130         else
2131             last1 = s - 1;      /* bogus */
2132
2133         /* XXXX check_substr already used to find "s", can optimize if
2134            check_substr==must. */
2135         scream_pos = -1;
2136         dontbother = end_shift;
2137         strend = HOPc(strend, -dontbother);
2138         while ( (s <= last) &&
2139                 ((flags & REXEC_SCREAM)
2140                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2141                                     end_shift, &scream_pos, 0))
2142                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2143                                   (unsigned char*)strend, must,
2144                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2145             /* we may be pointing at the wrong string */
2146             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2147                 s = strbeg + (s - SvPVX_const(sv));
2148             DEBUG_EXECUTE_r( did_match = 1 );
2149             if (HOPc(s, -back_max) > last1) {
2150                 last1 = HOPc(s, -back_min);
2151                 s = HOPc(s, -back_max);
2152             }
2153             else {
2154                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2155
2156                 last1 = HOPc(s, -back_min);
2157                 s = t;
2158             }
2159             if (do_utf8) {
2160                 while (s <= last1) {
2161                     if (regtry(&reginfo, &s))
2162                         goto got_it;
2163                     s += UTF8SKIP(s);
2164                 }
2165             }
2166             else {
2167                 while (s <= last1) {
2168                     if (regtry(&reginfo, &s))
2169                         goto got_it;
2170                     s++;
2171                 }
2172             }
2173         }
2174         DEBUG_EXECUTE_r(if (!did_match) {
2175             RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2176                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2177             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2178                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2179                                ? "anchored" : "floating"),
2180                 quoted, RE_SV_TAIL(must));
2181         });
2182         goto phooey;
2183     }
2184     else if ( (c = progi->regstclass) ) {
2185         if (minlen) {
2186             const OPCODE op = OP(progi->regstclass);
2187             /* don't bother with what can't match */
2188             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2189                 strend = HOPc(strend, -(minlen - 1));
2190         }
2191         DEBUG_EXECUTE_r({
2192             SV * const prop = sv_newmortal();
2193             regprop(prog, prop, c);
2194             {
2195                 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2196                     s,strend-s,60);
2197                 PerlIO_printf(Perl_debug_log,
2198                     "Matching stclass %.*s against %s (%d chars)\n",
2199                     (int)SvCUR(prop), SvPVX_const(prop),
2200                      quoted, (int)(strend - s));
2201             }
2202         });
2203         if (find_byclass(prog, c, s, strend, &reginfo))
2204             goto got_it;
2205         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2206     }
2207     else {
2208         dontbother = 0;
2209         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2210             /* Trim the end. */
2211             char *last;
2212             SV* float_real;
2213
2214             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2215                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2216             float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2217
2218             if (flags & REXEC_SCREAM) {
2219                 last = screaminstr(sv, float_real, s - strbeg,
2220                                    end_shift, &scream_pos, 1); /* last one */
2221                 if (!last)
2222                     last = scream_olds; /* Only one occurrence. */
2223                 /* we may be pointing at the wrong string */
2224                 else if (RXp_MATCH_COPIED(prog))
2225                     s = strbeg + (s - SvPVX_const(sv));
2226             }
2227             else {
2228                 STRLEN len;
2229                 const char * const little = SvPV_const(float_real, len);
2230
2231                 if (SvTAIL(float_real)) {
2232                     if (memEQ(strend - len + 1, little, len - 1))
2233                         last = strend - len + 1;
2234                     else if (!multiline)
2235                         last = memEQ(strend - len, little, len)
2236                             ? strend - len : NULL;
2237                     else
2238                         goto find_last;
2239                 } else {
2240                   find_last:
2241                     if (len)
2242                         last = rninstr(s, strend, little, little + len);
2243                     else
2244                         last = strend;  /* matching "$" */
2245                 }
2246             }
2247             if (last == NULL) {
2248                 DEBUG_EXECUTE_r(
2249                     PerlIO_printf(Perl_debug_log,
2250                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2251                         PL_colors[4], PL_colors[5]));
2252                 goto phooey; /* Should not happen! */
2253             }
2254             dontbother = strend - last + prog->float_min_offset;
2255         }
2256         if (minlen && (dontbother < minlen))
2257             dontbother = minlen - 1;
2258         strend -= dontbother;              /* this one's always in bytes! */
2259         /* We don't know much -- general case. */
2260         if (do_utf8) {
2261             for (;;) {
2262                 if (regtry(&reginfo, &s))
2263                     goto got_it;
2264                 if (s >= strend)
2265                     break;
2266                 s += UTF8SKIP(s);
2267             };
2268         }
2269         else {
2270             do {
2271                 if (regtry(&reginfo, &s))
2272                     goto got_it;
2273             } while (s++ < strend);
2274         }
2275     }
2276
2277     /* Failure. */
2278     goto phooey;
2279
2280 got_it:
2281     Safefree(swap);
2282     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2283
2284     if (PL_reg_eval_set)
2285         restore_pos(aTHX_ prog);
2286     if (RXp_PAREN_NAMES(prog))
2287         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2288
2289     /* make sure $`, $&, $', and $digit will work later */
2290     if ( !(flags & REXEC_NOT_FIRST) ) {
2291         RX_MATCH_COPY_FREE(rx);
2292         if (flags & REXEC_COPY_STR) {
2293             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2294 #ifdef PERL_OLD_COPY_ON_WRITE
2295             if ((SvIsCOW(sv)
2296                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2297                 if (DEBUG_C_TEST) {
2298                     PerlIO_printf(Perl_debug_log,
2299                                   "Copy on write: regexp capture, type %d\n",
2300                                   (int) SvTYPE(sv));
2301                 }
2302                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2303                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2304                 assert (SvPOKp(prog->saved_copy));
2305             } else
2306 #endif
2307             {
2308                 RX_MATCH_COPIED_on(rx);
2309                 s = savepvn(strbeg, i);
2310                 prog->subbeg = s;
2311             }
2312             prog->sublen = i;
2313         }
2314         else {
2315             prog->subbeg = strbeg;
2316             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2317         }
2318     }
2319
2320     return 1;
2321
2322 phooey:
2323     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2324                           PL_colors[4], PL_colors[5]));
2325     if (PL_reg_eval_set)
2326         restore_pos(aTHX_ prog);
2327     if (swap) {
2328         /* we failed :-( roll it back */
2329         Safefree(prog->offs);
2330         prog->offs = swap;
2331     }
2332
2333     return 0;
2334 }
2335
2336
2337 /*
2338  - regtry - try match at specific point
2339  */
2340 STATIC I32                      /* 0 failure, 1 success */
2341 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2342 {
2343     dVAR;
2344     CHECKPOINT lastcp;
2345     REGEXP *const rx = reginfo->prog;
2346     regexp *const prog = (struct regexp *)SvANY(rx);
2347     RXi_GET_DECL(prog,progi);
2348     GET_RE_DEBUG_FLAGS_DECL;
2349
2350     PERL_ARGS_ASSERT_REGTRY;
2351
2352     reginfo->cutpoint=NULL;
2353
2354     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2355         MAGIC *mg;
2356
2357         PL_reg_eval_set = RS_init;
2358         DEBUG_EXECUTE_r(DEBUG_s(
2359             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2360                           (IV)(PL_stack_sp - PL_stack_base));
2361             ));
2362         SAVESTACK_CXPOS();
2363         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2364         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2365         SAVETMPS;
2366         /* Apparently this is not needed, judging by wantarray. */
2367         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2368            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2369
2370         if (reginfo->sv) {
2371             /* Make $_ available to executed code. */
2372             if (reginfo->sv != DEFSV) {
2373                 SAVE_DEFSV;
2374                 DEFSV_set(reginfo->sv);
2375             }
2376
2377             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2378                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2379                 /* prepare for quick setting of pos */
2380 #ifdef PERL_OLD_COPY_ON_WRITE
2381                 if (SvIsCOW(reginfo->sv))
2382                     sv_force_normal_flags(reginfo->sv, 0);
2383 #endif
2384                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2385                                  &PL_vtbl_mglob, NULL, 0);
2386                 mg->mg_len = -1;
2387             }
2388             PL_reg_magic    = mg;
2389             PL_reg_oldpos   = mg->mg_len;
2390             SAVEDESTRUCTOR_X(restore_pos, prog);
2391         }
2392         if (!PL_reg_curpm) {
2393             Newxz(PL_reg_curpm, 1, PMOP);
2394 #ifdef USE_ITHREADS
2395             {
2396                 SV* const repointer = &PL_sv_undef;
2397                 /* this regexp is also owned by the new PL_reg_curpm, which
2398                    will try to free it.  */
2399                 av_push(PL_regex_padav, repointer);
2400                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2401                 PL_regex_pad = AvARRAY(PL_regex_padav);
2402             }
2403 #endif
2404         }
2405 #ifdef USE_ITHREADS
2406         /* It seems that non-ithreads works both with and without this code.
2407            So for efficiency reasons it seems best not to have the code
2408            compiled when it is not needed.  */
2409         /* This is safe against NULLs: */
2410         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2411         /* PM_reg_curpm owns a reference to this regexp.  */
2412         ReREFCNT_inc(rx);
2413 #endif
2414         PM_SETRE(PL_reg_curpm, rx);
2415         PL_reg_oldcurpm = PL_curpm;
2416         PL_curpm = PL_reg_curpm;
2417         if (RXp_MATCH_COPIED(prog)) {
2418             /*  Here is a serious problem: we cannot rewrite subbeg,
2419                 since it may be needed if this match fails.  Thus
2420                 $` inside (?{}) could fail... */
2421             PL_reg_oldsaved = prog->subbeg;
2422             PL_reg_oldsavedlen = prog->sublen;
2423 #ifdef PERL_OLD_COPY_ON_WRITE
2424             PL_nrs = prog->saved_copy;
2425 #endif
2426             RXp_MATCH_COPIED_off(prog);
2427         }
2428         else
2429             PL_reg_oldsaved = NULL;
2430         prog->subbeg = PL_bostr;
2431         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2432     }
2433     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2434     prog->offs[0].start = *startpos - PL_bostr;
2435     PL_reginput = *startpos;
2436     PL_reglastparen = &prog->lastparen;
2437     PL_reglastcloseparen = &prog->lastcloseparen;
2438     prog->lastparen = 0;
2439     prog->lastcloseparen = 0;
2440     PL_regsize = 0;
2441     PL_regoffs = prog->offs;
2442     if (PL_reg_start_tmpl <= prog->nparens) {
2443         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2444         if(PL_reg_start_tmp)
2445             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2446         else
2447             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2448     }
2449
2450     /* XXXX What this code is doing here?!!!  There should be no need
2451        to do this again and again, PL_reglastparen should take care of
2452        this!  --ilya*/
2453
2454     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2455      * Actually, the code in regcppop() (which Ilya may be meaning by
2456      * PL_reglastparen), is not needed at all by the test suite
2457      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2458      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2459      * Meanwhile, this code *is* needed for the
2460      * above-mentioned test suite tests to succeed.  The common theme
2461      * on those tests seems to be returning null fields from matches.
2462      * --jhi updated by dapm */
2463 #if 1
2464     if (prog->nparens) {
2465         regexp_paren_pair *pp = PL_regoffs;
2466         register I32 i;
2467         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2468             ++pp;
2469             pp->start = -1;
2470             pp->end = -1;
2471         }
2472     }
2473 #endif
2474     REGCP_SET(lastcp);
2475     if (regmatch(reginfo, progi->program + 1)) {
2476         PL_regoffs[0].end = PL_reginput - PL_bostr;
2477         return 1;
2478     }
2479     if (reginfo->cutpoint)
2480         *startpos= reginfo->cutpoint;
2481     REGCP_UNWIND(lastcp);
2482     return 0;
2483 }
2484
2485
2486 #define sayYES goto yes
2487 #define sayNO goto no
2488 #define sayNO_SILENT goto no_silent
2489
2490 /* we dont use STMT_START/END here because it leads to
2491    "unreachable code" warnings, which are bogus, but distracting. */
2492 #define CACHEsayNO \
2493     if (ST.cache_mask) \
2494        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2495     sayNO
2496
2497 /* this is used to determine how far from the left messages like
2498    'failed...' are printed. It should be set such that messages
2499    are inline with the regop output that created them.
2500 */
2501 #define REPORT_CODE_OFF 32
2502
2503
2504 /* Make sure there is a test for this +1 options in re_tests */
2505 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2506
2507 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2508 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2509
2510 #define SLAB_FIRST(s) (&(s)->states[0])
2511 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2512
2513 /* grab a new slab and return the first slot in it */
2514
2515 STATIC regmatch_state *
2516 S_push_slab(pTHX)
2517 {
2518 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2519     dMY_CXT;
2520 #endif
2521     regmatch_slab *s = PL_regmatch_slab->next;
2522     if (!s) {
2523         Newx(s, 1, regmatch_slab);
2524         s->prev = PL_regmatch_slab;
2525         s->next = NULL;
2526         PL_regmatch_slab->next = s;
2527     }
2528     PL_regmatch_slab = s;
2529     return SLAB_FIRST(s);
2530 }
2531
2532
2533 /* push a new state then goto it */
2534
2535 #define PUSH_STATE_GOTO(state, node) \
2536     scan = node; \
2537     st->resume_state = state; \
2538     goto push_state;
2539
2540 /* push a new state with success backtracking, then goto it */
2541
2542 #define PUSH_YES_STATE_GOTO(state, node) \
2543     scan = node; \
2544     st->resume_state = state; \
2545     goto push_yes_state;
2546
2547
2548
2549 /*
2550
2551 regmatch() - main matching routine
2552
2553 This is basically one big switch statement in a loop. We execute an op,
2554 set 'next' to point the next op, and continue. If we come to a point which
2555 we may need to backtrack to on failure such as (A|B|C), we push a
2556 backtrack state onto the backtrack stack. On failure, we pop the top
2557 state, and re-enter the loop at the state indicated. If there are no more
2558 states to pop, we return failure.
2559
2560 Sometimes we also need to backtrack on success; for example /A+/, where
2561 after successfully matching one A, we need to go back and try to
2562 match another one; similarly for lookahead assertions: if the assertion
2563 completes successfully, we backtrack to the state just before the assertion
2564 and then carry on.  In these cases, the pushed state is marked as
2565 'backtrack on success too'. This marking is in fact done by a chain of
2566 pointers, each pointing to the previous 'yes' state. On success, we pop to
2567 the nearest yes state, discarding any intermediate failure-only states.
2568 Sometimes a yes state is pushed just to force some cleanup code to be
2569 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2570 it to free the inner regex.
2571
2572 Note that failure backtracking rewinds the cursor position, while
2573 success backtracking leaves it alone.
2574
2575 A pattern is complete when the END op is executed, while a subpattern
2576 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2577 ops trigger the "pop to last yes state if any, otherwise return true"
2578 behaviour.
2579
2580 A common convention in this function is to use A and B to refer to the two
2581 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2582 the subpattern to be matched possibly multiple times, while B is the entire
2583 rest of the pattern. Variable and state names reflect this convention.
2584
2585 The states in the main switch are the union of ops and failure/success of
2586 substates associated with with that op.  For example, IFMATCH is the op
2587 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2588 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2589 successfully matched A and IFMATCH_A_fail is a state saying that we have
2590 just failed to match A. Resume states always come in pairs. The backtrack
2591 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2592 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2593 on success or failure.
2594
2595 The struct that holds a backtracking state is actually a big union, with
2596 one variant for each major type of op. The variable st points to the
2597 top-most backtrack struct. To make the code clearer, within each
2598 block of code we #define ST to alias the relevant union.
2599
2600 Here's a concrete example of a (vastly oversimplified) IFMATCH
2601 implementation:
2602
2603     switch (state) {
2604     ....
2605
2606 #define ST st->u.ifmatch
2607
2608     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2609         ST.foo = ...; // some state we wish to save
2610         ...
2611         // push a yes backtrack state with a resume value of
2612         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2613         // first node of A:
2614         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2615         // NOTREACHED
2616
2617     case IFMATCH_A: // we have successfully executed A; now continue with B
2618         next = B;
2619         bar = ST.foo; // do something with the preserved value
2620         break;
2621
2622     case IFMATCH_A_fail: // A failed, so the assertion failed
2623         ...;   // do some housekeeping, then ...
2624         sayNO; // propagate the failure
2625
2626 #undef ST
2627
2628     ...
2629     }
2630
2631 For any old-timers reading this who are familiar with the old recursive
2632 approach, the code above is equivalent to:
2633
2634     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2635     {
2636         int foo = ...
2637         ...
2638         if (regmatch(A)) {
2639             next = B;
2640             bar = foo;
2641             break;
2642         }
2643         ...;   // do some housekeeping, then ...
2644         sayNO; // propagate the failure
2645     }
2646
2647 The topmost backtrack state, pointed to by st, is usually free. If you
2648 want to claim it, populate any ST.foo fields in it with values you wish to
2649 save, then do one of
2650
2651         PUSH_STATE_GOTO(resume_state, node);
2652         PUSH_YES_STATE_GOTO(resume_state, node);
2653
2654 which sets that backtrack state's resume value to 'resume_state', pushes a
2655 new free entry to the top of the backtrack stack, then goes to 'node'.
2656 On backtracking, the free slot is popped, and the saved state becomes the
2657 new free state. An ST.foo field in this new top state can be temporarily
2658 accessed to retrieve values, but once the main loop is re-entered, it
2659 becomes available for reuse.
2660
2661 Note that the depth of the backtrack stack constantly increases during the
2662 left-to-right execution of the pattern, rather than going up and down with
2663 the pattern nesting. For example the stack is at its maximum at Z at the
2664 end of the pattern, rather than at X in the following:
2665
2666     /(((X)+)+)+....(Y)+....Z/
2667
2668 The only exceptions to this are lookahead/behind assertions and the cut,
2669 (?>A), which pop all the backtrack states associated with A before
2670 continuing.
2671
2672 Bascktrack state structs are allocated in slabs of about 4K in size.
2673 PL_regmatch_state and st always point to the currently active state,
2674 and PL_regmatch_slab points to the slab currently containing
2675 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2676 allocated, and is never freed until interpreter destruction. When the slab
2677 is full, a new one is allocated and chained to the end. At exit from
2678 regmatch(), slabs allocated since entry are freed.
2679
2680 */
2681
2682
2683 #define DEBUG_STATE_pp(pp)                                  \
2684     DEBUG_STATE_r({                                         \
2685         DUMP_EXEC_POS(locinput, scan, do_utf8);             \
2686         PerlIO_printf(Perl_debug_log,                       \
2687             "    %*s"pp" %s%s%s%s%s\n",                     \
2688             depth*2, "",                                    \
2689             PL_reg_name[st->resume_state],                     \
2690             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2691             ((st==yes_state) ? "Y" : ""),                   \
2692             ((st==mark_state) ? "M" : ""),                  \
2693             ((st==yes_state||st==mark_state) ? "]" : "")    \
2694         );                                                  \
2695     });
2696
2697
2698 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2699
2700 #ifdef DEBUGGING
2701
2702 STATIC void
2703 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2704     const char *start, const char *end, const char *blurb)
2705 {
2706     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2707
2708     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2709
2710     if (!PL_colorset)
2711             reginitcolors();
2712     {
2713         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2714             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2715
2716         RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2717             start, end - start, 60);
2718
2719         PerlIO_printf(Perl_debug_log,
2720             "%s%s REx%s %s against %s\n",
2721                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2722
2723         if (do_utf8||utf8_pat)
2724             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2725                 utf8_pat ? "pattern" : "",
2726                 utf8_pat && do_utf8 ? " and " : "",
2727                 do_utf8 ? "string" : ""
2728             );
2729     }
2730 }
2731
2732 STATIC void
2733 S_dump_exec_pos(pTHX_ const char *locinput,
2734                       const regnode *scan,
2735                       const char *loc_regeol,
2736                       const char *loc_bostr,
2737                       const char *loc_reg_starttry,
2738                       const bool do_utf8)
2739 {
2740     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2741     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2742     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2743     /* The part of the string before starttry has one color
2744        (pref0_len chars), between starttry and current
2745        position another one (pref_len - pref0_len chars),
2746        after the current position the third one.
2747        We assume that pref0_len <= pref_len, otherwise we
2748        decrease pref0_len.  */
2749     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2750         ? (5 + taill) - l : locinput - loc_bostr;
2751     int pref0_len;
2752
2753     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2754
2755     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2756         pref_len++;
2757     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2758     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2759         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2760               ? (5 + taill) - pref_len : loc_regeol - locinput);
2761     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2762         l--;
2763     if (pref0_len < 0)
2764         pref0_len = 0;
2765     if (pref0_len > pref_len)
2766         pref0_len = pref_len;
2767     {
2768         const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2769
2770         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2771             (locinput - pref_len),pref0_len, 60, 4, 5);
2772
2773         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2774                     (locinput - pref_len + pref0_len),
2775                     pref_len - pref0_len, 60, 2, 3);
2776
2777         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2778                     locinput, loc_regeol - locinput, 10, 0, 1);
2779
2780         const STRLEN tlen=len0+len1+len2;
2781         PerlIO_printf(Perl_debug_log,
2782                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2783                     (IV)(locinput - loc_bostr),
2784                     len0, s0,
2785                     len1, s1,
2786                     (docolor ? "" : "> <"),
2787                     len2, s2,
2788                     (int)(tlen > 19 ? 0 :  19 - tlen),
2789                     "");
2790     }
2791 }
2792
2793 #endif
2794
2795 /* reg_check_named_buff_matched()
2796  * Checks to see if a named buffer has matched. The data array of
2797  * buffer numbers corresponding to the buffer is expected to reside
2798  * in the regexp->data->data array in the slot stored in the ARG() of
2799  * node involved. Note that this routine doesn't actually care about the
2800  * name, that information is not preserved from compilation to execution.
2801  * Returns the index of the leftmost defined buffer with the given name
2802  * or 0 if non of the buffers matched.
2803  */
2804 STATIC I32
2805 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2806 {
2807     I32 n;
2808     RXi_GET_DECL(rex,rexi);
2809     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2810     I32 *nums=(I32*)SvPVX(sv_dat);
2811
2812     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2813
2814     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2815         if ((I32)*PL_reglastparen >= nums[n] &&
2816             PL_regoffs[nums[n]].end != -1)
2817         {
2818             return nums[n];
2819         }
2820     }
2821     return 0;
2822 }
2823
2824
2825 /* free all slabs above current one  - called during LEAVE_SCOPE */
2826
2827 STATIC void
2828 S_clear_backtrack_stack(pTHX_ void *p)
2829 {
2830     regmatch_slab *s = PL_regmatch_slab->next;
2831     PERL_UNUSED_ARG(p);
2832
2833     if (!s)
2834         return;
2835     PL_regmatch_slab->next = NULL;
2836     while (s) {
2837         regmatch_slab * const osl = s;
2838         s = s->next;
2839         Safefree(osl);
2840     }
2841 }
2842
2843
2844 #define SETREX(Re1,Re2) \
2845     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2846     Re1 = (Re2)
2847
2848 STATIC I32                      /* 0 failure, 1 success */
2849 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2850 {
2851 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2852     dMY_CXT;
2853 #endif
2854     dVAR;
2855     register const bool do_utf8 = PL_reg_match_utf8;
2856     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2857     REGEXP *rex_sv = reginfo->prog;
2858     regexp *rex = (struct regexp *)SvANY(rex_sv);
2859     RXi_GET_DECL(rex,rexi);
2860     I32 oldsave;
2861     /* the current state. This is a cached copy of PL_regmatch_state */
2862     register regmatch_state *st;
2863     /* cache heavy used fields of st in registers */
2864     register regnode *scan;
2865     register regnode *next;
2866     register U32 n = 0; /* general value; init to avoid compiler warning */
2867     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2868     register char *locinput = PL_reginput;
2869     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2870
2871     bool result = 0;        /* return value of S_regmatch */
2872     int depth = 0;          /* depth of backtrack stack */
2873     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2874     const U32 max_nochange_depth =
2875         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2876         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2877     regmatch_state *yes_state = NULL; /* state to pop to on success of
2878                                                             subpattern */
2879     /* mark_state piggy backs on the yes_state logic so that when we unwind
2880        the stack on success we can update the mark_state as we go */
2881     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2882     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2883     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2884     U32 state_num;
2885     bool no_final = 0;      /* prevent failure from backtracking? */
2886     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2887     char *startpoint = PL_reginput;
2888     SV *popmark = NULL;     /* are we looking for a mark? */
2889     SV *sv_commit = NULL;   /* last mark name seen in failure */
2890     SV *sv_yes_mark = NULL; /* last mark name we have seen
2891                                during a successfull match */
2892     U32 lastopen = 0;       /* last open we saw */
2893     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2894     SV* const oreplsv = GvSV(PL_replgv);
2895     /* these three flags are set by various ops to signal information to
2896      * the very next op. They have a useful lifetime of exactly one loop
2897      * iteration, and are not preserved or restored by state pushes/pops
2898      */
2899     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2900     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2901     int logical = 0;        /* the following EVAL is:
2902                                 0: (?{...})
2903                                 1: (?(?{...})X|Y)
2904                                 2: (??{...})
2905                                or the following IFMATCH/UNLESSM is:
2906                                 false: plain (?=foo)
2907                                 true:  used as a condition: (?(?=foo))
2908                             */
2909 #ifdef DEBUGGING
2910     GET_RE_DEBUG_FLAGS_DECL;
2911 #endif
2912
2913     PERL_ARGS_ASSERT_REGMATCH;
2914
2915     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2916             PerlIO_printf(Perl_debug_log,"regmatch start\n");
2917     }));
2918     /* on first ever call to regmatch, allocate first slab */
2919     if (!PL_regmatch_slab) {
2920         Newx(PL_regmatch_slab, 1, regmatch_slab);
2921         PL_regmatch_slab->prev = NULL;
2922         PL_regmatch_slab->next = NULL;
2923         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2924     }
2925
2926     oldsave = PL_savestack_ix;
2927     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2928     SAVEVPTR(PL_regmatch_slab);
2929     SAVEVPTR(PL_regmatch_state);
2930
2931     /* grab next free state slot */
2932     st = ++PL_regmatch_state;
2933     if (st >  SLAB_LAST(PL_regmatch_slab))
2934         st = PL_regmatch_state = S_push_slab(aTHX);
2935
2936     /* Note that nextchr is a byte even in UTF */
2937     nextchr = UCHARAT(locinput);
2938     scan = prog;
2939     while (scan != NULL) {
2940
2941         DEBUG_EXECUTE_r( {
2942             SV * const prop = sv_newmortal();
2943             regnode *rnext=regnext(scan);
2944             DUMP_EXEC_POS( locinput, scan, do_utf8 );
2945             regprop(rex, prop, scan);
2946
2947             PerlIO_printf(Perl_debug_log,
2948                     "%3"IVdf":%*s%s(%"IVdf")\n",
2949                     (IV)(scan - rexi->program), depth*2, "",
2950                     SvPVX_const(prop),
2951                     (PL_regkind[OP(scan)] == END || !rnext) ?
2952                         0 : (IV)(rnext - rexi->program));
2953         });
2954
2955         next = scan + NEXT_OFF(scan);
2956         if (next == scan)
2957             next = NULL;
2958         state_num = OP(scan);
2959
2960         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
2961       reenter_switch:
2962
2963         assert(PL_reglastparen == &rex->lastparen);
2964         assert(PL_reglastcloseparen == &rex->lastcloseparen);
2965         assert(PL_regoffs == rex->offs);
2966
2967         switch (state_num) {
2968         case BOL:
2969             if (locinput == PL_bostr)
2970             {
2971                 /* reginfo->till = reginfo->bol; */
2972                 break;
2973             }
2974             sayNO;
2975         case MBOL:
2976             if (locinput == PL_bostr ||
2977                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2978             {
2979                 break;
2980             }
2981             sayNO;
2982         case SBOL:
2983             if (locinput == PL_bostr)
2984                 break;
2985             sayNO;
2986         case GPOS:
2987             if (locinput == reginfo->ganch)
2988                 break;
2989             sayNO;
2990
2991         case KEEPS:
2992             /* update the startpoint */
2993             st->u.keeper.val = PL_regoffs[0].start;
2994             PL_reginput = locinput;
2995             PL_regoffs[0].start = locinput - PL_bostr;
2996             PUSH_STATE_GOTO(KEEPS_next, next);
2997             /*NOT-REACHED*/
2998         case KEEPS_next_fail:
2999             /* rollback the start point change */
3000             PL_regoffs[0].start = st->u.keeper.val;
3001             sayNO_SILENT;
3002             /*NOT-REACHED*/
3003         case EOL:
3004                 goto seol;
3005         case MEOL:
3006             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3007                 sayNO;
3008             break;
3009         case SEOL:
3010           seol:
3011             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3012                 sayNO;
3013             if (PL_regeol - locinput > 1)
3014                 sayNO;
3015             break;
3016         case EOS:
3017             if (PL_regeol != locinput)
3018                 sayNO;
3019             break;
3020         case SANY:
3021             if (!nextchr && locinput >= PL_regeol)
3022                 sayNO;
3023             if (do_utf8) {
3024                 locinput += PL_utf8skip[nextchr];
3025                 if (locinput > PL_regeol)
3026                     sayNO;
3027                 nextchr = UCHARAT(locinput);
3028             }
3029             else
3030                 nextchr = UCHARAT(++locinput);
3031             break;
3032         case CANY:
3033             if (!nextchr && locinput >= PL_regeol)
3034                 sayNO;
3035             nextchr = UCHARAT(++locinput);
3036             break;
3037         case REG_ANY:
3038             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3039                 sayNO;
3040             if (do_utf8) {
3041                 locinput += PL_utf8skip[nextchr];
3042                 if (locinput > PL_regeol)
3043                     sayNO;
3044                 nextchr = UCHARAT(locinput);
3045             }
3046             else
3047                 nextchr = UCHARAT(++locinput);
3048             break;
3049
3050 #undef  ST
3051 #define ST st->u.trie
3052         case TRIEC:
3053             /* In this case the charclass data is available inline so
3054                we can fail fast without a lot of extra overhead.
3055              */
3056             if (scan->flags == EXACT || !do_utf8) {
3057                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3058                     DEBUG_EXECUTE_r(
3059                         PerlIO_printf(Perl_debug_log,
3060                                   "%*s  %sfailed to match trie start class...%s\n",
3061                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3062                     );
3063                     sayNO_SILENT;
3064                     /* NOTREACHED */
3065                 }
3066             }
3067             /* FALL THROUGH */
3068         case TRIE:
3069             {
3070                 /* what type of TRIE am I? (utf8 makes this contextual) */
3071                 DECL_TRIE_TYPE(scan);
3072
3073                 /* what trie are we using right now */
3074                 reg_trie_data * const trie
3075                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3076                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3077                 U32 state = trie->startstate;
3078
3079                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3080                     !TRIE_BITMAP_TEST(trie,*locinput)
3081                 ) {
3082                     if (trie->states[ state ].wordnum) {
3083                          DEBUG_EXECUTE_r(
3084                             PerlIO_printf(Perl_debug_log,
3085                                           "%*s  %smatched empty string...%s\n",
3086                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3087                         );
3088                         break;
3089                     } else {
3090                         DEBUG_EXECUTE_r(
3091                             PerlIO_printf(Perl_debug_log,
3092                                           "%*s  %sfailed to match trie start class...%s\n",
3093                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3094                         );
3095                         sayNO_SILENT;
3096                    }
3097                 }
3098
3099             {
3100                 U8 *uc = ( U8* )locinput;
3101
3102                 STRLEN len = 0;
3103                 STRLEN foldlen = 0;
3104                 U8 *uscan = (U8*)NULL;
3105                 STRLEN bufflen=0;
3106                 SV *sv_accept_buff = NULL;
3107                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3108
3109                 ST.accepted = 0; /* how many accepting states we have seen */
3110                 ST.B = next;
3111                 ST.jump = trie->jump;
3112                 ST.me = scan;
3113                 /*
3114                    traverse the TRIE keeping track of all accepting states
3115                    we transition through until we get to a failing node.
3116                 */
3117
3118                 while ( state && uc <= (U8*)PL_regeol ) {
3119                     U32 base = trie->states[ state ].trans.base;
3120                     UV uvc = 0;
3121                     U16 charid;
3122                     /* We use charid to hold the wordnum as we don't use it
3123                        for charid until after we have done the wordnum logic.
3124                        We define an alias just so that the wordnum logic reads
3125                        more naturally. */
3126
3127 #define got_wordnum charid
3128                     got_wordnum = trie->states[ state ].wordnum;
3129
3130                     if ( got_wordnum ) {
3131                         if ( ! ST.accepted ) {
3132                             ENTER;
3133                             SAVETMPS; /* XXX is this necessary? dmq */
3134                             bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3135                             sv_accept_buff=newSV(bufflen *
3136                                             sizeof(reg_trie_accepted) - 1);
3137                             SvCUR_set(sv_accept_buff, 0);
3138                             SvPOK_on(sv_accept_buff);
3139                             sv_2mortal(sv_accept_buff);
3140                             SAVETMPS;
3141                             ST.accept_buff =
3142                                 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3143                         }
3144                         do {
3145                             if (ST.accepted >= bufflen) {
3146                                 bufflen *= 2;
3147                                 ST.accept_buff =(reg_trie_accepted*)
3148                                     SvGROW(sv_accept_buff,
3149                                         bufflen * sizeof(reg_trie_accepted));
3150                             }
3151                             SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3152                                 + sizeof(reg_trie_accepted));
3153
3154
3155                             ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3156                             ST.accept_buff[ST.accepted].endpos = uc;
3157                             ++ST.accepted;
3158                         } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3159                     }
3160 #undef got_wordnum
3161
3162                     DEBUG_TRIE_EXECUTE_r({
3163                                 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3164                                 PerlIO_printf( Perl_debug_log,
3165                                     "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
3166                                     2+depth * 2, "", PL_colors[4],
3167                                     (UV)state, (UV)ST.accepted );
3168                     });
3169
3170                     if ( base ) {
3171                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3172                                              uscan, len, uvc, charid, foldlen,
3173                                              foldbuf, uniflags);
3174
3175                         if (charid &&
3176                              (base + charid > trie->uniquecharcount )
3177                              && (base + charid - 1 - trie->uniquecharcount
3178                                     < trie->lasttrans)
3179                              && trie->trans[base + charid - 1 -
3180                                     trie->uniquecharcount].check == state)
3181                         {
3182                             state = trie->trans[base + charid - 1 -
3183                                 trie->uniquecharcount ].next;
3184                         }
3185                         else {
3186                             state = 0;
3187                         }
3188                         uc += len;
3189
3190                     }
3191                     else {
3192                         state = 0;
3193                     }
3194                     DEBUG_TRIE_EXECUTE_r(
3195                         PerlIO_printf( Perl_debug_log,
3196                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3197                             charid, uvc, (UV)state, PL_colors[5] );
3198                     );
3199                 }
3200                 if (!ST.accepted )
3201                    sayNO;
3202
3203                 DEBUG_EXECUTE_r(
3204                     PerlIO_printf( Perl_debug_log,
3205                         "%*s  %sgot %"IVdf" possible matches%s\n",
3206                         REPORT_CODE_OFF + depth * 2, "",
3207                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3208                 );
3209             }}
3210             goto trie_first_try; /* jump into the fail handler */
3211             /* NOTREACHED */
3212         case TRIE_next_fail: /* we failed - try next alterative */
3213             if ( ST.jump) {
3214                 REGCP_UNWIND(ST.cp);
3215                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3216                     PL_regoffs[n].end = -1;
3217                 *PL_reglastparen = n;
3218             }
3219           trie_first_try:
3220             if (do_cutgroup) {
3221                 do_cutgroup = 0;
3222                 no_final = 0;
3223             }
3224
3225             if ( ST.jump) {
3226                 ST.lastparen = *PL_reglastparen;
3227                 REGCP_SET(ST.cp);
3228             }
3229             if ( ST.accepted == 1 ) {
3230                 /* only one choice left - just continue */
3231                 DEBUG_EXECUTE_r({
3232                     AV *const trie_words
3233                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3234                     SV ** const tmp = av_fetch( trie_words,
3235                         ST.accept_buff[ 0 ].wordnum-1, 0 );
3236                     SV *sv= tmp ? sv_newmortal() : NULL;
3237
3238                     PerlIO_printf( Perl_debug_log,
3239                         "%*s  %sonly one match left: #%d <%s>%s\n",
3240                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3241                         ST.accept_buff[ 0 ].wordnum,
3242                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3243                                 PL_colors[0], PL_colors[1],
3244                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3245                             )
3246                         : "not compiled under -Dr",
3247                         PL_colors[5] );
3248                 });
3249                 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3250                 /* in this case we free tmps/leave before we call regmatch
3251                    as we wont be using accept_buff again. */
3252
3253                 locinput = PL_reginput;
3254                 nextchr = UCHARAT(locinput);
3255                 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3256                     scan = ST.B;
3257                 else
3258                     scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3259                 if (!has_cutgroup) {
3260                     FREETMPS;
3261                     LEAVE;
3262                 } else {
3263                     ST.accepted--;
3264                     PUSH_YES_STATE_GOTO(TRIE_next, scan);
3265                 }
3266
3267                 continue; /* execute rest of RE */
3268             }
3269
3270             if ( !ST.accepted-- ) {
3271                 DEBUG_EXECUTE_r({
3272                     PerlIO_printf( Perl_debug_log,
3273                         "%*s  %sTRIE failed...%s\n",
3274                         REPORT_CODE_OFF+depth*2, "",
3275                         PL_colors[4],
3276                         PL_colors[5] );
3277                 });
3278                 FREETMPS;
3279                 LEAVE;
3280                 sayNO_SILENT;
3281                 /*NOTREACHED*/
3282             }
3283
3284             /*
3285                There are at least two accepting states left.  Presumably
3286                the number of accepting states is going to be low,
3287                typically two. So we simply scan through to find the one
3288                with lowest wordnum.  Once we find it, we swap the last
3289                state into its place and decrement the size. We then try to
3290                match the rest of the pattern at the point where the word
3291                ends. If we succeed, control just continues along the
3292                regex; if we fail we return here to try the next accepting
3293                state
3294              */
3295
3296             {
3297                 U32 best = 0;
3298                 U32 cur;
3299                 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3300                     DEBUG_TRIE_EXECUTE_r(
3301                         PerlIO_printf( Perl_debug_log,
3302                             "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3303                             REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3304                             (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3305                             ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3306                     );
3307
3308                     if (ST.accept_buff[cur].wordnum <
3309                             ST.accept_buff[best].wordnum)
3310                         best = cur;
3311                 }
3312
3313                 DEBUG_EXECUTE_r({
3314                     AV *const trie_words
3315                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3316                     SV ** const tmp = av_fetch( trie_words,
3317                         ST.accept_buff[ best ].wordnum - 1, 0 );
3318                     regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3319                                     ST.B :
3320                                     ST.me + ST.jump[ST.accept_buff[best].wordnum];
3321                     SV *sv= tmp ? sv_newmortal() : NULL;
3322
3323                     PerlIO_printf( Perl_debug_log,
3324                         "%*s  %strying alternation #%d <%s> at node #%d %s\n",
3325                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3326                         ST.accept_buff[best].wordnum,
3327                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3328                                 PL_colors[0], PL_colors[1],
3329                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3330                             ) : "not compiled under -Dr",
3331                             REG_NODE_NUM(nextop),
3332                         PL_colors[5] );
3333                 });
3334
3335                 if ( best<ST.accepted ) {
3336                     reg_trie_accepted tmp = ST.accept_buff[ best ];
3337                     ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3338                     ST.accept_buff[ ST.accepted ] = tmp;
3339                     best = ST.accepted;
3340                 }
3341                 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3342                 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3343                     scan = ST.B;
3344                 } else {
3345                     scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3346                 }
3347                 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3348                 /* NOTREACHED */
3349             }
3350             /* NOTREACHED */
3351         case TRIE_next:
3352             /* we dont want to throw this away, see bug 57042*/
3353             if (oreplsv != GvSV(PL_replgv))
3354                 sv_setsv(oreplsv, GvSV(PL_replgv));
3355             FREETMPS;
3356             LEAVE;
3357             sayYES;
3358 #undef  ST
3359
3360         case EXACT: {
3361             char *s = STRING(scan);
3362             ln = STR_LEN(scan);
3363             if (do_utf8 != UTF) {
3364                 /* The target and the pattern have differing utf8ness. */
3365                 char *l = locinput;
3366                 const char * const e = s + ln;
3367
3368                 if (do_utf8) {
3369                     /* The target is utf8, the pattern is not utf8. */
3370                     while (s < e) {
3371                         STRLEN ulen;
3372                         if (l >= PL_regeol)
3373                              sayNO;
3374                         if (NATIVE_TO_UNI(*(U8*)s) !=
3375                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3376                                             uniflags))
3377                              sayNO;
3378                         l += ulen;
3379                         s ++;
3380                     }
3381                 }
3382                 else {
3383                     /* The target is not utf8, the pattern is utf8. */
3384                     while (s < e) {
3385                         STRLEN ulen;
3386                         if (l >= PL_regeol)
3387                             sayNO;
3388                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3389                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3390                                            uniflags))
3391                             sayNO;
3392                         s += ulen;
3393                         l ++;
3394                     }
3395                 }
3396                 locinput = l;
3397                 nextchr = UCHARAT(locinput);
3398                 break;
3399             }
3400             /* The target and the pattern have the same utf8ness. */
3401             /* Inline the first character, for speed. */
3402             if (UCHARAT(s) != nextchr)
3403                 sayNO;
3404             if (PL_regeol - locinput < ln)
3405                 sayNO;
3406             if (ln > 1 && memNE(s, locinput, ln))
3407                 sayNO;
3408             locinput += ln;
3409             nextchr = UCHARAT(locinput);
3410             break;
3411             }
3412         case EXACTFL:
3413             PL_reg_flags |= RF_tainted;
3414             /* FALL THROUGH */
3415         case EXACTF: {
3416             char * const s = STRING(scan);
3417             ln = STR_LEN(scan);
3418
3419             if (do_utf8 || UTF) {
3420               /* Either target or the pattern are utf8. */
3421                 const char * const l = locinput;
3422                 char *e = PL_regeol;
3423
3424                 if (ibcmp_utf8(s, 0,  ln, (bool)UTF,
3425                                l, &e, 0,  do_utf8)) {
3426                      /* One more case for the sharp s:
3427                       * pack("U0U*", 0xDF) =~ /ss/i,
3428                       * the 0xC3 0x9F are the UTF-8
3429                       * byte sequence for the U+00DF. */
3430
3431                      if (!(do_utf8 &&
3432                            toLOWER(s[0]) == 's' &&
3433                            ln >= 2 &&
3434                            toLOWER(s[1]) == 's' &&
3435                            (U8)l[0] == 0xC3 &&
3436                            e - l >= 2 &&
3437                            (U8)l[1] == 0x9F))
3438                           sayNO;
3439                 }
3440                 locinput = e;
3441                 nextchr = UCHARAT(locinput);
3442                 break;
3443             }
3444
3445             /* Neither the target and the pattern are utf8. */
3446
3447             /* Inline the first character, for speed. */
3448             if (UCHARAT(s) != nextchr &&
3449                 UCHARAT(s) != ((OP(scan) == EXACTF)
3450                                ? PL_fold : PL_fold_locale)[nextchr])
3451                 sayNO;
3452             if (PL_regeol - locinput < ln)
3453                 sayNO;
3454             if (ln > 1 && (OP(scan) == EXACTF
3455                            ? ibcmp(s, locinput, ln)
3456                            : ibcmp_locale(s, locinput, ln)))
3457                 sayNO;
3458             locinput += ln;
3459             nextchr = UCHARAT(locinput);
3460             break;
3461             }
3462         case BOUNDL:
3463         case NBOUNDL:
3464             PL_reg_flags |= RF_tainted;
3465             /* FALL THROUGH */
3466         case BOUND:
3467         case NBOUND:
3468             /* was last char in word? */
3469             if (do_utf8) {
3470                 if (locinput == PL_bostr)
3471                     ln = '\n';
3472                 else {
3473                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3474
3475                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3476                 }
3477                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3478                     ln = isALNUM_uni(ln);
3479                     LOAD_UTF8_CHARCLASS_ALNUM();
3480                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3481                 }
3482                 else {
3483                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3484                     n = isALNUM_LC_utf8((U8*)locinput);
3485                 }
3486             }
3487             else {
3488                 ln = (locinput != PL_bostr) ?
3489                     UCHARAT(locinput - 1) : '\n';
3490                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3491                     ln = isALNUM(ln);
3492                     n = isALNUM(nextchr);
3493                 }
3494                 else {
3495                     ln = isALNUM_LC(ln);
3496                     n = isALNUM_LC(nextchr);
3497                 }
3498             }
3499             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3500                                     OP(scan) == BOUNDL))
3501                     sayNO;
3502             break;
3503         case ANYOF:
3504             if (do_utf8) {
3505                 STRLEN inclasslen = PL_regeol - locinput;
3506
3507                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3508                     goto anyof_fail;
3509                 if (locinput >= PL_regeol)
3510                     sayNO;
3511                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3512                 nextchr = UCHARAT(locinput);
3513                 break;
3514             }
3515             else {
3516                 if (nextchr < 0)
3517                     nextchr = UCHARAT(locinput);
3518                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3519                     goto anyof_fail;
3520                 if (!nextchr && locinput >= PL_regeol)
3521                     sayNO;
3522                 nextchr = UCHARAT(++locinput);
3523                 break;
3524             }
3525         anyof_fail:
3526             /* If we might have the case of the German sharp s
3527              * in a casefolding Unicode character class. */
3528
3529             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3530                  locinput += SHARP_S_SKIP;
3531                  nextchr = UCHARAT(locinput);
3532             }
3533             else
3534                  sayNO;
3535             break;
3536         /* Special char classes - The defines start on line 129 or so */
3537         CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3538         CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3539
3540         CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3541         CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3542
3543         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3544         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3545
3546         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3547                        a Unicode extended Grapheme Cluster */
3548             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3549               extended Grapheme Cluster is:
3550
3551                CR LF
3552                | Prepend* Begin Extend*
3553                | .
3554
3555                Begin is (Hangul-syllable | ! Control)
3556                Extend is (Grapheme_Extend | Spacing_Mark)
3557                Control is [ GCB_Control CR LF ]
3558
3559                The discussion below shows how the code for CLUMP is derived
3560                from this regex.  Note that most of these concepts are from
3561                property values of the Grapheme Cluster Boundary (GCB) property.
3562                No code point can have multiple property values for a given
3563                property.  Thus a code point in Prepend can't be in Control, but
3564                it must be in !Control.  This is why Control above includes
3565                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3566                property separately, and so can't be in GCB_Control, even though
3567                they logically are controls.  Control is not the same as gc=cc,
3568                but includes format and other characters as well.
3569
3570                The Unicode definition of Hangul-syllable is:
3571                    L+
3572                    | (L* ( ( V | LV ) V* | LVT ) T*)
3573                    | T+
3574                   )
3575                Each of these is a value for the GCB property, and hence must be
3576                disjoint, so the order they are tested is immaterial, so the
3577                above can safely be changed to
3578                    T+
3579                    | L+
3580                    | (L* ( LVT | ( V | LV ) V*) T*)
3581
3582                The last two terms can be combined like this:
3583                    L* ( L
3584                         | (( LVT | ( V | LV ) V*) T*))
3585
3586                And refactored into this:
3587                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3588
3589                That means that if we have seen any L's at all we can quit
3590                there, but if the next character is a LVT, a V or and LV we
3591                should keep going.
3592
3593                There is a subtlety with Prepend* which showed up in testing.
3594                Note that the Begin, and only the Begin is required in:
3595                 | Prepend* Begin Extend*
3596                Also, Begin contains '! Control'.  A Prepend must be a '!
3597                Control', which means it must be a Begin.  What it comes down to
3598                is that if we match Prepend* and then find no suitable Begin
3599                afterwards, that if we backtrack the last Prepend, that one will
3600                be a suitable Begin.
3601             */
3602
3603             if (locinput >= PL_regeol)
3604                 sayNO;
3605             if  (! do_utf8) {
3606
3607                 /* Match either CR LF  or '.', as all the other possibilities
3608                  * require utf8 */
3609                 locinput++;         /* Match the . or CR */
3610                 if (nextchr == '\r'
3611                     && locinput < PL_regeol
3612                     && UCHARAT(locinput) == '\n') locinput++;
3613             }
3614             else {
3615
3616                 /* Utf8: See if is ( CR LF ); already know that locinput <
3617                  * PL_regeol, so locinput+1 is in bounds */
3618                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3619                     locinput += 2;
3620                 }
3621                 else {
3622                     /* In case have to backtrack to beginning, then match '.' */
3623                     char *starting = locinput;
3624
3625                     /* In case have to backtrack the last prepend */
3626                     char *previous_prepend = 0;
3627
3628                     LOAD_UTF8_CHARCLASS_GCB();
3629
3630                     /* Match (prepend)* */
3631                     while (locinput < PL_regeol
3632                            && swash_fetch(PL_utf8_X_prepend,
3633                                           (U8*)locinput, do_utf8))
3634                     {
3635                         previous_prepend = locinput;
3636                         locinput += UTF8SKIP(locinput);
3637                     }
3638
3639                     /* As noted above, if we matched a prepend character, but
3640                      * the next thing won't match, back off the last prepend we
3641                      * matched, as it is guaranteed to match the begin */
3642                     if (previous_prepend
3643                         && (locinput >=  PL_regeol
3644                             || ! swash_fetch(PL_utf8_X_begin,
3645                                              (U8*)locinput, do_utf8)))
3646                     {
3647                         locinput = previous_prepend;
3648                     }
3649
3650                     /* Note that here we know PL_regeol > locinput, as we
3651                      * tested that upon input to this switch case, and if we
3652                      * moved locinput forward, we tested the result just above
3653                      * and it either passed, or we backed off so that it will
3654                      * now pass */
3655                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3656
3657                         /* Here did not match the required 'Begin' in the
3658                          * second term.  So just match the very first
3659                          * character, the '.' of the final term of the regex */
3660                         locinput = starting + UTF8SKIP(starting);
3661                     } else {
3662
3663                         /* Here is the beginning of a character that can have
3664                          * an extender.  It is either a hangul syllable, or a
3665                          * non-control */
3666                         if (swash_fetch(PL_utf8_X_non_hangul,
3667                                         (U8*)locinput, do_utf8))
3668                         {
3669
3670                             /* Here not a Hangul syllable, must be a
3671                              * ('!  * Control') */
3672                             locinput += UTF8SKIP(locinput);
3673                         } else {
3674
3675                             /* Here is a Hangul syllable.  It can be composed
3676                              * of several individual characters.  One
3677                              * possibility is T+ */
3678                             if (swash_fetch(PL_utf8_X_T,
3679                                             (U8*)locinput, do_utf8))
3680                             {
3681                                 while (locinput < PL_regeol
3682                                         && swash_fetch(PL_utf8_X_T,
3683                                                         (U8*)locinput, do_utf8))
3684                                 {
3685                                     locinput += UTF8SKIP(locinput);
3686                                 }
3687                             } else {
3688
3689                                 /* Here, not T+, but is a Hangul.  That means
3690                                  * it is one of the others: L, LV, LVT or V,
3691                                  * and matches:
3692                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3693
3694                                 /* Match L*           */
3695                                 while (locinput < PL_regeol
3696                                         && swash_fetch(PL_utf8_X_L,
3697                                                         (U8*)locinput, do_utf8))
3698                                 {
3699                                     locinput += UTF8SKIP(locinput);
3700                                 }
3701
3702                                 /* Here, have exhausted L*.  If the next
3703                                  * character is not an LV, LVT nor V, it means
3704                                  * we had to have at least one L, so matches L+
3705                                  * in the original equation, we have a complete
3706                                  * hangul syllable.  Are done. */
3707
3708                                 if (locinput < PL_regeol
3709                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3710                                                     (U8*)locinput, do_utf8))
3711                                 {
3712
3713                                     /* Otherwise keep going.  Must be LV, LVT
3714                                      * or V.  See if LVT */
3715                                     if (swash_fetch(PL_utf8_X_LVT,
3716                                                     (U8*)locinput, do_utf8))
3717                                     {
3718                                         locinput += UTF8SKIP(locinput);
3719                                     } else {
3720
3721                                         /* Must be  V or LV.  Take it, then
3722                                          * match V*     */
3723                                         locinput += UTF8SKIP(locinput);
3724                                         while (locinput < PL_regeol
3725                                                 && swash_fetch(PL_utf8_X_V,
3726                                                          (U8*)locinput, do_utf8))
3727                                         {
3728                                             locinput += UTF8SKIP(locinput);
3729                                         }
3730                                     }
3731
3732                                     /* And any of LV, LVT, or V can be followed
3733                                      * by T*            */
3734                                     while (locinput < PL_regeol
3735                                            && swash_fetch(PL_utf8_X_T,
3736                                                            (U8*)locinput,
3737                                                            do_utf8))
3738                                     {
3739                                         locinput += UTF8SKIP(locinput);
3740                                     }
3741                                 }
3742                             }
3743                         }
3744
3745                         /* Match any extender */
3746                         while (locinput < PL_regeol
3747                                 && swash_fetch(PL_utf8_X_extend,
3748                                                 (U8*)locinput, do_utf8))
3749                         {
3750                             locinput += UTF8SKIP(locinput);
3751                         }
3752                     }
3753                 }
3754                 if (locinput > PL_regeol) sayNO;
3755             }
3756             nextchr = UCHARAT(locinput);
3757             break;
3758
3759         case NREFFL:
3760         {
3761             char *s;
3762             char type;
3763             PL_reg_flags |= RF_tainted;
3764             /* FALL THROUGH */
3765         case NREF:
3766         case NREFF:
3767             type = OP(scan);
3768             n = reg_check_named_buff_matched(rex,scan);
3769
3770             if ( n ) {
3771                 type = REF + ( type - NREF );
3772                 goto do_ref;
3773             } else {
3774                 sayNO;
3775             }
3776             /* unreached */
3777         case REFFL:
3778             PL_reg_flags |= RF_tainted;
3779             /* FALL THROUGH */
3780         case REF:
3781         case REFF:
3782             n = ARG(scan);  /* which paren pair */
3783             type = OP(scan);
3784           do_ref:
3785             ln = PL_regoffs[n].start;
3786             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3787             if (*PL_reglastparen < n || ln == -1)
3788                 sayNO;                  /* Do not match unless seen CLOSEn. */
3789             if (ln == PL_regoffs[n].end)
3790                 break;
3791
3792             s = PL_bostr + ln;
3793             if (do_utf8 && type != REF) {       /* REF can do byte comparison */
3794                 char *l = locinput;
3795                 const char *e = PL_bostr + PL_regoffs[n].end;
3796                 /*
3797                  * Note that we can't do the "other character" lookup trick as
3798                  * in the 8-bit case (no pun intended) because in Unicode we
3799                  * have to map both upper and title case to lower case.
3800                  */
3801                 if (type == REFF) {
3802                     while (s < e) {
3803                         STRLEN ulen1, ulen2;
3804                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3805                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3806
3807                         if (l >= PL_regeol)
3808                             sayNO;
3809                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3810                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3811                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3812                             sayNO;
3813                         s += ulen1;
3814                         l += ulen2;
3815                     }
3816                 }
3817                 locinput = l;
3818                 nextchr = UCHARAT(locinput);
3819                 break;
3820             }
3821
3822             /* Inline the first character, for speed. */
3823             if (UCHARAT(s) != nextchr &&
3824                 (type == REF ||
3825                  (UCHARAT(s) != (type == REFF
3826                                   ? PL_fold : PL_fold_locale)[nextchr])))
3827                 sayNO;
3828             ln = PL_regoffs[n].end - ln;
3829             if (locinput + ln > PL_regeol)
3830                 sayNO;
3831             if (ln > 1 && (type == REF
3832                            ? memNE(s, locinput, ln)
3833                            : (type == REFF
3834                               ? ibcmp(s, locinput, ln)
3835                               : ibcmp_locale(s, locinput, ln))))
3836                 sayNO;
3837             locinput += ln;
3838             nextchr = UCHARAT(locinput);
3839             break;
3840         }
3841         case NOTHING:
3842         case TAIL:
3843             break;
3844         case BACK:
3845             break;
3846
3847 #undef  ST
3848 #define ST st->u.eval
3849         {
3850             SV *ret;
3851             REGEXP *re_sv;
3852             regexp *re;
3853             regexp_internal *rei;
3854             regnode *startpoint;
3855
3856         case GOSTART:
3857         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3858             if (cur_eval && cur_eval->locinput==locinput) {
3859                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3860                     Perl_croak(aTHX_ "Infinite recursion in regex");
3861                 if ( ++nochange_depth > max_nochange_depth )
3862                     Perl_croak(aTHX_
3863                         "Pattern subroutine nesting without pos change"
3864                         " exceeded limit in regex");
3865             } else {
3866                 nochange_depth = 0;
3867             }
3868             re_sv = rex_sv;
3869             re = rex;
3870             rei = rexi;
3871             (void)ReREFCNT_inc(rex_sv);
3872             if (OP(scan)==GOSUB) {
3873                 startpoint = scan + ARG2L(scan);
3874                 ST.close_paren = ARG(scan);
3875             } else {
3876                 startpoint = rei->program+1;
3877                 ST.close_paren = 0;
3878             }
3879             goto eval_recurse_doit;
3880             /* NOTREACHED */
3881         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3882             if (cur_eval && cur_eval->locinput==locinput) {
3883                 if ( ++nochange_depth > max_nochange_depth )
3884                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3885             } else {
3886                 nochange_depth = 0;
3887             }
3888             {
3889                 /* execute the code in the {...} */
3890                 dSP;
3891                 SV ** const before = SP;
3892                 OP_4tree * const oop = PL_op;
3893                 COP * const ocurcop = PL_curcop;
3894                 PAD *old_comppad;
3895                 char *saved_regeol = PL_regeol;
3896
3897                 n = ARG(scan);
3898                 PL_op = (OP_4tree*)rexi->data->data[n];
3899                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3900                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3901                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3902                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3903
3904                 if (sv_yes_mark) {
3905                     SV *sv_mrk = get_sv("REGMARK", 1);
3906                     sv_setsv(sv_mrk, sv_yes_mark);
3907                 }
3908
3909                 CALLRUNOPS(aTHX);                       /* Scalar context. */
3910                 SPAGAIN;
3911                 if (SP == before)
3912                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3913                 else {
3914                     ret = POPs;
3915                     PUTBACK;
3916                 }
3917
3918                 PL_op = oop;
3919                 PAD_RESTORE_LOCAL(old_comppad);
3920                 PL_curcop = ocurcop;
3921                 PL_regeol = saved_regeol;
3922                 if (!logical) {
3923                     /* /(?{...})/ */
3924                     sv_setsv(save_scalar(PL_replgv), ret);
3925                     break;
3926                 }
3927             }
3928             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3929                 logical = 0;
3930                 {
3931                     /* extract RE object from returned value; compiling if
3932                      * necessary */
3933                     MAGIC *mg = NULL;
3934                     REGEXP *rx = NULL;
3935
3936                     if (SvROK(ret)) {
3937                         SV *const sv = SvRV(ret);
3938
3939                         if (SvTYPE(sv) == SVt_REGEXP) {
3940                             rx = (REGEXP*) sv;
3941                         } else if (SvSMAGICAL(sv)) {
3942                             mg = mg_find(sv, PERL_MAGIC_qr);
3943                             assert(mg);
3944                         }
3945                     } else if (SvTYPE(ret) == SVt_REGEXP) {
3946                         rx = (REGEXP*) ret;
3947                     } else if (SvSMAGICAL(ret)) {
3948                         if (SvGMAGICAL(ret)) {
3949                             /* I don't believe that there is ever qr magic
3950                                here.  */
3951                             assert(!mg_find(ret, PERL_MAGIC_qr));
3952                             sv_unmagic(ret, PERL_MAGIC_qr);
3953                         }
3954                         else {
3955                             mg = mg_find(ret, PERL_MAGIC_qr);
3956                             /* testing suggests mg only ends up non-NULL for
3957                                scalars who were upgraded and compiled in the
3958                                else block below. In turn, this is only
3959                                triggered in the "postponed utf8 string" tests
3960                                in t/op/pat.t  */
3961                         }
3962                     }
3963
3964                     if (mg) {
3965                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3966                         assert(rx);
3967                     }
3968                     if (rx) {
3969                         rx = reg_temp_copy(NULL, rx);
3970                     }
3971                     else {
3972                         U32 pm_flags = 0;
3973                         const I32 osize = PL_regsize;
3974
3975                         if (DO_UTF8(ret)) {
3976                             assert (SvUTF8(ret));
3977                         } else if (SvUTF8(ret)) {
3978                             /* Not doing UTF-8, despite what the SV says. Is
3979                                this only if we're trapped in use 'bytes'?  */
3980                             /* Make a copy of the octet sequence, but without
3981                                the flag on, as the compiler now honours the
3982                                SvUTF8 flag on ret.  */
3983                             STRLEN len;
3984                             const char *const p = SvPV(ret, len);
3985                             ret = newSVpvn_flags(p, len, SVs_TEMP);
3986                         }
3987                         rx = CALLREGCOMP(ret, pm_flags);
3988                         if (!(SvFLAGS(ret)
3989                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3990                                  | SVs_GMG))) {
3991                             /* This isn't a first class regexp. Instead, it's
3992                                caching a regexp onto an existing, Perl visible
3993                                scalar.  */
3994                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3995                         }
3996                         PL_regsize = osize;
3997                     }
3998                     re_sv = rx;
3999                     re = (struct regexp *)SvANY(rx);
4000                 }
4001                 RXp_MATCH_COPIED_off(re);
4002                 re->subbeg = rex->subbeg;
4003                 re->sublen = rex->sublen;
4004                 rei = RXi_GET(re);
4005                 DEBUG_EXECUTE_r(
4006                     debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4007                         "Matching embedded");
4008                 );
4009                 startpoint = rei->program + 1;
4010                 ST.close_paren = 0; /* only used for GOSUB */
4011                 /* borrowed from regtry */
4012                 if (PL_reg_start_tmpl <= re->nparens) {
4013                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4014                     if(PL_reg_start_tmp)
4015                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4016                     else
4017                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4018                 }
4019
4020         eval_recurse_doit: /* Share code with GOSUB below this line */
4021                 /* run the pattern returned from (??{...}) */
4022                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4023                 REGCP_SET(ST.lastcp);
4024
4025                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4026
4027                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4028                 PL_reglastparen = &re->lastparen;
4029                 PL_reglastcloseparen = &re->lastcloseparen;
4030                 re->lastparen = 0;
4031                 re->lastcloseparen = 0;
4032
4033                 PL_reginput = locinput;
4034                 PL_regsize = 0;
4035
4036                 /* XXXX This is too dramatic a measure... */
4037                 PL_reg_maxiter = 0;
4038
4039                 ST.toggle_reg_flags = PL_reg_flags;
4040                 if (RX_UTF8(re_sv))
4041                     PL_reg_flags |= RF_utf8;
4042                 else
4043                     PL_reg_flags &= ~RF_utf8;
4044                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4045
4046                 ST.prev_rex = rex_sv;
4047                 ST.prev_curlyx = cur_curlyx;
4048                 SETREX(rex_sv,re_sv);
4049                 rex = re;
4050                 rexi = rei;
4051                 cur_curlyx = NULL;
4052                 ST.B = next;
4053                 ST.prev_eval = cur_eval;
4054                 cur_eval = st;
4055                 /* now continue from first node in postoned RE */
4056                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4057                 /* NOTREACHED */
4058             }
4059             /* logical is 1,   /(?(?{...})X|Y)/ */
4060             sw = (bool)SvTRUE(ret);
4061             logical = 0;
4062             break;
4063         }
4064
4065         case EVAL_AB: /* cleanup after a successful (??{A})B */
4066             /* note: this is called twice; first after popping B, then A */
4067             PL_reg_flags ^= ST.toggle_reg_flags;
4068             ReREFCNT_dec(rex_sv);
4069             SETREX(rex_sv,ST.prev_rex);
4070             rex = (struct regexp *)SvANY(rex_sv);
4071             rexi = RXi_GET(rex);
4072             regcpblow(ST.cp);
4073             cur_eval = ST.prev_eval;
4074             cur_curlyx = ST.prev_curlyx;
4075
4076             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4077             PL_reglastparen = &rex->lastparen;
4078             PL_reglastcloseparen = &rex->lastcloseparen;
4079             /* also update PL_regoffs */
4080             PL_regoffs = rex->offs;
4081
4082             /* XXXX This is too dramatic a measure... */
4083             PL_reg_maxiter = 0;
4084             if ( nochange_depth )
4085                 nochange_depth--;
4086             sayYES;
4087
4088
4089         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4090             /* note: this is called twice; first after popping B, then A */
4091             PL_reg_flags ^= ST.toggle_reg_flags;
4092             ReREFCNT_dec(rex_sv);
4093             SETREX(rex_sv,ST.prev_rex);
4094             rex = (struct regexp *)SvANY(rex_sv);
4095             rexi = RXi_GET(rex);
4096             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4097             PL_reglastparen = &rex->lastparen;
4098             PL_reglastcloseparen = &rex->lastcloseparen;
4099
4100             PL_reginput = locinput;
4101             REGCP_UNWIND(ST.lastcp);
4102             regcppop(rex);
4103             cur_eval = ST.prev_eval;
4104             cur_curlyx = ST.prev_curlyx;
4105             /* XXXX This is too dramatic a measure... */
4106             PL_reg_maxiter = 0;
4107             if ( nochange_depth )
4108                 nochange_depth--;
4109             sayNO_SILENT;
4110 #undef ST
4111
4112         case OPEN:
4113             n = ARG(scan);  /* which paren pair */
4114             PL_reg_start_tmp[n] = locinput;
4115             if (n > PL_regsize)
4116                 PL_regsize = n;
4117             lastopen = n;
4118             break;
4119         case CLOSE:
4120             n = ARG(scan);  /* which paren pair */
4121             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4122             PL_regoffs[n].end = locinput - PL_bostr;
4123             /*if (n > PL_regsize)
4124                 PL_regsize = n;*/
4125             if (n > *PL_reglastparen)
4126                 *PL_reglastparen = n;
4127             *PL_reglastcloseparen = n;
4128             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4129                 goto fake_end;
4130             }
4131             break;
4132         case ACCEPT:
4133             if (ARG(scan)){
4134                 regnode *cursor;
4135                 for (cursor=scan;
4136                      cursor && OP(cursor)!=END;
4137                      cursor=regnext(cursor))
4138                 {
4139                     if ( OP(cursor)==CLOSE ){
4140                         n = ARG(cursor);
4141                         if ( n <= lastopen ) {
4142                             PL_regoffs[n].start
4143                                 = PL_reg_start_tmp[n] - PL_bostr;
4144                             PL_regoffs[n].end = locinput - PL_bostr;
4145                             /*if (n > PL_regsize)
4146                             PL_regsize = n;*/
4147                             if (n > *PL_reglastparen)
4148                                 *PL_reglastparen = n;
4149                             *PL_reglastcloseparen = n;
4150                             if ( n == ARG(scan) || (cur_eval &&
4151                                 cur_eval->u.eval.close_paren == n))
4152                                 break;
4153                         }
4154                     }
4155                 }
4156             }
4157             goto fake_end;
4158             /*NOTREACHED*/
4159         case GROUPP:
4160             n = ARG(scan);  /* which paren pair */
4161             sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4162             break;
4163         case NGROUPP:
4164             /* reg_check_named_buff_matched returns 0 for no match */
4165             sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
4166             break;
4167         case INSUBP:
4168             n = ARG(scan);
4169             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4170             break;
4171         case DEFINEP:
4172             sw = 0;
4173             break;
4174         case IFTHEN:
4175             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4176             if (sw)
4177                 next = NEXTOPER(NEXTOPER(scan));
4178             else {
4179                 next = scan + ARG(scan);
4180                 if (OP(next) == IFTHEN) /* Fake one. */
4181                     next = NEXTOPER(NEXTOPER(next));
4182             }
4183             break;
4184         case LOGICAL:
4185             logical = scan->flags;
4186             break;
4187
4188 /*******************************************************************
4189
4190 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4191 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4192 STAR/PLUS/CURLY/CURLYN are used instead.)
4193
4194 A*B is compiled as <CURLYX><A><WHILEM><B>
4195
4196 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4197 state, which contains the current count, initialised to -1. It also sets
4198 cur_curlyx to point to this state, with any previous value saved in the
4199 state block.
4200
4201 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4202 since the pattern may possibly match zero times (i.e. it's a while {} loop
4203 rather than a do {} while loop).
4204
4205 Each entry to WHILEM represents a successful match of A. The count in the
4206 CURLYX block is incremented, another WHILEM state is pushed, and execution
4207 passes to A or B depending on greediness and the current count.
4208
4209 For example, if matching against the string a1a2a3b (where the aN are
4210 substrings that match /A/), then the match progresses as follows: (the
4211 pushed states are interspersed with the bits of strings matched so far):
4212
4213     <CURLYX cnt=-1>
4214     <CURLYX cnt=0><WHILEM>
4215     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4216     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4217     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4218     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4219
4220 (Contrast this with something like CURLYM, which maintains only a single
4221 backtrack state:
4222
4223     <CURLYM cnt=0> a1
4224     a1 <CURLYM cnt=1> a2
4225     a1 a2 <CURLYM cnt=2> a3
4226     a1 a2 a3 <CURLYM cnt=3> b
4227 )
4228
4229 Each WHILEM state block marks a point to backtrack to upon partial failure
4230 of A or B, and also contains some minor state data related to that
4231 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4232 overall state, such as the count, and pointers to the A and B ops.
4233
4234 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4235 must always point to the *current* CURLYX block, the rules are:
4236
4237 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4238 and set cur_curlyx to point the new block.
4239
4240 When popping the CURLYX block after a successful or unsuccessful match,
4241 restore the previous cur_curlyx.
4242
4243 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4244 to the outer one saved in the CURLYX block.
4245
4246 When popping the WHILEM block after a successful or unsuccessful B match,
4247 restore the previous cur_curlyx.
4248
4249 Here's an example for the pattern (AI* BI)*BO
4250 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4251
4252 cur_
4253 curlyx backtrack stack
4254 ------ ---------------
4255 NULL
4256 CO     <CO prev=NULL> <WO>
4257 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4258 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4259 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4260
4261 At this point the pattern succeeds, and we work back down the stack to
4262 clean up, restoring as we go:
4263
4264 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4265 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4266 CO     <CO prev=NULL> <WO>
4267 NULL
4268
4269 *******************************************************************/
4270
4271 #define ST st->u.curlyx
4272
4273         case CURLYX:    /* start of /A*B/  (for complex A) */
4274         {
4275             /* No need to save/restore up to this paren */
4276             I32 parenfloor = scan->flags;
4277
4278             assert(next); /* keep Coverity happy */
4279             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4280                 next += ARG(next);
4281
4282             /* XXXX Probably it is better to teach regpush to support
4283                parenfloor > PL_regsize... */
4284             if (parenfloor > (I32)*PL_reglastparen)
4285                 parenfloor = *PL_reglastparen; /* Pessimization... */
4286
4287             ST.prev_curlyx= cur_curlyx;
4288             cur_curlyx = st;
4289             ST.cp = PL_savestack_ix;
4290
4291             /* these fields contain the state of the current curly.
4292              * they are accessed by subsequent WHILEMs */
4293             ST.parenfloor = parenfloor;
4294             ST.min = ARG1(scan);
4295             ST.max = ARG2(scan);
4296             ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4297             ST.B = next;
4298             ST.minmod = minmod;
4299             minmod = 0;
4300             ST.count = -1;      /* this will be updated by WHILEM */
4301             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4302
4303             PL_reginput = locinput;
4304             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4305             /* NOTREACHED */
4306         }
4307
4308         case CURLYX_end: /* just finished matching all of A*B */
4309             cur_curlyx = ST.prev_curlyx;
4310             sayYES;
4311             /* NOTREACHED */
4312
4313         case CURLYX_end_fail: /* just failed to match all of A*B */
4314             regcpblow(ST.cp);
4315             cur_curlyx = ST.prev_curlyx;
4316             sayNO;
4317             /* NOTREACHED */
4318
4319
4320 #undef ST
4321 #define ST st->u.whilem
4322
4323         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4324         {
4325             /* see the discussion above about CURLYX/WHILEM */
4326             I32 n;
4327             assert(cur_curlyx); /* keep Coverity happy */
4328             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4329             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4330             ST.cache_offset = 0;
4331             ST.cache_mask = 0;
4332
4333             PL_reginput = locinput;
4334
4335             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4336                   "%*s  whilem: matched %ld out of %ld..%ld\n",
4337                   REPORT_CODE_OFF+depth*2, "", (long)n,
4338                   (long)cur_curlyx->u.curlyx.min,
4339                   (long)cur_curlyx->u.curlyx.max)
4340             );
4341
4342             /* First just match a string of min A's. */
4343
4344             if (n < cur_curlyx->u.curlyx.min) {
4345                 cur_curlyx->u.curlyx.lastloc = locinput;
4346                 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4347                 /* NOTREACHED */
4348             }
4349
4350             /* If degenerate A matches "", assume A done. */
4351
4352             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4353                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4354                    "%*s  whilem: empty match detected, trying continuation...\n",
4355                    REPORT_CODE_OFF+depth*2, "")
4356                 );
4357                 goto do_whilem_B_max;
4358             }
4359
4360             /* super-linear cache processing */
4361
4362             if (scan->flags) {
4363
4364                 if (!PL_reg_maxiter) {
4365                     /* start the countdown: Postpone detection until we
4366                      * know the match is not *that* much linear. */
4367                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4368                     /* possible overflow for long strings and many CURLYX's */
4369                     if (PL_reg_maxiter < 0)
4370                         PL_reg_maxiter = I32_MAX;
4371                     PL_reg_leftiter = PL_reg_maxiter;
4372                 }
4373
4374                 if (PL_reg_leftiter-- == 0) {
4375                     /* initialise cache */
4376                     const I32 size = (PL_reg_maxiter + 7)/8;
4377                     if (PL_reg_poscache) {
4378                         if ((I32)PL_reg_poscache_size < size) {
4379                             Renew(PL_reg_poscache, size, char);
4380                             PL_reg_poscache_size = size;
4381                         }
4382                         Zero(PL_reg_poscache, size, char);
4383                     }
4384                     else {
4385                         PL_reg_poscache_size = size;
4386                         Newxz(PL_reg_poscache, size, char);
4387                     }
4388                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4389       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4390                               PL_colors[4], PL_colors[5])
4391                     );
4392                 }
4393
4394                 if (PL_reg_leftiter < 0) {
4395                     /* have we already failed at this position? */
4396                     I32 offset, mask;
4397                     offset  = (scan->flags & 0xf) - 1
4398                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4399                     mask    = 1 << (offset % 8);
4400                     offset /= 8;
4401                     if (PL_reg_poscache[offset] & mask) {
4402                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4403                             "%*s  whilem: (cache) already tried at this position...\n",
4404                             REPORT_CODE_OFF+depth*2, "")
4405                         );
4406                         sayNO; /* cache records failure */
4407                     }
4408                     ST.cache_offset = offset;
4409                     ST.cache_mask   = mask;
4410                 }
4411             }
4412
4413             /* Prefer B over A for minimal matching. */
4414
4415             if (cur_curlyx->u.curlyx.minmod) {
4416                 ST.save_curlyx = cur_curlyx;
4417                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4418                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4419                 REGCP_SET(ST.lastcp);
4420                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4421                 /* NOTREACHED */
4422             }
4423
4424             /* Prefer A over B for maximal matching. */
4425
4426             if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4427                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4428                 cur_curlyx->u.curlyx.lastloc = locinput;
4429                 REGCP_SET(ST.lastcp);
4430                 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4431                 /* NOTREACHED */
4432             }
4433             goto do_whilem_B_max;
4434         }
4435         /* NOTREACHED */
4436
4437         case WHILEM_B_min: /* just matched B in a minimal match */
4438         case WHILEM_B_max: /* just matched B in a maximal match */
4439             cur_curlyx = ST.save_curlyx;
4440             sayYES;
4441             /* NOTREACHED */
4442
4443         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4444             cur_curlyx = ST.save_curlyx;
4445             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4446             cur_curlyx->u.curlyx.count--;
4447             CACHEsayNO;
4448             /* NOTREACHED */
4449
4450         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4451             REGCP_UNWIND(ST.lastcp);
4452             regcppop(rex);
4453             /* FALL THROUGH */
4454         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4455             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4456             cur_curlyx->u.curlyx.count--;
4457             CACHEsayNO;
4458             /* NOTREACHED */
4459
4460         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4461             REGCP_UNWIND(ST.lastcp);
4462             regcppop(rex);      /* Restore some previous $<digit>s? */
4463             PL_reginput = locinput;
4464             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4465                 "%*s  whilem: failed, trying continuation...\n",
4466                 REPORT_CODE_OFF+depth*2, "")
4467             );
4468           do_whilem_B_max:
4469             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4470                 && ckWARN(WARN_REGEXP)
4471                 && !(PL_reg_flags & RF_warned))
4472             {
4473                 PL_reg_flags |= RF_warned;
4474                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4475                      "Complex regular subexpression recursion",
4476                      REG_INFTY - 1);
4477             }
4478
4479             /* now try B */
4480             ST.save_curlyx = cur_curlyx;
4481             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4482             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4483             /* NOTREACHED */
4484
4485         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4486             cur_curlyx = ST.save_curlyx;
4487             REGCP_UNWIND(ST.lastcp);
4488             regcppop(rex);
4489
4490             if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4491                 /* Maximum greed exceeded */
4492                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4493                     && ckWARN(WARN_REGEXP)
4494                     && !(PL_reg_flags & RF_warned))
4495                 {
4496                     PL_reg_flags |= RF_warned;
4497                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4498                         "%s limit (%d) exceeded",
4499                         "Complex regular subexpression recursion",
4500                         REG_INFTY - 1);
4501                 }
4502                 cur_curlyx->u.curlyx.count--;
4503                 CACHEsayNO;
4504             }
4505
4506             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4507                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4508             );
4509             /* Try grabbing another A and see if it helps. */
4510             PL_reginput = locinput;
4511             cur_curlyx->u.curlyx.lastloc = locinput;
4512             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4513             REGCP_SET(ST.lastcp);
4514             PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4515             /* NOTREACHED */
4516
4517 #undef  ST
4518 #define ST st->u.branch
4519
4520         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4521             next = scan + ARG(scan);
4522             if (next == scan)
4523                 next = NULL;
4524             scan = NEXTOPER(scan);
4525             /* FALL THROUGH */
4526
4527         case BRANCH:        /*  /(...|A|...)/ */
4528             scan = NEXTOPER(scan); /* scan now points to inner node */
4529             ST.lastparen = *PL_reglastparen;
4530             ST.next_branch = next;
4531             REGCP_SET(ST.cp);
4532             PL_reginput = locinput;
4533
4534             /* Now go into the branch */
4535             if (has_cutgroup) {
4536                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4537             } else {
4538                 PUSH_STATE_GOTO(BRANCH_next, scan);
4539             }
4540             /* NOTREACHED */
4541         case CUTGROUP:
4542             PL_reginput = locinput;
4543             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4544                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4545             PUSH_STATE_GOTO(CUTGROUP_next,next);
4546             /* NOTREACHED */
4547         case CUTGROUP_next_fail:
4548             do_cutgroup = 1;
4549             no_final = 1;
4550             if (st->u.mark.mark_name)
4551                 sv_commit = st->u.mark.mark_name;
4552             sayNO;
4553             /* NOTREACHED */
4554         case BRANCH_next:
4555             sayYES;
4556             /* NOTREACHED */
4557         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4558             if (do_cutgroup) {
4559                 do_cutgroup = 0;
4560                 no_final = 0;
4561             }
4562             REGCP_UNWIND(ST.cp);
4563             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4564                 PL_regoffs[n].end = -1;
4565             *PL_reglastparen = n;
4566             /*dmq: *PL_reglastcloseparen = n; */
4567             scan = ST.next_branch;
4568             /* no more branches? */
4569             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4570                 DEBUG_EXECUTE_r({
4571                     PerlIO_printf( Perl_debug_log,
4572                         "%*s  %sBRANCH failed...%s\n",
4573                         REPORT_CODE_OFF+depth*2, "",
4574                         PL_colors[4],
4575                         PL_colors[5] );
4576                 });
4577                 sayNO_SILENT;
4578             }
4579             continue; /* execute next BRANCH[J] op */
4580             /* NOTREACHED */
4581
4582         case MINMOD:
4583             minmod = 1;
4584             break;
4585
4586 #undef  ST
4587 #define ST st->u.curlym
4588
4589         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4590
4591             /* This is an optimisation of CURLYX that enables us to push
4592              * only a single backtracking state, no matter how many matches
4593              * there are in {m,n}. It relies on the pattern being constant
4594              * length, with no parens to influence future backrefs
4595              */
4596
4597             ST.me = scan;
4598             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4599
4600             /* if paren positive, emulate an OPEN/CLOSE around A */
4601             if (ST.me->flags) {
4602                 U32 paren = ST.me->flags;
4603                 if (paren > PL_regsize)
4604                     PL_regsize = paren;
4605                 if (paren > *PL_reglastparen)
4606                     *PL_reglastparen = paren;
4607                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4608             }
4609             ST.A = scan;
4610             ST.B = next;
4611             ST.alen = 0;
4612             ST.count = 0;
4613             ST.minmod = minmod;
4614             minmod = 0;
4615             ST.c1 = CHRTEST_UNINIT;
4616             REGCP_SET(ST.cp);
4617
4618             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4619                 goto curlym_do_B;
4620
4621           curlym_do_A: /* execute the A in /A{m,n}B/  */
4622             PL_reginput = locinput;
4623             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4624             /* NOTREACHED */
4625
4626         case CURLYM_A: /* we've just matched an A */
4627             locinput = st->locinput;
4628             nextchr = UCHARAT(locinput);
4629
4630             ST.count++;
4631             /* after first match, determine A's length: u.curlym.alen */
4632             if (ST.count == 1) {
4633                 if (PL_reg_match_utf8) {
4634                     char *s = locinput;
4635                     while (s < PL_reginput) {
4636                         ST.alen++;
4637                         s += UTF8SKIP(s);
4638                     }
4639                 }
4640                 else {
4641                     ST.alen = PL_reginput - locinput;
4642                 }
4643                 if (ST.alen == 0)
4644                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4645             }
4646             DEBUG_EXECUTE_r(
4647                 PerlIO_printf(Perl_debug_log,
4648                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4649                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4650                           (IV) ST.count, (IV)ST.alen)
4651             );
4652
4653             locinput = PL_reginput;
4654
4655             if (cur_eval && cur_eval->u.eval.close_paren &&
4656                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4657                 goto fake_end;
4658
4659             {
4660                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4661                 if ( max == REG_INFTY || ST.count < max )
4662                     goto curlym_do_A; /* try to match another A */
4663             }
4664             goto curlym_do_B; /* try to match B */
4665
4666         case CURLYM_A_fail: /* just failed to match an A */
4667             REGCP_UNWIND(ST.cp);
4668
4669             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4670                 || (cur_eval && cur_eval->u.eval.close_paren &&
4671                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4672                 sayNO;
4673
4674           curlym_do_B: /* execute the B in /A{m,n}B/  */
4675             PL_reginput = locinput;
4676             if (ST.c1 == CHRTEST_UNINIT) {
4677                 /* calculate c1 and c2 for possible match of 1st char
4678                  * following curly */
4679                 ST.c1 = ST.c2 = CHRTEST_VOID;
4680                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4681                     regnode *text_node = ST.B;
4682                     if (! HAS_TEXT(text_node))
4683                         FIND_NEXT_IMPT(text_node);
4684                     /* this used to be
4685
4686                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4687
4688                         But the former is redundant in light of the latter.
4689
4690                         if this changes back then the macro for
4691                         IS_TEXT and friends need to change.
4692                      */
4693                     if (PL_regkind[OP(text_node)] == EXACT)
4694                     {
4695
4696                         ST.c1 = (U8)*STRING(text_node);
4697                         ST.c2 =
4698                             (IS_TEXTF(text_node))
4699                             ? PL_fold[ST.c1]
4700                             : (IS_TEXTFL(text_node))
4701                                 ? PL_fold_locale[ST.c1]
4702                                 : ST.c1;
4703                     }
4704                 }
4705             }
4706
4707             DEBUG_EXECUTE_r(
4708                 PerlIO_printf(Perl_debug_log,
4709                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4710                     (int)(REPORT_CODE_OFF+(depth*2)),
4711                     "", (IV)ST.count)
4712                 );
4713             if (ST.c1 != CHRTEST_VOID
4714                     && UCHARAT(PL_reginput) != ST.c1
4715                     && UCHARAT(PL_reginput) != ST.c2)
4716             {
4717                 /* simulate B failing */
4718                 DEBUG_OPTIMISE_r(
4719                     PerlIO_printf(Perl_debug_log,
4720                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4721                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4722                         (IV)ST.c1,(IV)ST.c2
4723                 ));
4724                 state_num = CURLYM_B_fail;
4725                 goto reenter_switch;
4726             }
4727
4728             if (ST.me->flags) {
4729                 /* mark current A as captured */
4730                 I32 paren = ST.me->flags;
4731                 if (ST.count) {
4732                     PL_regoffs[paren].start
4733                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4734                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4735                     /*dmq: *PL_reglastcloseparen = paren; */
4736                 }
4737                 else
4738                     PL_regoffs[paren].end = -1;
4739                 if (cur_eval && cur_eval->u.eval.close_paren &&
4740                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4741                 {
4742                     if (ST.count)
4743                         goto fake_end;
4744                     else
4745                         sayNO;
4746                 }
4747             }
4748
4749             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4750             /* NOTREACHED */
4751
4752         case CURLYM_B_fail: /* just failed to match a B */
4753             REGCP_UNWIND(ST.cp);
4754             if (ST.minmod) {
4755                 I32 max = ARG2(ST.me);
4756                 if (max != REG_INFTY && ST.count == max)
4757                     sayNO;
4758                 goto curlym_do_A; /* try to match a further A */
4759             }
4760             /* backtrack one A */
4761             if (ST.count == ARG1(ST.me) /* min */)
4762                 sayNO;
4763             ST.count--;
4764             locinput = HOPc(locinput, -ST.alen);
4765             goto curlym_do_B; /* try to match B */
4766
4767 #undef ST
4768 #define ST st->u.curly
4769
4770 #define CURLY_SETPAREN(paren, success) \
4771     if (paren) { \
4772         if (success) { \
4773             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4774             PL_regoffs[paren].end = locinput - PL_bostr; \
4775             *PL_reglastcloseparen = paren; \
4776         } \
4777         else \
4778             PL_regoffs[paren].end = -1; \
4779     }
4780
4781         case STAR:              /*  /A*B/ where A is width 1 */
4782             ST.paren = 0;
4783             ST.min = 0;
4784             ST.max = REG_INFTY;
4785             scan = NEXTOPER(scan);
4786             goto repeat;
4787         case PLUS:              /*  /A+B/ where A is width 1 */
4788             ST.paren = 0;
4789             ST.min = 1;
4790             ST.max = REG_INFTY;
4791             scan = NEXTOPER(scan);
4792             goto repeat;
4793         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4794             ST.paren = scan->flags;     /* Which paren to set */
4795             if (ST.paren > PL_regsize)
4796                 PL_regsize = ST.paren;
4797             if (ST.paren > *PL_reglastparen)
4798                 *PL_reglastparen = ST.paren;
4799             ST.min = ARG1(scan);  /* min to match */
4800             ST.max = ARG2(scan);  /* max to match */
4801             if (cur_eval && cur_eval->u.eval.close_paren &&
4802                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4803                 ST.min=1;
4804                 ST.max=1;
4805             }
4806             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4807             goto repeat;
4808         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4809             ST.paren = 0;
4810             ST.min = ARG1(scan);  /* min to match */
4811             ST.max = ARG2(scan);  /* max to match */
4812             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4813           repeat:
4814             /*
4815             * Lookahead to avoid useless match attempts
4816             * when we know what character comes next.
4817             *
4818             * Used to only do .*x and .*?x, but now it allows
4819             * for )'s, ('s and (?{ ... })'s to be in the way
4820             * of the quantifier and the EXACT-like node.  -- japhy
4821             */
4822
4823             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4824                 sayNO;
4825             if (HAS_TEXT(next) || JUMPABLE(next)) {
4826                 U8 *s;
4827                 regnode *text_node = next;
4828
4829                 if (! HAS_TEXT(text_node))
4830                     FIND_NEXT_IMPT(text_node);
4831
4832                 if (! HAS_TEXT(text_node))
4833                     ST.c1 = ST.c2 = CHRTEST_VOID;
4834                 else {
4835                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4836                         ST.c1 = ST.c2 = CHRTEST_VOID;
4837                         goto assume_ok_easy;
4838                     }
4839                     else
4840                         s = (U8*)STRING(text_node);
4841
4842                     /*  Currently we only get here when
4843
4844                         PL_rekind[OP(text_node)] == EXACT
4845
4846                         if this changes back then the macro for IS_TEXT and
4847                         friends need to change. */
4848                     if (!UTF) {
4849                         ST.c2 = ST.c1 = *s;
4850                         if (IS_TEXTF(text_node))
4851                             ST.c2 = PL_fold[ST.c1];
4852                         else if (IS_TEXTFL(text_node))
4853                             ST.c2 = PL_fold_locale[ST.c1];
4854                     }
4855                     else { /* UTF */
4856                         if (IS_TEXTF(text_node)) {
4857                              STRLEN ulen1, ulen2;
4858                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4859                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4860
4861                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4862                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4863 #ifdef EBCDIC
4864                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4865                                                     ckWARN(WARN_UTF8) ?
4866                                                     0 : UTF8_ALLOW_ANY);
4867                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4868                                                     ckWARN(WARN_UTF8) ?
4869                                                     0 : UTF8_ALLOW_ANY);
4870 #else
4871                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4872                                                     uniflags);
4873                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4874                                                     uniflags);
4875 #endif
4876                         }
4877                         else {
4878                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4879                                                      uniflags);
4880                         }
4881                     }
4882                 }
4883             }
4884             else
4885                 ST.c1 = ST.c2 = CHRTEST_VOID;
4886         assume_ok_easy:
4887
4888             ST.A = scan;
4889             ST.B = next;
4890             PL_reginput = locinput;
4891             if (minmod) {
4892                 minmod = 0;
4893                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4894                     sayNO;
4895                 ST.count = ST.min;
4896                 locinput = PL_reginput;
4897                 REGCP_SET(ST.cp);
4898                 if (ST.c1 == CHRTEST_VOID)
4899                     goto curly_try_B_min;
4900
4901                 ST.oldloc = locinput;
4902
4903                 /* set ST.maxpos to the furthest point along the
4904                  * string that could possibly match */
4905                 if  (ST.max == REG_INFTY) {
4906                     ST.maxpos = PL_regeol - 1;
4907                     if (do_utf8)
4908                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4909                             ST.maxpos--;
4910                 }
4911                 else if (do_utf8) {
4912                     int m = ST.max - ST.min;
4913                     for (ST.maxpos = locinput;
4914                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4915                         ST.maxpos += UTF8SKIP(ST.maxpos);
4916                 }
4917                 else {
4918                     ST.maxpos = locinput + ST.max - ST.min;
4919                     if (ST.maxpos >= PL_regeol)
4920                         ST.maxpos = PL_regeol - 1;
4921                 }
4922                 goto curly_try_B_min_known;
4923
4924             }
4925             else {
4926                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4927                 locinput = PL_reginput;
4928                 if (ST.count < ST.min)
4929                     sayNO;
4930                 if ((ST.count > ST.min)
4931                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4932                 {
4933                     /* A{m,n} must come at the end of the string, there's
4934                      * no point in backing off ... */
4935                     ST.min = ST.count;
4936                     /* ...except that $ and \Z can match before *and* after
4937                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4938                        We may back off by one in this case. */
4939                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4940                         ST.min--;
4941                 }
4942                 REGCP_SET(ST.cp);
4943                 goto curly_try_B_max;
4944             }
4945             /* NOTREACHED */
4946
4947
4948         case CURLY_B_min_known_fail:
4949             /* failed to find B in a non-greedy match where c1,c2 valid */
4950             if (ST.paren && ST.count)
4951                 PL_regoffs[ST.paren].end = -1;
4952
4953             PL_reginput = locinput;     /* Could be reset... */
4954             REGCP_UNWIND(ST.cp);
4955             /* Couldn't or didn't -- move forward. */
4956             ST.oldloc = locinput;
4957             if (do_utf8)
4958                 locinput += UTF8SKIP(locinput);
4959             else
4960                 locinput++;
4961             ST.count++;
4962           curly_try_B_min_known:
4963              /* find the next place where 'B' could work, then call B */
4964             {
4965                 int n;
4966                 if (do_utf8) {
4967                     n = (ST.oldloc == locinput) ? 0 : 1;
4968                     if (ST.c1 == ST.c2) {
4969                         STRLEN len;
4970                         /* set n to utf8_distance(oldloc, locinput) */
4971                         while (locinput <= ST.maxpos &&
4972                                utf8n_to_uvchr((U8*)locinput,
4973                                               UTF8_MAXBYTES, &len,
4974                                               uniflags) != (UV)ST.c1) {
4975                             locinput += len;
4976                             n++;
4977                         }
4978                     }
4979                     else {
4980                         /* set n to utf8_distance(oldloc, locinput) */
4981                         while (locinput <= ST.maxpos) {
4982                             STRLEN len;
4983                             const UV c = utf8n_to_uvchr((U8*)locinput,
4984                                                   UTF8_MAXBYTES, &len,
4985                                                   uniflags);
4986                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
4987                                 break;
4988                             locinput += len;
4989                             n++;
4990                         }
4991                     }
4992                 }
4993                 else {
4994                     if (ST.c1 == ST.c2) {
4995                         while (locinput <= ST.maxpos &&
4996                                UCHARAT(locinput) != ST.c1)
4997                             locinput++;
4998                     }
4999                     else {
5000                         while (locinput <= ST.maxpos
5001                                && UCHARAT(locinput) != ST.c1
5002                                && UCHARAT(locinput) != ST.c2)
5003                             locinput++;
5004                     }
5005                     n = locinput - ST.oldloc;
5006                 }
5007                 if (locinput > ST.maxpos)
5008                     sayNO;
5009                 /* PL_reginput == oldloc now */
5010                 if (n) {
5011                     ST.count += n;
5012                     if (regrepeat(rex, ST.A, n, depth) < n)
5013                         sayNO;
5014                 }
5015                 PL_reginput = locinput;
5016                 CURLY_SETPAREN(ST.paren, ST.count);
5017                 if (cur_eval && cur_eval->u.eval.close_paren &&
5018                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5019                     goto fake_end;
5020                 }
5021                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5022             }
5023             /* NOTREACHED */
5024
5025
5026         case CURLY_B_min_fail:
5027             /* failed to find B in a non-greedy match where c1,c2 invalid */
5028             if (ST.paren && ST.count)
5029                 PL_regoffs[ST.paren].end = -1;
5030
5031             REGCP_UNWIND(ST.cp);
5032             /* failed -- move forward one */
5033             PL_reginput = locinput;
5034             if (regrepeat(rex, ST.A, 1, depth)) {
5035                 ST.count++;
5036                 locinput = PL_reginput;
5037                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5038                         ST.count > 0)) /* count overflow ? */
5039                 {
5040                   curly_try_B_min:
5041                     CURLY_SETPAREN(ST.paren, ST.count);
5042                     if (cur_eval && cur_eval->u.eval.close_paren &&
5043                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5044                         goto fake_end;
5045                     }
5046                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5047                 }
5048             }
5049             sayNO;
5050             /* NOTREACHED */
5051
5052
5053         curly_try_B_max:
5054             /* a successful greedy match: now try to match B */
5055             if (cur_eval && cur_eval->u.eval.close_paren &&
5056                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5057                 goto fake_end;
5058             }
5059             {
5060                 UV c = 0;
5061                 if (ST.c1 != CHRTEST_VOID)
5062                     c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5063                                            UTF8_MAXBYTES, 0, uniflags)
5064                                 : (UV) UCHARAT(PL_reginput);
5065                 /* If it could work, try it. */
5066                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5067                     CURLY_SETPAREN(ST.paren, ST.count);
5068                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5069                     /* NOTREACHED */
5070                 }
5071             }
5072             /* FALL THROUGH */
5073         case CURLY_B_max_fail:
5074             /* failed to find B in a greedy match */
5075             if (ST.paren && ST.count)
5076                 PL_regoffs[ST.paren].end = -1;
5077
5078             REGCP_UNWIND(ST.cp);
5079             /*  back up. */
5080             if (--ST.count < ST.min)
5081                 sayNO;
5082             PL_reginput = locinput = HOPc(locinput, -1);
5083             goto curly_try_B_max;
5084
5085 #undef ST
5086
5087         case END:
5088             fake_end:
5089             if (cur_eval) {
5090                 /* we've just finished A in /(??{A})B/; now continue with B */
5091                 I32 tmpix;
5092                 st->u.eval.toggle_reg_flags
5093                             = cur_eval->u.eval.toggle_reg_flags;
5094                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5095
5096                 st->u.eval.prev_rex = rex_sv;           /* inner */
5097                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5098                 rex = (struct regexp *)SvANY(rex_sv);
5099                 rexi = RXi_GET(rex);
5100                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5101                 ReREFCNT_inc(rex_sv);
5102                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5103
5104                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5105                 PL_reglastparen = &rex->lastparen;
5106                 PL_reglastcloseparen = &rex->lastcloseparen;
5107
5108                 REGCP_SET(st->u.eval.lastcp);
5109                 PL_reginput = locinput;
5110
5111                 /* Restore parens of the outer rex without popping the
5112                  * savestack */
5113                 tmpix = PL_savestack_ix;
5114                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5115                 regcppop(rex);
5116                 PL_savestack_ix = tmpix;
5117
5118                 st->u.eval.prev_eval = cur_eval;
5119                 cur_eval = cur_eval->u.eval.prev_eval;
5120                 DEBUG_EXECUTE_r(
5121                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5122                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5123                 if ( nochange_depth )
5124                     nochange_depth--;
5125
5126                 PUSH_YES_STATE_GOTO(EVAL_AB,
5127                         st->u.eval.prev_eval->u.eval.B); /* match B */
5128             }
5129
5130             if (locinput < reginfo->till) {
5131                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5132                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5133                                       PL_colors[4],
5134                                       (long)(locinput - PL_reg_starttry),
5135                                       (long)(reginfo->till - PL_reg_starttry),
5136                                       PL_colors[5]));
5137
5138                 sayNO_SILENT;           /* Cannot match: too short. */
5139             }
5140             PL_reginput = locinput;     /* put where regtry can find it */
5141             sayYES;                     /* Success! */
5142
5143         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5144             DEBUG_EXECUTE_r(
5145             PerlIO_printf(Perl_debug_log,
5146                 "%*s  %ssubpattern success...%s\n",
5147                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5148             PL_reginput = locinput;     /* put where regtry can find it */
5149             sayYES;                     /* Success! */
5150
5151 #undef  ST
5152 #define ST st->u.ifmatch
5153
5154         case SUSPEND:   /* (?>A) */
5155             ST.wanted = 1;
5156             PL_reginput = locinput;
5157             goto do_ifmatch;
5158
5159         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5160             ST.wanted = 0;
5161             goto ifmatch_trivial_fail_test;
5162
5163         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5164             ST.wanted = 1;
5165           ifmatch_trivial_fail_test:
5166             if (scan->flags) {
5167                 char * const s = HOPBACKc(locinput, scan->flags);
5168                 if (!s) {
5169                     /* trivial fail */
5170                     if (logical) {
5171                         logical = 0;
5172                         sw = 1 - (bool)ST.wanted;
5173                     }
5174                     else if (ST.wanted)
5175                         sayNO;
5176                     next = scan + ARG(scan);
5177                     if (next == scan)
5178                         next = NULL;
5179                     break;
5180                 }
5181                 PL_reginput = s;
5182             }
5183             else
5184                 PL_reginput = locinput;
5185
5186           do_ifmatch:
5187             ST.me = scan;
5188             ST.logical = logical;
5189             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5190
5191             /* execute body of (?...A) */
5192             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5193             /* NOTREACHED */
5194
5195         case IFMATCH_A_fail: /* body of (?...A) failed */
5196             ST.wanted = !ST.wanted;
5197             /* FALL THROUGH */
5198
5199         case IFMATCH_A: /* body of (?...A) succeeded */
5200             if (ST.logical) {
5201                 sw = (bool)ST.wanted;
5202             }
5203             else if (!ST.wanted)
5204                 sayNO;
5205
5206             if (OP(ST.me) == SUSPEND)
5207                 locinput = PL_reginput;
5208             else {
5209                 locinput = PL_reginput = st->locinput;
5210                 nextchr = UCHARAT(locinput);
5211             }
5212             scan = ST.me + ARG(ST.me);
5213             if (scan == ST.me)
5214                 scan = NULL;
5215             continue; /* execute B */
5216
5217 #undef ST
5218
5219         case LONGJMP:
5220             next = scan + ARG(scan);
5221             if (next == scan)
5222                 next = NULL;
5223             break;
5224         case COMMIT:
5225             reginfo->cutpoint = PL_regeol;
5226             /* FALLTHROUGH */
5227         case PRUNE:
5228             PL_reginput = locinput;
5229             if (!scan->flags)
5230                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5231             PUSH_STATE_GOTO(COMMIT_next,next);
5232             /* NOTREACHED */
5233         case COMMIT_next_fail:
5234             no_final = 1;
5235             /* FALLTHROUGH */
5236         case OPFAIL:
5237             sayNO;
5238             /* NOTREACHED */
5239
5240 #define ST st->u.mark
5241         case MARKPOINT:
5242             ST.prev_mark = mark_state;
5243             ST.mark_name = sv_commit = sv_yes_mark
5244                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5245             mark_state = st;
5246             ST.mark_loc = PL_reginput = locinput;
5247             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5248             /* NOTREACHED */
5249         case MARKPOINT_next:
5250             mark_state = ST.prev_mark;
5251             sayYES;
5252             /* NOTREACHED */
5253         case MARKPOINT_next_fail:
5254             if (popmark && sv_eq(ST.mark_name,popmark))
5255             {
5256                 if (ST.mark_loc > startpoint)
5257                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5258                 popmark = NULL; /* we found our mark */
5259                 sv_commit = ST.mark_name;
5260
5261                 DEBUG_EXECUTE_r({
5262                         PerlIO_printf(Perl_debug_log,
5263                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5264                             REPORT_CODE_OFF+depth*2, "",
5265                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5266                 });
5267             }
5268             mark_state = ST.prev_mark;
5269             sv_yes_mark = mark_state ?
5270                 mark_state->u.mark.mark_name : NULL;
5271             sayNO;
5272             /* NOTREACHED */
5273         case SKIP:
5274             PL_reginput = locinput;
5275             if (scan->flags) {
5276                 /* (*SKIP) : if we fail we cut here*/
5277                 ST.mark_name = NULL;
5278                 ST.mark_loc = locinput;
5279                 PUSH_STATE_GOTO(SKIP_next,next);
5280             } else {
5281                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5282                    otherwise do nothing.  Meaning we need to scan
5283                  */
5284                 regmatch_state *cur = mark_state;
5285                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5286
5287                 while (cur) {
5288                     if ( sv_eq( cur->u.mark.mark_name,
5289                                 find ) )
5290                     {
5291                         ST.mark_name = find;
5292                         PUSH_STATE_GOTO( SKIP_next, next );
5293                     }
5294                     cur = cur->u.mark.prev_mark;
5295                 }
5296             }
5297             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5298             break;
5299         case SKIP_next_fail:
5300             if (ST.mark_name) {
5301                 /* (*CUT:NAME) - Set up to search for the name as we
5302                    collapse the stack*/
5303                 popmark = ST.mark_name;
5304             } else {
5305                 /* (*CUT) - No name, we cut here.*/
5306                 if (ST.mark_loc > startpoint)
5307                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5308                 /* but we set sv_commit to latest mark_name if there
5309                    is one so they can test to see how things lead to this
5310                    cut */
5311                 if (mark_state)
5312                     sv_commit=mark_state->u.mark.mark_name;
5313             }
5314             no_final = 1;
5315             sayNO;
5316             /* NOTREACHED */
5317 #undef ST
5318         case FOLDCHAR:
5319             n = ARG(scan);
5320             if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5321                 locinput += ln;
5322             } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5323                 sayNO;
5324             } else  {
5325                 U8 folded[UTF8_MAXBYTES_CASE+1];
5326                 STRLEN foldlen;
5327                 const char * const l = locinput;
5328                 char *e = PL_regeol;
5329                 to_uni_fold(n, folded, &foldlen);
5330
5331                 if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
5332                                l, &e, 0,  do_utf8)) {
5333                         sayNO;
5334                 }
5335                 locinput = e;
5336             }
5337             nextchr = UCHARAT(locinput);
5338             break;
5339         case LNBREAK:
5340             if ((n=is_LNBREAK(locinput,do_utf8))) {
5341                 locinput += n;
5342                 nextchr = UCHARAT(locinput);
5343             } else
5344                 sayNO;
5345             break;
5346
5347 #define CASE_CLASS(nAmE)                              \
5348         case nAmE:                                    \
5349             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5350                 locinput += n;                        \
5351                 nextchr = UCHARAT(locinput);          \
5352             } else                                    \
5353                 sayNO;                                \
5354             break;                                    \
5355         case N##nAmE:                                 \
5356             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5357                 sayNO;                                \
5358             } else {                                  \
5359                 locinput += UTF8SKIP(locinput);       \
5360                 nextchr = UCHARAT(locinput);          \
5361             }                                         \
5362             break
5363
5364         CASE_CLASS(VERTWS);
5365         CASE_CLASS(HORIZWS);
5366 #undef CASE_CLASS
5367
5368         default:
5369             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5370                           PTR2UV(scan), OP(scan));
5371             Perl_croak(aTHX_ "regexp memory corruption");
5372
5373         } /* end switch */
5374
5375         /* switch break jumps here */
5376         scan = next; /* prepare to execute the next op and ... */
5377         continue;    /* ... jump back to the top, reusing st */
5378         /* NOTREACHED */
5379
5380       push_yes_state:
5381         /* push a state that backtracks on success */
5382         st->u.yes.prev_yes_state = yes_state;
5383         yes_state = st;
5384         /* FALL THROUGH */
5385       push_state:
5386         /* push a new regex state, then continue at scan  */
5387         {
5388             regmatch_state *newst;
5389
5390             DEBUG_STACK_r({
5391                 regmatch_state *cur = st;
5392                 regmatch_state *curyes = yes_state;
5393                 int curd = depth;
5394                 regmatch_slab *slab = PL_regmatch_slab;
5395                 for (;curd > -1;cur--,curd--) {
5396                     if (cur < SLAB_FIRST(slab)) {
5397                         slab = slab->prev;
5398                         cur = SLAB_LAST(slab);
5399                     }
5400                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5401                         REPORT_CODE_OFF + 2 + depth * 2,"",
5402                         curd, PL_reg_name[cur->resume_state],
5403                         (curyes == cur) ? "yes" : ""
5404                     );
5405                     if (curyes == cur)
5406                         curyes = cur->u.yes.prev_yes_state;
5407                 }
5408             } else
5409                 DEBUG_STATE_pp("push")
5410             );
5411             depth++;
5412             st->locinput = locinput;
5413             newst = st+1;
5414             if (newst >  SLAB_LAST(PL_regmatch_slab))
5415                 newst = S_push_slab(aTHX);
5416             PL_regmatch_state = newst;
5417
5418             locinput = PL_reginput;
5419             nextchr = UCHARAT(locinput);
5420             st = newst;
5421             continue;
5422             /* NOTREACHED */
5423         }
5424     }
5425
5426     /*
5427     * We get here only if there's trouble -- normally "case END" is
5428     * the terminating point.
5429     */
5430     Perl_croak(aTHX_ "corrupted regexp pointers");
5431     /*NOTREACHED*/
5432     sayNO;
5433
5434 yes:
5435     if (yes_state) {
5436         /* we have successfully completed a subexpression, but we must now
5437          * pop to the state marked by yes_state and continue from there */
5438         assert(st != yes_state);
5439 #ifdef DEBUGGING
5440         while (st != yes_state) {
5441             st--;
5442             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5443                 PL_regmatch_slab = PL_regmatch_slab->prev;
5444                 st = SLAB_LAST(PL_regmatch_slab);
5445             }
5446             DEBUG_STATE_r({
5447                 if (no_final) {
5448                     DEBUG_STATE_pp("pop (no final)");
5449                 } else {
5450                     DEBUG_STATE_pp("pop (yes)");
5451                 }
5452             });
5453             depth--;
5454         }
5455 #else
5456         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5457             || yes_state > SLAB_LAST(PL_regmatch_slab))
5458         {
5459             /* not in this slab, pop slab */
5460             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5461             PL_regmatch_slab = PL_regmatch_slab->prev;
5462             st = SLAB_LAST(PL_regmatch_slab);
5463         }
5464         depth -= (st - yes_state);
5465 #endif
5466         st = yes_state;
5467         yes_state = st->u.yes.prev_yes_state;
5468         PL_regmatch_state = st;
5469
5470         if (no_final) {
5471             locinput= st->locinput;
5472             nextchr = UCHARAT(locinput);
5473         }
5474         state_num = st->resume_state + no_final;
5475         goto reenter_switch;
5476     }
5477
5478     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5479                           PL_colors[4], PL_colors[5]));
5480
5481     if (PL_reg_eval_set) {
5482         /* each successfully executed (?{...}) block does the equivalent of
5483          *   local $^R = do {...}
5484          * When popping the save stack, all these locals would be undone;
5485          * bypass this by setting the outermost saved $^R to the latest
5486          * value */
5487         if (oreplsv != GvSV(PL_replgv))
5488             sv_setsv(oreplsv, GvSV(PL_replgv));
5489     }
5490     result = 1;
5491     goto final_exit;
5492
5493 no:
5494     DEBUG_EXECUTE_r(
5495         PerlIO_printf(Perl_debug_log,
5496             "%*s  %sfailed...%s\n",
5497             REPORT_CODE_OFF+depth*2, "",
5498             PL_colors[4], PL_colors[5])
5499         );
5500
5501 no_silent:
5502     if (no_final) {
5503         if (yes_state) {
5504             goto yes;
5505         } else {
5506             goto final_exit;
5507         }
5508     }
5509     if (depth) {
5510         /* there's a previous state to backtrack to */
5511         st--;
5512         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5513             PL_regmatch_slab = PL_regmatch_slab->prev;
5514             st = SLAB_LAST(PL_regmatch_slab);
5515         }
5516         PL_regmatch_state = st;
5517         locinput= st->locinput;
5518         nextchr = UCHARAT(locinput);
5519
5520         DEBUG_STATE_pp("pop");
5521         depth--;
5522         if (yes_state == st)
5523             yes_state = st->u.yes.prev_yes_state;
5524
5525         state_num = st->resume_state + 1; /* failure = success + 1 */
5526         goto reenter_switch;
5527     }
5528     result = 0;
5529
5530   final_exit:
5531     if (rex->intflags & PREGf_VERBARG_SEEN) {
5532         SV *sv_err = get_sv("REGERROR", 1);
5533         SV *sv_mrk = get_sv("REGMARK", 1);
5534         if (result) {
5535             sv_commit = &PL_sv_no;
5536             if (!sv_yes_mark)
5537                 sv_yes_mark = &PL_sv_yes;
5538         } else {
5539             if (!sv_commit)
5540                 sv_commit = &PL_sv_yes;
5541             sv_yes_mark = &PL_sv_no;
5542         }
5543         sv_setsv(sv_err, sv_commit);
5544         sv_setsv(sv_mrk, sv_yes_mark);
5545     }
5546
5547     /* clean up; in particular, free all slabs above current one */
5548     LEAVE_SCOPE(oldsave);
5549
5550     return result;
5551 }
5552
5553 /*
5554  - regrepeat - repeatedly match something simple, report how many
5555  */
5556 /*
5557  * [This routine now assumes that it will only match on things of length 1.
5558  * That was true before, but now we assume scan - reginput is the count,
5559  * rather than incrementing count on every character.  [Er, except utf8.]]
5560  */
5561 STATIC I32
5562 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5563 {
5564     dVAR;
5565     register char *scan;
5566     register I32 c;
5567     register char *loceol = PL_regeol;
5568     register I32 hardcount = 0;
5569     register bool do_utf8 = PL_reg_match_utf8;
5570 #ifndef DEBUGGING
5571     PERL_UNUSED_ARG(depth);
5572 #endif
5573
5574     PERL_ARGS_ASSERT_REGREPEAT;
5575
5576     scan = PL_reginput;
5577     if (max == REG_INFTY)
5578         max = I32_MAX;
5579     else if (max < loceol - scan)
5580         loceol = scan + max;
5581     switch (OP(p)) {
5582     case REG_ANY:
5583         if (do_utf8) {
5584             loceol = PL_regeol;
5585             while (scan < loceol && hardcount < max && *scan != '\n') {
5586                 scan += UTF8SKIP(scan);
5587                 hardcount++;
5588             }
5589         } else {
5590             while (scan < loceol && *scan != '\n')
5591                 scan++;
5592         }
5593         break;
5594     case SANY:
5595         if (do_utf8) {
5596             loceol = PL_regeol;
5597             while (scan < loceol && hardcount < max) {
5598                 scan += UTF8SKIP(scan);
5599                 hardcount++;
5600             }
5601         }
5602         else
5603             scan = loceol;
5604         break;
5605     case CANY:
5606         scan = loceol;
5607         break;
5608     case EXACT:         /* length of string is 1 */
5609         c = (U8)*STRING(p);
5610         while (scan < loceol && UCHARAT(scan) == c)
5611             scan++;
5612         break;
5613     case EXACTF:        /* length of string is 1 */
5614         c = (U8)*STRING(p);
5615         while (scan < loceol &&
5616                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5617             scan++;
5618         break;
5619     case EXACTFL:       /* length of string is 1 */
5620         PL_reg_flags |= RF_tainted;
5621         c = (U8)*STRING(p);
5622         while (scan < loceol &&
5623                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5624             scan++;
5625         break;
5626     case ANYOF:
5627         if (do_utf8) {
5628             loceol = PL_regeol;
5629             while (hardcount < max && scan < loceol &&
5630                    reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5631                 scan += UTF8SKIP(scan);
5632                 hardcount++;
5633             }
5634         } else {
5635             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5636                 scan++;
5637         }
5638         break;
5639     case ALNUM:
5640         if (do_utf8) {
5641             loceol = PL_regeol;
5642             LOAD_UTF8_CHARCLASS_ALNUM();
5643             while (hardcount < max && scan < loceol &&
5644                    swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5645                 scan += UTF8SKIP(scan);
5646                 hardcount++;
5647             }
5648         } else {
5649             while (scan < loceol && isALNUM(*scan))
5650                 scan++;
5651         }
5652         break;
5653     case ALNUML:
5654         PL_reg_flags |= RF_tainted;
5655         if (do_utf8) {
5656             loceol = PL_regeol;
5657             while (hardcount < max && scan < loceol &&
5658                    isALNUM_LC_utf8((U8*)scan)) {
5659                 scan += UTF8SKIP(scan);
5660                 hardcount++;
5661             }
5662         } else {
5663             while (scan < loceol && isALNUM_LC(*scan))
5664                 scan++;
5665         }
5666         break;
5667     case NALNUM:
5668         if (do_utf8) {
5669             loceol = PL_regeol;
5670             LOAD_UTF8_CHARCLASS_ALNUM();
5671             while (hardcount < max && scan < loceol &&
5672                    !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5673                 scan += UTF8SKIP(scan);
5674                 hardcount++;
5675             }
5676         } else {
5677             while (scan < loceol && !isALNUM(*scan))
5678                 scan++;
5679         }
5680         break;
5681     case NALNUML:
5682         PL_reg_flags |= RF_tainted;
5683         if (do_utf8) {
5684             loceol = PL_regeol;
5685             while (hardcount < max && scan < loceol &&
5686                    !isALNUM_LC_utf8((U8*)scan)) {
5687                 scan += UTF8SKIP(scan);
5688                 hardcount++;
5689             }
5690         } else {
5691             while (scan < loceol && !isALNUM_LC(*scan))
5692                 scan++;
5693         }
5694         break;
5695     case SPACE:
5696         if (do_utf8) {
5697             loceol = PL_regeol;
5698             LOAD_UTF8_CHARCLASS_SPACE();
5699             while (hardcount < max && scan < loceol &&
5700                    (*scan == ' ' ||
5701                     swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5702                 scan += UTF8SKIP(scan);
5703                 hardcount++;
5704             }
5705         } else {
5706             while (scan < loceol && isSPACE(*scan))
5707                 scan++;
5708         }
5709         break;
5710     case SPACEL:
5711         PL_reg_flags |= RF_tainted;
5712         if (do_utf8) {
5713             loceol = PL_regeol;
5714             while (hardcount < max && scan < loceol &&
5715                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5716                 scan += UTF8SKIP(scan);
5717                 hardcount++;
5718             }
5719         } else {
5720             while (scan < loceol && isSPACE_LC(*scan))
5721                 scan++;
5722         }
5723         break;
5724     case NSPACE:
5725         if (do_utf8) {
5726             loceol = PL_regeol;
5727             LOAD_UTF8_CHARCLASS_SPACE();
5728             while (hardcount < max && scan < loceol &&
5729                    !(*scan == ' ' ||
5730                      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5731                 scan += UTF8SKIP(scan);
5732                 hardcount++;
5733             }
5734         } else {
5735             while (scan < loceol && !isSPACE(*scan))
5736                 scan++;
5737         }
5738         break;
5739     case NSPACEL:
5740         PL_reg_flags |= RF_tainted;
5741         if (do_utf8) {
5742             loceol = PL_regeol;
5743             while (hardcount < max && scan < loceol &&
5744                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5745                 scan += UTF8SKIP(scan);
5746                 hardcount++;
5747             }
5748         } else {
5749             while (scan < loceol && !isSPACE_LC(*scan))
5750                 scan++;
5751         }
5752         break;
5753     case DIGIT:
5754         if (do_utf8) {
5755             loceol = PL_regeol;
5756             LOAD_UTF8_CHARCLASS_DIGIT();
5757             while (hardcount < max && scan < loceol &&
5758                    swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5759                 scan += UTF8SKIP(scan);
5760                 hardcount++;
5761             }
5762         } else {
5763             while (scan < loceol && isDIGIT(*scan))
5764                 scan++;
5765         }
5766         break;
5767     case NDIGIT:
5768         if (do_utf8) {
5769             loceol = PL_regeol;
5770             LOAD_UTF8_CHARCLASS_DIGIT();
5771             while (hardcount < max && scan < loceol &&
5772                    !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5773                 scan += UTF8SKIP(scan);
5774                 hardcount++;
5775             }
5776         } else {
5777             while (scan < loceol && !isDIGIT(*scan))
5778                 scan++;
5779         }
5780     case LNBREAK:
5781         if (do_utf8) {
5782             loceol = PL_regeol;
5783             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5784                 scan += c;
5785                 hardcount++;
5786             }
5787         } else {
5788             /*
5789               LNBREAK can match two latin chars, which is ok,
5790               because we have a null terminated string, but we
5791               have to use hardcount in this situation
5792             */
5793             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5794                 scan+=c;
5795                 hardcount++;
5796             }
5797         }
5798         break;
5799     case HORIZWS:
5800         if (do_utf8) {
5801             loceol = PL_regeol;
5802             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5803                 scan += c;
5804                 hardcount++;
5805             }
5806         } else {
5807             while (scan < loceol && is_HORIZWS_latin1(scan))
5808                 scan++;
5809         }
5810         break;
5811     case NHORIZWS:
5812         if (do_utf8) {
5813             loceol = PL_regeol;
5814             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5815                 scan += UTF8SKIP(scan);
5816                 hardcount++;
5817             }
5818         } else {
5819             while (scan < loceol && !is_HORIZWS_latin1(scan))
5820                 scan++;
5821
5822         }
5823         break;
5824     case VERTWS:
5825         if (do_utf8) {
5826             loceol = PL_regeol;
5827             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5828                 scan += c;
5829                 hardcount++;
5830             }
5831         } else {
5832             while (scan < loceol && is_VERTWS_latin1(scan))
5833                 scan++;
5834
5835         }
5836         break;
5837     case NVERTWS:
5838         if (do_utf8) {
5839             loceol = PL_regeol;
5840             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5841                 scan += UTF8SKIP(scan);
5842                 hardcount++;
5843             }
5844         } else {
5845             while (scan < loceol && !is_VERTWS_latin1(scan))
5846                 scan++;
5847
5848         }
5849         break;
5850
5851     default:            /* Called on something of 0 width. */
5852         break;          /* So match right here or not at all. */
5853     }
5854
5855     if (hardcount)
5856         c = hardcount;
5857     else
5858         c = scan - PL_reginput;
5859     PL_reginput = scan;
5860
5861     DEBUG_r({
5862         GET_RE_DEBUG_FLAGS_DECL;
5863         DEBUG_EXECUTE_r({
5864             SV * const prop = sv_newmortal();
5865             regprop(prog, prop, p);
5866             PerlIO_printf(Perl_debug_log,
5867                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5868                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5869         });
5870     });
5871
5872     return(c);
5873 }
5874
5875
5876 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5877 /*
5878 - regclass_swash - prepare the utf8 swash
5879 */
5880
5881 SV *
5882 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5883 {
5884     dVAR;
5885     SV *sw  = NULL;
5886     SV *si  = NULL;
5887     SV *alt = NULL;
5888     RXi_GET_DECL(prog,progi);
5889     const struct reg_data * const data = prog ? progi->data : NULL;
5890
5891     PERL_ARGS_ASSERT_REGCLASS_SWASH;
5892
5893     if (data && data->count) {
5894         const U32 n = ARG(node);
5895
5896         if (data->what[n] == 's') {
5897             SV * const rv = MUTABLE_SV(data->data[n]);
5898             AV * const av = MUTABLE_AV(SvRV(rv));
5899             SV **const ary = AvARRAY(av);
5900             SV **a, **b;
5901
5902             /* See the end of regcomp.c:S_regclass() for
5903              * documentation of these array elements. */
5904
5905             si = *ary;
5906             a  = SvROK(ary[1]) ? &ary[1] : NULL;
5907             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5908
5909             if (a)
5910                 sw = *a;
5911             else if (si && doinit) {
5912                 sw = swash_init("utf8", "", si, 1, 0);
5913                 (void)av_store(av, 1, sw);
5914             }
5915             if (b)
5916                 alt = *b;
5917         }
5918     }
5919
5920     if (listsvp)
5921         *listsvp = si;
5922     if (altsvp)
5923         *altsvp  = alt;
5924
5925     return sw;
5926 }
5927 #endif
5928
5929 /*
5930  - reginclass - determine if a character falls into a character class
5931
5932   The n is the ANYOF regnode, the p is the target string, lenp
5933   is pointer to the maximum length of how far to go in the p
5934   (if the lenp is zero, UTF8SKIP(p) is used),
5935   do_utf8 tells whether the target string is in UTF-8.
5936
5937  */
5938
5939 STATIC bool
5940 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5941 {
5942     dVAR;
5943     const char flags = ANYOF_FLAGS(n);
5944     bool match = FALSE;
5945     UV c = *p;
5946     STRLEN len = 0;
5947     STRLEN plen;
5948
5949     PERL_ARGS_ASSERT_REGINCLASS;
5950
5951     if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5952         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5953                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5954                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5955                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5956                  * UTF8_ALLOW_FFFF */
5957         if (len == (STRLEN)-1)
5958             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5959     }
5960
5961     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5962     if (do_utf8 || (flags & ANYOF_UNICODE)) {
5963         if (lenp)
5964             *lenp = 0;
5965         if (do_utf8 && !ANYOF_RUNTIME(n)) {
5966             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5967                 match = TRUE;
5968         }
5969         if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5970             match = TRUE;
5971         if (!match) {
5972             AV *av;
5973             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5974
5975             if (sw) {
5976                 U8 * utf8_p;
5977                 if (do_utf8) {
5978                     utf8_p = (U8 *) p;
5979                 } else {
5980                     STRLEN len = 1;
5981                     utf8_p = bytes_to_utf8(p, &len);
5982                 }
5983                 if (swash_fetch(sw, utf8_p, 1))
5984                     match = TRUE;
5985                 else if (flags & ANYOF_FOLD) {
5986                     if (!match && lenp && av) {
5987                         I32 i;
5988                         for (i = 0; i <= av_len(av); i++) {
5989                             SV* const sv = *av_fetch(av, i, FALSE);
5990                             STRLEN len;
5991                             const char * const s = SvPV_const(sv, len);
5992                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
5993                                 *lenp = len;
5994                                 match = TRUE;
5995                                 break;
5996                             }
5997                         }
5998                     }
5999                     if (!match) {
6000                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6001
6002                         STRLEN tmplen;
6003                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6004                         if (swash_fetch(sw, tmpbuf, 1))
6005                             match = TRUE;
6006                     }
6007                 }
6008
6009                 /* If we allocated a string above, free it */
6010                 if (! do_utf8) Safefree(utf8_p);
6011             }
6012         }
6013         if (match && lenp && *lenp == 0)
6014             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6015     }
6016     if (!match && c < 256) {
6017         if (ANYOF_BITMAP_TEST(n, c))
6018             match = TRUE;
6019         else if (flags & ANYOF_FOLD) {
6020             U8 f;
6021
6022             if (flags & ANYOF_LOCALE) {
6023                 PL_reg_flags |= RF_tainted;
6024                 f = PL_fold_locale[c];
6025             }
6026             else
6027                 f = PL_fold[c];
6028             if (f != c && ANYOF_BITMAP_TEST(n, f))
6029                 match = TRUE;
6030         }
6031
6032         if (!match && (flags & ANYOF_CLASS)) {
6033             PL_reg_flags |= RF_tainted;
6034             if (
6035                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6036                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6037                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6038                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6039                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6040                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6041                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6042                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6043                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6044                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6045                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6046                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6047                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6048                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6049                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6050                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6051                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6052                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6053                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6054                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6055                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6056                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6057                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6058                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6059                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6060                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6061                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6062                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6063                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6064                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6065                 ) /* How's that for a conditional? */
6066             {
6067                 match = TRUE;
6068             }
6069         }
6070     }
6071
6072     return (flags & ANYOF_INVERT) ? !match : match;
6073 }
6074
6075 STATIC U8 *
6076 S_reghop3(U8 *s, I32 off, const U8* lim)
6077 {
6078     dVAR;
6079
6080     PERL_ARGS_ASSERT_REGHOP3;
6081
6082     if (off >= 0) {
6083         while (off-- && s < lim) {
6084             /* XXX could check well-formedness here */
6085             s += UTF8SKIP(s);
6086         }
6087     }
6088     else {
6089         while (off++ && s > lim) {
6090             s--;
6091             if (UTF8_IS_CONTINUED(*s)) {
6092                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6093                     s--;
6094             }
6095             /* XXX could check well-formedness here */
6096         }
6097     }
6098     return s;
6099 }
6100
6101 #ifdef XXX_dmq
6102 /* there are a bunch of places where we use two reghop3's that should
6103    be replaced with this routine. but since thats not done yet
6104    we ifdef it out - dmq
6105 */
6106 STATIC U8 *
6107 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6108 {
6109     dVAR;
6110
6111     PERL_ARGS_ASSERT_REGHOP4;
6112
6113     if (off >= 0) {
6114         while (off-- && s < rlim) {
6115             /* XXX could check well-formedness here */
6116             s += UTF8SKIP(s);
6117         }
6118     }
6119     else {
6120         while (off++ && s > llim) {
6121             s--;
6122             if (UTF8_IS_CONTINUED(*s)) {
6123                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6124                     s--;
6125             }
6126             /* XXX could check well-formedness here */
6127         }
6128     }
6129     return s;
6130 }
6131 #endif
6132
6133 STATIC U8 *
6134 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6135 {
6136     dVAR;
6137
6138     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6139
6140     if (off >= 0) {
6141         while (off-- && s < lim) {
6142             /* XXX could check well-formedness here */
6143             s += UTF8SKIP(s);
6144         }
6145         if (off >= 0)
6146             return NULL;
6147     }
6148     else {
6149         while (off++ && s > lim) {
6150             s--;
6151             if (UTF8_IS_CONTINUED(*s)) {
6152                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6153                     s--;
6154             }
6155             /* XXX could check well-formedness here */
6156         }
6157         if (off <= 0)
6158             return NULL;
6159     }
6160     return s;
6161 }
6162
6163 static void
6164 restore_pos(pTHX_ void *arg)
6165 {
6166     dVAR;
6167     regexp * const rex = (regexp *)arg;
6168     if (PL_reg_eval_set) {
6169         if (PL_reg_oldsaved) {
6170             rex->subbeg = PL_reg_oldsaved;
6171             rex->sublen = PL_reg_oldsavedlen;
6172 #ifdef PERL_OLD_COPY_ON_WRITE
6173             rex->saved_copy = PL_nrs;
6174 #endif
6175             RXp_MATCH_COPIED_on(rex);
6176         }
6177         PL_reg_magic->mg_len = PL_reg_oldpos;
6178         PL_reg_eval_set = 0;
6179         PL_curpm = PL_reg_oldcurpm;
6180     }
6181 }
6182
6183 STATIC void
6184 S_to_utf8_substr(pTHX_ register regexp *prog)
6185 {
6186     int i = 1;
6187
6188     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6189
6190     do {
6191         if (prog->substrs->data[i].substr
6192             && !prog->substrs->data[i].utf8_substr) {
6193             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6194             prog->substrs->data[i].utf8_substr = sv;
6195             sv_utf8_upgrade(sv);
6196             if (SvVALID(prog->substrs->data[i].substr)) {
6197                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6198                 if (flags & FBMcf_TAIL) {
6199                     /* Trim the trailing \n that fbm_compile added last
6200                        time.  */
6201                     SvCUR_set(sv, SvCUR(sv) - 1);
6202                     /* Whilst this makes the SV technically "invalid" (as its
6203                        buffer is no longer followed by "\0") when fbm_compile()
6204                        adds the "\n" back, a "\0" is restored.  */
6205                 }
6206                 fbm_compile(sv, flags);
6207             }
6208             if (prog->substrs->data[i].substr == prog->check_substr)
6209                 prog->check_utf8 = sv;
6210         }
6211     } while (i--);
6212 }
6213
6214 STATIC void
6215 S_to_byte_substr(pTHX_ register regexp *prog)
6216 {
6217     dVAR;
6218     int i = 1;
6219
6220     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6221
6222     do {
6223         if (prog->substrs->data[i].utf8_substr
6224             && !prog->substrs->data[i].substr) {
6225             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6226             if (sv_utf8_downgrade(sv, TRUE)) {
6227                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6228                     const U8 flags
6229                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6230                     if (flags & FBMcf_TAIL) {
6231                         /* Trim the trailing \n that fbm_compile added last
6232                            time.  */
6233                         SvCUR_set(sv, SvCUR(sv) - 1);
6234                     }
6235                     fbm_compile(sv, flags);
6236                 }
6237             } else {
6238                 SvREFCNT_dec(sv);
6239                 sv = &PL_sv_undef;
6240             }
6241             prog->substrs->data[i].substr = sv;
6242             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6243                 prog->check_substr = sv;
6244         }
6245     } while (i--);
6246 }
6247
6248 /*
6249  * Local variables:
6250  * c-indentation-style: bsd
6251  * c-basic-offset: 4
6252  * indent-tabs-mode: t
6253  * End:
6254  *
6255  * ex: set ts=8 sts=4 sw=4 noet:
6256  */