src/5012003/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  98
  99 /*
 100  * Forwards.
 101  */
 102
 103 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 105
 106 #define HOPc(pos,off) \
 107         (char *)(PL_reg_match_utf8 \
 108             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 109             : (U8*)(pos + off))
 110 #define HOPBACKc(pos, off) \
 111         (char*)(PL_reg_match_utf8\
 112             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 113             : (pos - off >= PL_bostr)           \
 114                 ? (U8*)pos - off                \
 115                 : NULL)
 116
 117 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 118 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 119
 120 /* these are unrolled below in the CCC_TRY_XXX defined */
 121 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 122     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 123
 124 /* Doesn't do an assert to verify that is correct */
 125 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 127
 128 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 129 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 130 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 131
 132 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 133         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 134         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 135         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 136             * assert should likely and hopefully fail on an EBCDIC machine */ \
 137         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 138                                                                             \
 139         /* No asserts are done for these, in case called on an early        \
 140             * Unicode version in which they map to nothing */               \
 141         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 148
 149 /*
 150    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 151    so that it is possible to override the option here without having to
 152    rebuild the entire core. as we are required to do if we change regcomp.h
 153    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 154 */
 155 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 156 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 157 #endif
 158
 159 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 160 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 161 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 162 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 163 #define RE_utf8_perl_word   PL_utf8_alnum
 164 #define RE_utf8_perl_space  PL_utf8_space
 165 #define RE_utf8_posix_digit PL_utf8_digit
 166 #define perl_word  alnum
 167 #define perl_space space
 168 #define posix_digit digit
 169 #else
 170 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 171 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 172 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 173 #define RE_utf8_perl_word   PL_utf8_perl_word
 174 #define RE_utf8_perl_space  PL_utf8_perl_space
 175 #define RE_utf8_posix_digit PL_utf8_posix_digit
 176 #endif
 177
 178
 179 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 180         case NAMEL:                                                              \
 181             PL_reg_flags |= RF_tainted;                                                 \
 182             /* FALL THROUGH */                                                          \
 183         case NAME:                                                                     \
 184             if (!nextchr)                                                               \
 185                 sayNO;                                                                  \
 186             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 187                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 188                     bool ok;                                                            \
 189                     ENTER;                                                              \
 190                     save_re_context();                                                  \
 191                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 192                     assert(ok);                                                         \
 193                     LEAVE;                                                              \
 194                 }                                                                       \
 195                 if (!(OP(scan) == NAME                                                  \
 196                     ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)   \
 197                     : LCFUNC_utf8((U8*)locinput)))                                      \
 198                 {                                                                       \
 199                     sayNO;                                                              \
 200                 }                                                                       \
 201                 locinput += PL_utf8skip[nextchr];                                       \
 202                 nextchr = UCHARAT(locinput);                                            \
 203                 break;                                                                  \
 204             }                                                                           \
 205             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 206                 sayNO;                                                                  \
 207             nextchr = UCHARAT(++locinput);                                              \
 208             break
 209
 210 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 211         case NAMEL:                                                              \
 212             PL_reg_flags |= RF_tainted;                                                 \
 213             /* FALL THROUGH */                                                          \
 214         case NAME :                                                                     \
 215             if (!nextchr && locinput >= PL_regeol)                                      \
 216                 sayNO;                                                                  \
 217             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 218                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 219                     bool ok;                                                            \
 220                     ENTER;                                                              \
 221                     save_re_context();                                                  \
 222                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 223                     assert(ok);                                                         \
 224                     LEAVE;                                                              \
 225                 }                                                                       \
 226                 if ((OP(scan) == NAME                                                  \
 227                     ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)    \
 228                     : LCFUNC_utf8((U8*)locinput)))                                      \
 229                 {                                                                       \
 230                     sayNO;                                                              \
 231                 }                                                                       \
 232                 locinput += PL_utf8skip[nextchr];                                       \
 233                 nextchr = UCHARAT(locinput);                                            \
 234                 break;                                                                  \
 235             }                                                                           \
 236             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 237                 sayNO;                                                                  \
 238             nextchr = UCHARAT(++locinput);                                              \
 239             break
 240
 241
 242
 243
 244
 245 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 246
 247 /* for use after a quantifier and before an EXACT-like node -- japhy */
 248 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 249 #define JUMPABLE(rn) (      \
 250     OP(rn) == OPEN ||       \
 251     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 252     OP(rn) == EVAL ||   \
 253     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 254     OP(rn) == PLUS || OP(rn) == MINMOD || \
 255     OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 256     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 257 )
 258 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 259
 260 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 261
 262 #if 0
 263 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 264    we don't need this definition. */
 265 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 266 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 267 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 268
 269 #else
 270 /* ... so we use this as its faster. */
 271 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 272 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 273 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 274
 275 #endif
 276
 277 /*
 278   Search for mandatory following text node; for lookahead, the text must
 279   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 280 */
 281 #define FIND_NEXT_IMPT(rn) STMT_START { \
 282     while (JUMPABLE(rn)) { \
 283         const OPCODE type = OP(rn); \
 284         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 285             rn = NEXTOPER(NEXTOPER(rn)); \
 286         else if (type == PLUS) \
 287             rn = NEXTOPER(rn); \
 288         else if (type == IFMATCH) \
 289             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 290         else rn += NEXT_OFF(rn); \
 291     } \
 292 } STMT_END
 293
 294
 295 static void restore_pos(pTHX_ void *arg);
 296
 297 STATIC CHECKPOINT
 298 S_regcppush(pTHX_ I32 parenfloor)
 299 {
 300     dVAR;
 301     const int retval = PL_savestack_ix;
 302 #define REGCP_PAREN_ELEMS 4
 303     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 304     int p;
 305     GET_RE_DEBUG_FLAGS_DECL;
 306
 307     if (paren_elems_to_push < 0)
 308         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 309
 310 #define REGCP_OTHER_ELEMS 7
 311     SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
 312
 313     for (p = PL_regsize; p > parenfloor; p--) {
 314 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 315         SSPUSHINT(PL_regoffs[p].end);
 316         SSPUSHINT(PL_regoffs[p].start);
 317         SSPUSHPTR(PL_reg_start_tmp[p]);
 318         SSPUSHINT(p);
 319         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 320           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 321                       (UV)p, (IV)PL_regoffs[p].start,
 322                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 323                       (IV)PL_regoffs[p].end
 324         ));
 325     }
 326 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 327     SSPUSHPTR(PL_regoffs);
 328     SSPUSHINT(PL_regsize);
 329     SSPUSHINT(*PL_reglastparen);
 330     SSPUSHINT(*PL_reglastcloseparen);
 331     SSPUSHPTR(PL_reginput);
 332 #define REGCP_FRAME_ELEMS 2
 333 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
 334  * are needed for the regexp context stack bookkeeping. */
 335     SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 336     SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
 337
 338     return retval;
 339 }
 340
 341 /* These are needed since we do not localize EVAL nodes: */
 342 #define REGCP_SET(cp)                                           \
 343     DEBUG_STATE_r(                                              \
 344             PerlIO_printf(Perl_debug_log,                       \
 345                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 346                 (IV)PL_savestack_ix));                          \
 347     cp = PL_savestack_ix
 348
 349 #define REGCP_UNWIND(cp)                                        \
 350     DEBUG_STATE_r(                                              \
 351         if (cp != PL_savestack_ix)                              \
 352             PerlIO_printf(Perl_debug_log,                       \
 353                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 354                 (IV)(cp), (IV)PL_savestack_ix));                \
 355     regcpblow(cp)
 356
 357 STATIC char *
 358 S_regcppop(pTHX_ const regexp *rex)
 359 {
 360     dVAR;
 361     U32 i;
 362     char *input;
 363     GET_RE_DEBUG_FLAGS_DECL;
 364
 365     PERL_ARGS_ASSERT_REGCPPOP;
 366
 367     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 368     i = SSPOPINT;
 369     assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 370     i = SSPOPINT; /* Parentheses elements to pop. */
 371     input = (char *) SSPOPPTR;
 372     *PL_reglastcloseparen = SSPOPINT;
 373     *PL_reglastparen = SSPOPINT;
 374     PL_regsize = SSPOPINT;
 375     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 376
 377
 378     /* Now restore the parentheses context. */
 379     for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 380          i > 0; i -= REGCP_PAREN_ELEMS) {
 381         I32 tmps;
 382         U32 paren = (U32)SSPOPINT;
 383         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 384         PL_regoffs[paren].start = SSPOPINT;
 385         tmps = SSPOPINT;
 386         if (paren <= *PL_reglastparen)
 387             PL_regoffs[paren].end = tmps;
 388         DEBUG_BUFFERS_r(
 389             PerlIO_printf(Perl_debug_log,
 390                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 391                           (UV)paren, (IV)PL_regoffs[paren].start,
 392                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 393                           (IV)PL_regoffs[paren].end,
 394                           (paren > *PL_reglastparen ? "(no)" : ""));
 395         );
 396     }
 397     DEBUG_BUFFERS_r(
 398         if (*PL_reglastparen + 1 <= rex->nparens) {
 399             PerlIO_printf(Perl_debug_log,
 400                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 401                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 402         }
 403     );
 404 #if 1
 405     /* It would seem that the similar code in regtry()
 406      * already takes care of this, and in fact it is in
 407      * a better location to since this code can #if 0-ed out
 408      * but the code in regtry() is needed or otherwise tests
 409      * requiring null fields (pat.t#187 and split.t#{13,14}
 410      * (as of patchlevel 7877)  will fail.  Then again,
 411      * this code seems to be necessary or otherwise
 412      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 413      * --jhi updated by dapm */
 414     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 415         if (i > PL_regsize)
 416             PL_regoffs[i].start = -1;
 417         PL_regoffs[i].end = -1;
 418     }
 419 #endif
 420     return input;
 421 }
 422
 423 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 424
 425 /*
 426  * pregexec and friends
 427  */
 428
 429 #ifndef PERL_IN_XSUB_RE
 430 /*
 431  - pregexec - match a regexp against a string
 432  */
 433 I32
 434 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 435          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 436 /* strend: pointer to null at end of string */
 437 /* strbeg: real beginning of string */
 438 /* minend: end of match must be >=minend after stringarg. */
 439 /* nosave: For optimizations. */
 440 {
 441     PERL_ARGS_ASSERT_PREGEXEC;
 442
 443     return
 444         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 445                       nosave ? 0 : REXEC_COPY_STR);
 446 }
 447 #endif
 448
 449 /*
 450  * Need to implement the following flags for reg_anch:
 451  *
 452  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 453  * USE_INTUIT_ML
 454  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 455  * INTUIT_AUTORITATIVE_ML
 456  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 457  * INTUIT_ONCE_ML
 458  *
 459  * Another flag for this function: SECOND_TIME (so that float substrs
 460  * with giant delta may be not rechecked).
 461  */
 462
 463 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 464
 465 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 466    Otherwise, only SvCUR(sv) is used to get strbeg. */
 467
 468 /* XXXX We assume that strpos is strbeg unless sv. */
 469
 470 /* XXXX Some places assume that there is a fixed substring.
 471         An update may be needed if optimizer marks as "INTUITable"
 472         RExen without fixed substrings.  Similarly, it is assumed that
 473         lengths of all the strings are no more than minlen, thus they
 474         cannot come from lookahead.
 475         (Or minlen should take into account lookahead.)
 476   NOTE: Some of this comment is not correct. minlen does now take account
 477   of lookahead/behind. Further research is required. -- demerphq
 478
 479 */
 480
 481 /* A failure to find a constant substring means that there is no need to make
 482    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 483    finding a substring too deep into the string means that less calls to
 484    regtry() should be needed.
 485
 486    REx compiler's optimizer found 4 possible hints:
 487         a) Anchored substring;
 488         b) Fixed substring;
 489         c) Whether we are anchored (beginning-of-line or \G);
 490         d) First node (of those at offset 0) which may distingush positions;
 491    We use a)b)d) and multiline-part of c), and try to find a position in the
 492    string which does not contradict any of them.
 493  */
 494
 495 /* Most of decisions we do here should have been done at compile time.
 496    The nodes of the REx which we used for the search should have been
 497    deleted from the finite automaton. */
 498
 499 char *
 500 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 501                      char *strend, const U32 flags, re_scream_pos_data *data)
 502 {
 503     dVAR;
 504     struct regexp *const prog = (struct regexp *)SvANY(rx);
 505     register I32 start_shift = 0;
 506     /* Should be nonnegative! */
 507     register I32 end_shift   = 0;
 508     register char *s;
 509     register SV *check;
 510     char *strbeg;
 511     char *t;
 512     const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 513     I32 ml_anch;
 514     register char *other_last = NULL;   /* other substr checked before this */
 515     char *check_at = NULL;              /* check substr found at this pos */
 516     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 517     RXi_GET_DECL(prog,progi);
 518 #ifdef DEBUGGING
 519     const char * const i_strpos = strpos;
 520 #endif
 521     GET_RE_DEBUG_FLAGS_DECL;
 522
 523     PERL_ARGS_ASSERT_RE_INTUIT_START;
 524
 525     RX_MATCH_UTF8_set(rx,do_utf8);
 526
 527     if (RX_UTF8(rx)) {
 528         PL_reg_flags |= RF_utf8;
 529     }
 530     DEBUG_EXECUTE_r(
 531         debug_start_match(rx, do_utf8, strpos, strend,
 532             sv ? "Guessing start of match in sv for"
 533                : "Guessing start of match in string for");
 534               );
 535
 536     /* CHR_DIST() would be more correct here but it makes things slow. */
 537     if (prog->minlen > strend - strpos) {
 538         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 539                               "String too short... [re_intuit_start]\n"));
 540         goto fail;
 541     }
 542
 543     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 544     PL_regeol = strend;
 545     if (do_utf8) {
 546         if (!prog->check_utf8 && prog->check_substr)
 547             to_utf8_substr(prog);
 548         check = prog->check_utf8;
 549     } else {
 550         if (!prog->check_substr && prog->check_utf8)
 551             to_byte_substr(prog);
 552         check = prog->check_substr;
 553     }
 554     if (check == &PL_sv_undef) {
 555         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 556                 "Non-utf8 string cannot match utf8 check string\n"));
 557         goto fail;
 558     }
 559     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 560         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 561                      || ( (prog->extflags & RXf_ANCH_BOL)
 562                           && !multiline ) );    /* Check after \n? */
 563
 564         if (!ml_anch) {
 565           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 566                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 567                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 568                && sv && !SvROK(sv)
 569                && (strpos != strbeg)) {
 570               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 571               goto fail;
 572           }
 573           if (prog->check_offset_min == prog->check_offset_max &&
 574               !(prog->extflags & RXf_CANY_SEEN)) {
 575             /* Substring at constant offset from beg-of-str... */
 576             I32 slen;
 577
 578             s = HOP3c(strpos, prog->check_offset_min, strend);
 579
 580             if (SvTAIL(check)) {
 581                 slen = SvCUR(check);    /* >= 1 */
 582
 583                 if ( strend - s > slen || strend - s < slen - 1
 584                      || (strend - s == slen && strend[-1] != '\n')) {
 585                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 586                     goto fail_finish;
 587                 }
 588                 /* Now should match s[0..slen-2] */
 589                 slen--;
 590                 if (slen && (*SvPVX_const(check) != *s
 591                              || (slen > 1
 592                                  && memNE(SvPVX_const(check), s, slen)))) {
 593                   report_neq:
 594                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 595                     goto fail_finish;
 596                 }
 597             }
 598             else if (*SvPVX_const(check) != *s
 599                      || ((slen = SvCUR(check)) > 1
 600                          && memNE(SvPVX_const(check), s, slen)))
 601                 goto report_neq;
 602             check_at = s;
 603             goto success_at_start;
 604           }
 605         }
 606         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 607         s = strpos;
 608         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 609         end_shift = prog->check_end_shift;
 610
 611         if (!ml_anch) {
 612             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 613                                          - (SvTAIL(check) != 0);
 614             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 615
 616             if (end_shift < eshift)
 617                 end_shift = eshift;
 618         }
 619     }
 620     else {                              /* Can match at random position */
 621         ml_anch = 0;
 622         s = strpos;
 623         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 624         end_shift = prog->check_end_shift;
 625
 626         /* end shift should be non negative here */
 627     }
 628
 629 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 630     if (end_shift < 0)
 631         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 632                    (IV)end_shift, RX_PRECOMP(prog));
 633 #endif
 634
 635   restart:
 636     /* Find a possible match in the region s..strend by looking for
 637        the "check" substring in the region corrected by start/end_shift. */
 638
 639     {
 640         I32 srch_start_shift = start_shift;
 641         I32 srch_end_shift = end_shift;
 642         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 643             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 644             srch_start_shift = strbeg - s;
 645         }
 646     DEBUG_OPTIMISE_MORE_r({
 647         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 648             (IV)prog->check_offset_min,
 649             (IV)srch_start_shift,
 650             (IV)srch_end_shift,
 651             (IV)prog->check_end_shift);
 652     });
 653
 654     if (flags & REXEC_SCREAM) {
 655         I32 p = -1;                     /* Internal iterator of scream. */
 656         I32 * const pp = data ? data->scream_pos : &p;
 657
 658         if (PL_screamfirst[BmRARE(check)] >= 0
 659             || ( BmRARE(check) == '\n'
 660                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 661                  && SvTAIL(check) ))
 662             s = screaminstr(sv, check,
 663                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 664         else
 665             goto fail_finish;
 666         /* we may be pointing at the wrong string */
 667         if (s && RXp_MATCH_COPIED(prog))
 668             s = strbeg + (s - SvPVX_const(sv));
 669         if (data)
 670             *data->scream_olds = s;
 671     }
 672     else {
 673         U8* start_point;
 674         U8* end_point;
 675         if (prog->extflags & RXf_CANY_SEEN) {
 676             start_point= (U8*)(s + srch_start_shift);
 677             end_point= (U8*)(strend - srch_end_shift);
 678         } else {
 679             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 680             end_point= HOP3(strend, -srch_end_shift, strbeg);
 681         }
 682         DEBUG_OPTIMISE_MORE_r({
 683             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 684                 (int)(end_point - start_point),
 685                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 686                 start_point);
 687         });
 688
 689         s = fbm_instr( start_point, end_point,
 690                       check, multiline ? FBMrf_MULTILINE : 0);
 691     }
 692     }
 693     /* Update the count-of-usability, remove useless subpatterns,
 694         unshift s.  */
 695
 696     DEBUG_EXECUTE_r({
 697         RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 698             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 699         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 700                           (s ? "Found" : "Did not find"),
 701             (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
 702                 ? "anchored" : "floating"),
 703             quoted,
 704             RE_SV_TAIL(check),
 705             (s ? " at offset " : "...\n") );
 706     });
 707
 708     if (!s)
 709         goto fail_finish;
 710     /* Finish the diagnostic message */
 711     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 712
 713     /* XXX dmq: first branch is for positive lookbehind...
 714        Our check string is offset from the beginning of the pattern.
 715        So we need to do any stclass tests offset forward from that
 716        point. I think. :-(
 717      */
 718
 719
 720
 721     check_at=s;
 722
 723
 724     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 725        Start with the other substr.
 726        XXXX no SCREAM optimization yet - and a very coarse implementation
 727        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 728                 *always* match.  Probably should be marked during compile...
 729        Probably it is right to do no SCREAM here...
 730      */
 731
 732     if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
 733                 : (prog->float_substr && prog->anchored_substr))
 734     {
 735         /* Take into account the "other" substring. */
 736         /* XXXX May be hopelessly wrong for UTF... */
 737         if (!other_last)
 738             other_last = strpos;
 739         if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
 740           do_other_anchored:
 741             {
 742                 char * const last = HOP3c(s, -start_shift, strbeg);
 743                 char *last1, *last2;
 744                 char * const saved_s = s;
 745                 SV* must;
 746
 747                 t = s - prog->check_offset_max;
 748                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 749                     && (!do_utf8
 750                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 751                             && t > strpos)))
 752                     NOOP;
 753                 else
 754                     t = strpos;
 755                 t = HOP3c(t, prog->anchored_offset, strend);
 756                 if (t < other_last)     /* These positions already checked */
 757                     t = other_last;
 758                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 759                 if (last < last1)
 760                     last1 = last;
 761                 /* XXXX It is not documented what units *_offsets are in.
 762                    We assume bytes, but this is clearly wrong.
 763                    Meaning this code needs to be carefully reviewed for errors.
 764                    dmq.
 765                   */
 766
 767                 /* On end-of-str: see comment below. */
 768                 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
 769                 if (must == &PL_sv_undef) {
 770                     s = (char*)NULL;
 771                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 772                 }
 773                 else
 774                     s = fbm_instr(
 775                         (unsigned char*)t,
 776                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 777                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 778                         must,
 779                         multiline ? FBMrf_MULTILINE : 0
 780                     );
 781                 DEBUG_EXECUTE_r({
 782                     RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 783                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 784                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 785                         (s ? "Found" : "Contradicts"),
 786                         quoted, RE_SV_TAIL(must));
 787                 });
 788
 789
 790                 if (!s) {
 791                     if (last1 >= last2) {
 792                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 793                                                 ", giving up...\n"));
 794                         goto fail_finish;
 795                     }
 796                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 797                         ", trying floating at offset %ld...\n",
 798                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 799                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 800                     s = HOP3c(last, 1, strend);
 801                     goto restart;
 802                 }
 803                 else {
 804                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 805                           (long)(s - i_strpos)));
 806                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 807                     other_last = HOP3c(s, 1, strend);
 808                     s = saved_s;
 809                     if (t == strpos)
 810                         goto try_at_start;
 811                     goto try_at_offset;
 812                 }
 813             }
 814         }
 815         else {          /* Take into account the floating substring. */
 816             char *last, *last1;
 817             char * const saved_s = s;
 818             SV* must;
 819
 820             t = HOP3c(s, -start_shift, strbeg);
 821             last1 = last =
 822                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 823             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 824                 last = HOP3c(t, prog->float_max_offset, strend);
 825             s = HOP3c(t, prog->float_min_offset, strend);
 826             if (s < other_last)
 827                 s = other_last;
 828  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 829             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
 830             /* fbm_instr() takes into account exact value of end-of-str
 831                if the check is SvTAIL(ed).  Since false positives are OK,
 832                and end-of-str is not later than strend we are OK. */
 833             if (must == &PL_sv_undef) {
 834                 s = (char*)NULL;
 835                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 836             }
 837             else
 838                 s = fbm_instr((unsigned char*)s,
 839                               (unsigned char*)last + SvCUR(must)
 840                                   - (SvTAIL(must)!=0),
 841                               must, multiline ? FBMrf_MULTILINE : 0);
 842             DEBUG_EXECUTE_r({
 843                 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 844                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 845                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 846                     (s ? "Found" : "Contradicts"),
 847                     quoted, RE_SV_TAIL(must));
 848             });
 849             if (!s) {
 850                 if (last1 == last) {
 851                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 852                                             ", giving up...\n"));
 853                     goto fail_finish;
 854                 }
 855                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856                     ", trying anchored starting at offset %ld...\n",
 857                     (long)(saved_s + 1 - i_strpos)));
 858                 other_last = last;
 859                 s = HOP3c(t, 1, strend);
 860                 goto restart;
 861             }
 862             else {
 863                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 864                       (long)(s - i_strpos)));
 865                 other_last = s; /* Fix this later. --Hugo */
 866                 s = saved_s;
 867                 if (t == strpos)
 868                     goto try_at_start;
 869                 goto try_at_offset;
 870             }
 871         }
 872     }
 873
 874
 875     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 876
 877     DEBUG_OPTIMISE_MORE_r(
 878         PerlIO_printf(Perl_debug_log,
 879             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 880             (IV)prog->check_offset_min,
 881             (IV)prog->check_offset_max,
 882             (IV)(s-strpos),
 883             (IV)(t-strpos),
 884             (IV)(t-s),
 885             (IV)(strend-strpos)
 886         )
 887     );
 888
 889     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 890         && (!do_utf8
 891             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 892                  && t > strpos)))
 893     {
 894         /* Fixed substring is found far enough so that the match
 895            cannot start at strpos. */
 896       try_at_offset:
 897         if (ml_anch && t[-1] != '\n') {
 898             /* Eventually fbm_*() should handle this, but often
 899                anchored_offset is not 0, so this check will not be wasted. */
 900             /* XXXX In the code below we prefer to look for "^" even in
 901                presence of anchored substrings.  And we search even
 902                beyond the found float position.  These pessimizations
 903                are historical artefacts only.  */
 904           find_anchor:
 905             while (t < strend - prog->minlen) {
 906                 if (*t == '\n') {
 907                     if (t < check_at - prog->check_offset_min) {
 908                         if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
 909                             /* Since we moved from the found position,
 910                                we definitely contradict the found anchored
 911                                substr.  Due to the above check we do not
 912                                contradict "check" substr.
 913                                Thus we can arrive here only if check substr
 914                                is float.  Redo checking for "other"=="fixed".
 915                              */
 916                             strpos = t + 1;
 917                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 918                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 919                             goto do_other_anchored;
 920                         }
 921                         /* We don't contradict the found floating substring. */
 922                         /* XXXX Why not check for STCLASS? */
 923                         s = t + 1;
 924                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 925                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 926                         goto set_useful;
 927                     }
 928                     /* Position contradicts check-string */
 929                     /* XXXX probably better to look for check-string
 930                        than for "\n", so one should lower the limit for t? */
 931                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 932                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 933                     other_last = strpos = s = t + 1;
 934                     goto restart;
 935                 }
 936                 t++;
 937             }
 938             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 939                         PL_colors[0], PL_colors[1]));
 940             goto fail_finish;
 941         }
 942         else {
 943             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 944                         PL_colors[0], PL_colors[1]));
 945         }
 946         s = t;
 947       set_useful:
 948         ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr);    /* hooray/5 */
 949     }
 950     else {
 951         /* The found string does not prohibit matching at strpos,
 952            - no optimization of calling REx engine can be performed,
 953            unless it was an MBOL and we are not after MBOL,
 954            or a future STCLASS check will fail this. */
 955       try_at_start:
 956         /* Even in this situation we may use MBOL flag if strpos is offset
 957            wrt the start of the string. */
 958         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 959             && (strpos != strbeg) && strpos[-1] != '\n'
 960             /* May be due to an implicit anchor of m{.*foo}  */
 961             && !(prog->intflags & PREGf_IMPLICIT))
 962         {
 963             t = strpos;
 964             goto find_anchor;
 965         }
 966         DEBUG_EXECUTE_r( if (ml_anch)
 967             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 968                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 969         );
 970       success_at_start:
 971         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
 972             && (do_utf8 ? (
 973                 prog->check_utf8                /* Could be deleted already */
 974                 && --BmUSEFUL(prog->check_utf8) < 0
 975                 && (prog->check_utf8 == prog->float_utf8)
 976             ) : (
 977                 prog->check_substr              /* Could be deleted already */
 978                 && --BmUSEFUL(prog->check_substr) < 0
 979                 && (prog->check_substr == prog->float_substr)
 980             )))
 981         {
 982             /* If flags & SOMETHING - do not do it many times on the same match */
 983             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 984             /* XXX Does the destruction order has to change with do_utf8? */
 985             SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
 986             SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
 987             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
 988             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
 989             check = NULL;                       /* abort */
 990             s = strpos;
 991             /* XXXX This is a remnant of the old implementation.  It
 992                     looks wasteful, since now INTUIT can use many
 993                     other heuristics. */
 994             prog->extflags &= ~RXf_USE_INTUIT;
 995         }
 996         else
 997             s = strpos;
 998     }
 999
1000     /* Last resort... */
1001     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1002     /* trie stclasses are too expensive to use here, we are better off to
1003        leave it to regmatch itself */
1004     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1005         /* minlen == 0 is possible if regstclass is \b or \B,
1006            and the fixed substr is ''$.
1007            Since minlen is already taken into account, s+1 is before strend;
1008            accidentally, minlen >= 1 guaranties no false positives at s + 1
1009            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1010            regstclass does not come from lookahead...  */
1011         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1012            This leaves EXACTF only, which is dealt with in find_byclass().  */
1013         const U8* const str = (U8*)STRING(progi->regstclass);
1014         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1015                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1016                     : 1);
1017         char * endpos;
1018         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1019             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1020         else if (prog->float_substr || prog->float_utf8)
1021             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1022         else
1023             endpos= strend;
1024
1025         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1026                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1027
1028         t = s;
1029         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1030         if (!s) {
1031 #ifdef DEBUGGING
1032             const char *what = NULL;
1033 #endif
1034             if (endpos == strend) {
1035                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1036                                 "Could not match STCLASS...\n") );
1037                 goto fail;
1038             }
1039             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1040                                    "This position contradicts STCLASS...\n") );
1041             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1042                 goto fail;
1043             /* Contradict one of substrings */
1044             if (prog->anchored_substr || prog->anchored_utf8) {
1045                 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1046                     DEBUG_EXECUTE_r( what = "anchored" );
1047                   hop_and_restart:
1048                     s = HOP3c(t, 1, strend);
1049                     if (s + start_shift + end_shift > strend) {
1050                         /* XXXX Should be taken into account earlier? */
1051                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1052                                                "Could not match STCLASS...\n") );
1053                         goto fail;
1054                     }
1055                     if (!check)
1056                         goto giveup;
1057                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1058                                 "Looking for %s substr starting at offset %ld...\n",
1059                                  what, (long)(s + start_shift - i_strpos)) );
1060                     goto restart;
1061                 }
1062                 /* Have both, check_string is floating */
1063                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1064                     goto retry_floating_check;
1065                 /* Recheck anchored substring, but not floating... */
1066                 s = check_at;
1067                 if (!check)
1068                     goto giveup;
1069                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1070                           "Looking for anchored substr starting at offset %ld...\n",
1071                           (long)(other_last - i_strpos)) );
1072                 goto do_other_anchored;
1073             }
1074             /* Another way we could have checked stclass at the
1075                current position only: */
1076             if (ml_anch) {
1077                 s = t = t + 1;
1078                 if (!check)
1079                     goto giveup;
1080                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1081                           "Looking for /%s^%s/m starting at offset %ld...\n",
1082                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1083                 goto try_at_offset;
1084             }
1085             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))     /* Could have been deleted */
1086                 goto fail;
1087             /* Check is floating subtring. */
1088           retry_floating_check:
1089             t = check_at - start_shift;
1090             DEBUG_EXECUTE_r( what = "floating" );
1091             goto hop_and_restart;
1092         }
1093         if (t != s) {
1094             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1095                         "By STCLASS: moving %ld --> %ld\n",
1096                                   (long)(t - i_strpos), (long)(s - i_strpos))
1097                    );
1098         }
1099         else {
1100             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1101                                   "Does not contradict STCLASS...\n");
1102                    );
1103         }
1104     }
1105   giveup:
1106     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1107                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1108                           PL_colors[5], (long)(s - i_strpos)) );
1109     return s;
1110
1111   fail_finish:                          /* Substring not found */
1112     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1113         BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1114   fail:
1115     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1116                           PL_colors[4], PL_colors[5]));
1117     return NULL;
1118 }
1119
1120 #define DECL_TRIE_TYPE(scan) \
1121     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1122                     trie_type = (scan->flags != EXACT) \
1123                               ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1124                               : (do_utf8 ? trie_utf8 : trie_plain)
1125
1126 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1127 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1128     switch (trie_type) {                                                    \
1129     case trie_utf8_fold:                                                    \
1130         if ( foldlen>0 ) {                                                  \
1131             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1132             foldlen -= len;                                                 \
1133             uscan += len;                                                   \
1134             len=0;                                                          \
1135         } else {                                                            \
1136             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1137             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1138             foldlen -= UNISKIP( uvc );                                      \
1139             uscan = foldbuf + UNISKIP( uvc );                               \
1140         }                                                                   \
1141         break;                                                              \
1142     case trie_latin_utf8_fold:                                              \
1143         if ( foldlen>0 ) {                                                  \
1144             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1145             foldlen -= len;                                                 \
1146             uscan += len;                                                   \
1147             len=0;                                                          \
1148         } else {                                                            \
1149             len = 1;                                                        \
1150             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1151             foldlen -= UNISKIP( uvc );                                      \
1152             uscan = foldbuf + UNISKIP( uvc );                               \
1153         }                                                                   \
1154         break;                                                              \
1155     case trie_utf8:                                                         \
1156         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1157         break;                                                              \
1158     case trie_plain:                                                        \
1159         uvc = (UV)*uc;                                                      \
1160         len = 1;                                                            \
1161     }                                                                       \
1162     if (uvc < 256) {                                                        \
1163         charid = trie->charmap[ uvc ];                                      \
1164     }                                                                       \
1165     else {                                                                  \
1166         charid = 0;                                                         \
1167         if (widecharmap) {                                                  \
1168             SV** const svpp = hv_fetch(widecharmap,                         \
1169                         (char*)&uvc, sizeof(UV), 0);                        \
1170             if (svpp)                                                       \
1171                 charid = (U16)SvIV(*svpp);                                  \
1172         }                                                                   \
1173     }                                                                       \
1174 } STMT_END
1175
1176 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1177 {                                                      \
1178     char *my_strend= (char *)strend;                   \
1179     if ( (CoNd)                                        \
1180          && (ln == len ||                              \
1181              !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
1182                         m, NULL, ln, (bool)UTF))       \
1183          && (!reginfo || regtry(reginfo, &s)) )        \
1184         goto got_it;                                   \
1185     else {                                             \
1186          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1187          uvchr_to_utf8(tmpbuf, c);                     \
1188          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1189          if ( f != c                                   \
1190               && (f == c1 || f == c2)                  \
1191               && (ln == len ||                         \
1192                 !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
1193                               m, NULL, ln, (bool)UTF)) \
1194               && (!reginfo || regtry(reginfo, &s)) )   \
1195               goto got_it;                             \
1196     }                                                  \
1197 }                                                      \
1198 s += len
1199
1200 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1201 STMT_START {                                              \
1202     while (s <= e) {                                      \
1203         if ( (CoNd)                                       \
1204              && (ln == 1 || !(OP(c) == EXACTF             \
1205                               ? ibcmp(s, m, ln)           \
1206                               : ibcmp_locale(s, m, ln)))  \
1207              && (!reginfo || regtry(reginfo, &s)) )        \
1208             goto got_it;                                  \
1209         s++;                                              \
1210     }                                                     \
1211 } STMT_END
1212
1213 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1214 STMT_START {                                          \
1215     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1216         CoDe                                          \
1217         s += uskip;                                   \
1218     }                                                 \
1219 } STMT_END
1220
1221 #define REXEC_FBC_SCAN(CoDe)                          \
1222 STMT_START {                                          \
1223     while (s < strend) {                              \
1224         CoDe                                          \
1225         s++;                                          \
1226     }                                                 \
1227 } STMT_END
1228
1229 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1230 REXEC_FBC_UTF8_SCAN(                                  \
1231     if (CoNd) {                                       \
1232         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1233             goto got_it;                              \
1234         else                                          \
1235             tmp = doevery;                            \
1236     }                                                 \
1237     else                                              \
1238         tmp = 1;                                      \
1239 )
1240
1241 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1242 REXEC_FBC_SCAN(                                       \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_TRYIT               \
1254 if ((!reginfo || regtry(reginfo, &s))) \
1255     goto got_it
1256
1257 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1258     if (do_utf8) {                                             \
1259         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1260     }                                                          \
1261     else {                                                     \
1262         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1263     }                                                          \
1264     break
1265
1266 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1267     if (do_utf8) {                                             \
1268         UtFpReLoAd;                                            \
1269         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1270     }                                                          \
1271     else {                                                     \
1272         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1273     }                                                          \
1274     break
1275
1276 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1277     PL_reg_flags |= RF_tainted;                                \
1278     if (do_utf8) {                                             \
1279         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1280     }                                                          \
1281     else {                                                     \
1282         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1283     }                                                          \
1284     break
1285
1286 #define DUMP_EXEC_POS(li,s,doutf8) \
1287     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1288
1289 /* We know what class REx starts with.  Try to find this position... */
1290 /* if reginfo is NULL, its a dryrun */
1291 /* annoyingly all the vars in this routine have different names from their counterparts
1292    in regmatch. /grrr */
1293
1294 STATIC char *
1295 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1296     const char *strend, regmatch_info *reginfo)
1297 {
1298         dVAR;
1299         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1300         char *m;
1301         STRLEN ln;
1302         STRLEN lnc;
1303         register STRLEN uskip;
1304         unsigned int c1;
1305         unsigned int c2;
1306         char *e;
1307         register I32 tmp = 1;   /* Scratch variable? */
1308         register const bool do_utf8 = PL_reg_match_utf8;
1309         RXi_GET_DECL(prog,progi);
1310
1311         PERL_ARGS_ASSERT_FIND_BYCLASS;
1312
1313         /* We know what class it must start with. */
1314         switch (OP(c)) {
1315         case ANYOF:
1316             if (do_utf8) {
1317                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1318                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1319                           reginclass(prog, c, (U8*)s, 0, do_utf8) :
1320                           REGINCLASS(prog, c, (U8*)s));
1321             }
1322             else {
1323                  while (s < strend) {
1324                       STRLEN skip = 1;
1325
1326                       if (REGINCLASS(prog, c, (U8*)s) ||
1327                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1328                            /* The assignment of 2 is intentional:
1329                             * for the folded sharp s, the skip is 2. */
1330                            (skip = SHARP_S_SKIP))) {
1331                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1332                                 goto got_it;
1333                            else
1334                                 tmp = doevery;
1335                       }
1336                       else
1337                            tmp = 1;
1338                       s += skip;
1339                  }
1340             }
1341             break;
1342         case CANY:
1343             REXEC_FBC_SCAN(
1344                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1345                     goto got_it;
1346                 else
1347                     tmp = doevery;
1348             );
1349             break;
1350         case EXACTF:
1351             m   = STRING(c);
1352             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1353             lnc = (I32) ln;     /* length to match in characters */
1354             if (UTF) {
1355                 STRLEN ulen1, ulen2;
1356                 U8 *sm = (U8 *) m;
1357                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1358                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1359                 /* used by commented-out code below */
1360                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1361
1362                 /* XXX: Since the node will be case folded at compile
1363                    time this logic is a little odd, although im not
1364                    sure that its actually wrong. --dmq */
1365
1366                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1367                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1368
1369                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1370                    codepoint of the first character in the converted
1371                    form, yet originally we did the extra step.
1372                    No tests fail by commenting this code out however
1373                    so Ive left it out. -- dmq.
1374
1375                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1376                                     0, uniflags);
1377                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1378                                     0, uniflags);
1379                 */
1380
1381                 lnc = 0;
1382                 while (sm < ((U8 *) m + ln)) {
1383                     lnc++;
1384                     sm += UTF8SKIP(sm);
1385                 }
1386             }
1387             else {
1388                 c1 = *(U8*)m;
1389                 c2 = PL_fold[c1];
1390             }
1391             goto do_exactf;
1392         case EXACTFL:
1393             m   = STRING(c);
1394             ln  = STR_LEN(c);
1395             lnc = (I32) ln;
1396             c1 = *(U8*)m;
1397             c2 = PL_fold_locale[c1];
1398           do_exactf:
1399             e = HOP3c(strend, -((I32)lnc), s);
1400
1401             if (!reginfo && e < s)
1402                 e = s;                  /* Due to minlen logic of intuit() */
1403
1404             /* The idea in the EXACTF* cases is to first find the
1405              * first character of the EXACTF* node and then, if
1406              * necessary, case-insensitively compare the full
1407              * text of the node.  The c1 and c2 are the first
1408              * characters (though in Unicode it gets a bit
1409              * more complicated because there are more cases
1410              * than just upper and lower: one needs to use
1411              * the so-called folding case for case-insensitive
1412              * matching (called "loose matching" in Unicode).
1413              * ibcmp_utf8() will do just that. */
1414
1415             if (do_utf8 || UTF) {
1416                 UV c, f;
1417                 U8 tmpbuf [UTF8_MAXBYTES+1];
1418                 STRLEN len = 1;
1419                 STRLEN foldlen;
1420                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1421                 if (c1 == c2) {
1422                     /* Upper and lower of 1st char are equal -
1423                      * probably not a "letter". */
1424                     while (s <= e) {
1425                         if (do_utf8) {
1426                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1427                                            uniflags);
1428                         } else {
1429                             c = *((U8*)s);
1430                         }
1431                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1432                     }
1433                 }
1434                 else {
1435                     while (s <= e) {
1436                         if (do_utf8) {
1437                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1438                                            uniflags);
1439                         } else {
1440                             c = *((U8*)s);
1441                         }
1442
1443                         /* Handle some of the three Greek sigmas cases.
1444                          * Note that not all the possible combinations
1445                          * are handled here: some of them are handled
1446                          * by the standard folding rules, and some of
1447                          * them (the character class or ANYOF cases)
1448                          * are handled during compiletime in
1449                          * regexec.c:S_regclass(). */
1450                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1451                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1452                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1453
1454                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1455                     }
1456                 }
1457             }
1458             else {
1459                 /* Neither pattern nor string are UTF8 */
1460                 if (c1 == c2)
1461                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1462                 else
1463                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1464             }
1465             break;
1466         case BOUNDL:
1467             PL_reg_flags |= RF_tainted;
1468             /* FALL THROUGH */
1469         case BOUND:
1470             if (do_utf8) {
1471                 if (s == PL_bostr)
1472                     tmp = '\n';
1473                 else {
1474                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1475                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1476                 }
1477                 tmp = ((OP(c) == BOUND ?
1478                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1479                 LOAD_UTF8_CHARCLASS_ALNUM();
1480                 REXEC_FBC_UTF8_SCAN(
1481                     if (tmp == !(OP(c) == BOUND ?
1482                                  (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1483                                  isALNUM_LC_utf8((U8*)s)))
1484                     {
1485                         tmp = !tmp;
1486                         REXEC_FBC_TRYIT;
1487                 }
1488                 );
1489             }
1490             else {
1491                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1492                 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1493                 REXEC_FBC_SCAN(
1494                     if (tmp ==
1495                         !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1496                         tmp = !tmp;
1497                         REXEC_FBC_TRYIT;
1498                 }
1499                 );
1500             }
1501             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1502                 goto got_it;
1503             break;
1504         case NBOUNDL:
1505             PL_reg_flags |= RF_tainted;
1506             /* FALL THROUGH */
1507         case NBOUND:
1508             if (do_utf8) {
1509                 if (s == PL_bostr)
1510                     tmp = '\n';
1511                 else {
1512                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1513                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1514                 }
1515                 tmp = ((OP(c) == NBOUND ?
1516                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1517                 LOAD_UTF8_CHARCLASS_ALNUM();
1518                 REXEC_FBC_UTF8_SCAN(
1519                     if (tmp == !(OP(c) == NBOUND ?
1520                                  (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1521                                  isALNUM_LC_utf8((U8*)s)))
1522                         tmp = !tmp;
1523                     else REXEC_FBC_TRYIT;
1524                 );
1525             }
1526             else {
1527                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1528                 tmp = ((OP(c) == NBOUND ?
1529                         isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1530                 REXEC_FBC_SCAN(
1531                     if (tmp ==
1532                         !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1533                         tmp = !tmp;
1534                     else REXEC_FBC_TRYIT;
1535                 );
1536             }
1537             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1538                 goto got_it;
1539             break;
1540         case ALNUM:
1541             REXEC_FBC_CSCAN_PRELOAD(
1542                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1543                 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1544                 isALNUM(*s)
1545             );
1546         case ALNUML:
1547             REXEC_FBC_CSCAN_TAINT(
1548                 isALNUM_LC_utf8((U8*)s),
1549                 isALNUM_LC(*s)
1550             );
1551         case NALNUM:
1552             REXEC_FBC_CSCAN_PRELOAD(
1553                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1554                 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1555                 !isALNUM(*s)
1556             );
1557         case NALNUML:
1558             REXEC_FBC_CSCAN_TAINT(
1559                 !isALNUM_LC_utf8((U8*)s),
1560                 !isALNUM_LC(*s)
1561             );
1562         case SPACE:
1563             REXEC_FBC_CSCAN_PRELOAD(
1564                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1565                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1566                 isSPACE(*s)
1567             );
1568         case SPACEL:
1569             REXEC_FBC_CSCAN_TAINT(
1570                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1571                 isSPACE_LC(*s)
1572             );
1573         case NSPACE:
1574             REXEC_FBC_CSCAN_PRELOAD(
1575                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1576                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1577                 !isSPACE(*s)
1578             );
1579         case NSPACEL:
1580             REXEC_FBC_CSCAN_TAINT(
1581                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1582                 !isSPACE_LC(*s)
1583             );
1584         case DIGIT:
1585             REXEC_FBC_CSCAN_PRELOAD(
1586                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1587                 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1588                 isDIGIT(*s)
1589             );
1590         case DIGITL:
1591             REXEC_FBC_CSCAN_TAINT(
1592                 isDIGIT_LC_utf8((U8*)s),
1593                 isDIGIT_LC(*s)
1594             );
1595         case NDIGIT:
1596             REXEC_FBC_CSCAN_PRELOAD(
1597                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1598                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1599                 !isDIGIT(*s)
1600             );
1601         case NDIGITL:
1602             REXEC_FBC_CSCAN_TAINT(
1603                 !isDIGIT_LC_utf8((U8*)s),
1604                 !isDIGIT_LC(*s)
1605             );
1606         case LNBREAK:
1607             REXEC_FBC_CSCAN(
1608                 is_LNBREAK_utf8(s),
1609                 is_LNBREAK_latin1(s)
1610             );
1611         case VERTWS:
1612             REXEC_FBC_CSCAN(
1613                 is_VERTWS_utf8(s),
1614                 is_VERTWS_latin1(s)
1615             );
1616         case NVERTWS:
1617             REXEC_FBC_CSCAN(
1618                 !is_VERTWS_utf8(s),
1619                 !is_VERTWS_latin1(s)
1620             );
1621         case HORIZWS:
1622             REXEC_FBC_CSCAN(
1623                 is_HORIZWS_utf8(s),
1624                 is_HORIZWS_latin1(s)
1625             );
1626         case NHORIZWS:
1627             REXEC_FBC_CSCAN(
1628                 !is_HORIZWS_utf8(s),
1629                 !is_HORIZWS_latin1(s)
1630             );
1631         case AHOCORASICKC:
1632         case AHOCORASICK:
1633             {
1634                 DECL_TRIE_TYPE(c);
1635                 /* what trie are we using right now */
1636                 reg_ac_data *aho
1637                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1638                 reg_trie_data *trie
1639                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1640                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1641
1642                 const char *last_start = strend - trie->minlen;
1643 #ifdef DEBUGGING
1644                 const char *real_start = s;
1645 #endif
1646                 STRLEN maxlen = trie->maxlen;
1647                 SV *sv_points;
1648                 U8 **points; /* map of where we were in the input string
1649                                 when reading a given char. For ASCII this
1650                                 is unnecessary overhead as the relationship
1651                                 is always 1:1, but for Unicode, especially
1652                                 case folded Unicode this is not true. */
1653                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1654                 U8 *bitmap=NULL;
1655
1656
1657                 GET_RE_DEBUG_FLAGS_DECL;
1658
1659                 /* We can't just allocate points here. We need to wrap it in
1660                  * an SV so it gets freed properly if there is a croak while
1661                  * running the match */
1662                 ENTER;
1663                 SAVETMPS;
1664                 sv_points=newSV(maxlen * sizeof(U8 *));
1665                 SvCUR_set(sv_points,
1666                     maxlen * sizeof(U8 *));
1667                 SvPOK_on(sv_points);
1668                 sv_2mortal(sv_points);
1669                 points=(U8**)SvPV_nolen(sv_points );
1670                 if ( trie_type != trie_utf8_fold
1671                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1672                 {
1673                     if (trie->bitmap)
1674                         bitmap=(U8*)trie->bitmap;
1675                     else
1676                         bitmap=(U8*)ANYOF_BITMAP(c);
1677                 }
1678                 /* this is the Aho-Corasick algorithm modified a touch
1679                    to include special handling for long "unknown char"
1680                    sequences. The basic idea being that we use AC as long
1681                    as we are dealing with a possible matching char, when
1682                    we encounter an unknown char (and we have not encountered
1683                    an accepting state) we scan forward until we find a legal
1684                    starting char.
1685                    AC matching is basically that of trie matching, except
1686                    that when we encounter a failing transition, we fall back
1687                    to the current states "fail state", and try the current char
1688                    again, a process we repeat until we reach the root state,
1689                    state 1, or a legal transition. If we fail on the root state
1690                    then we can either terminate if we have reached an accepting
1691                    state previously, or restart the entire process from the beginning
1692                    if we have not.
1693
1694                  */
1695                 while (s <= last_start) {
1696                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1697                     U8 *uc = (U8*)s;
1698                     U16 charid = 0;
1699                     U32 base = 1;
1700                     U32 state = 1;
1701                     UV uvc = 0;
1702                     STRLEN len = 0;
1703                     STRLEN foldlen = 0;
1704                     U8 *uscan = (U8*)NULL;
1705                     U8 *leftmost = NULL;
1706 #ifdef DEBUGGING
1707                     U32 accepted_word= 0;
1708 #endif
1709                     U32 pointpos = 0;
1710
1711                     while ( state && uc <= (U8*)strend ) {
1712                         int failed=0;
1713                         U32 word = aho->states[ state ].wordnum;
1714
1715                         if( state==1 ) {
1716                             if ( bitmap ) {
1717                                 DEBUG_TRIE_EXECUTE_r(
1718                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1719                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1720                                             (char *)uc, do_utf8 );
1721                                         PerlIO_printf( Perl_debug_log,
1722                                             " Scanning for legal start char...\n");
1723                                     }
1724                                 );
1725                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1726                                     uc++;
1727                                 }
1728                                 s= (char *)uc;
1729                             }
1730                             if (uc >(U8*)last_start) break;
1731                         }
1732
1733                         if ( word ) {
1734                             U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1735                             if (!leftmost || lpos < leftmost) {
1736                                 DEBUG_r(accepted_word=word);
1737                                 leftmost= lpos;
1738                             }
1739                             if (base==0) break;
1740
1741                         }
1742                         points[pointpos++ % maxlen]= uc;
1743                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1744                                              uscan, len, uvc, charid, foldlen,
1745                                              foldbuf, uniflags);
1746                         DEBUG_TRIE_EXECUTE_r({
1747                             dump_exec_pos( (char *)uc, c, strend, real_start,
1748                                 s,   do_utf8 );
1749                             PerlIO_printf(Perl_debug_log,
1750                                 " Charid:%3u CP:%4"UVxf" ",
1751                                  charid, uvc);
1752                         });
1753
1754                         do {
1755 #ifdef DEBUGGING
1756                             word = aho->states[ state ].wordnum;
1757 #endif
1758                             base = aho->states[ state ].trans.base;
1759
1760                             DEBUG_TRIE_EXECUTE_r({
1761                                 if (failed)
1762                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1763                                         s,   do_utf8 );
1764                                 PerlIO_printf( Perl_debug_log,
1765                                     "%sState: %4"UVxf", word=%"UVxf,
1766                                     failed ? " Fail transition to " : "",
1767                                     (UV)state, (UV)word);
1768                             });
1769                             if ( base ) {
1770                                 U32 tmp;
1771                                 if (charid &&
1772                                      (base + charid > trie->uniquecharcount )
1773                                      && (base + charid - 1 - trie->uniquecharcount
1774                                             < trie->lasttrans)
1775                                      && trie->trans[base + charid - 1 -
1776                                             trie->uniquecharcount].check == state
1777                                      && (tmp=trie->trans[base + charid - 1 -
1778                                         trie->uniquecharcount ].next))
1779                                 {
1780                                     DEBUG_TRIE_EXECUTE_r(
1781                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1782                                     state = tmp;
1783                                     break;
1784                                 }
1785                                 else {
1786                                     DEBUG_TRIE_EXECUTE_r(
1787                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1788                                     failed = 1;
1789                                     state = aho->fail[state];
1790                                 }
1791                             }
1792                             else {
1793                                 /* we must be accepting here */
1794                                 DEBUG_TRIE_EXECUTE_r(
1795                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1796                                 failed = 1;
1797                                 break;
1798                             }
1799                         } while(state);
1800                         uc += len;
1801                         if (failed) {
1802                             if (leftmost)
1803                                 break;
1804                             if (!state) state = 1;
1805                         }
1806                     }
1807                     if ( aho->states[ state ].wordnum ) {
1808                         U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1809                         if (!leftmost || lpos < leftmost) {
1810                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1811                             leftmost = lpos;
1812                         }
1813                     }
1814                     if (leftmost) {
1815                         s = (char*)leftmost;
1816                         DEBUG_TRIE_EXECUTE_r({
1817                             PerlIO_printf(
1818                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1819                                 (UV)accepted_word, (IV)(s - real_start)
1820                             );
1821                         });
1822                         if (!reginfo || regtry(reginfo, &s)) {
1823                             FREETMPS;
1824                             LEAVE;
1825                             goto got_it;
1826                         }
1827                         s = HOPc(s,1);
1828                         DEBUG_TRIE_EXECUTE_r({
1829                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1830                         });
1831                     } else {
1832                         DEBUG_TRIE_EXECUTE_r(
1833                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1834                         break;
1835                     }
1836                 }
1837                 FREETMPS;
1838                 LEAVE;
1839             }
1840             break;
1841         default:
1842             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1843             break;
1844         }
1845         return 0;
1846       got_it:
1847         return s;
1848 }
1849
1850
1851 /*
1852  - regexec_flags - match a regexp against a string
1853  */
1854 I32
1855 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1856               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1857 /* strend: pointer to null at end of string */
1858 /* strbeg: real beginning of string */
1859 /* minend: end of match must be >=minend after stringarg. */
1860 /* data: May be used for some additional optimizations.
1861          Currently its only used, with a U32 cast, for transmitting
1862          the ganch offset when doing a /g match. This will change */
1863 /* nosave: For optimizations. */
1864 {
1865     dVAR;
1866     struct regexp *const prog = (struct regexp *)SvANY(rx);
1867     /*register*/ char *s;
1868     register regnode *c;
1869     /*register*/ char *startpos = stringarg;
1870     I32 minlen;         /* must match at least this many chars */
1871     I32 dontbother = 0; /* how many characters not to try at end */
1872     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1873     I32 scream_pos = -1;                /* Internal iterator of scream. */
1874     char *scream_olds = NULL;
1875     const bool do_utf8 = (bool)DO_UTF8(sv);
1876     I32 multiline;
1877     RXi_GET_DECL(prog,progi);
1878     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1879     regexp_paren_pair *swap = NULL;
1880     GET_RE_DEBUG_FLAGS_DECL;
1881
1882     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1883     PERL_UNUSED_ARG(data);
1884
1885     /* Be paranoid... */
1886     if (prog == NULL || startpos == NULL) {
1887         Perl_croak(aTHX_ "NULL regexp parameter");
1888         return 0;
1889     }
1890
1891     multiline = prog->extflags & RXf_PMf_MULTILINE;
1892     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1893
1894     RX_MATCH_UTF8_set(rx, do_utf8);
1895     DEBUG_EXECUTE_r(
1896         debug_start_match(rx, do_utf8, startpos, strend,
1897         "Matching");
1898     );
1899
1900     minlen = prog->minlen;
1901
1902     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1903         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1904                               "String too short [regexec_flags]...\n"));
1905         goto phooey;
1906     }
1907
1908
1909     /* Check validity of program. */
1910     if (UCHARAT(progi->program) != REG_MAGIC) {
1911         Perl_croak(aTHX_ "corrupted regexp program");
1912     }
1913
1914     PL_reg_flags = 0;
1915     PL_reg_eval_set = 0;
1916     PL_reg_maxiter = 0;
1917
1918     if (RX_UTF8(rx))
1919         PL_reg_flags |= RF_utf8;
1920
1921     /* Mark beginning of line for ^ and lookbehind. */
1922     reginfo.bol = startpos; /* XXX not used ??? */
1923     PL_bostr  = strbeg;
1924     reginfo.sv = sv;
1925
1926     /* Mark end of line for $ (and such) */
1927     PL_regeol = strend;
1928
1929     /* see how far we have to get to not match where we matched before */
1930     reginfo.till = startpos+minend;
1931
1932     /* If there is a "must appear" string, look for it. */
1933     s = startpos;
1934
1935     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1936         MAGIC *mg;
1937         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1938             reginfo.ganch = startpos + prog->gofs;
1939             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1940               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1941         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1942                   && SvMAGIC(sv)
1943                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1944                   && mg->mg_len >= 0) {
1945             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
1946             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1947                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1948
1949             if (prog->extflags & RXf_ANCH_GPOS) {
1950                 if (s > reginfo.ganch)
1951                     goto phooey;
1952                 s = reginfo.ganch - prog->gofs;
1953                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1954                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1955                 if (s < strbeg)
1956                     goto phooey;
1957             }
1958         }
1959         else if (data) {
1960             reginfo.ganch = strbeg + PTR2UV(data);
1961             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1962                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1963
1964         } else {                                /* pos() not defined */
1965             reginfo.ganch = strbeg;
1966             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1967                  "GPOS: reginfo.ganch = strbeg\n"));
1968         }
1969     }
1970     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1971         /* We have to be careful. If the previous successful match
1972            was from this regex we don't want a subsequent partially
1973            successful match to clobber the old results.
1974            So when we detect this possibility we add a swap buffer
1975            to the re, and switch the buffer each match. If we fail
1976            we switch it back, otherwise we leave it swapped.
1977         */
1978         swap = prog->offs;
1979         /* do we need a save destructor here for eval dies? */
1980         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1981     }
1982     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1983         re_scream_pos_data d;
1984
1985         d.scream_olds = &scream_olds;
1986         d.scream_pos = &scream_pos;
1987         s = re_intuit_start(rx, sv, s, strend, flags, &d);
1988         if (!s) {
1989             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1990             goto phooey;        /* not present */
1991         }
1992     }
1993
1994
1995
1996     /* Simplest case:  anchored match need be tried only once. */
1997     /*  [unless only anchor is BOL and multiline is set] */
1998     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1999         if (s == startpos && regtry(&reginfo, &startpos))
2000             goto got_it;
2001         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2002                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2003         {
2004             char *end;
2005
2006             if (minlen)
2007                 dontbother = minlen - 1;
2008             end = HOP3c(strend, -dontbother, strbeg) - 1;
2009             /* for multiline we only have to try after newlines */
2010             if (prog->check_substr || prog->check_utf8) {
2011                 /* because of the goto we can not easily reuse the macros for bifurcating the
2012                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2013                 if (do_utf8) {
2014                     if (s == startpos)
2015                         goto after_try_utf8;
2016                     while (1) {
2017                         if (regtry(&reginfo, &s)) {
2018                             goto got_it;
2019                         }
2020                       after_try_utf8:
2021                         if (s > end) {
2022                             goto phooey;
2023                         }
2024                         if (prog->extflags & RXf_USE_INTUIT) {
2025                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2026                             if (!s) {
2027                                 goto phooey;
2028                             }
2029                         }
2030                         else {
2031                             s += UTF8SKIP(s);
2032                         }
2033                     }
2034                 } /* end search for check string in unicode */
2035                 else {
2036                     if (s == startpos) {
2037                         goto after_try_latin;
2038                     }
2039                     while (1) {
2040                         if (regtry(&reginfo, &s)) {
2041                             goto got_it;
2042                         }
2043                       after_try_latin:
2044                         if (s > end) {
2045                             goto phooey;
2046                         }
2047                         if (prog->extflags & RXf_USE_INTUIT) {
2048                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2049                             if (!s) {
2050                                 goto phooey;
2051                             }
2052                         }
2053                         else {
2054                             s++;
2055                         }
2056                     }
2057                 } /* end search for check string in latin*/
2058             } /* end search for check string */
2059             else { /* search for newline */
2060                 if (s > startpos) {
2061                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2062                     s--;
2063                 }
2064                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2065                 while (s < end) {
2066                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2067                         if (regtry(&reginfo, &s))
2068                             goto got_it;
2069                     }
2070                 }
2071             } /* end search for newline */
2072         } /* end anchored/multiline check string search */
2073         goto phooey;
2074     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2075     {
2076         /* the warning about reginfo.ganch being used without intialization
2077            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2078            and we only enter this block when the same bit is set. */
2079         char *tmp_s = reginfo.ganch - prog->gofs;
2080
2081         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2082             goto got_it;
2083         goto phooey;
2084     }
2085
2086     /* Messy cases:  unanchored match. */
2087     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2088         /* we have /x+whatever/ */
2089         /* it must be a one character string (XXXX Except UTF?) */
2090         char ch;
2091 #ifdef DEBUGGING
2092         int did_match = 0;
2093 #endif
2094         if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2095             do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2096         ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2097
2098         if (do_utf8) {
2099             REXEC_FBC_SCAN(
2100                 if (*s == ch) {
2101                     DEBUG_EXECUTE_r( did_match = 1 );
2102                     if (regtry(&reginfo, &s)) goto got_it;
2103                     s += UTF8SKIP(s);
2104                     while (s < strend && *s == ch)
2105                         s += UTF8SKIP(s);
2106                 }
2107             );
2108         }
2109         else {
2110             REXEC_FBC_SCAN(
2111                 if (*s == ch) {
2112                     DEBUG_EXECUTE_r( did_match = 1 );
2113                     if (regtry(&reginfo, &s)) goto got_it;
2114                     s++;
2115                     while (s < strend && *s == ch)
2116                         s++;
2117                 }
2118             );
2119         }
2120         DEBUG_EXECUTE_r(if (!did_match)
2121                 PerlIO_printf(Perl_debug_log,
2122                                   "Did not find anchored character...\n")
2123                );
2124     }
2125     else if (prog->anchored_substr != NULL
2126               || prog->anchored_utf8 != NULL
2127               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2128                   && prog->float_max_offset < strend - s)) {
2129         SV *must;
2130         I32 back_max;
2131         I32 back_min;
2132         char *last;
2133         char *last1;            /* Last position checked before */
2134 #ifdef DEBUGGING
2135         int did_match = 0;
2136 #endif
2137         if (prog->anchored_substr || prog->anchored_utf8) {
2138             if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2139                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2140             must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2141             back_max = back_min = prog->anchored_offset;
2142         } else {
2143             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2144                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2145             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2146             back_max = prog->float_max_offset;
2147             back_min = prog->float_min_offset;
2148         }
2149
2150
2151         if (must == &PL_sv_undef)
2152             /* could not downgrade utf8 check substring, so must fail */
2153             goto phooey;
2154
2155         if (back_min<0) {
2156             last = strend;
2157         } else {
2158             last = HOP3c(strend,        /* Cannot start after this */
2159                   -(I32)(CHR_SVLEN(must)
2160                          - (SvTAIL(must) != 0) + back_min), strbeg);
2161         }
2162         if (s > PL_bostr)
2163             last1 = HOPc(s, -1);
2164         else
2165             last1 = s - 1;      /* bogus */
2166
2167         /* XXXX check_substr already used to find "s", can optimize if
2168            check_substr==must. */
2169         scream_pos = -1;
2170         dontbother = end_shift;
2171         strend = HOPc(strend, -dontbother);
2172         while ( (s <= last) &&
2173                 ((flags & REXEC_SCREAM)
2174                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2175                                     end_shift, &scream_pos, 0))
2176                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2177                                   (unsigned char*)strend, must,
2178                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2179             /* we may be pointing at the wrong string */
2180             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2181                 s = strbeg + (s - SvPVX_const(sv));
2182             DEBUG_EXECUTE_r( did_match = 1 );
2183             if (HOPc(s, -back_max) > last1) {
2184                 last1 = HOPc(s, -back_min);
2185                 s = HOPc(s, -back_max);
2186             }
2187             else {
2188                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2189
2190                 last1 = HOPc(s, -back_min);
2191                 s = t;
2192             }
2193             if (do_utf8) {
2194                 while (s <= last1) {
2195                     if (regtry(&reginfo, &s))
2196                         goto got_it;
2197                     s += UTF8SKIP(s);
2198                 }
2199             }
2200             else {
2201                 while (s <= last1) {
2202                     if (regtry(&reginfo, &s))
2203                         goto got_it;
2204                     s++;
2205                 }
2206             }
2207         }
2208         DEBUG_EXECUTE_r(if (!did_match) {
2209             RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2210                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2211             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2212                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2213                                ? "anchored" : "floating"),
2214                 quoted, RE_SV_TAIL(must));
2215         });
2216         goto phooey;
2217     }
2218     else if ( (c = progi->regstclass) ) {
2219         if (minlen) {
2220             const OPCODE op = OP(progi->regstclass);
2221             /* don't bother with what can't match */
2222             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2223                 strend = HOPc(strend, -(minlen - 1));
2224         }
2225         DEBUG_EXECUTE_r({
2226             SV * const prop = sv_newmortal();
2227             regprop(prog, prop, c);
2228             {
2229                 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2230                     s,strend-s,60);
2231                 PerlIO_printf(Perl_debug_log,
2232                     "Matching stclass %.*s against %s (%d chars)\n",
2233                     (int)SvCUR(prop), SvPVX_const(prop),
2234                      quoted, (int)(strend - s));
2235             }
2236         });
2237         if (find_byclass(prog, c, s, strend, &reginfo))
2238             goto got_it;
2239         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2240     }
2241     else {
2242         dontbother = 0;
2243         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2244             /* Trim the end. */
2245             char *last;
2246             SV* float_real;
2247
2248             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2249                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2250             float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2251
2252             if (flags & REXEC_SCREAM) {
2253                 last = screaminstr(sv, float_real, s - strbeg,
2254                                    end_shift, &scream_pos, 1); /* last one */
2255                 if (!last)
2256                     last = scream_olds; /* Only one occurrence. */
2257                 /* we may be pointing at the wrong string */
2258                 else if (RXp_MATCH_COPIED(prog))
2259                     s = strbeg + (s - SvPVX_const(sv));
2260             }
2261             else {
2262                 STRLEN len;
2263                 const char * const little = SvPV_const(float_real, len);
2264
2265                 if (SvTAIL(float_real)) {
2266                     if (memEQ(strend - len + 1, little, len - 1))
2267                         last = strend - len + 1;
2268                     else if (!multiline)
2269                         last = memEQ(strend - len, little, len)
2270                             ? strend - len : NULL;
2271                     else
2272                         goto find_last;
2273                 } else {
2274                   find_last:
2275                     if (len)
2276                         last = rninstr(s, strend, little, little + len);
2277                     else
2278                         last = strend;  /* matching "$" */
2279                 }
2280             }
2281             if (last == NULL) {
2282                 DEBUG_EXECUTE_r(
2283                     PerlIO_printf(Perl_debug_log,
2284                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2285                         PL_colors[4], PL_colors[5]));
2286                 goto phooey; /* Should not happen! */
2287             }
2288             dontbother = strend - last + prog->float_min_offset;
2289         }
2290         if (minlen && (dontbother < minlen))
2291             dontbother = minlen - 1;
2292         strend -= dontbother;              /* this one's always in bytes! */
2293         /* We don't know much -- general case. */
2294         if (do_utf8) {
2295             for (;;) {
2296                 if (regtry(&reginfo, &s))
2297                     goto got_it;
2298                 if (s >= strend)
2299                     break;
2300                 s += UTF8SKIP(s);
2301             };
2302         }
2303         else {
2304             do {
2305                 if (regtry(&reginfo, &s))
2306                     goto got_it;
2307             } while (s++ < strend);
2308         }
2309     }
2310
2311     /* Failure. */
2312     goto phooey;
2313
2314 got_it:
2315     Safefree(swap);
2316     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2317
2318     if (PL_reg_eval_set)
2319         restore_pos(aTHX_ prog);
2320     if (RXp_PAREN_NAMES(prog))
2321         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2322
2323     /* make sure $`, $&, $', and $digit will work later */
2324     if ( !(flags & REXEC_NOT_FIRST) ) {
2325         RX_MATCH_COPY_FREE(rx);
2326         if (flags & REXEC_COPY_STR) {
2327             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2328 #ifdef PERL_OLD_COPY_ON_WRITE
2329             if ((SvIsCOW(sv)
2330                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2331                 if (DEBUG_C_TEST) {
2332                     PerlIO_printf(Perl_debug_log,
2333                                   "Copy on write: regexp capture, type %d\n",
2334                                   (int) SvTYPE(sv));
2335                 }
2336                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2337                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2338                 assert (SvPOKp(prog->saved_copy));
2339             } else
2340 #endif
2341             {
2342                 RX_MATCH_COPIED_on(rx);
2343                 s = savepvn(strbeg, i);
2344                 prog->subbeg = s;
2345             }
2346             prog->sublen = i;
2347         }
2348         else {
2349             prog->subbeg = strbeg;
2350             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2351         }
2352     }
2353
2354     return 1;
2355
2356 phooey:
2357     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2358                           PL_colors[4], PL_colors[5]));
2359     if (PL_reg_eval_set)
2360         restore_pos(aTHX_ prog);
2361     if (swap) {
2362         /* we failed :-( roll it back */
2363         Safefree(prog->offs);
2364         prog->offs = swap;
2365     }
2366
2367     return 0;
2368 }
2369
2370
2371 /*
2372  - regtry - try match at specific point
2373  */
2374 STATIC I32                      /* 0 failure, 1 success */
2375 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2376 {
2377     dVAR;
2378     CHECKPOINT lastcp;
2379     REGEXP *const rx = reginfo->prog;
2380     regexp *const prog = (struct regexp *)SvANY(rx);
2381     RXi_GET_DECL(prog,progi);
2382     GET_RE_DEBUG_FLAGS_DECL;
2383
2384     PERL_ARGS_ASSERT_REGTRY;
2385
2386     reginfo->cutpoint=NULL;
2387
2388     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2389         MAGIC *mg;
2390
2391         PL_reg_eval_set = RS_init;
2392         DEBUG_EXECUTE_r(DEBUG_s(
2393             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2394                           (IV)(PL_stack_sp - PL_stack_base));
2395             ));
2396         SAVESTACK_CXPOS();
2397         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2398         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2399         SAVETMPS;
2400         /* Apparently this is not needed, judging by wantarray. */
2401         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2402            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2403
2404         if (reginfo->sv) {
2405             /* Make $_ available to executed code. */
2406             if (reginfo->sv != DEFSV) {
2407                 SAVE_DEFSV;
2408                 DEFSV_set(reginfo->sv);
2409             }
2410
2411             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2412                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2413                 /* prepare for quick setting of pos */
2414 #ifdef PERL_OLD_COPY_ON_WRITE
2415                 if (SvIsCOW(reginfo->sv))
2416                     sv_force_normal_flags(reginfo->sv, 0);
2417 #endif
2418                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2419                                  &PL_vtbl_mglob, NULL, 0);
2420                 mg->mg_len = -1;
2421             }
2422             PL_reg_magic    = mg;
2423             PL_reg_oldpos   = mg->mg_len;
2424             SAVEDESTRUCTOR_X(restore_pos, prog);
2425         }
2426         if (!PL_reg_curpm) {
2427             Newxz(PL_reg_curpm, 1, PMOP);
2428 #ifdef USE_ITHREADS
2429             {
2430                 SV* const repointer = &PL_sv_undef;
2431                 /* this regexp is also owned by the new PL_reg_curpm, which
2432                    will try to free it.  */
2433                 av_push(PL_regex_padav, repointer);
2434                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2435                 PL_regex_pad = AvARRAY(PL_regex_padav);
2436             }
2437 #endif
2438         }
2439 #ifdef USE_ITHREADS
2440         /* It seems that non-ithreads works both with and without this code.
2441            So for efficiency reasons it seems best not to have the code
2442            compiled when it is not needed.  */
2443         /* This is safe against NULLs: */
2444         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2445         /* PM_reg_curpm owns a reference to this regexp.  */
2446         ReREFCNT_inc(rx);
2447 #endif
2448         PM_SETRE(PL_reg_curpm, rx);
2449         PL_reg_oldcurpm = PL_curpm;
2450         PL_curpm = PL_reg_curpm;
2451         if (RXp_MATCH_COPIED(prog)) {
2452             /*  Here is a serious problem: we cannot rewrite subbeg,
2453                 since it may be needed if this match fails.  Thus
2454                 $` inside (?{}) could fail... */
2455             PL_reg_oldsaved = prog->subbeg;
2456             PL_reg_oldsavedlen = prog->sublen;
2457 #ifdef PERL_OLD_COPY_ON_WRITE
2458             PL_nrs = prog->saved_copy;
2459 #endif
2460             RXp_MATCH_COPIED_off(prog);
2461         }
2462         else
2463             PL_reg_oldsaved = NULL;
2464         prog->subbeg = PL_bostr;
2465         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2466     }
2467     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2468     prog->offs[0].start = *startpos - PL_bostr;
2469     PL_reginput = *startpos;
2470     PL_reglastparen = &prog->lastparen;
2471     PL_reglastcloseparen = &prog->lastcloseparen;
2472     prog->lastparen = 0;
2473     prog->lastcloseparen = 0;
2474     PL_regsize = 0;
2475     PL_regoffs = prog->offs;
2476     if (PL_reg_start_tmpl <= prog->nparens) {
2477         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2478         if(PL_reg_start_tmp)
2479             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2480         else
2481             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2482     }
2483
2484     /* XXXX What this code is doing here?!!!  There should be no need
2485        to do this again and again, PL_reglastparen should take care of
2486        this!  --ilya*/
2487
2488     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2489      * Actually, the code in regcppop() (which Ilya may be meaning by
2490      * PL_reglastparen), is not needed at all by the test suite
2491      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2492      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2493      * Meanwhile, this code *is* needed for the
2494      * above-mentioned test suite tests to succeed.  The common theme
2495      * on those tests seems to be returning null fields from matches.
2496      * --jhi updated by dapm */
2497 #if 1
2498     if (prog->nparens) {
2499         regexp_paren_pair *pp = PL_regoffs;
2500         register I32 i;
2501         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2502             ++pp;
2503             pp->start = -1;
2504             pp->end = -1;
2505         }
2506     }
2507 #endif
2508     REGCP_SET(lastcp);
2509     if (regmatch(reginfo, progi->program + 1)) {
2510         PL_regoffs[0].end = PL_reginput - PL_bostr;
2511         return 1;
2512     }
2513     if (reginfo->cutpoint)
2514         *startpos= reginfo->cutpoint;
2515     REGCP_UNWIND(lastcp);
2516     return 0;
2517 }
2518
2519
2520 #define sayYES goto yes
2521 #define sayNO goto no
2522 #define sayNO_SILENT goto no_silent
2523
2524 /* we dont use STMT_START/END here because it leads to
2525    "unreachable code" warnings, which are bogus, but distracting. */
2526 #define CACHEsayNO \
2527     if (ST.cache_mask) \
2528        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2529     sayNO
2530
2531 /* this is used to determine how far from the left messages like
2532    'failed...' are printed. It should be set such that messages
2533    are inline with the regop output that created them.
2534 */
2535 #define REPORT_CODE_OFF 32
2536
2537
2538 /* Make sure there is a test for this +1 options in re_tests */
2539 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2540
2541 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2542 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2543
2544 #define SLAB_FIRST(s) (&(s)->states[0])
2545 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2546
2547 /* grab a new slab and return the first slot in it */
2548
2549 STATIC regmatch_state *
2550 S_push_slab(pTHX)
2551 {
2552 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2553     dMY_CXT;
2554 #endif
2555     regmatch_slab *s = PL_regmatch_slab->next;
2556     if (!s) {
2557         Newx(s, 1, regmatch_slab);
2558         s->prev = PL_regmatch_slab;
2559         s->next = NULL;
2560         PL_regmatch_slab->next = s;
2561     }
2562     PL_regmatch_slab = s;
2563     return SLAB_FIRST(s);
2564 }
2565
2566
2567 /* push a new state then goto it */
2568
2569 #define PUSH_STATE_GOTO(state, node) \
2570     scan = node; \
2571     st->resume_state = state; \
2572     goto push_state;
2573
2574 /* push a new state with success backtracking, then goto it */
2575
2576 #define PUSH_YES_STATE_GOTO(state, node) \
2577     scan = node; \
2578     st->resume_state = state; \
2579     goto push_yes_state;
2580
2581
2582
2583 /*
2584
2585 regmatch() - main matching routine
2586
2587 This is basically one big switch statement in a loop. We execute an op,
2588 set 'next' to point the next op, and continue. If we come to a point which
2589 we may need to backtrack to on failure such as (A|B|C), we push a
2590 backtrack state onto the backtrack stack. On failure, we pop the top
2591 state, and re-enter the loop at the state indicated. If there are no more
2592 states to pop, we return failure.
2593
2594 Sometimes we also need to backtrack on success; for example /A+/, where
2595 after successfully matching one A, we need to go back and try to
2596 match another one; similarly for lookahead assertions: if the assertion
2597 completes successfully, we backtrack to the state just before the assertion
2598 and then carry on.  In these cases, the pushed state is marked as
2599 'backtrack on success too'. This marking is in fact done by a chain of
2600 pointers, each pointing to the previous 'yes' state. On success, we pop to
2601 the nearest yes state, discarding any intermediate failure-only states.
2602 Sometimes a yes state is pushed just to force some cleanup code to be
2603 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2604 it to free the inner regex.
2605
2606 Note that failure backtracking rewinds the cursor position, while
2607 success backtracking leaves it alone.
2608
2609 A pattern is complete when the END op is executed, while a subpattern
2610 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2611 ops trigger the "pop to last yes state if any, otherwise return true"
2612 behaviour.
2613
2614 A common convention in this function is to use A and B to refer to the two
2615 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2616 the subpattern to be matched possibly multiple times, while B is the entire
2617 rest of the pattern. Variable and state names reflect this convention.
2618
2619 The states in the main switch are the union of ops and failure/success of
2620 substates associated with with that op.  For example, IFMATCH is the op
2621 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2622 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2623 successfully matched A and IFMATCH_A_fail is a state saying that we have
2624 just failed to match A. Resume states always come in pairs. The backtrack
2625 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2626 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2627 on success or failure.
2628
2629 The struct that holds a backtracking state is actually a big union, with
2630 one variant for each major type of op. The variable st points to the
2631 top-most backtrack struct. To make the code clearer, within each
2632 block of code we #define ST to alias the relevant union.
2633
2634 Here's a concrete example of a (vastly oversimplified) IFMATCH
2635 implementation:
2636
2637     switch (state) {
2638     ....
2639
2640 #define ST st->u.ifmatch
2641
2642     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2643         ST.foo = ...; // some state we wish to save
2644         ...
2645         // push a yes backtrack state with a resume value of
2646         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2647         // first node of A:
2648         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2649         // NOTREACHED
2650
2651     case IFMATCH_A: // we have successfully executed A; now continue with B
2652         next = B;
2653         bar = ST.foo; // do something with the preserved value
2654         break;
2655
2656     case IFMATCH_A_fail: // A failed, so the assertion failed
2657         ...;   // do some housekeeping, then ...
2658         sayNO; // propagate the failure
2659
2660 #undef ST
2661
2662     ...
2663     }
2664
2665 For any old-timers reading this who are familiar with the old recursive
2666 approach, the code above is equivalent to:
2667
2668     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2669     {
2670         int foo = ...
2671         ...
2672         if (regmatch(A)) {
2673             next = B;
2674             bar = foo;
2675             break;
2676         }
2677         ...;   // do some housekeeping, then ...
2678         sayNO; // propagate the failure
2679     }
2680
2681 The topmost backtrack state, pointed to by st, is usually free. If you
2682 want to claim it, populate any ST.foo fields in it with values you wish to
2683 save, then do one of
2684
2685         PUSH_STATE_GOTO(resume_state, node);
2686         PUSH_YES_STATE_GOTO(resume_state, node);
2687
2688 which sets that backtrack state's resume value to 'resume_state', pushes a
2689 new free entry to the top of the backtrack stack, then goes to 'node'.
2690 On backtracking, the free slot is popped, and the saved state becomes the
2691 new free state. An ST.foo field in this new top state can be temporarily
2692 accessed to retrieve values, but once the main loop is re-entered, it
2693 becomes available for reuse.
2694
2695 Note that the depth of the backtrack stack constantly increases during the
2696 left-to-right execution of the pattern, rather than going up and down with
2697 the pattern nesting. For example the stack is at its maximum at Z at the
2698 end of the pattern, rather than at X in the following:
2699
2700     /(((X)+)+)+....(Y)+....Z/
2701
2702 The only exceptions to this are lookahead/behind assertions and the cut,
2703 (?>A), which pop all the backtrack states associated with A before
2704 continuing.
2705
2706 Bascktrack state structs are allocated in slabs of about 4K in size.
2707 PL_regmatch_state and st always point to the currently active state,
2708 and PL_regmatch_slab points to the slab currently containing
2709 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2710 allocated, and is never freed until interpreter destruction. When the slab
2711 is full, a new one is allocated and chained to the end. At exit from
2712 regmatch(), slabs allocated since entry are freed.
2713
2714 */
2715
2716
2717 #define DEBUG_STATE_pp(pp)                                  \
2718     DEBUG_STATE_r({                                         \
2719         DUMP_EXEC_POS(locinput, scan, do_utf8);             \
2720         PerlIO_printf(Perl_debug_log,                       \
2721             "    %*s"pp" %s%s%s%s%s\n",                     \
2722             depth*2, "",                                    \
2723             PL_reg_name[st->resume_state],                     \
2724             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2725             ((st==yes_state) ? "Y" : ""),                   \
2726             ((st==mark_state) ? "M" : ""),                  \
2727             ((st==yes_state||st==mark_state) ? "]" : "")    \
2728         );                                                  \
2729     });
2730
2731
2732 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2733
2734 #ifdef DEBUGGING
2735
2736 STATIC void
2737 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2738     const char *start, const char *end, const char *blurb)
2739 {
2740     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2741
2742     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2743
2744     if (!PL_colorset)
2745             reginitcolors();
2746     {
2747         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2748             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2749
2750         RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2751             start, end - start, 60);
2752
2753         PerlIO_printf(Perl_debug_log,
2754             "%s%s REx%s %s against %s\n",
2755                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2756
2757         if (do_utf8||utf8_pat)
2758             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2759                 utf8_pat ? "pattern" : "",
2760                 utf8_pat && do_utf8 ? " and " : "",
2761                 do_utf8 ? "string" : ""
2762             );
2763     }
2764 }
2765
2766 STATIC void
2767 S_dump_exec_pos(pTHX_ const char *locinput,
2768                       const regnode *scan,
2769                       const char *loc_regeol,
2770                       const char *loc_bostr,
2771                       const char *loc_reg_starttry,
2772                       const bool do_utf8)
2773 {
2774     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2775     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2776     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2777     /* The part of the string before starttry has one color
2778        (pref0_len chars), between starttry and current
2779        position another one (pref_len - pref0_len chars),
2780        after the current position the third one.
2781        We assume that pref0_len <= pref_len, otherwise we
2782        decrease pref0_len.  */
2783     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2784         ? (5 + taill) - l : locinput - loc_bostr;
2785     int pref0_len;
2786
2787     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2788
2789     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2790         pref_len++;
2791     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2792     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2793         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2794               ? (5 + taill) - pref_len : loc_regeol - locinput);
2795     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2796         l--;
2797     if (pref0_len < 0)
2798         pref0_len = 0;
2799     if (pref0_len > pref_len)
2800         pref0_len = pref_len;
2801     {
2802         const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2803
2804         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2805             (locinput - pref_len),pref0_len, 60, 4, 5);
2806
2807         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2808                     (locinput - pref_len + pref0_len),
2809                     pref_len - pref0_len, 60, 2, 3);
2810
2811         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2812                     locinput, loc_regeol - locinput, 10, 0, 1);
2813
2814         const STRLEN tlen=len0+len1+len2;
2815         PerlIO_printf(Perl_debug_log,
2816                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2817                     (IV)(locinput - loc_bostr),
2818                     len0, s0,
2819                     len1, s1,
2820                     (docolor ? "" : "> <"),
2821                     len2, s2,
2822                     (int)(tlen > 19 ? 0 :  19 - tlen),
2823                     "");
2824     }
2825 }
2826
2827 #endif
2828
2829 /* reg_check_named_buff_matched()
2830  * Checks to see if a named buffer has matched. The data array of
2831  * buffer numbers corresponding to the buffer is expected to reside
2832  * in the regexp->data->data array in the slot stored in the ARG() of
2833  * node involved. Note that this routine doesn't actually care about the
2834  * name, that information is not preserved from compilation to execution.
2835  * Returns the index of the leftmost defined buffer with the given name
2836  * or 0 if non of the buffers matched.
2837  */
2838 STATIC I32
2839 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2840 {
2841     I32 n;
2842     RXi_GET_DECL(rex,rexi);
2843     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2844     I32 *nums=(I32*)SvPVX(sv_dat);
2845
2846     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2847
2848     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2849         if ((I32)*PL_reglastparen >= nums[n] &&
2850             PL_regoffs[nums[n]].end != -1)
2851         {
2852             return nums[n];
2853         }
2854     }
2855     return 0;
2856 }
2857
2858
2859 /* free all slabs above current one  - called during LEAVE_SCOPE */
2860
2861 STATIC void
2862 S_clear_backtrack_stack(pTHX_ void *p)
2863 {
2864     regmatch_slab *s = PL_regmatch_slab->next;
2865     PERL_UNUSED_ARG(p);
2866
2867     if (!s)
2868         return;
2869     PL_regmatch_slab->next = NULL;
2870     while (s) {
2871         regmatch_slab * const osl = s;
2872         s = s->next;
2873         Safefree(osl);
2874     }
2875 }
2876
2877
2878 #define SETREX(Re1,Re2) \
2879     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2880     Re1 = (Re2)
2881
2882 STATIC I32                      /* 0 failure, 1 success */
2883 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2884 {
2885 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2886     dMY_CXT;
2887 #endif
2888     dVAR;
2889     register const bool do_utf8 = PL_reg_match_utf8;
2890     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2891     REGEXP *rex_sv = reginfo->prog;
2892     regexp *rex = (struct regexp *)SvANY(rex_sv);
2893     RXi_GET_DECL(rex,rexi);
2894     I32 oldsave;
2895     /* the current state. This is a cached copy of PL_regmatch_state */
2896     register regmatch_state *st;
2897     /* cache heavy used fields of st in registers */
2898     register regnode *scan;
2899     register regnode *next;
2900     register U32 n = 0; /* general value; init to avoid compiler warning */
2901     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2902     register char *locinput = PL_reginput;
2903     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2904
2905     bool result = 0;        /* return value of S_regmatch */
2906     int depth = 0;          /* depth of backtrack stack */
2907     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2908     const U32 max_nochange_depth =
2909         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2910         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2911     regmatch_state *yes_state = NULL; /* state to pop to on success of
2912                                                             subpattern */
2913     /* mark_state piggy backs on the yes_state logic so that when we unwind
2914        the stack on success we can update the mark_state as we go */
2915     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2916     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2917     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2918     U32 state_num;
2919     bool no_final = 0;      /* prevent failure from backtracking? */
2920     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2921     char *startpoint = PL_reginput;
2922     SV *popmark = NULL;     /* are we looking for a mark? */
2923     SV *sv_commit = NULL;   /* last mark name seen in failure */
2924     SV *sv_yes_mark = NULL; /* last mark name we have seen
2925                                during a successfull match */
2926     U32 lastopen = 0;       /* last open we saw */
2927     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2928     SV* const oreplsv = GvSV(PL_replgv);
2929     /* these three flags are set by various ops to signal information to
2930      * the very next op. They have a useful lifetime of exactly one loop
2931      * iteration, and are not preserved or restored by state pushes/pops
2932      */
2933     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2934     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2935     int logical = 0;        /* the following EVAL is:
2936                                 0: (?{...})
2937                                 1: (?(?{...})X|Y)
2938                                 2: (??{...})
2939                                or the following IFMATCH/UNLESSM is:
2940                                 false: plain (?=foo)
2941                                 true:  used as a condition: (?(?=foo))
2942                             */
2943 #ifdef DEBUGGING
2944     GET_RE_DEBUG_FLAGS_DECL;
2945 #endif
2946
2947     PERL_ARGS_ASSERT_REGMATCH;
2948
2949     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2950             PerlIO_printf(Perl_debug_log,"regmatch start\n");
2951     }));
2952     /* on first ever call to regmatch, allocate first slab */
2953     if (!PL_regmatch_slab) {
2954         Newx(PL_regmatch_slab, 1, regmatch_slab);
2955         PL_regmatch_slab->prev = NULL;
2956         PL_regmatch_slab->next = NULL;
2957         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2958     }
2959
2960     oldsave = PL_savestack_ix;
2961     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2962     SAVEVPTR(PL_regmatch_slab);
2963     SAVEVPTR(PL_regmatch_state);
2964
2965     /* grab next free state slot */
2966     st = ++PL_regmatch_state;
2967     if (st >  SLAB_LAST(PL_regmatch_slab))
2968         st = PL_regmatch_state = S_push_slab(aTHX);
2969
2970     /* Note that nextchr is a byte even in UTF */
2971     nextchr = UCHARAT(locinput);
2972     scan = prog;
2973     while (scan != NULL) {
2974
2975         DEBUG_EXECUTE_r( {
2976             SV * const prop = sv_newmortal();
2977             regnode *rnext=regnext(scan);
2978             DUMP_EXEC_POS( locinput, scan, do_utf8 );
2979             regprop(rex, prop, scan);
2980
2981             PerlIO_printf(Perl_debug_log,
2982                     "%3"IVdf":%*s%s(%"IVdf")\n",
2983                     (IV)(scan - rexi->program), depth*2, "",
2984                     SvPVX_const(prop),
2985                     (PL_regkind[OP(scan)] == END || !rnext) ?
2986                         0 : (IV)(rnext - rexi->program));
2987         });
2988
2989         next = scan + NEXT_OFF(scan);
2990         if (next == scan)
2991             next = NULL;
2992         state_num = OP(scan);
2993
2994       reenter_switch:
2995
2996         assert(PL_reglastparen == &rex->lastparen);
2997         assert(PL_reglastcloseparen == &rex->lastcloseparen);
2998         assert(PL_regoffs == rex->offs);
2999
3000         switch (state_num) {
3001         case BOL:
3002             if (locinput == PL_bostr)
3003             {
3004                 /* reginfo->till = reginfo->bol; */
3005                 break;
3006             }
3007             sayNO;
3008         case MBOL:
3009             if (locinput == PL_bostr ||
3010                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3011             {
3012                 break;
3013             }
3014             sayNO;
3015         case SBOL:
3016             if (locinput == PL_bostr)
3017                 break;
3018             sayNO;
3019         case GPOS:
3020             if (locinput == reginfo->ganch)
3021                 break;
3022             sayNO;
3023
3024         case KEEPS:
3025             /* update the startpoint */
3026             st->u.keeper.val = PL_regoffs[0].start;
3027             PL_reginput = locinput;
3028             PL_regoffs[0].start = locinput - PL_bostr;
3029             PUSH_STATE_GOTO(KEEPS_next, next);
3030             /*NOT-REACHED*/
3031         case KEEPS_next_fail:
3032             /* rollback the start point change */
3033             PL_regoffs[0].start = st->u.keeper.val;
3034             sayNO_SILENT;
3035             /*NOT-REACHED*/
3036         case EOL:
3037                 goto seol;
3038         case MEOL:
3039             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3040                 sayNO;
3041             break;
3042         case SEOL:
3043           seol:
3044             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3045                 sayNO;
3046             if (PL_regeol - locinput > 1)
3047                 sayNO;
3048             break;
3049         case EOS:
3050             if (PL_regeol != locinput)
3051                 sayNO;
3052             break;
3053         case SANY:
3054             if (!nextchr && locinput >= PL_regeol)
3055                 sayNO;
3056             if (do_utf8) {
3057                 locinput += PL_utf8skip[nextchr];
3058                 if (locinput > PL_regeol)
3059                     sayNO;
3060                 nextchr = UCHARAT(locinput);
3061             }
3062             else
3063                 nextchr = UCHARAT(++locinput);
3064             break;
3065         case CANY:
3066             if (!nextchr && locinput >= PL_regeol)
3067                 sayNO;
3068             nextchr = UCHARAT(++locinput);
3069             break;
3070         case REG_ANY:
3071             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3072                 sayNO;
3073             if (do_utf8) {
3074                 locinput += PL_utf8skip[nextchr];
3075                 if (locinput > PL_regeol)
3076                     sayNO;
3077                 nextchr = UCHARAT(locinput);
3078             }
3079             else
3080                 nextchr = UCHARAT(++locinput);
3081             break;
3082
3083 #undef  ST
3084 #define ST st->u.trie
3085         case TRIEC:
3086             /* In this case the charclass data is available inline so
3087                we can fail fast without a lot of extra overhead.
3088              */
3089             if (scan->flags == EXACT || !do_utf8) {
3090                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3091                     DEBUG_EXECUTE_r(
3092                         PerlIO_printf(Perl_debug_log,
3093                                   "%*s  %sfailed to match trie start class...%s\n",
3094                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3095                     );
3096                     sayNO_SILENT;
3097                     /* NOTREACHED */
3098                 }
3099             }
3100             /* FALL THROUGH */
3101         case TRIE:
3102             {
3103                 /* what type of TRIE am I? (utf8 makes this contextual) */
3104                 DECL_TRIE_TYPE(scan);
3105
3106                 /* what trie are we using right now */
3107                 reg_trie_data * const trie
3108                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3109                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3110                 U32 state = trie->startstate;
3111
3112                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3113                     !TRIE_BITMAP_TEST(trie,*locinput)
3114                 ) {
3115                     if (trie->states[ state ].wordnum) {
3116                          DEBUG_EXECUTE_r(
3117                             PerlIO_printf(Perl_debug_log,
3118                                           "%*s  %smatched empty string...%s\n",
3119                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3120                         );
3121                         break;
3122                     } else {
3123                         DEBUG_EXECUTE_r(
3124                             PerlIO_printf(Perl_debug_log,
3125                                           "%*s  %sfailed to match trie start class...%s\n",
3126                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3127                         );
3128                         sayNO_SILENT;
3129                    }
3130                 }
3131
3132             {
3133                 U8 *uc = ( U8* )locinput;
3134
3135                 STRLEN len = 0;
3136                 STRLEN foldlen = 0;
3137                 U8 *uscan = (U8*)NULL;
3138                 STRLEN bufflen=0;
3139                 SV *sv_accept_buff = NULL;
3140                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3141
3142                 ST.accepted = 0; /* how many accepting states we have seen */
3143                 ST.B = next;
3144                 ST.jump = trie->jump;
3145                 ST.me = scan;
3146                 /*
3147                    traverse the TRIE keeping track of all accepting states
3148                    we transition through until we get to a failing node.
3149                 */
3150
3151                 while ( state && uc <= (U8*)PL_regeol ) {
3152                     U32 base = trie->states[ state ].trans.base;
3153                     UV uvc = 0;
3154                     U16 charid;
3155                     /* We use charid to hold the wordnum as we don't use it
3156                        for charid until after we have done the wordnum logic.
3157                        We define an alias just so that the wordnum logic reads
3158                        more naturally. */
3159
3160 #define got_wordnum charid
3161                     got_wordnum = trie->states[ state ].wordnum;
3162
3163                     if ( got_wordnum ) {
3164                         if ( ! ST.accepted ) {
3165                             ENTER;
3166                             SAVETMPS; /* XXX is this necessary? dmq */
3167                             bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3168                             sv_accept_buff=newSV(bufflen *
3169                                             sizeof(reg_trie_accepted) - 1);
3170                             SvCUR_set(sv_accept_buff, 0);
3171                             SvPOK_on(sv_accept_buff);
3172                             sv_2mortal(sv_accept_buff);
3173                             SAVETMPS;
3174                             ST.accept_buff =
3175                                 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3176                         }
3177                         do {
3178                             if (ST.accepted >= bufflen) {
3179                                 bufflen *= 2;
3180                                 ST.accept_buff =(reg_trie_accepted*)
3181                                     SvGROW(sv_accept_buff,
3182                                         bufflen * sizeof(reg_trie_accepted));
3183                             }
3184                             SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3185                                 + sizeof(reg_trie_accepted));
3186
3187
3188                             ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3189                             ST.accept_buff[ST.accepted].endpos = uc;
3190                             ++ST.accepted;
3191                         } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3192                     }
3193 #undef got_wordnum
3194
3195                     DEBUG_TRIE_EXECUTE_r({
3196                                 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3197                                 PerlIO_printf( Perl_debug_log,
3198                                     "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
3199                                     2+depth * 2, "", PL_colors[4],
3200                                     (UV)state, (UV)ST.accepted );
3201                     });
3202
3203                     if ( base ) {
3204                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3205                                              uscan, len, uvc, charid, foldlen,
3206                                              foldbuf, uniflags);
3207
3208                         if (charid &&
3209                              (base + charid > trie->uniquecharcount )
3210                              && (base + charid - 1 - trie->uniquecharcount
3211                                     < trie->lasttrans)
3212                              && trie->trans[base + charid - 1 -
3213                                     trie->uniquecharcount].check == state)
3214                         {
3215                             state = trie->trans[base + charid - 1 -
3216                                 trie->uniquecharcount ].next;
3217                         }
3218                         else {
3219                             state = 0;
3220                         }
3221                         uc += len;
3222
3223                     }
3224                     else {
3225                         state = 0;
3226                     }
3227                     DEBUG_TRIE_EXECUTE_r(
3228                         PerlIO_printf( Perl_debug_log,
3229                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3230                             charid, uvc, (UV)state, PL_colors[5] );
3231                     );
3232                 }
3233                 if (!ST.accepted )
3234                    sayNO;
3235
3236                 DEBUG_EXECUTE_r(
3237                     PerlIO_printf( Perl_debug_log,
3238                         "%*s  %sgot %"IVdf" possible matches%s\n",
3239                         REPORT_CODE_OFF + depth * 2, "",
3240                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3241                 );
3242             }}
3243             goto trie_first_try; /* jump into the fail handler */
3244             /* NOTREACHED */
3245         case TRIE_next_fail: /* we failed - try next alterative */
3246             if ( ST.jump) {
3247                 REGCP_UNWIND(ST.cp);
3248                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3249                     PL_regoffs[n].end = -1;
3250                 *PL_reglastparen = n;
3251             }
3252           trie_first_try:
3253             if (do_cutgroup) {
3254                 do_cutgroup = 0;
3255                 no_final = 0;
3256             }
3257
3258             if ( ST.jump) {
3259                 ST.lastparen = *PL_reglastparen;
3260                 REGCP_SET(ST.cp);
3261             }
3262             if ( ST.accepted == 1 ) {
3263                 /* only one choice left - just continue */
3264                 DEBUG_EXECUTE_r({
3265                     AV *const trie_words
3266                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3267                     SV ** const tmp = av_fetch( trie_words,
3268                         ST.accept_buff[ 0 ].wordnum-1, 0 );
3269                     SV *sv= tmp ? sv_newmortal() : NULL;
3270
3271                     PerlIO_printf( Perl_debug_log,
3272                         "%*s  %sonly one match left: #%d <%s>%s\n",
3273                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3274                         ST.accept_buff[ 0 ].wordnum,
3275                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3276                                 PL_colors[0], PL_colors[1],
3277                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3278                             )
3279                         : "not compiled under -Dr",
3280                         PL_colors[5] );
3281                 });
3282                 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3283                 /* in this case we free tmps/leave before we call regmatch
3284                    as we wont be using accept_buff again. */
3285
3286                 locinput = PL_reginput;
3287                 nextchr = UCHARAT(locinput);
3288                 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3289                     scan = ST.B;
3290                 else
3291                     scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3292                 if (!has_cutgroup) {
3293                     FREETMPS;
3294                     LEAVE;
3295                 } else {
3296                     ST.accepted--;
3297                     PUSH_YES_STATE_GOTO(TRIE_next, scan);
3298                 }
3299
3300                 continue; /* execute rest of RE */
3301             }
3302
3303             if ( !ST.accepted-- ) {
3304                 DEBUG_EXECUTE_r({
3305                     PerlIO_printf( Perl_debug_log,
3306                         "%*s  %sTRIE failed...%s\n",
3307                         REPORT_CODE_OFF+depth*2, "",
3308                         PL_colors[4],
3309                         PL_colors[5] );
3310                 });
3311                 FREETMPS;
3312                 LEAVE;
3313                 sayNO_SILENT;
3314                 /*NOTREACHED*/
3315             }
3316
3317             /*
3318                There are at least two accepting states left.  Presumably
3319                the number of accepting states is going to be low,
3320                typically two. So we simply scan through to find the one
3321                with lowest wordnum.  Once we find it, we swap the last
3322                state into its place and decrement the size. We then try to
3323                match the rest of the pattern at the point where the word
3324                ends. If we succeed, control just continues along the
3325                regex; if we fail we return here to try the next accepting
3326                state
3327              */
3328
3329             {
3330                 U32 best = 0;
3331                 U32 cur;
3332                 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3333                     DEBUG_TRIE_EXECUTE_r(
3334                         PerlIO_printf( Perl_debug_log,
3335                             "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3336                             REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3337                             (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3338                             ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3339                     );
3340
3341                     if (ST.accept_buff[cur].wordnum <
3342                             ST.accept_buff[best].wordnum)
3343                         best = cur;
3344                 }
3345
3346                 DEBUG_EXECUTE_r({
3347                     AV *const trie_words
3348                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3349                     SV ** const tmp = av_fetch( trie_words,
3350                         ST.accept_buff[ best ].wordnum - 1, 0 );
3351                     regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3352                                     ST.B :
3353                                     ST.me + ST.jump[ST.accept_buff[best].wordnum];
3354                     SV *sv= tmp ? sv_newmortal() : NULL;
3355
3356                     PerlIO_printf( Perl_debug_log,
3357                         "%*s  %strying alternation #%d <%s> at node #%d %s\n",
3358                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3359                         ST.accept_buff[best].wordnum,
3360                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3361                                 PL_colors[0], PL_colors[1],
3362                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3363                             ) : "not compiled under -Dr",
3364                             REG_NODE_NUM(nextop),
3365                         PL_colors[5] );
3366                 });
3367
3368                 if ( best<ST.accepted ) {
3369                     reg_trie_accepted tmp = ST.accept_buff[ best ];
3370                     ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3371                     ST.accept_buff[ ST.accepted ] = tmp;
3372                     best = ST.accepted;
3373                 }
3374                 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3375                 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3376                     scan = ST.B;
3377                 } else {
3378                     scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3379                 }
3380                 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3381                 /* NOTREACHED */
3382             }
3383             /* NOTREACHED */
3384         case TRIE_next:
3385             /* we dont want to throw this away, see bug 57042*/
3386             if (oreplsv != GvSV(PL_replgv))
3387                 sv_setsv(oreplsv, GvSV(PL_replgv));
3388             FREETMPS;
3389             LEAVE;
3390             sayYES;
3391 #undef  ST
3392
3393         case EXACT: {
3394             char *s = STRING(scan);
3395             ln = STR_LEN(scan);
3396             if (do_utf8 != UTF) {
3397                 /* The target and the pattern have differing utf8ness. */
3398                 char *l = locinput;
3399                 const char * const e = s + ln;
3400
3401                 if (do_utf8) {
3402                     /* The target is utf8, the pattern is not utf8. */
3403                     while (s < e) {
3404                         STRLEN ulen;
3405                         if (l >= PL_regeol)
3406                              sayNO;
3407                         if (NATIVE_TO_UNI(*(U8*)s) !=
3408                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3409                                             uniflags))
3410                              sayNO;
3411                         l += ulen;
3412                         s ++;
3413                     }
3414                 }
3415                 else {
3416                     /* The target is not utf8, the pattern is utf8. */
3417                     while (s < e) {
3418                         STRLEN ulen;
3419                         if (l >= PL_regeol)
3420                             sayNO;
3421                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3422                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3423                                            uniflags))
3424                             sayNO;
3425                         s += ulen;
3426                         l ++;
3427                     }
3428                 }
3429                 locinput = l;
3430                 nextchr = UCHARAT(locinput);
3431                 break;
3432             }
3433             /* The target and the pattern have the same utf8ness. */
3434             /* Inline the first character, for speed. */
3435             if (UCHARAT(s) != nextchr)
3436                 sayNO;
3437             if (PL_regeol - locinput < ln)
3438                 sayNO;
3439             if (ln > 1 && memNE(s, locinput, ln))
3440                 sayNO;
3441             locinput += ln;
3442             nextchr = UCHARAT(locinput);
3443             break;
3444             }
3445         case EXACTFL:
3446             PL_reg_flags |= RF_tainted;
3447             /* FALL THROUGH */
3448         case EXACTF: {
3449             char * const s = STRING(scan);
3450             ln = STR_LEN(scan);
3451
3452             if (do_utf8 || UTF) {
3453               /* Either target or the pattern are utf8. */
3454                 const char * const l = locinput;
3455                 char *e = PL_regeol;
3456
3457                 if (ibcmp_utf8(s, 0,  ln, (bool)UTF,
3458                                l, &e, 0,  do_utf8)) {
3459                      /* One more case for the sharp s:
3460                       * pack("U0U*", 0xDF) =~ /ss/i,
3461                       * the 0xC3 0x9F are the UTF-8
3462                       * byte sequence for the U+00DF. */
3463
3464                      if (!(do_utf8 &&
3465                            toLOWER(s[0]) == 's' &&
3466                            ln >= 2 &&
3467                            toLOWER(s[1]) == 's' &&
3468                            (U8)l[0] == 0xC3 &&
3469                            e - l >= 2 &&
3470                            (U8)l[1] == 0x9F))
3471                           sayNO;
3472                 }
3473                 locinput = e;
3474                 nextchr = UCHARAT(locinput);
3475                 break;
3476             }
3477
3478             /* Neither the target and the pattern are utf8. */
3479
3480             /* Inline the first character, for speed. */
3481             if (UCHARAT(s) != nextchr &&
3482                 UCHARAT(s) != ((OP(scan) == EXACTF)
3483                                ? PL_fold : PL_fold_locale)[nextchr])
3484                 sayNO;
3485             if (PL_regeol - locinput < ln)
3486                 sayNO;
3487             if (ln > 1 && (OP(scan) == EXACTF
3488                            ? ibcmp(s, locinput, ln)
3489                            : ibcmp_locale(s, locinput, ln)))
3490                 sayNO;
3491             locinput += ln;
3492             nextchr = UCHARAT(locinput);
3493             break;
3494             }
3495         case BOUNDL:
3496         case NBOUNDL:
3497             PL_reg_flags |= RF_tainted;
3498             /* FALL THROUGH */
3499         case BOUND:
3500         case NBOUND:
3501             /* was last char in word? */
3502             if (do_utf8) {
3503                 if (locinput == PL_bostr)
3504                     ln = '\n';
3505                 else {
3506                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3507
3508                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3509                 }
3510                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3511                     ln = isALNUM_uni(ln);
3512                     LOAD_UTF8_CHARCLASS_ALNUM();
3513                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3514                 }
3515                 else {
3516                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3517                     n = isALNUM_LC_utf8((U8*)locinput);
3518                 }
3519             }
3520             else {
3521                 ln = (locinput != PL_bostr) ?
3522                     UCHARAT(locinput - 1) : '\n';
3523                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3524                     ln = isALNUM(ln);
3525                     n = isALNUM(nextchr);
3526                 }
3527                 else {
3528                     ln = isALNUM_LC(ln);
3529                     n = isALNUM_LC(nextchr);
3530                 }
3531             }
3532             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3533                                     OP(scan) == BOUNDL))
3534                     sayNO;
3535             break;
3536         case ANYOF:
3537             if (do_utf8) {
3538                 STRLEN inclasslen = PL_regeol - locinput;
3539
3540                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3541                     goto anyof_fail;
3542                 if (locinput >= PL_regeol)
3543                     sayNO;
3544                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3545                 nextchr = UCHARAT(locinput);
3546                 break;
3547             }
3548             else {
3549                 if (nextchr < 0)
3550                     nextchr = UCHARAT(locinput);
3551                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3552                     goto anyof_fail;
3553                 if (!nextchr && locinput >= PL_regeol)
3554                     sayNO;
3555                 nextchr = UCHARAT(++locinput);
3556                 break;
3557             }
3558         anyof_fail:
3559             /* If we might have the case of the German sharp s
3560              * in a casefolding Unicode character class. */
3561
3562             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3563                  locinput += SHARP_S_SKIP;
3564                  nextchr = UCHARAT(locinput);
3565             }
3566             else
3567                  sayNO;
3568             break;
3569         /* Special char classes - The defines start on line 129 or so */
3570         CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3571         CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3572
3573         CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3574         CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3575
3576         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3577         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3578
3579         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3580                        a Unicode extended Grapheme Cluster */
3581             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3582               extended Grapheme Cluster is:
3583
3584                CR LF
3585                | Prepend* Begin Extend*
3586                | .
3587
3588                Begin is (Hangul-syllable | ! Control)
3589                Extend is (Grapheme_Extend | Spacing_Mark)
3590                Control is [ GCB_Control CR LF ]
3591
3592                The discussion below shows how the code for CLUMP is derived
3593                from this regex.  Note that most of these concepts are from
3594                property values of the Grapheme Cluster Boundary (GCB) property.
3595                No code point can have multiple property values for a given
3596                property.  Thus a code point in Prepend can't be in Control, but
3597                it must be in !Control.  This is why Control above includes
3598                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3599                property separately, and so can't be in GCB_Control, even though
3600                they logically are controls.  Control is not the same as gc=cc,
3601                but includes format and other characters as well.
3602
3603                The Unicode definition of Hangul-syllable is:
3604                    L+
3605                    | (L* ( ( V | LV ) V* | LVT ) T*)
3606                    | T+
3607                   )
3608                Each of these is a value for the GCB property, and hence must be
3609                disjoint, so the order they are tested is immaterial, so the
3610                above can safely be changed to
3611                    T+
3612                    | L+
3613                    | (L* ( LVT | ( V | LV ) V*) T*)
3614
3615                The last two terms can be combined like this:
3616                    L* ( L
3617                         | (( LVT | ( V | LV ) V*) T*))
3618
3619                And refactored into this:
3620                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3621
3622                That means that if we have seen any L's at all we can quit
3623                there, but if the next character is a LVT, a V or and LV we
3624                should keep going.
3625
3626                There is a subtlety with Prepend* which showed up in testing.
3627                Note that the Begin, and only the Begin is required in:
3628                 | Prepend* Begin Extend*
3629                Also, Begin contains '! Control'.  A Prepend must be a '!
3630                Control', which means it must be a Begin.  What it comes down to
3631                is that if we match Prepend* and then find no suitable Begin
3632                afterwards, that if we backtrack the last Prepend, that one will
3633                be a suitable Begin.
3634             */
3635
3636             if (locinput >= PL_regeol)
3637                 sayNO;
3638             if  (! do_utf8) {
3639
3640                 /* Match either CR LF  or '.', as all the other possibilities
3641                  * require utf8 */
3642                 locinput++;         /* Match the . or CR */
3643                 if (nextchr == '\r'
3644                     && locinput < PL_regeol
3645                     && UCHARAT(locinput) == '\n') locinput++;
3646             }
3647             else {
3648
3649                 /* Utf8: See if is ( CR LF ); already know that locinput <
3650                  * PL_regeol, so locinput+1 is in bounds */
3651                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3652                     locinput += 2;
3653                 }
3654                 else {
3655                     /* In case have to backtrack to beginning, then match '.' */
3656                     char *starting = locinput;
3657
3658                     /* In case have to backtrack the last prepend */
3659                     char *previous_prepend = 0;
3660
3661                     LOAD_UTF8_CHARCLASS_GCB();
3662
3663                     /* Match (prepend)* */
3664                     while (locinput < PL_regeol
3665                            && swash_fetch(PL_utf8_X_prepend,
3666                                           (U8*)locinput, do_utf8))
3667                     {
3668                         previous_prepend = locinput;
3669                         locinput += UTF8SKIP(locinput);
3670                     }
3671
3672                     /* As noted above, if we matched a prepend character, but
3673                      * the next thing won't match, back off the last prepend we
3674                      * matched, as it is guaranteed to match the begin */
3675                     if (previous_prepend
3676                         && (locinput >=  PL_regeol
3677                             || ! swash_fetch(PL_utf8_X_begin,
3678                                              (U8*)locinput, do_utf8)))
3679                     {
3680                         locinput = previous_prepend;
3681                     }
3682
3683                     /* Note that here we know PL_regeol > locinput, as we
3684                      * tested that upon input to this switch case, and if we
3685                      * moved locinput forward, we tested the result just above
3686                      * and it either passed, or we backed off so that it will
3687                      * now pass */
3688                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3689
3690                         /* Here did not match the required 'Begin' in the
3691                          * second term.  So just match the very first
3692                          * character, the '.' of the final term of the regex */
3693                         locinput = starting + UTF8SKIP(starting);
3694                     } else {
3695
3696                         /* Here is the beginning of a character that can have
3697                          * an extender.  It is either a hangul syllable, or a
3698                          * non-control */
3699                         if (swash_fetch(PL_utf8_X_non_hangul,
3700                                         (U8*)locinput, do_utf8))
3701                         {
3702
3703                             /* Here not a Hangul syllable, must be a
3704                              * ('!  * Control') */
3705                             locinput += UTF8SKIP(locinput);
3706                         } else {
3707
3708                             /* Here is a Hangul syllable.  It can be composed
3709                              * of several individual characters.  One
3710                              * possibility is T+ */
3711                             if (swash_fetch(PL_utf8_X_T,
3712                                             (U8*)locinput, do_utf8))
3713                             {
3714                                 while (locinput < PL_regeol
3715                                         && swash_fetch(PL_utf8_X_T,
3716                                                         (U8*)locinput, do_utf8))
3717                                 {
3718                                     locinput += UTF8SKIP(locinput);
3719                                 }
3720                             } else {
3721
3722                                 /* Here, not T+, but is a Hangul.  That means
3723                                  * it is one of the others: L, LV, LVT or V,
3724                                  * and matches:
3725                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3726
3727                                 /* Match L*           */
3728                                 while (locinput < PL_regeol
3729                                         && swash_fetch(PL_utf8_X_L,
3730                                                         (U8*)locinput, do_utf8))
3731                                 {
3732                                     locinput += UTF8SKIP(locinput);
3733                                 }
3734
3735                                 /* Here, have exhausted L*.  If the next
3736                                  * character is not an LV, LVT nor V, it means
3737                                  * we had to have at least one L, so matches L+
3738                                  * in the original equation, we have a complete
3739                                  * hangul syllable.  Are done. */
3740
3741                                 if (locinput < PL_regeol
3742                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3743                                                     (U8*)locinput, do_utf8))
3744                                 {
3745
3746                                     /* Otherwise keep going.  Must be LV, LVT
3747                                      * or V.  See if LVT */
3748                                     if (swash_fetch(PL_utf8_X_LVT,
3749                                                     (U8*)locinput, do_utf8))
3750                                     {
3751                                         locinput += UTF8SKIP(locinput);
3752                                     } else {
3753
3754                                         /* Must be  V or LV.  Take it, then
3755                                          * match V*     */
3756                                         locinput += UTF8SKIP(locinput);
3757                                         while (locinput < PL_regeol
3758                                                 && swash_fetch(PL_utf8_X_V,
3759                                                          (U8*)locinput, do_utf8))
3760                                         {
3761                                             locinput += UTF8SKIP(locinput);
3762                                         }
3763                                     }
3764
3765                                     /* And any of LV, LVT, or V can be followed
3766                                      * by T*            */
3767                                     while (locinput < PL_regeol
3768                                            && swash_fetch(PL_utf8_X_T,
3769                                                            (U8*)locinput,
3770                                                            do_utf8))
3771                                     {
3772                                         locinput += UTF8SKIP(locinput);
3773                                     }
3774                                 }
3775                             }
3776                         }
3777
3778                         /* Match any extender */
3779                         while (locinput < PL_regeol
3780                                 && swash_fetch(PL_utf8_X_extend,
3781                                                 (U8*)locinput, do_utf8))
3782                         {
3783                             locinput += UTF8SKIP(locinput);
3784                         }
3785                     }
3786                 }
3787                 if (locinput > PL_regeol) sayNO;
3788             }
3789             nextchr = UCHARAT(locinput);
3790             break;
3791
3792         case NREFFL:
3793         {
3794             char *s;
3795             char type;
3796             PL_reg_flags |= RF_tainted;
3797             /* FALL THROUGH */
3798         case NREF:
3799         case NREFF:
3800             type = OP(scan);
3801             n = reg_check_named_buff_matched(rex,scan);
3802
3803             if ( n ) {
3804                 type = REF + ( type - NREF );
3805                 goto do_ref;
3806             } else {
3807                 sayNO;
3808             }
3809             /* unreached */
3810         case REFFL:
3811             PL_reg_flags |= RF_tainted;
3812             /* FALL THROUGH */
3813         case REF:
3814         case REFF:
3815             n = ARG(scan);  /* which paren pair */
3816             type = OP(scan);
3817           do_ref:
3818             ln = PL_regoffs[n].start;
3819             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3820             if (*PL_reglastparen < n || ln == -1)
3821                 sayNO;                  /* Do not match unless seen CLOSEn. */
3822             if (ln == PL_regoffs[n].end)
3823                 break;
3824
3825             s = PL_bostr + ln;
3826             if (do_utf8 && type != REF) {       /* REF can do byte comparison */
3827                 char *l = locinput;
3828                 const char *e = PL_bostr + PL_regoffs[n].end;
3829                 /*
3830                  * Note that we can't do the "other character" lookup trick as
3831                  * in the 8-bit case (no pun intended) because in Unicode we
3832                  * have to map both upper and title case to lower case.
3833                  */
3834                 if (type == REFF) {
3835                     while (s < e) {
3836                         STRLEN ulen1, ulen2;
3837                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3838                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3839
3840                         if (l >= PL_regeol)
3841                             sayNO;
3842                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3843                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3844                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3845                             sayNO;
3846                         s += ulen1;
3847                         l += ulen2;
3848                     }
3849                 }
3850                 locinput = l;
3851                 nextchr = UCHARAT(locinput);
3852                 break;
3853             }
3854
3855             /* Inline the first character, for speed. */
3856             if (UCHARAT(s) != nextchr &&
3857                 (type == REF ||
3858                  (UCHARAT(s) != (type == REFF
3859                                   ? PL_fold : PL_fold_locale)[nextchr])))
3860                 sayNO;
3861             ln = PL_regoffs[n].end - ln;
3862             if (locinput + ln > PL_regeol)
3863                 sayNO;
3864             if (ln > 1 && (type == REF
3865                            ? memNE(s, locinput, ln)
3866                            : (type == REFF
3867                               ? ibcmp(s, locinput, ln)
3868                               : ibcmp_locale(s, locinput, ln))))
3869                 sayNO;
3870             locinput += ln;
3871             nextchr = UCHARAT(locinput);
3872             break;
3873         }
3874         case NOTHING:
3875         case TAIL:
3876             break;
3877         case BACK:
3878             break;
3879
3880 #undef  ST
3881 #define ST st->u.eval
3882         {
3883             SV *ret;
3884             REGEXP *re_sv;
3885             regexp *re;
3886             regexp_internal *rei;
3887             regnode *startpoint;
3888
3889         case GOSTART:
3890         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3891             if (cur_eval && cur_eval->locinput==locinput) {
3892                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3893                     Perl_croak(aTHX_ "Infinite recursion in regex");
3894                 if ( ++nochange_depth > max_nochange_depth )
3895                     Perl_croak(aTHX_
3896                         "Pattern subroutine nesting without pos change"
3897                         " exceeded limit in regex");
3898             } else {
3899                 nochange_depth = 0;
3900             }
3901             re_sv = rex_sv;
3902             re = rex;
3903             rei = rexi;
3904             (void)ReREFCNT_inc(rex_sv);
3905             if (OP(scan)==GOSUB) {
3906                 startpoint = scan + ARG2L(scan);
3907                 ST.close_paren = ARG(scan);
3908             } else {
3909                 startpoint = rei->program+1;
3910                 ST.close_paren = 0;
3911             }
3912             goto eval_recurse_doit;
3913             /* NOTREACHED */
3914         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3915             if (cur_eval && cur_eval->locinput==locinput) {
3916                 if ( ++nochange_depth > max_nochange_depth )
3917                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3918             } else {
3919                 nochange_depth = 0;
3920             }
3921             {
3922                 /* execute the code in the {...} */
3923                 dSP;
3924                 SV ** const before = SP;
3925                 OP_4tree * const oop = PL_op;
3926                 COP * const ocurcop = PL_curcop;
3927                 PAD *old_comppad;
3928                 char *saved_regeol = PL_regeol;
3929
3930                 n = ARG(scan);
3931                 PL_op = (OP_4tree*)rexi->data->data[n];
3932                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3933                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3934                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3935                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3936
3937                 if (sv_yes_mark) {
3938                     SV *sv_mrk = get_sv("REGMARK", 1);
3939                     sv_setsv(sv_mrk, sv_yes_mark);
3940                 }
3941
3942                 CALLRUNOPS(aTHX);                       /* Scalar context. */
3943                 SPAGAIN;
3944                 if (SP == before)
3945                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3946                 else {
3947                     ret = POPs;
3948                     PUTBACK;
3949                 }
3950
3951                 PL_op = oop;
3952                 PAD_RESTORE_LOCAL(old_comppad);
3953                 PL_curcop = ocurcop;
3954                 PL_regeol = saved_regeol;
3955                 if (!logical) {
3956                     /* /(?{...})/ */
3957                     sv_setsv(save_scalar(PL_replgv), ret);
3958                     break;
3959                 }
3960             }
3961             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3962                 logical = 0;
3963                 {
3964                     /* extract RE object from returned value; compiling if
3965                      * necessary */
3966                     MAGIC *mg = NULL;
3967                     REGEXP *rx = NULL;
3968
3969                     if (SvROK(ret)) {
3970                         SV *const sv = SvRV(ret);
3971
3972                         if (SvTYPE(sv) == SVt_REGEXP) {
3973                             rx = (REGEXP*) sv;
3974                         } else if (SvSMAGICAL(sv)) {
3975                             mg = mg_find(sv, PERL_MAGIC_qr);
3976                             assert(mg);
3977                         }
3978                     } else if (SvTYPE(ret) == SVt_REGEXP) {
3979                         rx = (REGEXP*) ret;
3980                     } else if (SvSMAGICAL(ret)) {
3981                         if (SvGMAGICAL(ret)) {
3982                             /* I don't believe that there is ever qr magic
3983                                here.  */
3984                             assert(!mg_find(ret, PERL_MAGIC_qr));
3985                             sv_unmagic(ret, PERL_MAGIC_qr);
3986                         }
3987                         else {
3988                             mg = mg_find(ret, PERL_MAGIC_qr);
3989                             /* testing suggests mg only ends up non-NULL for
3990                                scalars who were upgraded and compiled in the
3991                                else block below. In turn, this is only
3992                                triggered in the "postponed utf8 string" tests
3993                                in t/op/pat.t  */
3994                         }
3995                     }
3996
3997                     if (mg) {
3998                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3999                         assert(rx);
4000                     }
4001                     if (rx) {
4002                         rx = reg_temp_copy(NULL, rx);
4003                     }
4004                     else {
4005                         U32 pm_flags = 0;
4006                         const I32 osize = PL_regsize;
4007
4008                         if (DO_UTF8(ret)) {
4009                             assert (SvUTF8(ret));
4010                         } else if (SvUTF8(ret)) {
4011                             /* Not doing UTF-8, despite what the SV says. Is
4012                                this only if we're trapped in use 'bytes'?  */
4013                             /* Make a copy of the octet sequence, but without
4014                                the flag on, as the compiler now honours the
4015                                SvUTF8 flag on ret.  */
4016                             STRLEN len;
4017                             const char *const p = SvPV(ret, len);
4018                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4019                         }
4020                         rx = CALLREGCOMP(ret, pm_flags);
4021                         if (!(SvFLAGS(ret)
4022                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4023                                  | SVs_GMG))) {
4024                             /* This isn't a first class regexp. Instead, it's
4025                                caching a regexp onto an existing, Perl visible
4026                                scalar.  */
4027                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4028                         }
4029                         PL_regsize = osize;
4030                     }
4031                     re_sv = rx;
4032                     re = (struct regexp *)SvANY(rx);
4033                 }
4034                 RXp_MATCH_COPIED_off(re);
4035                 re->subbeg = rex->subbeg;
4036                 re->sublen = rex->sublen;
4037                 rei = RXi_GET(re);
4038                 DEBUG_EXECUTE_r(
4039                     debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4040                         "Matching embedded");
4041                 );
4042                 startpoint = rei->program + 1;
4043                 ST.close_paren = 0; /* only used for GOSUB */
4044                 /* borrowed from regtry */
4045                 if (PL_reg_start_tmpl <= re->nparens) {
4046                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4047                     if(PL_reg_start_tmp)
4048                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4049                     else
4050                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4051                 }
4052
4053         eval_recurse_doit: /* Share code with GOSUB below this line */
4054                 /* run the pattern returned from (??{...}) */
4055                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4056                 REGCP_SET(ST.lastcp);
4057
4058                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4059
4060                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4061                 PL_reglastparen = &re->lastparen;
4062                 PL_reglastcloseparen = &re->lastcloseparen;
4063                 re->lastparen = 0;
4064                 re->lastcloseparen = 0;
4065
4066                 PL_reginput = locinput;
4067                 PL_regsize = 0;
4068
4069                 /* XXXX This is too dramatic a measure... */
4070                 PL_reg_maxiter = 0;
4071
4072                 ST.toggle_reg_flags = PL_reg_flags;
4073                 if (RX_UTF8(re_sv))
4074                     PL_reg_flags |= RF_utf8;
4075                 else
4076                     PL_reg_flags &= ~RF_utf8;
4077                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4078
4079                 ST.prev_rex = rex_sv;
4080                 ST.prev_curlyx = cur_curlyx;
4081                 SETREX(rex_sv,re_sv);
4082                 rex = re;
4083                 rexi = rei;
4084                 cur_curlyx = NULL;
4085                 ST.B = next;
4086                 ST.prev_eval = cur_eval;
4087                 cur_eval = st;
4088                 /* now continue from first node in postoned RE */
4089                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4090                 /* NOTREACHED */
4091             }
4092             /* logical is 1,   /(?(?{...})X|Y)/ */
4093             sw = (bool)SvTRUE(ret);
4094             logical = 0;
4095             break;
4096         }
4097
4098         case EVAL_AB: /* cleanup after a successful (??{A})B */
4099             /* note: this is called twice; first after popping B, then A */
4100             PL_reg_flags ^= ST.toggle_reg_flags;
4101             ReREFCNT_dec(rex_sv);
4102             SETREX(rex_sv,ST.prev_rex);
4103             rex = (struct regexp *)SvANY(rex_sv);
4104             rexi = RXi_GET(rex);
4105             regcpblow(ST.cp);
4106             cur_eval = ST.prev_eval;
4107             cur_curlyx = ST.prev_curlyx;
4108
4109             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4110             PL_reglastparen = &rex->lastparen;
4111             PL_reglastcloseparen = &rex->lastcloseparen;
4112             /* also update PL_regoffs */
4113             PL_regoffs = rex->offs;
4114
4115             /* XXXX This is too dramatic a measure... */
4116             PL_reg_maxiter = 0;
4117             if ( nochange_depth )
4118                 nochange_depth--;
4119             sayYES;
4120
4121
4122         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4123             /* note: this is called twice; first after popping B, then A */
4124             PL_reg_flags ^= ST.toggle_reg_flags;
4125             ReREFCNT_dec(rex_sv);
4126             SETREX(rex_sv,ST.prev_rex);
4127             rex = (struct regexp *)SvANY(rex_sv);
4128             rexi = RXi_GET(rex);
4129             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4130             PL_reglastparen = &rex->lastparen;
4131             PL_reglastcloseparen = &rex->lastcloseparen;
4132
4133             PL_reginput = locinput;
4134             REGCP_UNWIND(ST.lastcp);
4135             regcppop(rex);
4136             cur_eval = ST.prev_eval;
4137             cur_curlyx = ST.prev_curlyx;
4138             /* XXXX This is too dramatic a measure... */
4139             PL_reg_maxiter = 0;
4140             if ( nochange_depth )
4141                 nochange_depth--;
4142             sayNO_SILENT;
4143 #undef ST
4144
4145         case OPEN:
4146             n = ARG(scan);  /* which paren pair */
4147             PL_reg_start_tmp[n] = locinput;
4148             if (n > PL_regsize)
4149                 PL_regsize = n;
4150             lastopen = n;
4151             break;
4152         case CLOSE:
4153             n = ARG(scan);  /* which paren pair */
4154             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4155             PL_regoffs[n].end = locinput - PL_bostr;
4156             /*if (n > PL_regsize)
4157                 PL_regsize = n;*/
4158             if (n > *PL_reglastparen)
4159                 *PL_reglastparen = n;
4160             *PL_reglastcloseparen = n;
4161             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4162                 goto fake_end;
4163             }
4164             break;
4165         case ACCEPT:
4166             if (ARG(scan)){
4167                 regnode *cursor;
4168                 for (cursor=scan;
4169                      cursor && OP(cursor)!=END;
4170                      cursor=regnext(cursor))
4171                 {
4172                     if ( OP(cursor)==CLOSE ){
4173                         n = ARG(cursor);
4174                         if ( n <= lastopen ) {
4175                             PL_regoffs[n].start
4176                                 = PL_reg_start_tmp[n] - PL_bostr;
4177                             PL_regoffs[n].end = locinput - PL_bostr;
4178                             /*if (n > PL_regsize)
4179                             PL_regsize = n;*/
4180                             if (n > *PL_reglastparen)
4181                                 *PL_reglastparen = n;
4182                             *PL_reglastcloseparen = n;
4183                             if ( n == ARG(scan) || (cur_eval &&
4184                                 cur_eval->u.eval.close_paren == n))
4185                                 break;
4186                         }
4187                     }
4188                 }
4189             }
4190             goto fake_end;
4191             /*NOTREACHED*/
4192         case GROUPP:
4193             n = ARG(scan);  /* which paren pair */
4194             sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4195             break;
4196         case NGROUPP:
4197             /* reg_check_named_buff_matched returns 0 for no match */
4198             sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
4199             break;
4200         case INSUBP:
4201             n = ARG(scan);
4202             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4203             break;
4204         case DEFINEP:
4205             sw = 0;
4206             break;
4207         case IFTHEN:
4208             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4209             if (sw)
4210                 next = NEXTOPER(NEXTOPER(scan));
4211             else {
4212                 next = scan + ARG(scan);
4213                 if (OP(next) == IFTHEN) /* Fake one. */
4214                     next = NEXTOPER(NEXTOPER(next));
4215             }
4216             break;
4217         case LOGICAL:
4218             logical = scan->flags;
4219             break;
4220
4221 /*******************************************************************
4222
4223 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4224 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4225 STAR/PLUS/CURLY/CURLYN are used instead.)
4226
4227 A*B is compiled as <CURLYX><A><WHILEM><B>
4228
4229 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4230 state, which contains the current count, initialised to -1. It also sets
4231 cur_curlyx to point to this state, with any previous value saved in the
4232 state block.
4233
4234 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4235 since the pattern may possibly match zero times (i.e. it's a while {} loop
4236 rather than a do {} while loop).
4237
4238 Each entry to WHILEM represents a successful match of A. The count in the
4239 CURLYX block is incremented, another WHILEM state is pushed, and execution
4240 passes to A or B depending on greediness and the current count.
4241
4242 For example, if matching against the string a1a2a3b (where the aN are
4243 substrings that match /A/), then the match progresses as follows: (the
4244 pushed states are interspersed with the bits of strings matched so far):
4245
4246     <CURLYX cnt=-1>
4247     <CURLYX cnt=0><WHILEM>
4248     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4249     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4250     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4251     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4252
4253 (Contrast this with something like CURLYM, which maintains only a single
4254 backtrack state:
4255
4256     <CURLYM cnt=0> a1
4257     a1 <CURLYM cnt=1> a2
4258     a1 a2 <CURLYM cnt=2> a3
4259     a1 a2 a3 <CURLYM cnt=3> b
4260 )
4261
4262 Each WHILEM state block marks a point to backtrack to upon partial failure
4263 of A or B, and also contains some minor state data related to that
4264 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4265 overall state, such as the count, and pointers to the A and B ops.
4266
4267 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4268 must always point to the *current* CURLYX block, the rules are:
4269
4270 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4271 and set cur_curlyx to point the new block.
4272
4273 When popping the CURLYX block after a successful or unsuccessful match,
4274 restore the previous cur_curlyx.
4275
4276 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4277 to the outer one saved in the CURLYX block.
4278
4279 When popping the WHILEM block after a successful or unsuccessful B match,
4280 restore the previous cur_curlyx.
4281
4282 Here's an example for the pattern (AI* BI)*BO
4283 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4284
4285 cur_
4286 curlyx backtrack stack
4287 ------ ---------------
4288 NULL
4289 CO     <CO prev=NULL> <WO>
4290 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4291 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4292 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4293
4294 At this point the pattern succeeds, and we work back down the stack to
4295 clean up, restoring as we go:
4296
4297 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4298 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4299 CO     <CO prev=NULL> <WO>
4300 NULL
4301
4302 *******************************************************************/
4303
4304 #define ST st->u.curlyx
4305
4306         case CURLYX:    /* start of /A*B/  (for complex A) */
4307         {
4308             /* No need to save/restore up to this paren */
4309             I32 parenfloor = scan->flags;
4310
4311             assert(next); /* keep Coverity happy */
4312             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4313                 next += ARG(next);
4314
4315             /* XXXX Probably it is better to teach regpush to support
4316                parenfloor > PL_regsize... */
4317             if (parenfloor > (I32)*PL_reglastparen)
4318                 parenfloor = *PL_reglastparen; /* Pessimization... */
4319
4320             ST.prev_curlyx= cur_curlyx;
4321             cur_curlyx = st;
4322             ST.cp = PL_savestack_ix;
4323
4324             /* these fields contain the state of the current curly.
4325              * they are accessed by subsequent WHILEMs */
4326             ST.parenfloor = parenfloor;
4327             ST.min = ARG1(scan);
4328             ST.max = ARG2(scan);
4329             ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4330             ST.B = next;
4331             ST.minmod = minmod;
4332             minmod = 0;
4333             ST.count = -1;      /* this will be updated by WHILEM */
4334             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4335
4336             PL_reginput = locinput;
4337             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4338             /* NOTREACHED */
4339         }
4340
4341         case CURLYX_end: /* just finished matching all of A*B */
4342             cur_curlyx = ST.prev_curlyx;
4343             sayYES;
4344             /* NOTREACHED */
4345
4346         case CURLYX_end_fail: /* just failed to match all of A*B */
4347             regcpblow(ST.cp);
4348             cur_curlyx = ST.prev_curlyx;
4349             sayNO;
4350             /* NOTREACHED */
4351
4352
4353 #undef ST
4354 #define ST st->u.whilem
4355
4356         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4357         {
4358             /* see the discussion above about CURLYX/WHILEM */
4359             I32 n;
4360             assert(cur_curlyx); /* keep Coverity happy */
4361             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4362             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4363             ST.cache_offset = 0;
4364             ST.cache_mask = 0;
4365
4366             PL_reginput = locinput;
4367
4368             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4369                   "%*s  whilem: matched %ld out of %ld..%ld\n",
4370                   REPORT_CODE_OFF+depth*2, "", (long)n,
4371                   (long)cur_curlyx->u.curlyx.min,
4372                   (long)cur_curlyx->u.curlyx.max)
4373             );
4374
4375             /* First just match a string of min A's. */
4376
4377             if (n < cur_curlyx->u.curlyx.min) {
4378                 cur_curlyx->u.curlyx.lastloc = locinput;
4379                 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4380                 /* NOTREACHED */
4381             }
4382
4383             /* If degenerate A matches "", assume A done. */
4384
4385             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4386                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4387                    "%*s  whilem: empty match detected, trying continuation...\n",
4388                    REPORT_CODE_OFF+depth*2, "")
4389                 );
4390                 goto do_whilem_B_max;
4391             }
4392
4393             /* super-linear cache processing */
4394
4395             if (scan->flags) {
4396
4397                 if (!PL_reg_maxiter) {
4398                     /* start the countdown: Postpone detection until we
4399                      * know the match is not *that* much linear. */
4400                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4401                     /* possible overflow for long strings and many CURLYX's */
4402                     if (PL_reg_maxiter < 0)
4403                         PL_reg_maxiter = I32_MAX;
4404                     PL_reg_leftiter = PL_reg_maxiter;
4405                 }
4406
4407                 if (PL_reg_leftiter-- == 0) {
4408                     /* initialise cache */
4409                     const I32 size = (PL_reg_maxiter + 7)/8;
4410                     if (PL_reg_poscache) {
4411                         if ((I32)PL_reg_poscache_size < size) {
4412                             Renew(PL_reg_poscache, size, char);
4413                             PL_reg_poscache_size = size;
4414                         }
4415                         Zero(PL_reg_poscache, size, char);
4416                     }
4417                     else {
4418                         PL_reg_poscache_size = size;
4419                         Newxz(PL_reg_poscache, size, char);
4420                     }
4421                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4422       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4423                               PL_colors[4], PL_colors[5])
4424                     );
4425                 }
4426
4427                 if (PL_reg_leftiter < 0) {
4428                     /* have we already failed at this position? */
4429                     I32 offset, mask;
4430                     offset  = (scan->flags & 0xf) - 1
4431                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4432                     mask    = 1 << (offset % 8);
4433                     offset /= 8;
4434                     if (PL_reg_poscache[offset] & mask) {
4435                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4436                             "%*s  whilem: (cache) already tried at this position...\n",
4437                             REPORT_CODE_OFF+depth*2, "")
4438                         );
4439                         sayNO; /* cache records failure */
4440                     }
4441                     ST.cache_offset = offset;
4442                     ST.cache_mask   = mask;
4443                 }
4444             }
4445
4446             /* Prefer B over A for minimal matching. */
4447
4448             if (cur_curlyx->u.curlyx.minmod) {
4449                 ST.save_curlyx = cur_curlyx;
4450                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4451                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4452                 REGCP_SET(ST.lastcp);
4453                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4454                 /* NOTREACHED */
4455             }
4456
4457             /* Prefer A over B for maximal matching. */
4458
4459             if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4460                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4461                 cur_curlyx->u.curlyx.lastloc = locinput;
4462                 REGCP_SET(ST.lastcp);
4463                 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4464                 /* NOTREACHED */
4465             }
4466             goto do_whilem_B_max;
4467         }
4468         /* NOTREACHED */
4469
4470         case WHILEM_B_min: /* just matched B in a minimal match */
4471         case WHILEM_B_max: /* just matched B in a maximal match */
4472             cur_curlyx = ST.save_curlyx;
4473             sayYES;
4474             /* NOTREACHED */
4475
4476         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4477             cur_curlyx = ST.save_curlyx;
4478             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4479             cur_curlyx->u.curlyx.count--;
4480             CACHEsayNO;
4481             /* NOTREACHED */
4482
4483         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4484             REGCP_UNWIND(ST.lastcp);
4485             regcppop(rex);
4486             /* FALL THROUGH */
4487         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4488             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4489             cur_curlyx->u.curlyx.count--;
4490             CACHEsayNO;
4491             /* NOTREACHED */
4492
4493         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4494             REGCP_UNWIND(ST.lastcp);
4495             regcppop(rex);      /* Restore some previous $<digit>s? */
4496             PL_reginput = locinput;
4497             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4498                 "%*s  whilem: failed, trying continuation...\n",
4499                 REPORT_CODE_OFF+depth*2, "")
4500             );
4501           do_whilem_B_max:
4502             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4503                 && ckWARN(WARN_REGEXP)
4504                 && !(PL_reg_flags & RF_warned))
4505             {
4506                 PL_reg_flags |= RF_warned;
4507                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4508                      "Complex regular subexpression recursion",
4509                      REG_INFTY - 1);
4510             }
4511
4512             /* now try B */
4513             ST.save_curlyx = cur_curlyx;
4514             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4515             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4516             /* NOTREACHED */
4517
4518         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4519             cur_curlyx = ST.save_curlyx;
4520             REGCP_UNWIND(ST.lastcp);
4521             regcppop(rex);
4522
4523             if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4524                 /* Maximum greed exceeded */
4525                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4526                     && ckWARN(WARN_REGEXP)
4527                     && !(PL_reg_flags & RF_warned))
4528                 {
4529                     PL_reg_flags |= RF_warned;
4530                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4531                         "%s limit (%d) exceeded",
4532                         "Complex regular subexpression recursion",
4533                         REG_INFTY - 1);
4534                 }
4535                 cur_curlyx->u.curlyx.count--;
4536                 CACHEsayNO;
4537             }
4538
4539             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4540                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4541             );
4542             /* Try grabbing another A and see if it helps. */
4543             PL_reginput = locinput;
4544             cur_curlyx->u.curlyx.lastloc = locinput;
4545             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4546             REGCP_SET(ST.lastcp);
4547             PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4548             /* NOTREACHED */
4549
4550 #undef  ST
4551 #define ST st->u.branch
4552
4553         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4554             next = scan + ARG(scan);
4555             if (next == scan)
4556                 next = NULL;
4557             scan = NEXTOPER(scan);
4558             /* FALL THROUGH */
4559
4560         case BRANCH:        /*  /(...|A|...)/ */
4561             scan = NEXTOPER(scan); /* scan now points to inner node */
4562             ST.lastparen = *PL_reglastparen;
4563             ST.next_branch = next;
4564             REGCP_SET(ST.cp);
4565             PL_reginput = locinput;
4566
4567             /* Now go into the branch */
4568             if (has_cutgroup) {
4569                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4570             } else {
4571                 PUSH_STATE_GOTO(BRANCH_next, scan);
4572             }
4573             /* NOTREACHED */
4574         case CUTGROUP:
4575             PL_reginput = locinput;
4576             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4577                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4578             PUSH_STATE_GOTO(CUTGROUP_next,next);
4579             /* NOTREACHED */
4580         case CUTGROUP_next_fail:
4581             do_cutgroup = 1;
4582             no_final = 1;
4583             if (st->u.mark.mark_name)
4584                 sv_commit = st->u.mark.mark_name;
4585             sayNO;
4586             /* NOTREACHED */
4587         case BRANCH_next:
4588             sayYES;
4589             /* NOTREACHED */
4590         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4591             if (do_cutgroup) {
4592                 do_cutgroup = 0;
4593                 no_final = 0;
4594             }
4595             REGCP_UNWIND(ST.cp);
4596             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4597                 PL_regoffs[n].end = -1;
4598             *PL_reglastparen = n;
4599             /*dmq: *PL_reglastcloseparen = n; */
4600             scan = ST.next_branch;
4601             /* no more branches? */
4602             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4603                 DEBUG_EXECUTE_r({
4604                     PerlIO_printf( Perl_debug_log,
4605                         "%*s  %sBRANCH failed...%s\n",
4606                         REPORT_CODE_OFF+depth*2, "",
4607                         PL_colors[4],
4608                         PL_colors[5] );
4609                 });
4610                 sayNO_SILENT;
4611             }
4612             continue; /* execute next BRANCH[J] op */
4613             /* NOTREACHED */
4614
4615         case MINMOD:
4616             minmod = 1;
4617             break;
4618
4619 #undef  ST
4620 #define ST st->u.curlym
4621
4622         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4623
4624             /* This is an optimisation of CURLYX that enables us to push
4625              * only a single backtracking state, no matter how many matches
4626              * there are in {m,n}. It relies on the pattern being constant
4627              * length, with no parens to influence future backrefs
4628              */
4629
4630             ST.me = scan;
4631             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4632
4633             /* if paren positive, emulate an OPEN/CLOSE around A */
4634             if (ST.me->flags) {
4635                 U32 paren = ST.me->flags;
4636                 if (paren > PL_regsize)
4637                     PL_regsize = paren;
4638                 if (paren > *PL_reglastparen)
4639                     *PL_reglastparen = paren;
4640                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4641             }
4642             ST.A = scan;
4643             ST.B = next;
4644             ST.alen = 0;
4645             ST.count = 0;
4646             ST.minmod = minmod;
4647             minmod = 0;
4648             ST.c1 = CHRTEST_UNINIT;
4649             REGCP_SET(ST.cp);
4650
4651             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4652                 goto curlym_do_B;
4653
4654           curlym_do_A: /* execute the A in /A{m,n}B/  */
4655             PL_reginput = locinput;
4656             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4657             /* NOTREACHED */
4658
4659         case CURLYM_A: /* we've just matched an A */
4660             locinput = st->locinput;
4661             nextchr = UCHARAT(locinput);
4662
4663             ST.count++;
4664             /* after first match, determine A's length: u.curlym.alen */
4665             if (ST.count == 1) {
4666                 if (PL_reg_match_utf8) {
4667                     char *s = locinput;
4668                     while (s < PL_reginput) {
4669                         ST.alen++;
4670                         s += UTF8SKIP(s);
4671                     }
4672                 }
4673                 else {
4674                     ST.alen = PL_reginput - locinput;
4675                 }
4676                 if (ST.alen == 0)
4677                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4678             }
4679             DEBUG_EXECUTE_r(
4680                 PerlIO_printf(Perl_debug_log,
4681                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4682                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4683                           (IV) ST.count, (IV)ST.alen)
4684             );
4685
4686             locinput = PL_reginput;
4687
4688             if (cur_eval && cur_eval->u.eval.close_paren &&
4689                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4690                 goto fake_end;
4691
4692             {
4693                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4694                 if ( max == REG_INFTY || ST.count < max )
4695                     goto curlym_do_A; /* try to match another A */
4696             }
4697             goto curlym_do_B; /* try to match B */
4698
4699         case CURLYM_A_fail: /* just failed to match an A */
4700             REGCP_UNWIND(ST.cp);
4701
4702             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4703                 || (cur_eval && cur_eval->u.eval.close_paren &&
4704                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4705                 sayNO;
4706
4707           curlym_do_B: /* execute the B in /A{m,n}B/  */
4708             PL_reginput = locinput;
4709             if (ST.c1 == CHRTEST_UNINIT) {
4710                 /* calculate c1 and c2 for possible match of 1st char
4711                  * following curly */
4712                 ST.c1 = ST.c2 = CHRTEST_VOID;
4713                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4714                     regnode *text_node = ST.B;
4715                     if (! HAS_TEXT(text_node))
4716                         FIND_NEXT_IMPT(text_node);
4717                     /* this used to be
4718
4719                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4720
4721                         But the former is redundant in light of the latter.
4722
4723                         if this changes back then the macro for
4724                         IS_TEXT and friends need to change.
4725                      */
4726                     if (PL_regkind[OP(text_node)] == EXACT)
4727                     {
4728
4729                         ST.c1 = (U8)*STRING(text_node);
4730                         ST.c2 =
4731                             (IS_TEXTF(text_node))
4732                             ? PL_fold[ST.c1]
4733                             : (IS_TEXTFL(text_node))
4734                                 ? PL_fold_locale[ST.c1]
4735                                 : ST.c1;
4736                     }
4737                 }
4738             }
4739
4740             DEBUG_EXECUTE_r(
4741                 PerlIO_printf(Perl_debug_log,
4742                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4743                     (int)(REPORT_CODE_OFF+(depth*2)),
4744                     "", (IV)ST.count)
4745                 );
4746             if (ST.c1 != CHRTEST_VOID
4747                     && UCHARAT(PL_reginput) != ST.c1
4748                     && UCHARAT(PL_reginput) != ST.c2)
4749             {
4750                 /* simulate B failing */
4751                 DEBUG_OPTIMISE_r(
4752                     PerlIO_printf(Perl_debug_log,
4753                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4754                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4755                         (IV)ST.c1,(IV)ST.c2
4756                 ));
4757                 state_num = CURLYM_B_fail;
4758                 goto reenter_switch;
4759             }
4760
4761             if (ST.me->flags) {
4762                 /* mark current A as captured */
4763                 I32 paren = ST.me->flags;
4764                 if (ST.count) {
4765                     PL_regoffs[paren].start
4766                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4767                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4768                     /*dmq: *PL_reglastcloseparen = paren; */
4769                 }
4770                 else
4771                     PL_regoffs[paren].end = -1;
4772                 if (cur_eval && cur_eval->u.eval.close_paren &&
4773                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4774                 {
4775                     if (ST.count)
4776                         goto fake_end;
4777                     else
4778                         sayNO;
4779                 }
4780             }
4781
4782             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4783             /* NOTREACHED */
4784
4785         case CURLYM_B_fail: /* just failed to match a B */
4786             REGCP_UNWIND(ST.cp);
4787             if (ST.minmod) {
4788                 I32 max = ARG2(ST.me);
4789                 if (max != REG_INFTY && ST.count == max)
4790                     sayNO;
4791                 goto curlym_do_A; /* try to match a further A */
4792             }
4793             /* backtrack one A */
4794             if (ST.count == ARG1(ST.me) /* min */)
4795                 sayNO;
4796             ST.count--;
4797             locinput = HOPc(locinput, -ST.alen);
4798             goto curlym_do_B; /* try to match B */
4799
4800 #undef ST
4801 #define ST st->u.curly
4802
4803 #define CURLY_SETPAREN(paren, success) \
4804     if (paren) { \
4805         if (success) { \
4806             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4807             PL_regoffs[paren].end = locinput - PL_bostr; \
4808             *PL_reglastcloseparen = paren; \
4809         } \
4810         else \
4811             PL_regoffs[paren].end = -1; \
4812     }
4813
4814         case STAR:              /*  /A*B/ where A is width 1 */
4815             ST.paren = 0;
4816             ST.min = 0;
4817             ST.max = REG_INFTY;
4818             scan = NEXTOPER(scan);
4819             goto repeat;
4820         case PLUS:              /*  /A+B/ where A is width 1 */
4821             ST.paren = 0;
4822             ST.min = 1;
4823             ST.max = REG_INFTY;
4824             scan = NEXTOPER(scan);
4825             goto repeat;
4826         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4827             ST.paren = scan->flags;     /* Which paren to set */
4828             if (ST.paren > PL_regsize)
4829                 PL_regsize = ST.paren;
4830             if (ST.paren > *PL_reglastparen)
4831                 *PL_reglastparen = ST.paren;
4832             ST.min = ARG1(scan);  /* min to match */
4833             ST.max = ARG2(scan);  /* max to match */
4834             if (cur_eval && cur_eval->u.eval.close_paren &&
4835                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4836                 ST.min=1;
4837                 ST.max=1;
4838             }
4839             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4840             goto repeat;
4841         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4842             ST.paren = 0;
4843             ST.min = ARG1(scan);  /* min to match */
4844             ST.max = ARG2(scan);  /* max to match */
4845             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4846           repeat:
4847             /*
4848             * Lookahead to avoid useless match attempts
4849             * when we know what character comes next.
4850             *
4851             * Used to only do .*x and .*?x, but now it allows
4852             * for )'s, ('s and (?{ ... })'s to be in the way
4853             * of the quantifier and the EXACT-like node.  -- japhy
4854             */
4855
4856             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4857                 sayNO;
4858             if (HAS_TEXT(next) || JUMPABLE(next)) {
4859                 U8 *s;
4860                 regnode *text_node = next;
4861
4862                 if (! HAS_TEXT(text_node))
4863                     FIND_NEXT_IMPT(text_node);
4864
4865                 if (! HAS_TEXT(text_node))
4866                     ST.c1 = ST.c2 = CHRTEST_VOID;
4867                 else {
4868                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4869                         ST.c1 = ST.c2 = CHRTEST_VOID;
4870                         goto assume_ok_easy;
4871                     }
4872                     else
4873                         s = (U8*)STRING(text_node);
4874
4875                     /*  Currently we only get here when
4876
4877                         PL_rekind[OP(text_node)] == EXACT
4878
4879                         if this changes back then the macro for IS_TEXT and
4880                         friends need to change. */
4881                     if (!UTF) {
4882                         ST.c2 = ST.c1 = *s;
4883                         if (IS_TEXTF(text_node))
4884                             ST.c2 = PL_fold[ST.c1];
4885                         else if (IS_TEXTFL(text_node))
4886                             ST.c2 = PL_fold_locale[ST.c1];
4887                     }
4888                     else { /* UTF */
4889                         if (IS_TEXTF(text_node)) {
4890                              STRLEN ulen1, ulen2;
4891                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4892                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4893
4894                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4895                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4896 #ifdef EBCDIC
4897                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4898                                                     ckWARN(WARN_UTF8) ?
4899                                                     0 : UTF8_ALLOW_ANY);
4900                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4901                                                     ckWARN(WARN_UTF8) ?
4902                                                     0 : UTF8_ALLOW_ANY);
4903 #else
4904                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4905                                                     uniflags);
4906                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4907                                                     uniflags);
4908 #endif
4909                         }
4910                         else {
4911                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4912                                                      uniflags);
4913                         }
4914                     }
4915                 }
4916             }
4917             else
4918                 ST.c1 = ST.c2 = CHRTEST_VOID;
4919         assume_ok_easy:
4920
4921             ST.A = scan;
4922             ST.B = next;
4923             PL_reginput = locinput;
4924             if (minmod) {
4925                 minmod = 0;
4926                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4927                     sayNO;
4928                 ST.count = ST.min;
4929                 locinput = PL_reginput;
4930                 REGCP_SET(ST.cp);
4931                 if (ST.c1 == CHRTEST_VOID)
4932                     goto curly_try_B_min;
4933
4934                 ST.oldloc = locinput;
4935
4936                 /* set ST.maxpos to the furthest point along the
4937                  * string that could possibly match */
4938                 if  (ST.max == REG_INFTY) {
4939                     ST.maxpos = PL_regeol - 1;
4940                     if (do_utf8)
4941                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4942                             ST.maxpos--;
4943                 }
4944                 else if (do_utf8) {
4945                     int m = ST.max - ST.min;
4946                     for (ST.maxpos = locinput;
4947                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4948                         ST.maxpos += UTF8SKIP(ST.maxpos);
4949                 }
4950                 else {
4951                     ST.maxpos = locinput + ST.max - ST.min;
4952                     if (ST.maxpos >= PL_regeol)
4953                         ST.maxpos = PL_regeol - 1;
4954                 }
4955                 goto curly_try_B_min_known;
4956
4957             }
4958             else {
4959                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4960                 locinput = PL_reginput;
4961                 if (ST.count < ST.min)
4962                     sayNO;
4963                 if ((ST.count > ST.min)
4964                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4965                 {
4966                     /* A{m,n} must come at the end of the string, there's
4967                      * no point in backing off ... */
4968                     ST.min = ST.count;
4969                     /* ...except that $ and \Z can match before *and* after
4970                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4971                        We may back off by one in this case. */
4972                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4973                         ST.min--;
4974                 }
4975                 REGCP_SET(ST.cp);
4976                 goto curly_try_B_max;
4977             }
4978             /* NOTREACHED */
4979
4980
4981         case CURLY_B_min_known_fail:
4982             /* failed to find B in a non-greedy match where c1,c2 valid */
4983             if (ST.paren && ST.count)
4984                 PL_regoffs[ST.paren].end = -1;
4985
4986             PL_reginput = locinput;     /* Could be reset... */
4987             REGCP_UNWIND(ST.cp);
4988             /* Couldn't or didn't -- move forward. */
4989             ST.oldloc = locinput;
4990             if (do_utf8)
4991                 locinput += UTF8SKIP(locinput);
4992             else
4993                 locinput++;
4994             ST.count++;
4995           curly_try_B_min_known:
4996              /* find the next place where 'B' could work, then call B */
4997             {
4998                 int n;
4999                 if (do_utf8) {
5000                     n = (ST.oldloc == locinput) ? 0 : 1;
5001                     if (ST.c1 == ST.c2) {
5002                         STRLEN len;
5003                         /* set n to utf8_distance(oldloc, locinput) */
5004                         while (locinput <= ST.maxpos &&
5005                                utf8n_to_uvchr((U8*)locinput,
5006                                               UTF8_MAXBYTES, &len,
5007                                               uniflags) != (UV)ST.c1) {
5008                             locinput += len;
5009                             n++;
5010                         }
5011                     }
5012                     else {
5013                         /* set n to utf8_distance(oldloc, locinput) */
5014                         while (locinput <= ST.maxpos) {
5015                             STRLEN len;
5016                             const UV c = utf8n_to_uvchr((U8*)locinput,
5017                                                   UTF8_MAXBYTES, &len,
5018                                                   uniflags);
5019                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5020                                 break;
5021                             locinput += len;
5022                             n++;
5023                         }
5024                     }
5025                 }
5026                 else {
5027                     if (ST.c1 == ST.c2) {
5028                         while (locinput <= ST.maxpos &&
5029                                UCHARAT(locinput) != ST.c1)
5030                             locinput++;
5031                     }
5032                     else {
5033                         while (locinput <= ST.maxpos
5034                                && UCHARAT(locinput) != ST.c1
5035                                && UCHARAT(locinput) != ST.c2)
5036                             locinput++;
5037                     }
5038                     n = locinput - ST.oldloc;
5039                 }
5040                 if (locinput > ST.maxpos)
5041                     sayNO;
5042                 /* PL_reginput == oldloc now */
5043                 if (n) {
5044                     ST.count += n;
5045                     if (regrepeat(rex, ST.A, n, depth) < n)
5046                         sayNO;
5047                 }
5048                 PL_reginput = locinput;
5049                 CURLY_SETPAREN(ST.paren, ST.count);
5050                 if (cur_eval && cur_eval->u.eval.close_paren &&
5051                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5052                     goto fake_end;
5053                 }
5054                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5055             }
5056             /* NOTREACHED */
5057
5058
5059         case CURLY_B_min_fail:
5060             /* failed to find B in a non-greedy match where c1,c2 invalid */
5061             if (ST.paren && ST.count)
5062                 PL_regoffs[ST.paren].end = -1;
5063
5064             REGCP_UNWIND(ST.cp);
5065             /* failed -- move forward one */
5066             PL_reginput = locinput;
5067             if (regrepeat(rex, ST.A, 1, depth)) {
5068                 ST.count++;
5069                 locinput = PL_reginput;
5070                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5071                         ST.count > 0)) /* count overflow ? */
5072                 {
5073                   curly_try_B_min:
5074                     CURLY_SETPAREN(ST.paren, ST.count);
5075                     if (cur_eval && cur_eval->u.eval.close_paren &&
5076                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5077                         goto fake_end;
5078                     }
5079                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5080                 }
5081             }
5082             sayNO;
5083             /* NOTREACHED */
5084
5085
5086         curly_try_B_max:
5087             /* a successful greedy match: now try to match B */
5088             if (cur_eval && cur_eval->u.eval.close_paren &&
5089                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5090                 goto fake_end;
5091             }
5092             {
5093                 UV c = 0;
5094                 if (ST.c1 != CHRTEST_VOID)
5095                     c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5096                                            UTF8_MAXBYTES, 0, uniflags)
5097                                 : (UV) UCHARAT(PL_reginput);
5098                 /* If it could work, try it. */
5099                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5100                     CURLY_SETPAREN(ST.paren, ST.count);
5101                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5102                     /* NOTREACHED */
5103                 }
5104             }
5105             /* FALL THROUGH */
5106         case CURLY_B_max_fail:
5107             /* failed to find B in a greedy match */
5108             if (ST.paren && ST.count)
5109                 PL_regoffs[ST.paren].end = -1;
5110
5111             REGCP_UNWIND(ST.cp);
5112             /*  back up. */
5113             if (--ST.count < ST.min)
5114                 sayNO;
5115             PL_reginput = locinput = HOPc(locinput, -1);
5116             goto curly_try_B_max;
5117
5118 #undef ST
5119
5120         case END:
5121             fake_end:
5122             if (cur_eval) {
5123                 /* we've just finished A in /(??{A})B/; now continue with B */
5124                 I32 tmpix;
5125                 st->u.eval.toggle_reg_flags
5126                             = cur_eval->u.eval.toggle_reg_flags;
5127                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5128
5129                 st->u.eval.prev_rex = rex_sv;           /* inner */
5130                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5131                 rex = (struct regexp *)SvANY(rex_sv);
5132                 rexi = RXi_GET(rex);
5133                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5134                 ReREFCNT_inc(rex_sv);
5135                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5136
5137                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5138                 PL_reglastparen = &rex->lastparen;
5139                 PL_reglastcloseparen = &rex->lastcloseparen;
5140
5141                 REGCP_SET(st->u.eval.lastcp);
5142                 PL_reginput = locinput;
5143
5144                 /* Restore parens of the outer rex without popping the
5145                  * savestack */
5146                 tmpix = PL_savestack_ix;
5147                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5148                 regcppop(rex);
5149                 PL_savestack_ix = tmpix;
5150
5151                 st->u.eval.prev_eval = cur_eval;
5152                 cur_eval = cur_eval->u.eval.prev_eval;
5153                 DEBUG_EXECUTE_r(
5154                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5155                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5156                 if ( nochange_depth )
5157                     nochange_depth--;
5158
5159                 PUSH_YES_STATE_GOTO(EVAL_AB,
5160                         st->u.eval.prev_eval->u.eval.B); /* match B */
5161             }
5162
5163             if (locinput < reginfo->till) {
5164                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5165                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5166                                       PL_colors[4],
5167                                       (long)(locinput - PL_reg_starttry),
5168                                       (long)(reginfo->till - PL_reg_starttry),
5169                                       PL_colors[5]));
5170
5171                 sayNO_SILENT;           /* Cannot match: too short. */
5172             }
5173             PL_reginput = locinput;     /* put where regtry can find it */
5174             sayYES;                     /* Success! */
5175
5176         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5177             DEBUG_EXECUTE_r(
5178             PerlIO_printf(Perl_debug_log,
5179                 "%*s  %ssubpattern success...%s\n",
5180                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5181             PL_reginput = locinput;     /* put where regtry can find it */
5182             sayYES;                     /* Success! */
5183
5184 #undef  ST
5185 #define ST st->u.ifmatch
5186
5187         case SUSPEND:   /* (?>A) */
5188             ST.wanted = 1;
5189             PL_reginput = locinput;
5190             goto do_ifmatch;
5191
5192         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5193             ST.wanted = 0;
5194             goto ifmatch_trivial_fail_test;
5195
5196         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5197             ST.wanted = 1;
5198           ifmatch_trivial_fail_test:
5199             if (scan->flags) {
5200                 char * const s = HOPBACKc(locinput, scan->flags);
5201                 if (!s) {
5202                     /* trivial fail */
5203                     if (logical) {
5204                         logical = 0;
5205                         sw = 1 - (bool)ST.wanted;
5206                     }
5207                     else if (ST.wanted)
5208                         sayNO;
5209                     next = scan + ARG(scan);
5210                     if (next == scan)
5211                         next = NULL;
5212                     break;
5213                 }
5214                 PL_reginput = s;
5215             }
5216             else
5217                 PL_reginput = locinput;
5218
5219           do_ifmatch:
5220             ST.me = scan;
5221             ST.logical = logical;
5222             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5223
5224             /* execute body of (?...A) */
5225             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5226             /* NOTREACHED */
5227
5228         case IFMATCH_A_fail: /* body of (?...A) failed */
5229             ST.wanted = !ST.wanted;
5230             /* FALL THROUGH */
5231
5232         case IFMATCH_A: /* body of (?...A) succeeded */
5233             if (ST.logical) {
5234                 sw = (bool)ST.wanted;
5235             }
5236             else if (!ST.wanted)
5237                 sayNO;
5238
5239             if (OP(ST.me) == SUSPEND)
5240                 locinput = PL_reginput;
5241             else {
5242                 locinput = PL_reginput = st->locinput;
5243                 nextchr = UCHARAT(locinput);
5244             }
5245             scan = ST.me + ARG(ST.me);
5246             if (scan == ST.me)
5247                 scan = NULL;
5248             continue; /* execute B */
5249
5250 #undef ST
5251
5252         case LONGJMP:
5253             next = scan + ARG(scan);
5254             if (next == scan)
5255                 next = NULL;
5256             break;
5257         case COMMIT:
5258             reginfo->cutpoint = PL_regeol;
5259             /* FALLTHROUGH */
5260         case PRUNE:
5261             PL_reginput = locinput;
5262             if (!scan->flags)
5263                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5264             PUSH_STATE_GOTO(COMMIT_next,next);
5265             /* NOTREACHED */
5266         case COMMIT_next_fail:
5267             no_final = 1;
5268             /* FALLTHROUGH */
5269         case OPFAIL:
5270             sayNO;
5271             /* NOTREACHED */
5272
5273 #define ST st->u.mark
5274         case MARKPOINT:
5275             ST.prev_mark = mark_state;
5276             ST.mark_name = sv_commit = sv_yes_mark
5277                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5278             mark_state = st;
5279             ST.mark_loc = PL_reginput = locinput;
5280             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5281             /* NOTREACHED */
5282         case MARKPOINT_next:
5283             mark_state = ST.prev_mark;
5284             sayYES;
5285             /* NOTREACHED */
5286         case MARKPOINT_next_fail:
5287             if (popmark && sv_eq(ST.mark_name,popmark))
5288             {
5289                 if (ST.mark_loc > startpoint)
5290                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5291                 popmark = NULL; /* we found our mark */
5292                 sv_commit = ST.mark_name;
5293
5294                 DEBUG_EXECUTE_r({
5295                         PerlIO_printf(Perl_debug_log,
5296                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5297                             REPORT_CODE_OFF+depth*2, "",
5298                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5299                 });
5300             }
5301             mark_state = ST.prev_mark;
5302             sv_yes_mark = mark_state ?
5303                 mark_state->u.mark.mark_name : NULL;
5304             sayNO;
5305             /* NOTREACHED */
5306         case SKIP:
5307             PL_reginput = locinput;
5308             if (scan->flags) {
5309                 /* (*SKIP) : if we fail we cut here*/
5310                 ST.mark_name = NULL;
5311                 ST.mark_loc = locinput;
5312                 PUSH_STATE_GOTO(SKIP_next,next);
5313             } else {
5314                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5315                    otherwise do nothing.  Meaning we need to scan
5316                  */
5317                 regmatch_state *cur = mark_state;
5318                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5319
5320                 while (cur) {
5321                     if ( sv_eq( cur->u.mark.mark_name,
5322                                 find ) )
5323                     {
5324                         ST.mark_name = find;
5325                         PUSH_STATE_GOTO( SKIP_next, next );
5326                     }
5327                     cur = cur->u.mark.prev_mark;
5328                 }
5329             }
5330             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5331             break;
5332         case SKIP_next_fail:
5333             if (ST.mark_name) {
5334                 /* (*CUT:NAME) - Set up to search for the name as we
5335                    collapse the stack*/
5336                 popmark = ST.mark_name;
5337             } else {
5338                 /* (*CUT) - No name, we cut here.*/
5339                 if (ST.mark_loc > startpoint)
5340                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5341                 /* but we set sv_commit to latest mark_name if there
5342                    is one so they can test to see how things lead to this
5343                    cut */
5344                 if (mark_state)
5345                     sv_commit=mark_state->u.mark.mark_name;
5346             }
5347             no_final = 1;
5348             sayNO;
5349             /* NOTREACHED */
5350 #undef ST
5351         case FOLDCHAR:
5352             n = ARG(scan);
5353             if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5354                 locinput += ln;
5355             } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5356                 sayNO;
5357             } else  {
5358                 U8 folded[UTF8_MAXBYTES_CASE+1];
5359                 STRLEN foldlen;
5360                 const char * const l = locinput;
5361                 char *e = PL_regeol;
5362                 to_uni_fold(n, folded, &foldlen);
5363
5364                 if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
5365                                l, &e, 0,  do_utf8)) {
5366                         sayNO;
5367                 }
5368                 locinput = e;
5369             }
5370             nextchr = UCHARAT(locinput);
5371             break;
5372         case LNBREAK:
5373             if ((n=is_LNBREAK(locinput,do_utf8))) {
5374                 locinput += n;
5375                 nextchr = UCHARAT(locinput);
5376             } else
5377                 sayNO;
5378             break;
5379
5380 #define CASE_CLASS(nAmE)                              \
5381         case nAmE:                                    \
5382             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5383                 locinput += n;                        \
5384                 nextchr = UCHARAT(locinput);          \
5385             } else                                    \
5386                 sayNO;                                \
5387             break;                                    \
5388         case N##nAmE:                                 \
5389             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5390                 sayNO;                                \
5391             } else {                                  \
5392                 locinput += UTF8SKIP(locinput);       \
5393                 nextchr = UCHARAT(locinput);          \
5394             }                                         \
5395             break
5396
5397         CASE_CLASS(VERTWS);
5398         CASE_CLASS(HORIZWS);
5399 #undef CASE_CLASS
5400
5401         default:
5402             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5403                           PTR2UV(scan), OP(scan));
5404             Perl_croak(aTHX_ "regexp memory corruption");
5405
5406         } /* end switch */
5407
5408         /* switch break jumps here */
5409         scan = next; /* prepare to execute the next op and ... */
5410         continue;    /* ... jump back to the top, reusing st */
5411         /* NOTREACHED */
5412
5413       push_yes_state:
5414         /* push a state that backtracks on success */
5415         st->u.yes.prev_yes_state = yes_state;
5416         yes_state = st;
5417         /* FALL THROUGH */
5418       push_state:
5419         /* push a new regex state, then continue at scan  */
5420         {
5421             regmatch_state *newst;
5422
5423             DEBUG_STACK_r({
5424                 regmatch_state *cur = st;
5425                 regmatch_state *curyes = yes_state;
5426                 int curd = depth;
5427                 regmatch_slab *slab = PL_regmatch_slab;
5428                 for (;curd > -1;cur--,curd--) {
5429                     if (cur < SLAB_FIRST(slab)) {
5430                         slab = slab->prev;
5431                         cur = SLAB_LAST(slab);
5432                     }
5433                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5434                         REPORT_CODE_OFF + 2 + depth * 2,"",
5435                         curd, PL_reg_name[cur->resume_state],
5436                         (curyes == cur) ? "yes" : ""
5437                     );
5438                     if (curyes == cur)
5439                         curyes = cur->u.yes.prev_yes_state;
5440                 }
5441             } else
5442                 DEBUG_STATE_pp("push")
5443             );
5444             depth++;
5445             st->locinput = locinput;
5446             newst = st+1;
5447             if (newst >  SLAB_LAST(PL_regmatch_slab))
5448                 newst = S_push_slab(aTHX);
5449             PL_regmatch_state = newst;
5450
5451             locinput = PL_reginput;
5452             nextchr = UCHARAT(locinput);
5453             st = newst;
5454             continue;
5455             /* NOTREACHED */
5456         }
5457     }
5458
5459     /*
5460     * We get here only if there's trouble -- normally "case END" is
5461     * the terminating point.
5462     */
5463     Perl_croak(aTHX_ "corrupted regexp pointers");
5464     /*NOTREACHED*/
5465     sayNO;
5466
5467 yes:
5468     if (yes_state) {
5469         /* we have successfully completed a subexpression, but we must now
5470          * pop to the state marked by yes_state and continue from there */
5471         assert(st != yes_state);
5472 #ifdef DEBUGGING
5473         while (st != yes_state) {
5474             st--;
5475             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5476                 PL_regmatch_slab = PL_regmatch_slab->prev;
5477                 st = SLAB_LAST(PL_regmatch_slab);
5478             }
5479             DEBUG_STATE_r({
5480                 if (no_final) {
5481                     DEBUG_STATE_pp("pop (no final)");
5482                 } else {
5483                     DEBUG_STATE_pp("pop (yes)");
5484                 }
5485             });
5486             depth--;
5487         }
5488 #else
5489         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5490             || yes_state > SLAB_LAST(PL_regmatch_slab))
5491         {
5492             /* not in this slab, pop slab */
5493             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5494             PL_regmatch_slab = PL_regmatch_slab->prev;
5495             st = SLAB_LAST(PL_regmatch_slab);
5496         }
5497         depth -= (st - yes_state);
5498 #endif
5499         st = yes_state;
5500         yes_state = st->u.yes.prev_yes_state;
5501         PL_regmatch_state = st;
5502
5503         if (no_final) {
5504             locinput= st->locinput;
5505             nextchr = UCHARAT(locinput);
5506         }
5507         state_num = st->resume_state + no_final;
5508         goto reenter_switch;
5509     }
5510
5511     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5512                           PL_colors[4], PL_colors[5]));
5513
5514     if (PL_reg_eval_set) {
5515         /* each successfully executed (?{...}) block does the equivalent of
5516          *   local $^R = do {...}
5517          * When popping the save stack, all these locals would be undone;
5518          * bypass this by setting the outermost saved $^R to the latest
5519          * value */
5520         if (oreplsv != GvSV(PL_replgv))
5521             sv_setsv(oreplsv, GvSV(PL_replgv));
5522     }
5523     result = 1;
5524     goto final_exit;
5525
5526 no:
5527     DEBUG_EXECUTE_r(
5528         PerlIO_printf(Perl_debug_log,
5529             "%*s  %sfailed...%s\n",
5530             REPORT_CODE_OFF+depth*2, "",
5531             PL_colors[4], PL_colors[5])
5532         );
5533
5534 no_silent:
5535     if (no_final) {
5536         if (yes_state) {
5537             goto yes;
5538         } else {
5539             goto final_exit;
5540         }
5541     }
5542     if (depth) {
5543         /* there's a previous state to backtrack to */
5544         st--;
5545         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5546             PL_regmatch_slab = PL_regmatch_slab->prev;
5547             st = SLAB_LAST(PL_regmatch_slab);
5548         }
5549         PL_regmatch_state = st;
5550         locinput= st->locinput;
5551         nextchr = UCHARAT(locinput);
5552
5553         DEBUG_STATE_pp("pop");
5554         depth--;
5555         if (yes_state == st)
5556             yes_state = st->u.yes.prev_yes_state;
5557
5558         state_num = st->resume_state + 1; /* failure = success + 1 */
5559         goto reenter_switch;
5560     }
5561     result = 0;
5562
5563   final_exit:
5564     if (rex->intflags & PREGf_VERBARG_SEEN) {
5565         SV *sv_err = get_sv("REGERROR", 1);
5566         SV *sv_mrk = get_sv("REGMARK", 1);
5567         if (result) {
5568             sv_commit = &PL_sv_no;
5569             if (!sv_yes_mark)
5570                 sv_yes_mark = &PL_sv_yes;
5571         } else {
5572             if (!sv_commit)
5573                 sv_commit = &PL_sv_yes;
5574             sv_yes_mark = &PL_sv_no;
5575         }
5576         sv_setsv(sv_err, sv_commit);
5577         sv_setsv(sv_mrk, sv_yes_mark);
5578     }
5579
5580     /* clean up; in particular, free all slabs above current one */
5581     LEAVE_SCOPE(oldsave);
5582
5583     return result;
5584 }
5585
5586 /*
5587  - regrepeat - repeatedly match something simple, report how many
5588  */
5589 /*
5590  * [This routine now assumes that it will only match on things of length 1.
5591  * That was true before, but now we assume scan - reginput is the count,
5592  * rather than incrementing count on every character.  [Er, except utf8.]]
5593  */
5594 STATIC I32
5595 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5596 {
5597     dVAR;
5598     register char *scan;
5599     register I32 c;
5600     register char *loceol = PL_regeol;
5601     register I32 hardcount = 0;
5602     register bool do_utf8 = PL_reg_match_utf8;
5603 #ifndef DEBUGGING
5604     PERL_UNUSED_ARG(depth);
5605 #endif
5606
5607     PERL_ARGS_ASSERT_REGREPEAT;
5608
5609     scan = PL_reginput;
5610     if (max == REG_INFTY)
5611         max = I32_MAX;
5612     else if (max < loceol - scan)
5613         loceol = scan + max;
5614     switch (OP(p)) {
5615     case REG_ANY:
5616         if (do_utf8) {
5617             loceol = PL_regeol;
5618             while (scan < loceol && hardcount < max && *scan != '\n') {
5619                 scan += UTF8SKIP(scan);
5620                 hardcount++;
5621             }
5622         } else {
5623             while (scan < loceol && *scan != '\n')
5624                 scan++;
5625         }
5626         break;
5627     case SANY:
5628         if (do_utf8) {
5629             loceol = PL_regeol;
5630             while (scan < loceol && hardcount < max) {
5631                 scan += UTF8SKIP(scan);
5632                 hardcount++;
5633             }
5634         }
5635         else
5636             scan = loceol;
5637         break;
5638     case CANY:
5639         scan = loceol;
5640         break;
5641     case EXACT:         /* length of string is 1 */
5642         c = (U8)*STRING(p);
5643         while (scan < loceol && UCHARAT(scan) == c)
5644             scan++;
5645         break;
5646     case EXACTF:        /* length of string is 1 */
5647         c = (U8)*STRING(p);
5648         while (scan < loceol &&
5649                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5650             scan++;
5651         break;
5652     case EXACTFL:       /* length of string is 1 */
5653         PL_reg_flags |= RF_tainted;
5654         c = (U8)*STRING(p);
5655         while (scan < loceol &&
5656                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5657             scan++;
5658         break;
5659     case ANYOF:
5660         if (do_utf8) {
5661             loceol = PL_regeol;
5662             while (hardcount < max && scan < loceol &&
5663                    reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5664                 scan += UTF8SKIP(scan);
5665                 hardcount++;
5666             }
5667         } else {
5668             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5669                 scan++;
5670         }
5671         break;
5672     case ALNUM:
5673         if (do_utf8) {
5674             loceol = PL_regeol;
5675             LOAD_UTF8_CHARCLASS_ALNUM();
5676             while (hardcount < max && scan < loceol &&
5677                    swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5678                 scan += UTF8SKIP(scan);
5679                 hardcount++;
5680             }
5681         } else {
5682             while (scan < loceol && isALNUM(*scan))
5683                 scan++;
5684         }
5685         break;
5686     case ALNUML:
5687         PL_reg_flags |= RF_tainted;
5688         if (do_utf8) {
5689             loceol = PL_regeol;
5690             while (hardcount < max && scan < loceol &&
5691                    isALNUM_LC_utf8((U8*)scan)) {
5692                 scan += UTF8SKIP(scan);
5693                 hardcount++;
5694             }
5695         } else {
5696             while (scan < loceol && isALNUM_LC(*scan))
5697                 scan++;
5698         }
5699         break;
5700     case NALNUM:
5701         if (do_utf8) {
5702             loceol = PL_regeol;
5703             LOAD_UTF8_CHARCLASS_ALNUM();
5704             while (hardcount < max && scan < loceol &&
5705                    !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5706                 scan += UTF8SKIP(scan);
5707                 hardcount++;
5708             }
5709         } else {
5710             while (scan < loceol && !isALNUM(*scan))
5711                 scan++;
5712         }
5713         break;
5714     case NALNUML:
5715         PL_reg_flags |= RF_tainted;
5716         if (do_utf8) {
5717             loceol = PL_regeol;
5718             while (hardcount < max && scan < loceol &&
5719                    !isALNUM_LC_utf8((U8*)scan)) {
5720                 scan += UTF8SKIP(scan);
5721                 hardcount++;
5722             }
5723         } else {
5724             while (scan < loceol && !isALNUM_LC(*scan))
5725                 scan++;
5726         }
5727         break;
5728     case SPACE:
5729         if (do_utf8) {
5730             loceol = PL_regeol;
5731             LOAD_UTF8_CHARCLASS_SPACE();
5732             while (hardcount < max && scan < loceol &&
5733                    (*scan == ' ' ||
5734                     swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5735                 scan += UTF8SKIP(scan);
5736                 hardcount++;
5737             }
5738         } else {
5739             while (scan < loceol && isSPACE(*scan))
5740                 scan++;
5741         }
5742         break;
5743     case SPACEL:
5744         PL_reg_flags |= RF_tainted;
5745         if (do_utf8) {
5746             loceol = PL_regeol;
5747             while (hardcount < max && scan < loceol &&
5748                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5749                 scan += UTF8SKIP(scan);
5750                 hardcount++;
5751             }
5752         } else {
5753             while (scan < loceol && isSPACE_LC(*scan))
5754                 scan++;
5755         }
5756         break;
5757     case NSPACE:
5758         if (do_utf8) {
5759             loceol = PL_regeol;
5760             LOAD_UTF8_CHARCLASS_SPACE();
5761             while (hardcount < max && scan < loceol &&
5762                    !(*scan == ' ' ||
5763                      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5764                 scan += UTF8SKIP(scan);
5765                 hardcount++;
5766             }
5767         } else {
5768             while (scan < loceol && !isSPACE(*scan))
5769                 scan++;
5770         }
5771         break;
5772     case NSPACEL:
5773         PL_reg_flags |= RF_tainted;
5774         if (do_utf8) {
5775             loceol = PL_regeol;
5776             while (hardcount < max && scan < loceol &&
5777                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5778                 scan += UTF8SKIP(scan);
5779                 hardcount++;
5780             }
5781         } else {
5782             while (scan < loceol && !isSPACE_LC(*scan))
5783                 scan++;
5784         }
5785         break;
5786     case DIGIT:
5787         if (do_utf8) {
5788             loceol = PL_regeol;
5789             LOAD_UTF8_CHARCLASS_DIGIT();
5790             while (hardcount < max && scan < loceol &&
5791                    swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5792                 scan += UTF8SKIP(scan);
5793                 hardcount++;
5794             }
5795         } else {
5796             while (scan < loceol && isDIGIT(*scan))
5797                 scan++;
5798         }
5799         break;
5800     case NDIGIT:
5801         if (do_utf8) {
5802             loceol = PL_regeol;
5803             LOAD_UTF8_CHARCLASS_DIGIT();
5804             while (hardcount < max && scan < loceol &&
5805                    !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5806                 scan += UTF8SKIP(scan);
5807                 hardcount++;
5808             }
5809         } else {
5810             while (scan < loceol && !isDIGIT(*scan))
5811                 scan++;
5812         }
5813     case LNBREAK:
5814         if (do_utf8) {
5815             loceol = PL_regeol;
5816             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5817                 scan += c;
5818                 hardcount++;
5819             }
5820         } else {
5821             /*
5822               LNBREAK can match two latin chars, which is ok,
5823               because we have a null terminated string, but we
5824               have to use hardcount in this situation
5825             */
5826             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5827                 scan+=c;
5828                 hardcount++;
5829             }
5830         }
5831         break;
5832     case HORIZWS:
5833         if (do_utf8) {
5834             loceol = PL_regeol;
5835             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5836                 scan += c;
5837                 hardcount++;
5838             }
5839         } else {
5840             while (scan < loceol && is_HORIZWS_latin1(scan))
5841                 scan++;
5842         }
5843         break;
5844     case NHORIZWS:
5845         if (do_utf8) {
5846             loceol = PL_regeol;
5847             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5848                 scan += UTF8SKIP(scan);
5849                 hardcount++;
5850             }
5851         } else {
5852             while (scan < loceol && !is_HORIZWS_latin1(scan))
5853                 scan++;
5854
5855         }
5856         break;
5857     case VERTWS:
5858         if (do_utf8) {
5859             loceol = PL_regeol;
5860             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5861                 scan += c;
5862                 hardcount++;
5863             }
5864         } else {
5865             while (scan < loceol && is_VERTWS_latin1(scan))
5866                 scan++;
5867
5868         }
5869         break;
5870     case NVERTWS:
5871         if (do_utf8) {
5872             loceol = PL_regeol;
5873             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5874                 scan += UTF8SKIP(scan);
5875                 hardcount++;
5876             }
5877         } else {
5878             while (scan < loceol && !is_VERTWS_latin1(scan))
5879                 scan++;
5880
5881         }
5882         break;
5883
5884     default:            /* Called on something of 0 width. */
5885         break;          /* So match right here or not at all. */
5886     }
5887
5888     if (hardcount)
5889         c = hardcount;
5890     else
5891         c = scan - PL_reginput;
5892     PL_reginput = scan;
5893
5894     DEBUG_r({
5895         GET_RE_DEBUG_FLAGS_DECL;
5896         DEBUG_EXECUTE_r({
5897             SV * const prop = sv_newmortal();
5898             regprop(prog, prop, p);
5899             PerlIO_printf(Perl_debug_log,
5900                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5901                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5902         });
5903     });
5904
5905     return(c);
5906 }
5907
5908
5909 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5910 /*
5911 - regclass_swash - prepare the utf8 swash
5912 */
5913
5914 SV *
5915 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5916 {
5917     dVAR;
5918     SV *sw  = NULL;
5919     SV *si  = NULL;
5920     SV *alt = NULL;
5921     RXi_GET_DECL(prog,progi);
5922     const struct reg_data * const data = prog ? progi->data : NULL;
5923
5924     PERL_ARGS_ASSERT_REGCLASS_SWASH;
5925
5926     if (data && data->count) {
5927         const U32 n = ARG(node);
5928
5929         if (data->what[n] == 's') {
5930             SV * const rv = MUTABLE_SV(data->data[n]);
5931             AV * const av = MUTABLE_AV(SvRV(rv));
5932             SV **const ary = AvARRAY(av);
5933             SV **a, **b;
5934
5935             /* See the end of regcomp.c:S_regclass() for
5936              * documentation of these array elements. */
5937
5938             si = *ary;
5939             a  = SvROK(ary[1]) ? &ary[1] : NULL;
5940             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5941
5942             if (a)
5943                 sw = *a;
5944             else if (si && doinit) {
5945                 sw = swash_init("utf8", "", si, 1, 0);
5946                 (void)av_store(av, 1, sw);
5947             }
5948             if (b)
5949                 alt = *b;
5950         }
5951     }
5952
5953     if (listsvp)
5954         *listsvp = si;
5955     if (altsvp)
5956         *altsvp  = alt;
5957
5958     return sw;
5959 }
5960 #endif
5961
5962 /*
5963  - reginclass - determine if a character falls into a character class
5964
5965   The n is the ANYOF regnode, the p is the target string, lenp
5966   is pointer to the maximum length of how far to go in the p
5967   (if the lenp is zero, UTF8SKIP(p) is used),
5968   do_utf8 tells whether the target string is in UTF-8.
5969
5970  */
5971
5972 STATIC bool
5973 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5974 {
5975     dVAR;
5976     const char flags = ANYOF_FLAGS(n);
5977     bool match = FALSE;
5978     UV c = *p;
5979     STRLEN len = 0;
5980     STRLEN plen;
5981
5982     PERL_ARGS_ASSERT_REGINCLASS;
5983
5984     if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5985         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5986                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5987                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5988                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5989                  * UTF8_ALLOW_FFFF */
5990         if (len == (STRLEN)-1)
5991             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5992     }
5993
5994     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5995     if (do_utf8 || (flags & ANYOF_UNICODE)) {
5996         if (lenp)
5997             *lenp = 0;
5998         if (do_utf8 && !ANYOF_RUNTIME(n)) {
5999             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
6000                 match = TRUE;
6001         }
6002         if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
6003             match = TRUE;
6004         if (!match) {
6005             AV *av;
6006             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6007
6008             if (sw) {
6009                 U8 * utf8_p;
6010                 if (do_utf8) {
6011                     utf8_p = (U8 *) p;
6012                 } else {
6013                     STRLEN len = 1;
6014                     utf8_p = bytes_to_utf8(p, &len);
6015                 }
6016                 if (swash_fetch(sw, utf8_p, 1))
6017                     match = TRUE;
6018                 else if (flags & ANYOF_FOLD) {
6019                     if (!match && lenp && av) {
6020                         I32 i;
6021                         for (i = 0; i <= av_len(av); i++) {
6022                             SV* const sv = *av_fetch(av, i, FALSE);
6023                             STRLEN len;
6024                             const char * const s = SvPV_const(sv, len);
6025                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
6026                                 *lenp = len;
6027                                 match = TRUE;
6028                                 break;
6029                             }
6030                         }
6031                     }
6032                     if (!match) {
6033                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6034
6035                         STRLEN tmplen;
6036                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6037                         if (swash_fetch(sw, tmpbuf, 1))
6038                             match = TRUE;
6039                     }
6040                 }
6041
6042                 /* If we allocated a string above, free it */
6043                 if (! do_utf8) Safefree(utf8_p);
6044             }
6045         }
6046         if (match && lenp && *lenp == 0)
6047             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6048     }
6049     if (!match && c < 256) {
6050         if (ANYOF_BITMAP_TEST(n, c))
6051             match = TRUE;
6052         else if (flags & ANYOF_FOLD) {
6053             U8 f;
6054
6055             if (flags & ANYOF_LOCALE) {
6056                 PL_reg_flags |= RF_tainted;
6057                 f = PL_fold_locale[c];
6058             }
6059             else
6060                 f = PL_fold[c];
6061             if (f != c && ANYOF_BITMAP_TEST(n, f))
6062                 match = TRUE;
6063         }
6064
6065         if (!match && (flags & ANYOF_CLASS)) {
6066             PL_reg_flags |= RF_tainted;
6067             if (
6068                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6069                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6070                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6071                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6072                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6073                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6074                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6075                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6076                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6077                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6078                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6079                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6080                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6081                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6082                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6083                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6084                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6085                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6086                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6087                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6088                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6089                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6090                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6091                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6092                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6093                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6094                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6095                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6096                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6097                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6098                 ) /* How's that for a conditional? */
6099             {
6100                 match = TRUE;
6101             }
6102         }
6103     }
6104
6105     return (flags & ANYOF_INVERT) ? !match : match;
6106 }
6107
6108 STATIC U8 *
6109 S_reghop3(U8 *s, I32 off, const U8* lim)
6110 {
6111     dVAR;
6112
6113     PERL_ARGS_ASSERT_REGHOP3;
6114
6115     if (off >= 0) {
6116         while (off-- && s < lim) {
6117             /* XXX could check well-formedness here */
6118             s += UTF8SKIP(s);
6119         }
6120     }
6121     else {
6122         while (off++ && s > lim) {
6123             s--;
6124             if (UTF8_IS_CONTINUED(*s)) {
6125                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6126                     s--;
6127             }
6128             /* XXX could check well-formedness here */
6129         }
6130     }
6131     return s;
6132 }
6133
6134 #ifdef XXX_dmq
6135 /* there are a bunch of places where we use two reghop3's that should
6136    be replaced with this routine. but since thats not done yet
6137    we ifdef it out - dmq
6138 */
6139 STATIC U8 *
6140 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6141 {
6142     dVAR;
6143
6144     PERL_ARGS_ASSERT_REGHOP4;
6145
6146     if (off >= 0) {
6147         while (off-- && s < rlim) {
6148             /* XXX could check well-formedness here */
6149             s += UTF8SKIP(s);
6150         }
6151     }
6152     else {
6153         while (off++ && s > llim) {
6154             s--;
6155             if (UTF8_IS_CONTINUED(*s)) {
6156                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6157                     s--;
6158             }
6159             /* XXX could check well-formedness here */
6160         }
6161     }
6162     return s;
6163 }
6164 #endif
6165
6166 STATIC U8 *
6167 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6168 {
6169     dVAR;
6170
6171     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6172
6173     if (off >= 0) {
6174         while (off-- && s < lim) {
6175             /* XXX could check well-formedness here */
6176             s += UTF8SKIP(s);
6177         }
6178         if (off >= 0)
6179             return NULL;
6180     }
6181     else {
6182         while (off++ && s > lim) {
6183             s--;
6184             if (UTF8_IS_CONTINUED(*s)) {
6185                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6186                     s--;
6187             }
6188             /* XXX could check well-formedness here */
6189         }
6190         if (off <= 0)
6191             return NULL;
6192     }
6193     return s;
6194 }
6195
6196 static void
6197 restore_pos(pTHX_ void *arg)
6198 {
6199     dVAR;
6200     regexp * const rex = (regexp *)arg;
6201     if (PL_reg_eval_set) {
6202         if (PL_reg_oldsaved) {
6203             rex->subbeg = PL_reg_oldsaved;
6204             rex->sublen = PL_reg_oldsavedlen;
6205 #ifdef PERL_OLD_COPY_ON_WRITE
6206             rex->saved_copy = PL_nrs;
6207 #endif
6208             RXp_MATCH_COPIED_on(rex);
6209         }
6210         PL_reg_magic->mg_len = PL_reg_oldpos;
6211         PL_reg_eval_set = 0;
6212         PL_curpm = PL_reg_oldcurpm;
6213     }
6214 }
6215
6216 STATIC void
6217 S_to_utf8_substr(pTHX_ register regexp *prog)
6218 {
6219     int i = 1;
6220
6221     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6222
6223     do {
6224         if (prog->substrs->data[i].substr
6225             && !prog->substrs->data[i].utf8_substr) {
6226             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6227             prog->substrs->data[i].utf8_substr = sv;
6228             sv_utf8_upgrade(sv);
6229             if (SvVALID(prog->substrs->data[i].substr)) {
6230                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6231                 if (flags & FBMcf_TAIL) {
6232                     /* Trim the trailing \n that fbm_compile added last
6233                        time.  */
6234                     SvCUR_set(sv, SvCUR(sv) - 1);
6235                     /* Whilst this makes the SV technically "invalid" (as its
6236                        buffer is no longer followed by "\0") when fbm_compile()
6237                        adds the "\n" back, a "\0" is restored.  */
6238                 }
6239                 fbm_compile(sv, flags);
6240             }
6241             if (prog->substrs->data[i].substr == prog->check_substr)
6242                 prog->check_utf8 = sv;
6243         }
6244     } while (i--);
6245 }
6246
6247 STATIC void
6248 S_to_byte_substr(pTHX_ register regexp *prog)
6249 {
6250     dVAR;
6251     int i = 1;
6252
6253     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6254
6255     do {
6256         if (prog->substrs->data[i].utf8_substr
6257             && !prog->substrs->data[i].substr) {
6258             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6259             if (sv_utf8_downgrade(sv, TRUE)) {
6260                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6261                     const U8 flags
6262                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6263                     if (flags & FBMcf_TAIL) {
6264                         /* Trim the trailing \n that fbm_compile added last
6265                            time.  */
6266                         SvCUR_set(sv, SvCUR(sv) - 1);
6267                     }
6268                     fbm_compile(sv, flags);
6269                 }
6270             } else {
6271                 SvREFCNT_dec(sv);
6272                 sv = &PL_sv_undef;
6273             }
6274             prog->substrs->data[i].substr = sv;
6275             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6276                 prog->check_substr = sv;
6277         }
6278     } while (i--);
6279 }
6280
6281 /*
6282  * Local variables:
6283  * c-indentation-style: bsd
6284  * c-basic-offset: 4
6285  * indent-tabs-mode: t
6286  * End:
6287  *
6288  * ex: set ts=8 sts=4 sw=4 noet:
6289  */