src/5017001/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #ifndef STATIC
  91 #define STATIC  static
  92 #endif
  93
  94 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  95  * call if there are no complications: i.e., if everything matchable is
  96  * straight forward in the bitmap */
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
  98                                               : ANYOF_BITMAP_TEST(p,*(c)))
  99
 100 /*
 101  * Forwards.
 102  */
 103
 104 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 106
 107 #define HOPc(pos,off) \
 108         (char *)(PL_reg_match_utf8 \
 109             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 110             : (U8*)(pos + off))
 111 #define HOPBACKc(pos, off) \
 112         (char*)(PL_reg_match_utf8\
 113             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 114             : (pos - off >= PL_bostr)           \
 115                 ? (U8*)pos - off                \
 116                 : NULL)
 117
 118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 120
 121 /* these are unrolled below in the CCC_TRY_XXX defined */
 122 #ifdef EBCDIC
 123     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 124      * skip the check on EBCDIC platforms */
 125 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 126 #else
 127 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 128     if (!CAT2(PL_utf8_,class)) { \
 129         bool ok; \
 130         ENTER; save_re_context(); \
 131         ok=CAT2(is_utf8_,class)((const U8*)str); \
 132         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 133 #endif
 134
 135 /* Doesn't do an assert to verify that is correct */
 136 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 137     if (!CAT2(PL_utf8_,class)) { \
 138         bool throw_away PERL_UNUSED_DECL; \
 139         ENTER; save_re_context(); \
 140         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 141         LEAVE; } } STMT_END
 142
 143 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 144 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 145 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 146
 147 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 148         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 149         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 150         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 151             * assert should likely and hopefully fail on an EBCDIC machine */ \
 152         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 153                                                                             \
 154         /* No asserts are done for these, in case called on an early        \
 155             * Unicode version in which they map to nothing */               \
 156         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 157         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 163
 164 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 165
 166 /* The actual code for CCC_TRY, which uses several variables from the routine
 167  * it's callable from.  It is designed to be the bulk of a case statement.
 168  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 169  *      nextchr matches the class.
 170  * UTF8_TEST is the whole test string to use for utf8 targets
 171  * LOAD is what to use to test, and if not present to load in the swash for the
 172  *      class
 173  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 174  *      UTF8_TEST test.
 175  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 176  * utf8 and a variant, load the swash if necessary and test using the utf8
 177  * test.  Advance to the next character if test is ok, otherwise fail; If not
 178  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 179  * fails, or advance to the next character */
 180
 181 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 182     if (locinput >= PL_regeol) {                                              \
 183         sayNO;                                                                \
 184     }                                                                         \
 185     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 186         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 187         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 188             sayNO;                                                            \
 189         }                                                                     \
 190         locinput += PL_utf8skip[nextchr];                                     \
 191         nextchr = UCHARAT(locinput);                                          \
 192         break;                                                                \
 193     }                                                                         \
 194     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 195         sayNO;                                                                \
 196     }                                                                         \
 197     nextchr = UCHARAT(++locinput);                                            \
 198     break;
 199
 200 /* Handle the non-locale cases for a character class and its complement.  It
 201  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 202  * This is because that code fails when the test succeeds, so we want to have
 203  * the test fail so that the code succeeds.  The swash is stored in a
 204  * predictable PL_ place */
 205 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 206                            CLASS, STR)                                        \
 207     case NAME:                                                                \
 208         _CCC_TRY_CODE( !, FUNC,                                               \
 209                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 210                                             (U8*)locinput, TRUE)),            \
 211                           CLASS, STR)                                         \
 212     case NNAME:                                                               \
 213         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 214                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 215                                             (U8*)locinput, TRUE)),            \
 216                           CLASS, STR)                                         \
 217
 218 /* Generate the case statements for both locale and non-locale character
 219  * classes in regmatch for classes that don't have special unicode semantics.
 220  * Locales don't use an immediate swash, but an intermediary special locale
 221  * function that is called on the pointer to the current place in the input
 222  * string.  That function will resolve to needing the same swash.  One might
 223  * think that because we don't know what the locale will match, we shouldn't
 224  * check with the swash loading function that it loaded properly; ie, that we
 225  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 226  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 227  * irrelevant here */
 228 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 229                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 230                 NAMEA, NNAMEA, FUNCA,                                         \
 231                 CLASS, STR)                                                   \
 232     case NAMEL:                                                               \
 233         PL_reg_flags |= RF_tainted;                                           \
 234         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 235     case NNAMEL:                                                              \
 236         PL_reg_flags |= RF_tainted;                                           \
 237         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 238                        CLASS, STR)                                            \
 239     case NAMEA:                                                               \
 240         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 241             sayNO;                                                            \
 242         }                                                                     \
 243         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 244         nextchr = UCHARAT(++locinput);                                        \
 245         break;                                                                \
 246     case NNAMEA:                                                              \
 247         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 248             sayNO;                                                            \
 249         }                                                                     \
 250         if (utf8_target) {                                                    \
 251             locinput += PL_utf8skip[nextchr];                                 \
 252             nextchr = UCHARAT(locinput);                                      \
 253         }                                                                     \
 254         else {                                                                \
 255             nextchr = UCHARAT(++locinput);                                    \
 256         }                                                                     \
 257         break;                                                                \
 258     /* Generate the non-locale cases */                                       \
 259     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 260
 261 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 262  * statements to handle separate Unicode semantics nodes */
 263 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 264                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 265                   NAMEU, NNAMEU, FUNCU,                                        \
 266                   NAMEA, NNAMEA, FUNCA,                                        \
 267                   CLASS, STR)                                                  \
 268     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 269             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 270             NAMEA, NNAMEA, FUNCA,                                              \
 271             CLASS, STR)                                                        \
 272     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 273
 274 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 275
 276 /* for use after a quantifier and before an EXACT-like node -- japhy */
 277 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 278  *
 279  * NOTE that *nothing* that affects backtracking should be in here, specifically
 280  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 281  * node that is in between two EXACT like nodes when ascertaining what the required
 282  * "follow" character is. This should probably be moved to regex compile time
 283  * although it may be done at run time beause of the REF possibility - more
 284  * investigation required. -- demerphq
 285 */
 286 #define JUMPABLE(rn) (      \
 287     OP(rn) == OPEN ||       \
 288     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 289     OP(rn) == EVAL ||   \
 290     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 291     OP(rn) == PLUS || OP(rn) == MINMOD || \
 292     OP(rn) == KEEPS || \
 293     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 294 )
 295 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 296
 297 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 298
 299 #if 0
 300 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 301    we don't need this definition. */
 302 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 303 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 305
 306 #else
 307 /* ... so we use this as its faster. */
 308 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 309 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 310 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 311 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 312
 313 #endif
 314
 315 /*
 316   Search for mandatory following text node; for lookahead, the text must
 317   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 318 */
 319 #define FIND_NEXT_IMPT(rn) STMT_START { \
 320     while (JUMPABLE(rn)) { \
 321         const OPCODE type = OP(rn); \
 322         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 323             rn = NEXTOPER(NEXTOPER(rn)); \
 324         else if (type == PLUS) \
 325             rn = NEXTOPER(rn); \
 326         else if (type == IFMATCH) \
 327             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 328         else rn += NEXT_OFF(rn); \
 329     } \
 330 } STMT_END
 331
 332
 333 static void restore_pos(pTHX_ void *arg);
 334
 335 #define REGCP_PAREN_ELEMS 3
 336 #define REGCP_OTHER_ELEMS 3
 337 #define REGCP_FRAME_ELEMS 1
 338 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 339  * are needed for the regexp context stack bookkeeping. */
 340
 341 STATIC CHECKPOINT
 342 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 343 {
 344     dVAR;
 345     const int retval = PL_savestack_ix;
 346     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 347     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 348     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 349     I32 p;
 350     GET_RE_DEBUG_FLAGS_DECL;
 351
 352     PERL_ARGS_ASSERT_REGCPPUSH;
 353
 354     if (paren_elems_to_push < 0)
 355         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 356                    paren_elems_to_push);
 357
 358     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 359         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 360                    " out of range (%lu-%ld)",
 361                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 362
 363     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 364
 365     DEBUG_BUFFERS_r(
 366         if ((int)PL_regsize > (int)parenfloor)
 367             PerlIO_printf(Perl_debug_log,
 368                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 369                 PTR2UV(rex),
 370                 PTR2UV(rex->offs)
 371             );
 372     );
 373     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 374 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 375         SSPUSHINT(rex->offs[p].end);
 376         SSPUSHINT(rex->offs[p].start);
 377         SSPUSHINT(rex->offs[p].start_tmp);
 378         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 379             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 380             (UV)p,
 381             (IV)rex->offs[p].start,
 382             (IV)rex->offs[p].start_tmp,
 383             (IV)rex->offs[p].end
 384         ));
 385     }
 386 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 387     SSPUSHINT(PL_regsize);
 388     SSPUSHINT(rex->lastparen);
 389     SSPUSHINT(rex->lastcloseparen);
 390     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 391
 392     return retval;
 393 }
 394
 395 /* These are needed since we do not localize EVAL nodes: */
 396 #define REGCP_SET(cp)                                           \
 397     DEBUG_STATE_r(                                              \
 398             PerlIO_printf(Perl_debug_log,                       \
 399                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 400                 (IV)PL_savestack_ix));                          \
 401     cp = PL_savestack_ix
 402
 403 #define REGCP_UNWIND(cp)                                        \
 404     DEBUG_STATE_r(                                              \
 405         if (cp != PL_savestack_ix)                              \
 406             PerlIO_printf(Perl_debug_log,                       \
 407                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 408                 (IV)(cp), (IV)PL_savestack_ix));                \
 409     regcpblow(cp)
 410
 411 #define UNWIND_PAREN(lp, lcp)               \
 412     for (n = rex->lastparen; n > lp; n--)   \
 413         rex->offs[n].end = -1;              \
 414     rex->lastparen = n;                     \
 415     rex->lastcloseparen = lcp;
 416
 417
 418 STATIC void
 419 S_regcppop(pTHX_ regexp *rex)
 420 {
 421     dVAR;
 422     UV i;
 423     U32 paren;
 424     GET_RE_DEBUG_FLAGS_DECL;
 425
 426     PERL_ARGS_ASSERT_REGCPPOP;
 427
 428     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 429     i = SSPOPUV;
 430     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 431     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 432     rex->lastcloseparen = SSPOPINT;
 433     rex->lastparen = SSPOPINT;
 434     PL_regsize = SSPOPINT;
 435
 436     i -= REGCP_OTHER_ELEMS;
 437     /* Now restore the parentheses context. */
 438     DEBUG_BUFFERS_r(
 439         if (i || rex->lastparen + 1 <= rex->nparens)
 440             PerlIO_printf(Perl_debug_log,
 441                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 442                 PTR2UV(rex),
 443                 PTR2UV(rex->offs)
 444             );
 445     );
 446     paren = PL_regsize;
 447     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 448         I32 tmps;
 449         rex->offs[paren].start_tmp = SSPOPINT;
 450         rex->offs[paren].start = SSPOPINT;
 451         tmps = SSPOPINT;
 452         if (paren <= rex->lastparen)
 453             rex->offs[paren].end = tmps;
 454         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 455             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 456             (UV)paren,
 457             (IV)rex->offs[paren].start,
 458             (IV)rex->offs[paren].start_tmp,
 459             (IV)rex->offs[paren].end,
 460             (paren > rex->lastparen ? "(skipped)" : ""));
 461         );
 462         paren--;
 463     }
 464 #if 1
 465     /* It would seem that the similar code in regtry()
 466      * already takes care of this, and in fact it is in
 467      * a better location to since this code can #if 0-ed out
 468      * but the code in regtry() is needed or otherwise tests
 469      * requiring null fields (pat.t#187 and split.t#{13,14}
 470      * (as of patchlevel 7877)  will fail.  Then again,
 471      * this code seems to be necessary or otherwise
 472      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 473      * --jhi updated by dapm */
 474     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 475         if (i > PL_regsize)
 476             rex->offs[i].start = -1;
 477         rex->offs[i].end = -1;
 478         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 479             "    \\%"UVuf": %s   ..-1 undeffing\n",
 480             (UV)i,
 481             (i > PL_regsize) ? "-1" : "  "
 482         ));
 483     }
 484 #endif
 485 }
 486
 487 /* restore the parens and associated vars at savestack position ix,
 488  * but without popping the stack */
 489
 490 STATIC void
 491 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 492 {
 493     I32 tmpix = PL_savestack_ix;
 494     PL_savestack_ix = ix;
 495     regcppop(rex);
 496     PL_savestack_ix = tmpix;
 497 }
 498
 499 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 500
 501 /*
 502  * pregexec and friends
 503  */
 504
 505 #ifndef PERL_IN_XSUB_RE
 506 /*
 507  - pregexec - match a regexp against a string
 508  */
 509 I32
 510 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 511          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 512 /* strend: pointer to null at end of string */
 513 /* strbeg: real beginning of string */
 514 /* minend: end of match must be >=minend after stringarg. */
 515 /* nosave: For optimizations. */
 516 {
 517     PERL_ARGS_ASSERT_PREGEXEC;
 518
 519     return
 520         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 521                       nosave ? 0 : REXEC_COPY_STR);
 522 }
 523 #endif
 524
 525 /*
 526  * Need to implement the following flags for reg_anch:
 527  *
 528  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 529  * USE_INTUIT_ML
 530  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 531  * INTUIT_AUTORITATIVE_ML
 532  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 533  * INTUIT_ONCE_ML
 534  *
 535  * Another flag for this function: SECOND_TIME (so that float substrs
 536  * with giant delta may be not rechecked).
 537  */
 538
 539 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 540
 541 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 542    Otherwise, only SvCUR(sv) is used to get strbeg. */
 543
 544 /* XXXX We assume that strpos is strbeg unless sv. */
 545
 546 /* XXXX Some places assume that there is a fixed substring.
 547         An update may be needed if optimizer marks as "INTUITable"
 548         RExen without fixed substrings.  Similarly, it is assumed that
 549         lengths of all the strings are no more than minlen, thus they
 550         cannot come from lookahead.
 551         (Or minlen should take into account lookahead.)
 552   NOTE: Some of this comment is not correct. minlen does now take account
 553   of lookahead/behind. Further research is required. -- demerphq
 554
 555 */
 556
 557 /* A failure to find a constant substring means that there is no need to make
 558    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 559    finding a substring too deep into the string means that less calls to
 560    regtry() should be needed.
 561
 562    REx compiler's optimizer found 4 possible hints:
 563         a) Anchored substring;
 564         b) Fixed substring;
 565         c) Whether we are anchored (beginning-of-line or \G);
 566         d) First node (of those at offset 0) which may distinguish positions;
 567    We use a)b)d) and multiline-part of c), and try to find a position in the
 568    string which does not contradict any of them.
 569  */
 570
 571 /* Most of decisions we do here should have been done at compile time.
 572    The nodes of the REx which we used for the search should have been
 573    deleted from the finite automaton. */
 574
 575 char *
 576 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 577                      char *strend, const U32 flags, re_scream_pos_data *data)
 578 {
 579     dVAR;
 580     struct regexp *const prog = (struct regexp *)SvANY(rx);
 581     register I32 start_shift = 0;
 582     /* Should be nonnegative! */
 583     register I32 end_shift   = 0;
 584     register char *s;
 585     register SV *check;
 586     char *strbeg;
 587     char *t;
 588     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 589     I32 ml_anch;
 590     register char *other_last = NULL;   /* other substr checked before this */
 591     char *check_at = NULL;              /* check substr found at this pos */
 592     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 593     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 594     RXi_GET_DECL(prog,progi);
 595 #ifdef DEBUGGING
 596     const char * const i_strpos = strpos;
 597 #endif
 598     GET_RE_DEBUG_FLAGS_DECL;
 599
 600     PERL_ARGS_ASSERT_RE_INTUIT_START;
 601     PERL_UNUSED_ARG(flags);
 602     PERL_UNUSED_ARG(data);
 603
 604     RX_MATCH_UTF8_set(rx,utf8_target);
 605
 606     if (RX_UTF8(rx)) {
 607         PL_reg_flags |= RF_utf8;
 608     }
 609     DEBUG_EXECUTE_r(
 610         debug_start_match(rx, utf8_target, strpos, strend,
 611             sv ? "Guessing start of match in sv for"
 612                : "Guessing start of match in string for");
 613               );
 614
 615     /* CHR_DIST() would be more correct here but it makes things slow. */
 616     if (prog->minlen > strend - strpos) {
 617         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 618                               "String too short... [re_intuit_start]\n"));
 619         goto fail;
 620     }
 621
 622     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 623     PL_regeol = strend;
 624     if (utf8_target) {
 625         if (!prog->check_utf8 && prog->check_substr)
 626             to_utf8_substr(prog);
 627         check = prog->check_utf8;
 628     } else {
 629         if (!prog->check_substr && prog->check_utf8)
 630             to_byte_substr(prog);
 631         check = prog->check_substr;
 632     }
 633     if (check == &PL_sv_undef) {
 634         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 635                 "Non-utf8 string cannot match utf8 check string\n"));
 636         goto fail;
 637     }
 638     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 639         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 640                      || ( (prog->extflags & RXf_ANCH_BOL)
 641                           && !multiline ) );    /* Check after \n? */
 642
 643         if (!ml_anch) {
 644           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 645                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 646                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 647                && sv && !SvROK(sv)
 648                && (strpos != strbeg)) {
 649               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 650               goto fail;
 651           }
 652           if (prog->check_offset_min == prog->check_offset_max &&
 653               !(prog->extflags & RXf_CANY_SEEN)) {
 654             /* Substring at constant offset from beg-of-str... */
 655             I32 slen;
 656
 657             s = HOP3c(strpos, prog->check_offset_min, strend);
 658
 659             if (SvTAIL(check)) {
 660                 slen = SvCUR(check);    /* >= 1 */
 661
 662                 if ( strend - s > slen || strend - s < slen - 1
 663                      || (strend - s == slen && strend[-1] != '\n')) {
 664                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 665                     goto fail_finish;
 666                 }
 667                 /* Now should match s[0..slen-2] */
 668                 slen--;
 669                 if (slen && (*SvPVX_const(check) != *s
 670                              || (slen > 1
 671                                  && memNE(SvPVX_const(check), s, slen)))) {
 672                   report_neq:
 673                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 674                     goto fail_finish;
 675                 }
 676             }
 677             else if (*SvPVX_const(check) != *s
 678                      || ((slen = SvCUR(check)) > 1
 679                          && memNE(SvPVX_const(check), s, slen)))
 680                 goto report_neq;
 681             check_at = s;
 682             goto success_at_start;
 683           }
 684         }
 685         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 686         s = strpos;
 687         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 688         end_shift = prog->check_end_shift;
 689
 690         if (!ml_anch) {
 691             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 692                                          - (SvTAIL(check) != 0);
 693             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 694
 695             if (end_shift < eshift)
 696                 end_shift = eshift;
 697         }
 698     }
 699     else {                              /* Can match at random position */
 700         ml_anch = 0;
 701         s = strpos;
 702         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 703         end_shift = prog->check_end_shift;
 704
 705         /* end shift should be non negative here */
 706     }
 707
 708 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 709     if (end_shift < 0)
 710         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 711                    (IV)end_shift, RX_PRECOMP(prog));
 712 #endif
 713
 714   restart:
 715     /* Find a possible match in the region s..strend by looking for
 716        the "check" substring in the region corrected by start/end_shift. */
 717
 718     {
 719         I32 srch_start_shift = start_shift;
 720         I32 srch_end_shift = end_shift;
 721         U8* start_point;
 722         U8* end_point;
 723         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 724             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 725             srch_start_shift = strbeg - s;
 726         }
 727     DEBUG_OPTIMISE_MORE_r({
 728         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 729             (IV)prog->check_offset_min,
 730             (IV)srch_start_shift,
 731             (IV)srch_end_shift,
 732             (IV)prog->check_end_shift);
 733     });
 734
 735         if (prog->extflags & RXf_CANY_SEEN) {
 736             start_point= (U8*)(s + srch_start_shift);
 737             end_point= (U8*)(strend - srch_end_shift);
 738         } else {
 739             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 740             end_point= HOP3(strend, -srch_end_shift, strbeg);
 741         }
 742         DEBUG_OPTIMISE_MORE_r({
 743             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 744                 (int)(end_point - start_point),
 745                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 746                 start_point);
 747         });
 748
 749         s = fbm_instr( start_point, end_point,
 750                       check, multiline ? FBMrf_MULTILINE : 0);
 751     }
 752     /* Update the count-of-usability, remove useless subpatterns,
 753         unshift s.  */
 754
 755     DEBUG_EXECUTE_r({
 756         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 757             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 758         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 759                           (s ? "Found" : "Did not find"),
 760             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 761                 ? "anchored" : "floating"),
 762             quoted,
 763             RE_SV_TAIL(check),
 764             (s ? " at offset " : "...\n") );
 765     });
 766
 767     if (!s)
 768         goto fail_finish;
 769     /* Finish the diagnostic message */
 770     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 771
 772     /* XXX dmq: first branch is for positive lookbehind...
 773        Our check string is offset from the beginning of the pattern.
 774        So we need to do any stclass tests offset forward from that
 775        point. I think. :-(
 776      */
 777
 778
 779
 780     check_at=s;
 781
 782
 783     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 784        Start with the other substr.
 785        XXXX no SCREAM optimization yet - and a very coarse implementation
 786        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 787                 *always* match.  Probably should be marked during compile...
 788        Probably it is right to do no SCREAM here...
 789      */
 790
 791     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 792                 : (prog->float_substr && prog->anchored_substr))
 793     {
 794         /* Take into account the "other" substring. */
 795         /* XXXX May be hopelessly wrong for UTF... */
 796         if (!other_last)
 797             other_last = strpos;
 798         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 799           do_other_anchored:
 800             {
 801                 char * const last = HOP3c(s, -start_shift, strbeg);
 802                 char *last1, *last2;
 803                 char * const saved_s = s;
 804                 SV* must;
 805
 806                 t = s - prog->check_offset_max;
 807                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 808                     && (!utf8_target
 809                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 810                             && t > strpos)))
 811                     NOOP;
 812                 else
 813                     t = strpos;
 814                 t = HOP3c(t, prog->anchored_offset, strend);
 815                 if (t < other_last)     /* These positions already checked */
 816                     t = other_last;
 817                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 818                 if (last < last1)
 819                     last1 = last;
 820                 /* XXXX It is not documented what units *_offsets are in.
 821                    We assume bytes, but this is clearly wrong.
 822                    Meaning this code needs to be carefully reviewed for errors.
 823                    dmq.
 824                   */
 825
 826                 /* On end-of-str: see comment below. */
 827                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 828                 if (must == &PL_sv_undef) {
 829                     s = (char*)NULL;
 830                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 831                 }
 832                 else
 833                     s = fbm_instr(
 834                         (unsigned char*)t,
 835                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 836                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 837                         must,
 838                         multiline ? FBMrf_MULTILINE : 0
 839                     );
 840                 DEBUG_EXECUTE_r({
 841                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 842                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 843                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 844                         (s ? "Found" : "Contradicts"),
 845                         quoted, RE_SV_TAIL(must));
 846                 });
 847
 848
 849                 if (!s) {
 850                     if (last1 >= last2) {
 851                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 852                                                 ", giving up...\n"));
 853                         goto fail_finish;
 854                     }
 855                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856                         ", trying floating at offset %ld...\n",
 857                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 858                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 859                     s = HOP3c(last, 1, strend);
 860                     goto restart;
 861                 }
 862                 else {
 863                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 864                           (long)(s - i_strpos)));
 865                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 866                     other_last = HOP3c(s, 1, strend);
 867                     s = saved_s;
 868                     if (t == strpos)
 869                         goto try_at_start;
 870                     goto try_at_offset;
 871                 }
 872             }
 873         }
 874         else {          /* Take into account the floating substring. */
 875             char *last, *last1;
 876             char * const saved_s = s;
 877             SV* must;
 878
 879             t = HOP3c(s, -start_shift, strbeg);
 880             last1 = last =
 881                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 882             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 883                 last = HOP3c(t, prog->float_max_offset, strend);
 884             s = HOP3c(t, prog->float_min_offset, strend);
 885             if (s < other_last)
 886                 s = other_last;
 887  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 888             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 889             /* fbm_instr() takes into account exact value of end-of-str
 890                if the check is SvTAIL(ed).  Since false positives are OK,
 891                and end-of-str is not later than strend we are OK. */
 892             if (must == &PL_sv_undef) {
 893                 s = (char*)NULL;
 894                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 895             }
 896             else
 897                 s = fbm_instr((unsigned char*)s,
 898                               (unsigned char*)last + SvCUR(must)
 899                                   - (SvTAIL(must)!=0),
 900                               must, multiline ? FBMrf_MULTILINE : 0);
 901             DEBUG_EXECUTE_r({
 902                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 903                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 904                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 905                     (s ? "Found" : "Contradicts"),
 906                     quoted, RE_SV_TAIL(must));
 907             });
 908             if (!s) {
 909                 if (last1 == last) {
 910                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 911                                             ", giving up...\n"));
 912                     goto fail_finish;
 913                 }
 914                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 915                     ", trying anchored starting at offset %ld...\n",
 916                     (long)(saved_s + 1 - i_strpos)));
 917                 other_last = last;
 918                 s = HOP3c(t, 1, strend);
 919                 goto restart;
 920             }
 921             else {
 922                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 923                       (long)(s - i_strpos)));
 924                 other_last = s; /* Fix this later. --Hugo */
 925                 s = saved_s;
 926                 if (t == strpos)
 927                     goto try_at_start;
 928                 goto try_at_offset;
 929             }
 930         }
 931     }
 932
 933
 934     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 935
 936     DEBUG_OPTIMISE_MORE_r(
 937         PerlIO_printf(Perl_debug_log,
 938             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 939             (IV)prog->check_offset_min,
 940             (IV)prog->check_offset_max,
 941             (IV)(s-strpos),
 942             (IV)(t-strpos),
 943             (IV)(t-s),
 944             (IV)(strend-strpos)
 945         )
 946     );
 947
 948     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 949         && (!utf8_target
 950             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 951                  && t > strpos)))
 952     {
 953         /* Fixed substring is found far enough so that the match
 954            cannot start at strpos. */
 955       try_at_offset:
 956         if (ml_anch && t[-1] != '\n') {
 957             /* Eventually fbm_*() should handle this, but often
 958                anchored_offset is not 0, so this check will not be wasted. */
 959             /* XXXX In the code below we prefer to look for "^" even in
 960                presence of anchored substrings.  And we search even
 961                beyond the found float position.  These pessimizations
 962                are historical artefacts only.  */
 963           find_anchor:
 964             while (t < strend - prog->minlen) {
 965                 if (*t == '\n') {
 966                     if (t < check_at - prog->check_offset_min) {
 967                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 968                             /* Since we moved from the found position,
 969                                we definitely contradict the found anchored
 970                                substr.  Due to the above check we do not
 971                                contradict "check" substr.
 972                                Thus we can arrive here only if check substr
 973                                is float.  Redo checking for "other"=="fixed".
 974                              */
 975                             strpos = t + 1;
 976                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 977                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 978                             goto do_other_anchored;
 979                         }
 980                         /* We don't contradict the found floating substring. */
 981                         /* XXXX Why not check for STCLASS? */
 982                         s = t + 1;
 983                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 984                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 985                         goto set_useful;
 986                     }
 987                     /* Position contradicts check-string */
 988                     /* XXXX probably better to look for check-string
 989                        than for "\n", so one should lower the limit for t? */
 990                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 991                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 992                     other_last = strpos = s = t + 1;
 993                     goto restart;
 994                 }
 995                 t++;
 996             }
 997             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 998                         PL_colors[0], PL_colors[1]));
 999             goto fail_finish;
1000         }
1001         else {
1002             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1003                         PL_colors[0], PL_colors[1]));
1004         }
1005         s = t;
1006       set_useful:
1007         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1008     }
1009     else {
1010         /* The found string does not prohibit matching at strpos,
1011            - no optimization of calling REx engine can be performed,
1012            unless it was an MBOL and we are not after MBOL,
1013            or a future STCLASS check will fail this. */
1014       try_at_start:
1015         /* Even in this situation we may use MBOL flag if strpos is offset
1016            wrt the start of the string. */
1017         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1018             && (strpos != strbeg) && strpos[-1] != '\n'
1019             /* May be due to an implicit anchor of m{.*foo}  */
1020             && !(prog->intflags & PREGf_IMPLICIT))
1021         {
1022             t = strpos;
1023             goto find_anchor;
1024         }
1025         DEBUG_EXECUTE_r( if (ml_anch)
1026             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1027                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1028         );
1029       success_at_start:
1030         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1031             && (utf8_target ? (
1032                 prog->check_utf8                /* Could be deleted already */
1033                 && --BmUSEFUL(prog->check_utf8) < 0
1034                 && (prog->check_utf8 == prog->float_utf8)
1035             ) : (
1036                 prog->check_substr              /* Could be deleted already */
1037                 && --BmUSEFUL(prog->check_substr) < 0
1038                 && (prog->check_substr == prog->float_substr)
1039             )))
1040         {
1041             /* If flags & SOMETHING - do not do it many times on the same match */
1042             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1043             /* XXX Does the destruction order has to change with utf8_target? */
1044             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1045             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1046             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1047             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1048             check = NULL;                       /* abort */
1049             s = strpos;
1050             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1051                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1052             if (prog->intflags & PREGf_IMPLICIT)
1053                 prog->extflags &= ~RXf_ANCH_MBOL;
1054             /* XXXX This is a remnant of the old implementation.  It
1055                     looks wasteful, since now INTUIT can use many
1056                     other heuristics. */
1057             prog->extflags &= ~RXf_USE_INTUIT;
1058             /* XXXX What other flags might need to be cleared in this branch? */
1059         }
1060         else
1061             s = strpos;
1062     }
1063
1064     /* Last resort... */
1065     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1066     /* trie stclasses are too expensive to use here, we are better off to
1067        leave it to regmatch itself */
1068     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1069         /* minlen == 0 is possible if regstclass is \b or \B,
1070            and the fixed substr is ''$.
1071            Since minlen is already taken into account, s+1 is before strend;
1072            accidentally, minlen >= 1 guaranties no false positives at s + 1
1073            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1074            regstclass does not come from lookahead...  */
1075         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1076            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1077         const U8* const str = (U8*)STRING(progi->regstclass);
1078         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1079                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1080                     : 1);
1081         char * endpos;
1082         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1083             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1084         else if (prog->float_substr || prog->float_utf8)
1085             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1086         else
1087             endpos= strend;
1088
1089         if (checked_upto < s)
1090            checked_upto = s;
1091         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1092                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1093
1094         t = s;
1095         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1096         if (s) {
1097             checked_upto = s;
1098         } else {
1099 #ifdef DEBUGGING
1100             const char *what = NULL;
1101 #endif
1102             if (endpos == strend) {
1103                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Could not match STCLASS...\n") );
1105                 goto fail;
1106             }
1107             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                                    "This position contradicts STCLASS...\n") );
1109             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1110                 goto fail;
1111             checked_upto = HOPBACKc(endpos, start_shift);
1112             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1113                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1114             /* Contradict one of substrings */
1115             if (prog->anchored_substr || prog->anchored_utf8) {
1116                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1117                     DEBUG_EXECUTE_r( what = "anchored" );
1118                   hop_and_restart:
1119                     s = HOP3c(t, 1, strend);
1120                     if (s + start_shift + end_shift > strend) {
1121                         /* XXXX Should be taken into account earlier? */
1122                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1123                                                "Could not match STCLASS...\n") );
1124                         goto fail;
1125                     }
1126                     if (!check)
1127                         goto giveup;
1128                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1129                                 "Looking for %s substr starting at offset %ld...\n",
1130                                  what, (long)(s + start_shift - i_strpos)) );
1131                     goto restart;
1132                 }
1133                 /* Have both, check_string is floating */
1134                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1135                     goto retry_floating_check;
1136                 /* Recheck anchored substring, but not floating... */
1137                 s = check_at;
1138                 if (!check)
1139                     goto giveup;
1140                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141                           "Looking for anchored substr starting at offset %ld...\n",
1142                           (long)(other_last - i_strpos)) );
1143                 goto do_other_anchored;
1144             }
1145             /* Another way we could have checked stclass at the
1146                current position only: */
1147             if (ml_anch) {
1148                 s = t = t + 1;
1149                 if (!check)
1150                     goto giveup;
1151                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1152                           "Looking for /%s^%s/m starting at offset %ld...\n",
1153                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1154                 goto try_at_offset;
1155             }
1156             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1157                 goto fail;
1158             /* Check is floating substring. */
1159           retry_floating_check:
1160             t = check_at - start_shift;
1161             DEBUG_EXECUTE_r( what = "floating" );
1162             goto hop_and_restart;
1163         }
1164         if (t != s) {
1165             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1166                         "By STCLASS: moving %ld --> %ld\n",
1167                                   (long)(t - i_strpos), (long)(s - i_strpos))
1168                    );
1169         }
1170         else {
1171             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1172                                   "Does not contradict STCLASS...\n");
1173                    );
1174         }
1175     }
1176   giveup:
1177     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1178                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1179                           PL_colors[5], (long)(s - i_strpos)) );
1180     return s;
1181
1182   fail_finish:                          /* Substring not found */
1183     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1184         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1185   fail:
1186     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1187                           PL_colors[4], PL_colors[5]));
1188     return NULL;
1189 }
1190
1191 #define DECL_TRIE_TYPE(scan) \
1192     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1193                     trie_type = ((scan->flags == EXACT) \
1194                               ? (utf8_target ? trie_utf8 : trie_plain) \
1195                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1196
1197 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1198 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1199     STRLEN skiplen;                                                                 \
1200     switch (trie_type) {                                                            \
1201     case trie_utf8_fold:                                                            \
1202         if ( foldlen>0 ) {                                                          \
1203             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1204             foldlen -= len;                                                         \
1205             uscan += len;                                                           \
1206             len=0;                                                                  \
1207         } else {                                                                    \
1208             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1209             len = UTF8SKIP(uc);                                                     \
1210             skiplen = UNISKIP( uvc );                                               \
1211             foldlen -= skiplen;                                                     \
1212             uscan = foldbuf + skiplen;                                              \
1213         }                                                                           \
1214         break;                                                                      \
1215     case trie_latin_utf8_fold:                                                      \
1216         if ( foldlen>0 ) {                                                          \
1217             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1218             foldlen -= len;                                                         \
1219             uscan += len;                                                           \
1220             len=0;                                                                  \
1221         } else {                                                                    \
1222             len = 1;                                                                \
1223             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1224             skiplen = UNISKIP( uvc );                                               \
1225             foldlen -= skiplen;                                                     \
1226             uscan = foldbuf + skiplen;                                              \
1227         }                                                                           \
1228         break;                                                                      \
1229     case trie_utf8:                                                                 \
1230         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1231         break;                                                                      \
1232     case trie_plain:                                                                \
1233         uvc = (UV)*uc;                                                              \
1234         len = 1;                                                                    \
1235     }                                                                               \
1236     if (uvc < 256) {                                                                \
1237         charid = trie->charmap[ uvc ];                                              \
1238     }                                                                               \
1239     else {                                                                          \
1240         charid = 0;                                                                 \
1241         if (widecharmap) {                                                          \
1242             SV** const svpp = hv_fetch(widecharmap,                                 \
1243                         (char*)&uvc, sizeof(UV), 0);                                \
1244             if (svpp)                                                               \
1245                 charid = (U16)SvIV(*svpp);                                          \
1246         }                                                                           \
1247     }                                                                               \
1248 } STMT_END
1249
1250 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1251 STMT_START {                                              \
1252     while (s <= e) {                                      \
1253         if ( (CoNd)                                       \
1254              && (ln == 1 || folder(s, pat_string, ln))    \
1255              && (!reginfo || regtry(reginfo, &s)) )       \
1256             goto got_it;                                  \
1257         s++;                                              \
1258     }                                                     \
1259 } STMT_END
1260
1261 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1262 STMT_START {                                          \
1263     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1264         CoDe                                          \
1265         s += uskip;                                   \
1266     }                                                 \
1267 } STMT_END
1268
1269 #define REXEC_FBC_SCAN(CoDe)                          \
1270 STMT_START {                                          \
1271     while (s < strend) {                              \
1272         CoDe                                          \
1273         s++;                                          \
1274     }                                                 \
1275 } STMT_END
1276
1277 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1278 REXEC_FBC_UTF8_SCAN(                                  \
1279     if (CoNd) {                                       \
1280         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1281             goto got_it;                              \
1282         else                                          \
1283             tmp = doevery;                            \
1284     }                                                 \
1285     else                                              \
1286         tmp = 1;                                      \
1287 )
1288
1289 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1290 REXEC_FBC_SCAN(                                       \
1291     if (CoNd) {                                       \
1292         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1293             goto got_it;                              \
1294         else                                          \
1295             tmp = doevery;                            \
1296     }                                                 \
1297     else                                              \
1298         tmp = 1;                                      \
1299 )
1300
1301 #define REXEC_FBC_TRYIT               \
1302 if ((!reginfo || regtry(reginfo, &s))) \
1303     goto got_it
1304
1305 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1306     if (utf8_target) {                                             \
1307         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1308     }                                                          \
1309     else {                                                     \
1310         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1311     }
1312
1313 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1314     if (utf8_target) {                                             \
1315         UtFpReLoAd;                                            \
1316         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1317     }                                                          \
1318     else {                                                     \
1319         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1320     }
1321
1322 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1323     PL_reg_flags |= RF_tainted;                                \
1324     if (utf8_target) {                                             \
1325         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1326     }                                                          \
1327     else {                                                     \
1328         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1329     }
1330
1331 #define DUMP_EXEC_POS(li,s,doutf8) \
1332     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1333
1334
1335 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1336         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1337         tmp = TEST_NON_UTF8(tmp);                                              \
1338         REXEC_FBC_UTF8_SCAN(                                                   \
1339             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1340                 tmp = !tmp;                                                    \
1341                 IF_SUCCESS;                                                    \
1342             }                                                                  \
1343             else {                                                             \
1344                 IF_FAIL;                                                       \
1345             }                                                                  \
1346         );                                                                     \
1347
1348 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1349         if (s == PL_bostr) {                                                   \
1350             tmp = '\n';                                                        \
1351         }                                                                      \
1352         else {                                                                 \
1353             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1354             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1355         }                                                                      \
1356         tmp = TeSt1_UtF8;                                                      \
1357         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1358         REXEC_FBC_UTF8_SCAN(                                                   \
1359             if (tmp == ! (TeSt2_UtF8)) { \
1360                 tmp = !tmp;                                                    \
1361                 IF_SUCCESS;                                                    \
1362             }                                                                  \
1363             else {                                                             \
1364                 IF_FAIL;                                                       \
1365             }                                                                  \
1366         );                                                                     \
1367
1368 /* The only difference between the BOUND and NBOUND cases is that
1369  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1370  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1371  * with the other one being empty */
1372 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1373     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1374
1375 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1376     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1377
1378 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1379     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1380
1381 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1382     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1383
1384
1385 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1386  * be passed in completely with the variable name being tested, which isn't
1387  * such a clean interface, but this is easier to read than it was before.  We
1388  * are looking for the boundary (or non-boundary between a word and non-word
1389  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1390  * must be different.  Find the "wordness" of the character just prior to this
1391  * one, and compare it with the wordness of this one.  If they differ, we have
1392  * a boundary.  At the beginning of the string, pretend that the previous
1393  * character was a new-line */
1394 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1395     if (utf8_target) {                                                         \
1396                 UTF8_CODE \
1397     }                                                                          \
1398     else {  /* Not utf8 */                                                     \
1399         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1400         tmp = TEST_NON_UTF8(tmp);                                              \
1401         REXEC_FBC_SCAN(                                                        \
1402             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1403                 tmp = !tmp;                                                    \
1404                 IF_SUCCESS;                                                    \
1405             }                                                                  \
1406             else {                                                             \
1407                 IF_FAIL;                                                       \
1408             }                                                                  \
1409         );                                                                     \
1410     }                                                                          \
1411     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1412         goto got_it;
1413
1414 /* We know what class REx starts with.  Try to find this position... */
1415 /* if reginfo is NULL, its a dryrun */
1416 /* annoyingly all the vars in this routine have different names from their counterparts
1417    in regmatch. /grrr */
1418
1419 STATIC char *
1420 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1421     const char *strend, regmatch_info *reginfo)
1422 {
1423         dVAR;
1424         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1425         char *pat_string;   /* The pattern's exactish string */
1426         char *pat_end;      /* ptr to end char of pat_string */
1427         re_fold_t folder;       /* Function for computing non-utf8 folds */
1428         const U8 *fold_array;   /* array for folding ords < 256 */
1429         STRLEN ln;
1430         STRLEN lnc;
1431         register STRLEN uskip;
1432         U8 c1;
1433         U8 c2;
1434         char *e;
1435         register I32 tmp = 1;   /* Scratch variable? */
1436         register const bool utf8_target = PL_reg_match_utf8;
1437         UV utf8_fold_flags = 0;
1438         RXi_GET_DECL(prog,progi);
1439
1440         PERL_ARGS_ASSERT_FIND_BYCLASS;
1441
1442         /* We know what class it must start with. */
1443         switch (OP(c)) {
1444         case ANYOFV:
1445         case ANYOF:
1446             if (utf8_target || OP(c) == ANYOFV) {
1447                 STRLEN inclasslen = strend - s;
1448                 REXEC_FBC_UTF8_CLASS_SCAN(
1449                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1450             }
1451             else {
1452                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1453             }
1454             break;
1455         case CANY:
1456             REXEC_FBC_SCAN(
1457                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1458                     goto got_it;
1459                 else
1460                     tmp = doevery;
1461             );
1462             break;
1463
1464         case EXACTFA:
1465             if (UTF_PATTERN || utf8_target) {
1466                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1467                 goto do_exactf_utf8;
1468             }
1469             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1470             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1471             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1472
1473         case EXACTF:
1474             if (utf8_target) {
1475
1476                 /* regcomp.c already folded this if pattern is in UTF-8 */
1477                 utf8_fold_flags = 0;
1478                 goto do_exactf_utf8;
1479             }
1480             fold_array = PL_fold;
1481             folder = foldEQ;
1482             goto do_exactf_non_utf8;
1483
1484         case EXACTFL:
1485             if (UTF_PATTERN || utf8_target) {
1486                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1487                 goto do_exactf_utf8;
1488             }
1489             fold_array = PL_fold_locale;
1490             folder = foldEQ_locale;
1491             goto do_exactf_non_utf8;
1492
1493         case EXACTFU_SS:
1494             if (UTF_PATTERN) {
1495                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1496             }
1497             goto do_exactf_utf8;
1498
1499         case EXACTFU_TRICKYFOLD:
1500         case EXACTFU:
1501             if (UTF_PATTERN || utf8_target) {
1502                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1503                 goto do_exactf_utf8;
1504             }
1505
1506             /* Any 'ss' in the pattern should have been replaced by regcomp,
1507              * so we don't have to worry here about this single special case
1508              * in the Latin1 range */
1509             fold_array = PL_fold_latin1;
1510             folder = foldEQ_latin1;
1511
1512             /* FALL THROUGH */
1513
1514         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1515                                are no glitches with fold-length differences
1516                                between the target string and pattern */
1517
1518             /* The idea in the non-utf8 EXACTF* cases is to first find the
1519              * first character of the EXACTF* node and then, if necessary,
1520              * case-insensitively compare the full text of the node.  c1 is the
1521              * first character.  c2 is its fold.  This logic will not work for
1522              * Unicode semantics and the german sharp ss, which hence should
1523              * not be compiled into a node that gets here. */
1524             pat_string = STRING(c);
1525             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1526
1527             /* We know that we have to match at least 'ln' bytes (which is the
1528              * same as characters, since not utf8).  If we have to match 3
1529              * characters, and there are only 2 availabe, we know without
1530              * trying that it will fail; so don't start a match past the
1531              * required minimum number from the far end */
1532             e = HOP3c(strend, -((I32)ln), s);
1533
1534             if (!reginfo && e < s) {
1535                 e = s;                  /* Due to minlen logic of intuit() */
1536             }
1537
1538             c1 = *pat_string;
1539             c2 = fold_array[c1];
1540             if (c1 == c2) { /* If char and fold are the same */
1541                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1542             }
1543             else {
1544                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1545             }
1546             break;
1547
1548         do_exactf_utf8:
1549         {
1550             unsigned expansion;
1551
1552
1553             /* If one of the operands is in utf8, we can't use the simpler
1554              * folding above, due to the fact that many different characters
1555              * can have the same fold, or portion of a fold, or different-
1556              * length fold */
1557             pat_string = STRING(c);
1558             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1559             pat_end = pat_string + ln;
1560             lnc = (UTF_PATTERN) /* length to match in characters */
1561                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1562                     : ln;
1563
1564             /* We have 'lnc' characters to match in the pattern, but because of
1565              * multi-character folding, each character in the target can match
1566              * up to 3 characters (Unicode guarantees it will never exceed
1567              * this) if it is utf8-encoded; and up to 2 if not (based on the
1568              * fact that the Latin 1 folds are already determined, and the
1569              * only multi-char fold in that range is the sharp-s folding to
1570              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1571              * string character.  Adjust lnc accordingly, rounding up, so that
1572              * if we need to match at least 4+1/3 chars, that really is 5. */
1573             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1574             lnc = (lnc + expansion - 1) / expansion;
1575
1576             /* As in the non-UTF8 case, if we have to match 3 characters, and
1577              * only 2 are left, it's guaranteed to fail, so don't start a
1578              * match that would require us to go beyond the end of the string
1579              */
1580             e = HOP3c(strend, -((I32)lnc), s);
1581
1582             if (!reginfo && e < s) {
1583                 e = s;                  /* Due to minlen logic of intuit() */
1584             }
1585
1586             /* XXX Note that we could recalculate e to stop the loop earlier,
1587              * as the worst case expansion above will rarely be met, and as we
1588              * go along we would usually find that e moves further to the left.
1589              * This would happen only after we reached the point in the loop
1590              * where if there were no expansion we should fail.  Unclear if
1591              * worth the expense */
1592
1593             while (s <= e) {
1594                 char *my_strend= (char *)strend;
1595                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1596                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1597                     && (!reginfo || regtry(reginfo, &s)) )
1598                 {
1599                     goto got_it;
1600                 }
1601                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1602             }
1603             break;
1604         }
1605         case BOUNDL:
1606             PL_reg_flags |= RF_tainted;
1607             FBC_BOUND(isALNUM_LC,
1608                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1609                       isALNUM_LC_utf8((U8*)s));
1610             break;
1611         case NBOUNDL:
1612             PL_reg_flags |= RF_tainted;
1613             FBC_NBOUND(isALNUM_LC,
1614                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1615                        isALNUM_LC_utf8((U8*)s));
1616             break;
1617         case BOUND:
1618             FBC_BOUND(isWORDCHAR,
1619                       isALNUM_uni(tmp),
1620                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1621             break;
1622         case BOUNDA:
1623             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1624                              isWORDCHAR_A(tmp),
1625                              isWORDCHAR_A((U8*)s));
1626             break;
1627         case NBOUND:
1628             FBC_NBOUND(isWORDCHAR,
1629                        isALNUM_uni(tmp),
1630                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1631             break;
1632         case NBOUNDA:
1633             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1634                               isWORDCHAR_A(tmp),
1635                               isWORDCHAR_A((U8*)s));
1636             break;
1637         case BOUNDU:
1638             FBC_BOUND(isWORDCHAR_L1,
1639                       isALNUM_uni(tmp),
1640                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1641             break;
1642         case NBOUNDU:
1643             FBC_NBOUND(isWORDCHAR_L1,
1644                        isALNUM_uni(tmp),
1645                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1646             break;
1647         case ALNUML:
1648             REXEC_FBC_CSCAN_TAINT(
1649                 isALNUM_LC_utf8((U8*)s),
1650                 isALNUM_LC(*s)
1651             );
1652             break;
1653         case ALNUMU:
1654             REXEC_FBC_CSCAN_PRELOAD(
1655                 LOAD_UTF8_CHARCLASS_ALNUM(),
1656                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1657                 isWORDCHAR_L1((U8) *s)
1658             );
1659             break;
1660         case ALNUM:
1661             REXEC_FBC_CSCAN_PRELOAD(
1662                 LOAD_UTF8_CHARCLASS_ALNUM(),
1663                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1664                 isWORDCHAR((U8) *s)
1665             );
1666             break;
1667         case ALNUMA:
1668             /* Don't need to worry about utf8, as it can match only a single
1669              * byte invariant character */
1670             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1671             break;
1672         case NALNUMU:
1673             REXEC_FBC_CSCAN_PRELOAD(
1674                 LOAD_UTF8_CHARCLASS_ALNUM(),
1675                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1676                 ! isWORDCHAR_L1((U8) *s)
1677             );
1678             break;
1679         case NALNUM:
1680             REXEC_FBC_CSCAN_PRELOAD(
1681                 LOAD_UTF8_CHARCLASS_ALNUM(),
1682                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1683                 ! isALNUM(*s)
1684             );
1685             break;
1686         case NALNUMA:
1687             REXEC_FBC_CSCAN(
1688                 !isWORDCHAR_A(*s),
1689                 !isWORDCHAR_A(*s)
1690             );
1691             break;
1692         case NALNUML:
1693             REXEC_FBC_CSCAN_TAINT(
1694                 !isALNUM_LC_utf8((U8*)s),
1695                 !isALNUM_LC(*s)
1696             );
1697             break;
1698         case SPACEU:
1699             REXEC_FBC_CSCAN_PRELOAD(
1700                 LOAD_UTF8_CHARCLASS_SPACE(),
1701                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1702                 isSPACE_L1((U8) *s)
1703             );
1704             break;
1705         case SPACE:
1706             REXEC_FBC_CSCAN_PRELOAD(
1707                 LOAD_UTF8_CHARCLASS_SPACE(),
1708                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1709                 isSPACE((U8) *s)
1710             );
1711             break;
1712         case SPACEA:
1713             /* Don't need to worry about utf8, as it can match only a single
1714              * byte invariant character */
1715             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1716             break;
1717         case SPACEL:
1718             REXEC_FBC_CSCAN_TAINT(
1719                 isSPACE_LC_utf8((U8*)s),
1720                 isSPACE_LC(*s)
1721             );
1722             break;
1723         case NSPACEU:
1724             REXEC_FBC_CSCAN_PRELOAD(
1725                 LOAD_UTF8_CHARCLASS_SPACE(),
1726                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1727                 ! isSPACE_L1((U8) *s)
1728             );
1729             break;
1730         case NSPACE:
1731             REXEC_FBC_CSCAN_PRELOAD(
1732                 LOAD_UTF8_CHARCLASS_SPACE(),
1733                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1734                 ! isSPACE((U8) *s)
1735             );
1736             break;
1737         case NSPACEA:
1738             REXEC_FBC_CSCAN(
1739                 !isSPACE_A(*s),
1740                 !isSPACE_A(*s)
1741             );
1742             break;
1743         case NSPACEL:
1744             REXEC_FBC_CSCAN_TAINT(
1745                 !isSPACE_LC_utf8((U8*)s),
1746                 !isSPACE_LC(*s)
1747             );
1748             break;
1749         case DIGIT:
1750             REXEC_FBC_CSCAN_PRELOAD(
1751                 LOAD_UTF8_CHARCLASS_DIGIT(),
1752                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1753                 isDIGIT(*s)
1754             );
1755             break;
1756         case DIGITA:
1757             /* Don't need to worry about utf8, as it can match only a single
1758              * byte invariant character */
1759             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1760             break;
1761         case DIGITL:
1762             REXEC_FBC_CSCAN_TAINT(
1763                 isDIGIT_LC_utf8((U8*)s),
1764                 isDIGIT_LC(*s)
1765             );
1766             break;
1767         case NDIGIT:
1768             REXEC_FBC_CSCAN_PRELOAD(
1769                 LOAD_UTF8_CHARCLASS_DIGIT(),
1770                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1771                 !isDIGIT(*s)
1772             );
1773             break;
1774         case NDIGITA:
1775             REXEC_FBC_CSCAN(
1776                 !isDIGIT_A(*s),
1777                 !isDIGIT_A(*s)
1778             );
1779             break;
1780         case NDIGITL:
1781             REXEC_FBC_CSCAN_TAINT(
1782                 !isDIGIT_LC_utf8((U8*)s),
1783                 !isDIGIT_LC(*s)
1784             );
1785             break;
1786         case LNBREAK:
1787             REXEC_FBC_CSCAN(
1788                 is_LNBREAK_utf8(s),
1789                 is_LNBREAK_latin1(s)
1790             );
1791             break;
1792         case VERTWS:
1793             REXEC_FBC_CSCAN(
1794                 is_VERTWS_utf8(s),
1795                 is_VERTWS_latin1(s)
1796             );
1797             break;
1798         case NVERTWS:
1799             REXEC_FBC_CSCAN(
1800                 !is_VERTWS_utf8(s),
1801                 !is_VERTWS_latin1(s)
1802             );
1803             break;
1804         case HORIZWS:
1805             REXEC_FBC_CSCAN(
1806                 is_HORIZWS_utf8(s),
1807                 is_HORIZWS_latin1(s)
1808             );
1809             break;
1810         case NHORIZWS:
1811             REXEC_FBC_CSCAN(
1812                 !is_HORIZWS_utf8(s),
1813                 !is_HORIZWS_latin1(s)
1814             );
1815             break;
1816         case AHOCORASICKC:
1817         case AHOCORASICK:
1818             {
1819                 DECL_TRIE_TYPE(c);
1820                 /* what trie are we using right now */
1821                 reg_ac_data *aho
1822                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1823                 reg_trie_data *trie
1824                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1825                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1826
1827                 const char *last_start = strend - trie->minlen;
1828 #ifdef DEBUGGING
1829                 const char *real_start = s;
1830 #endif
1831                 STRLEN maxlen = trie->maxlen;
1832                 SV *sv_points;
1833                 U8 **points; /* map of where we were in the input string
1834                                 when reading a given char. For ASCII this
1835                                 is unnecessary overhead as the relationship
1836                                 is always 1:1, but for Unicode, especially
1837                                 case folded Unicode this is not true. */
1838                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1839                 U8 *bitmap=NULL;
1840
1841
1842                 GET_RE_DEBUG_FLAGS_DECL;
1843
1844                 /* We can't just allocate points here. We need to wrap it in
1845                  * an SV so it gets freed properly if there is a croak while
1846                  * running the match */
1847                 ENTER;
1848                 SAVETMPS;
1849                 sv_points=newSV(maxlen * sizeof(U8 *));
1850                 SvCUR_set(sv_points,
1851                     maxlen * sizeof(U8 *));
1852                 SvPOK_on(sv_points);
1853                 sv_2mortal(sv_points);
1854                 points=(U8**)SvPV_nolen(sv_points );
1855                 if ( trie_type != trie_utf8_fold
1856                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1857                 {
1858                     if (trie->bitmap)
1859                         bitmap=(U8*)trie->bitmap;
1860                     else
1861                         bitmap=(U8*)ANYOF_BITMAP(c);
1862                 }
1863                 /* this is the Aho-Corasick algorithm modified a touch
1864                    to include special handling for long "unknown char"
1865                    sequences. The basic idea being that we use AC as long
1866                    as we are dealing with a possible matching char, when
1867                    we encounter an unknown char (and we have not encountered
1868                    an accepting state) we scan forward until we find a legal
1869                    starting char.
1870                    AC matching is basically that of trie matching, except
1871                    that when we encounter a failing transition, we fall back
1872                    to the current states "fail state", and try the current char
1873                    again, a process we repeat until we reach the root state,
1874                    state 1, or a legal transition. If we fail on the root state
1875                    then we can either terminate if we have reached an accepting
1876                    state previously, or restart the entire process from the beginning
1877                    if we have not.
1878
1879                  */
1880                 while (s <= last_start) {
1881                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1882                     U8 *uc = (U8*)s;
1883                     U16 charid = 0;
1884                     U32 base = 1;
1885                     U32 state = 1;
1886                     UV uvc = 0;
1887                     STRLEN len = 0;
1888                     STRLEN foldlen = 0;
1889                     U8 *uscan = (U8*)NULL;
1890                     U8 *leftmost = NULL;
1891 #ifdef DEBUGGING
1892                     U32 accepted_word= 0;
1893 #endif
1894                     U32 pointpos = 0;
1895
1896                     while ( state && uc <= (U8*)strend ) {
1897                         int failed=0;
1898                         U32 word = aho->states[ state ].wordnum;
1899
1900                         if( state==1 ) {
1901                             if ( bitmap ) {
1902                                 DEBUG_TRIE_EXECUTE_r(
1903                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1904                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1905                                             (char *)uc, utf8_target );
1906                                         PerlIO_printf( Perl_debug_log,
1907                                             " Scanning for legal start char...\n");
1908                                     }
1909                                 );
1910                                 if (utf8_target) {
1911                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1912                                         uc += UTF8SKIP(uc);
1913                                     }
1914                                 } else {
1915                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1916                                         uc++;
1917                                     }
1918                                 }
1919                                 s= (char *)uc;
1920                             }
1921                             if (uc >(U8*)last_start) break;
1922                         }
1923
1924                         if ( word ) {
1925                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1926                             if (!leftmost || lpos < leftmost) {
1927                                 DEBUG_r(accepted_word=word);
1928                                 leftmost= lpos;
1929                             }
1930                             if (base==0) break;
1931
1932                         }
1933                         points[pointpos++ % maxlen]= uc;
1934                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1935                                              uscan, len, uvc, charid, foldlen,
1936                                              foldbuf, uniflags);
1937                         DEBUG_TRIE_EXECUTE_r({
1938                             dump_exec_pos( (char *)uc, c, strend, real_start,
1939                                 s,   utf8_target );
1940                             PerlIO_printf(Perl_debug_log,
1941                                 " Charid:%3u CP:%4"UVxf" ",
1942                                  charid, uvc);
1943                         });
1944
1945                         do {
1946 #ifdef DEBUGGING
1947                             word = aho->states[ state ].wordnum;
1948 #endif
1949                             base = aho->states[ state ].trans.base;
1950
1951                             DEBUG_TRIE_EXECUTE_r({
1952                                 if (failed)
1953                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1954                                         s,   utf8_target );
1955                                 PerlIO_printf( Perl_debug_log,
1956                                     "%sState: %4"UVxf", word=%"UVxf,
1957                                     failed ? " Fail transition to " : "",
1958                                     (UV)state, (UV)word);
1959                             });
1960                             if ( base ) {
1961                                 U32 tmp;
1962                                 I32 offset;
1963                                 if (charid &&
1964                                      ( ((offset = base + charid
1965                                         - 1 - trie->uniquecharcount)) >= 0)
1966                                      && ((U32)offset < trie->lasttrans)
1967                                      && trie->trans[offset].check == state
1968                                      && (tmp=trie->trans[offset].next))
1969                                 {
1970                                     DEBUG_TRIE_EXECUTE_r(
1971                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1972                                     state = tmp;
1973                                     break;
1974                                 }
1975                                 else {
1976                                     DEBUG_TRIE_EXECUTE_r(
1977                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1978                                     failed = 1;
1979                                     state = aho->fail[state];
1980                                 }
1981                             }
1982                             else {
1983                                 /* we must be accepting here */
1984                                 DEBUG_TRIE_EXECUTE_r(
1985                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1986                                 failed = 1;
1987                                 break;
1988                             }
1989                         } while(state);
1990                         uc += len;
1991                         if (failed) {
1992                             if (leftmost)
1993                                 break;
1994                             if (!state) state = 1;
1995                         }
1996                     }
1997                     if ( aho->states[ state ].wordnum ) {
1998                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1999                         if (!leftmost || lpos < leftmost) {
2000                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2001                             leftmost = lpos;
2002                         }
2003                     }
2004                     if (leftmost) {
2005                         s = (char*)leftmost;
2006                         DEBUG_TRIE_EXECUTE_r({
2007                             PerlIO_printf(
2008                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2009                                 (UV)accepted_word, (IV)(s - real_start)
2010                             );
2011                         });
2012                         if (!reginfo || regtry(reginfo, &s)) {
2013                             FREETMPS;
2014                             LEAVE;
2015                             goto got_it;
2016                         }
2017                         s = HOPc(s,1);
2018                         DEBUG_TRIE_EXECUTE_r({
2019                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2020                         });
2021                     } else {
2022                         DEBUG_TRIE_EXECUTE_r(
2023                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2024                         break;
2025                     }
2026                 }
2027                 FREETMPS;
2028                 LEAVE;
2029             }
2030             break;
2031         default:
2032             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2033             break;
2034         }
2035         return 0;
2036       got_it:
2037         return s;
2038 }
2039
2040
2041 /*
2042  - regexec_flags - match a regexp against a string
2043  */
2044 I32
2045 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2046               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2047 /* strend: pointer to null at end of string */
2048 /* strbeg: real beginning of string */
2049 /* minend: end of match must be >=minend after stringarg. */
2050 /* data: May be used for some additional optimizations.
2051          Currently its only used, with a U32 cast, for transmitting
2052          the ganch offset when doing a /g match. This will change */
2053 /* nosave: For optimizations. */
2054 {
2055     dVAR;
2056     struct regexp *const prog = (struct regexp *)SvANY(rx);
2057     /*register*/ char *s;
2058     register regnode *c;
2059     /*register*/ char *startpos = stringarg;
2060     I32 minlen;         /* must match at least this many chars */
2061     I32 dontbother = 0; /* how many characters not to try at end */
2062     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2063     I32 scream_pos = -1;                /* Internal iterator of scream. */
2064     char *scream_olds = NULL;
2065     const bool utf8_target = cBOOL(DO_UTF8(sv));
2066     I32 multiline;
2067     RXi_GET_DECL(prog,progi);
2068     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2069     regexp_paren_pair *swap = NULL;
2070     GET_RE_DEBUG_FLAGS_DECL;
2071
2072     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2073     PERL_UNUSED_ARG(data);
2074
2075     /* Be paranoid... */
2076     if (prog == NULL || startpos == NULL) {
2077         Perl_croak(aTHX_ "NULL regexp parameter");
2078         return 0;
2079     }
2080
2081     multiline = prog->extflags & RXf_PMf_MULTILINE;
2082     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2083
2084     RX_MATCH_UTF8_set(rx, utf8_target);
2085     DEBUG_EXECUTE_r(
2086         debug_start_match(rx, utf8_target, startpos, strend,
2087         "Matching");
2088     );
2089
2090     minlen = prog->minlen;
2091
2092     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2093         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2094                               "String too short [regexec_flags]...\n"));
2095         goto phooey;
2096     }
2097
2098
2099     /* Check validity of program. */
2100     if (UCHARAT(progi->program) != REG_MAGIC) {
2101         Perl_croak(aTHX_ "corrupted regexp program");
2102     }
2103
2104     PL_reg_flags = 0;
2105     PL_reg_state.re_state_eval_setup_done = FALSE;
2106     PL_reg_maxiter = 0;
2107
2108     if (RX_UTF8(rx))
2109         PL_reg_flags |= RF_utf8;
2110
2111     /* Mark beginning of line for ^ and lookbehind. */
2112     reginfo.bol = startpos; /* XXX not used ??? */
2113     PL_bostr  = strbeg;
2114     reginfo.sv = sv;
2115
2116     /* Mark end of line for $ (and such) */
2117     PL_regeol = strend;
2118
2119     /* see how far we have to get to not match where we matched before */
2120     reginfo.till = startpos+minend;
2121
2122     /* If there is a "must appear" string, look for it. */
2123     s = startpos;
2124
2125     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2126         MAGIC *mg;
2127         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2128             reginfo.ganch = startpos + prog->gofs;
2129             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2130               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2131         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2132                   && SvMAGIC(sv)
2133                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2134                   && mg->mg_len >= 0) {
2135             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2136             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2137                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2138
2139             if (prog->extflags & RXf_ANCH_GPOS) {
2140                 if (s > reginfo.ganch)
2141                     goto phooey;
2142                 s = reginfo.ganch - prog->gofs;
2143                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2144                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2145                 if (s < strbeg)
2146                     goto phooey;
2147             }
2148         }
2149         else if (data) {
2150             reginfo.ganch = strbeg + PTR2UV(data);
2151             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2152                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2153
2154         } else {                                /* pos() not defined */
2155             reginfo.ganch = strbeg;
2156             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2157                  "GPOS: reginfo.ganch = strbeg\n"));
2158         }
2159     }
2160     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2161         /* We have to be careful. If the previous successful match
2162            was from this regex we don't want a subsequent partially
2163            successful match to clobber the old results.
2164            So when we detect this possibility we add a swap buffer
2165            to the re, and switch the buffer each match. If we fail
2166            we switch it back, otherwise we leave it swapped.
2167         */
2168         swap = prog->offs;
2169         /* do we need a save destructor here for eval dies? */
2170         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2171         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2172             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2173             PTR2UV(prog),
2174             PTR2UV(swap),
2175             PTR2UV(prog->offs)
2176         ));
2177     }
2178     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2179         re_scream_pos_data d;
2180
2181         d.scream_olds = &scream_olds;
2182         d.scream_pos = &scream_pos;
2183         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2184         if (!s) {
2185             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2186             goto phooey;        /* not present */
2187         }
2188     }
2189
2190
2191
2192     /* Simplest case:  anchored match need be tried only once. */
2193     /*  [unless only anchor is BOL and multiline is set] */
2194     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2195         if (s == startpos && regtry(&reginfo, &startpos))
2196             goto got_it;
2197         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2198                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2199         {
2200             char *end;
2201
2202             if (minlen)
2203                 dontbother = minlen - 1;
2204             end = HOP3c(strend, -dontbother, strbeg) - 1;
2205             /* for multiline we only have to try after newlines */
2206             if (prog->check_substr || prog->check_utf8) {
2207                 /* because of the goto we can not easily reuse the macros for bifurcating the
2208                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2209                 if (utf8_target) {
2210                     if (s == startpos)
2211                         goto after_try_utf8;
2212                     while (1) {
2213                         if (regtry(&reginfo, &s)) {
2214                             goto got_it;
2215                         }
2216                       after_try_utf8:
2217                         if (s > end) {
2218                             goto phooey;
2219                         }
2220                         if (prog->extflags & RXf_USE_INTUIT) {
2221                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2222                             if (!s) {
2223                                 goto phooey;
2224                             }
2225                         }
2226                         else {
2227                             s += UTF8SKIP(s);
2228                         }
2229                     }
2230                 } /* end search for check string in unicode */
2231                 else {
2232                     if (s == startpos) {
2233                         goto after_try_latin;
2234                     }
2235                     while (1) {
2236                         if (regtry(&reginfo, &s)) {
2237                             goto got_it;
2238                         }
2239                       after_try_latin:
2240                         if (s > end) {
2241                             goto phooey;
2242                         }
2243                         if (prog->extflags & RXf_USE_INTUIT) {
2244                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2245                             if (!s) {
2246                                 goto phooey;
2247                             }
2248                         }
2249                         else {
2250                             s++;
2251                         }
2252                     }
2253                 } /* end search for check string in latin*/
2254             } /* end search for check string */
2255             else { /* search for newline */
2256                 if (s > startpos) {
2257                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2258                     s--;
2259                 }
2260                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2261                 while (s <= end) { /* note it could be possible to match at the end of the string */
2262                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2263                         if (regtry(&reginfo, &s))
2264                             goto got_it;
2265                     }
2266                 }
2267             } /* end search for newline */
2268         } /* end anchored/multiline check string search */
2269         goto phooey;
2270     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2271     {
2272         /* the warning about reginfo.ganch being used without initialization
2273            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2274            and we only enter this block when the same bit is set. */
2275         char *tmp_s = reginfo.ganch - prog->gofs;
2276
2277         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2278             goto got_it;
2279         goto phooey;
2280     }
2281
2282     /* Messy cases:  unanchored match. */
2283     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2284         /* we have /x+whatever/ */
2285         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2286         char ch;
2287 #ifdef DEBUGGING
2288         int did_match = 0;
2289 #endif
2290         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2291             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2292         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2293
2294         if (utf8_target) {
2295             REXEC_FBC_SCAN(
2296                 if (*s == ch) {
2297                     DEBUG_EXECUTE_r( did_match = 1 );
2298                     if (regtry(&reginfo, &s)) goto got_it;
2299                     s += UTF8SKIP(s);
2300                     while (s < strend && *s == ch)
2301                         s += UTF8SKIP(s);
2302                 }
2303             );
2304         }
2305         else {
2306             REXEC_FBC_SCAN(
2307                 if (*s == ch) {
2308                     DEBUG_EXECUTE_r( did_match = 1 );
2309                     if (regtry(&reginfo, &s)) goto got_it;
2310                     s++;
2311                     while (s < strend && *s == ch)
2312                         s++;
2313                 }
2314             );
2315         }
2316         DEBUG_EXECUTE_r(if (!did_match)
2317                 PerlIO_printf(Perl_debug_log,
2318                                   "Did not find anchored character...\n")
2319                );
2320     }
2321     else if (prog->anchored_substr != NULL
2322               || prog->anchored_utf8 != NULL
2323               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2324                   && prog->float_max_offset < strend - s)) {
2325         SV *must;
2326         I32 back_max;
2327         I32 back_min;
2328         char *last;
2329         char *last1;            /* Last position checked before */
2330 #ifdef DEBUGGING
2331         int did_match = 0;
2332 #endif
2333         if (prog->anchored_substr || prog->anchored_utf8) {
2334             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2335                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2336             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2337             back_max = back_min = prog->anchored_offset;
2338         } else {
2339             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2340                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2341             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2342             back_max = prog->float_max_offset;
2343             back_min = prog->float_min_offset;
2344         }
2345
2346
2347         if (must == &PL_sv_undef)
2348             /* could not downgrade utf8 check substring, so must fail */
2349             goto phooey;
2350
2351         if (back_min<0) {
2352             last = strend;
2353         } else {
2354             last = HOP3c(strend,        /* Cannot start after this */
2355                   -(I32)(CHR_SVLEN(must)
2356                          - (SvTAIL(must) != 0) + back_min), strbeg);
2357         }
2358         if (s > PL_bostr)
2359             last1 = HOPc(s, -1);
2360         else
2361             last1 = s - 1;      /* bogus */
2362
2363         /* XXXX check_substr already used to find "s", can optimize if
2364            check_substr==must. */
2365         scream_pos = -1;
2366         dontbother = end_shift;
2367         strend = HOPc(strend, -dontbother);
2368         while ( (s <= last) &&
2369                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2370                                   (unsigned char*)strend, must,
2371                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2372             DEBUG_EXECUTE_r( did_match = 1 );
2373             if (HOPc(s, -back_max) > last1) {
2374                 last1 = HOPc(s, -back_min);
2375                 s = HOPc(s, -back_max);
2376             }
2377             else {
2378                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2379
2380                 last1 = HOPc(s, -back_min);
2381                 s = t;
2382             }
2383             if (utf8_target) {
2384                 while (s <= last1) {
2385                     if (regtry(&reginfo, &s))
2386                         goto got_it;
2387                     s += UTF8SKIP(s);
2388                 }
2389             }
2390             else {
2391                 while (s <= last1) {
2392                     if (regtry(&reginfo, &s))
2393                         goto got_it;
2394                     s++;
2395                 }
2396             }
2397         }
2398         DEBUG_EXECUTE_r(if (!did_match) {
2399             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2400                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2401             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2402                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2403                                ? "anchored" : "floating"),
2404                 quoted, RE_SV_TAIL(must));
2405         });
2406         goto phooey;
2407     }
2408     else if ( (c = progi->regstclass) ) {
2409         if (minlen) {
2410             const OPCODE op = OP(progi->regstclass);
2411             /* don't bother with what can't match */
2412             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2413                 strend = HOPc(strend, -(minlen - 1));
2414         }
2415         DEBUG_EXECUTE_r({
2416             SV * const prop = sv_newmortal();
2417             regprop(prog, prop, c);
2418             {
2419                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2420                     s,strend-s,60);
2421                 PerlIO_printf(Perl_debug_log,
2422                     "Matching stclass %.*s against %s (%d bytes)\n",
2423                     (int)SvCUR(prop), SvPVX_const(prop),
2424                      quoted, (int)(strend - s));
2425             }
2426         });
2427         if (find_byclass(prog, c, s, strend, &reginfo))
2428             goto got_it;
2429         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2430     }
2431     else {
2432         dontbother = 0;
2433         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2434             /* Trim the end. */
2435             char *last= NULL;
2436             SV* float_real;
2437             STRLEN len;
2438             const char *little;
2439
2440             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2441                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2442             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2443
2444             little = SvPV_const(float_real, len);
2445             if (SvTAIL(float_real)) {
2446                     /* This means that float_real contains an artificial \n on the end
2447                      * due to the presence of something like this: /foo$/
2448                      * where we can match both "foo" and "foo\n" at the end of the string.
2449                      * So we have to compare the end of the string first against the float_real
2450                      * without the \n and then against the full float_real with the string.
2451                      * We have to watch out for cases where the string might be smaller
2452                      * than the float_real or the float_real without the \n.
2453                      */
2454                     char *checkpos= strend - len;
2455                     DEBUG_OPTIMISE_r(
2456                         PerlIO_printf(Perl_debug_log,
2457                             "%sChecking for float_real.%s\n",
2458                             PL_colors[4], PL_colors[5]));
2459                     if (checkpos + 1 < strbeg) {
2460                         /* can't match, even if we remove the trailing \n string is too short to match */
2461                         DEBUG_EXECUTE_r(
2462                             PerlIO_printf(Perl_debug_log,
2463                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2464                                 PL_colors[4], PL_colors[5]));
2465                         goto phooey;
2466                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2467                         /* can match, the end of the string matches without the "\n" */
2468                         last = checkpos + 1;
2469                     } else if (checkpos < strbeg) {
2470                         /* cant match, string is too short when the "\n" is included */
2471                         DEBUG_EXECUTE_r(
2472                             PerlIO_printf(Perl_debug_log,
2473                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2474                                 PL_colors[4], PL_colors[5]));
2475                         goto phooey;
2476                     } else if (!multiline) {
2477                         /* non multiline match, so compare with the "\n" at the end of the string */
2478                         if (memEQ(checkpos, little, len)) {
2479                             last= checkpos;
2480                         } else {
2481                             DEBUG_EXECUTE_r(
2482                                 PerlIO_printf(Perl_debug_log,
2483                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2484                                     PL_colors[4], PL_colors[5]));
2485                             goto phooey;
2486                         }
2487                     } else {
2488                         /* multiline match, so we have to search for a place where the full string is located */
2489                         goto find_last;
2490                     }
2491             } else {
2492                   find_last:
2493                     if (len)
2494                         last = rninstr(s, strend, little, little + len);
2495                     else
2496                         last = strend;  /* matching "$" */
2497             }
2498             if (!last) {
2499                 /* at one point this block contained a comment which was probably
2500                  * incorrect, which said that this was a "should not happen" case.
2501                  * Even if it was true when it was written I am pretty sure it is
2502                  * not anymore, so I have removed the comment and replaced it with
2503                  * this one. Yves */
2504                 DEBUG_EXECUTE_r(
2505                     PerlIO_printf(Perl_debug_log,
2506                         "String does not contain required substring, cannot match.\n"
2507                     ));
2508                 goto phooey;
2509             }
2510             dontbother = strend - last + prog->float_min_offset;
2511         }
2512         if (minlen && (dontbother < minlen))
2513             dontbother = minlen - 1;
2514         strend -= dontbother;              /* this one's always in bytes! */
2515         /* We don't know much -- general case. */
2516         if (utf8_target) {
2517             for (;;) {
2518                 if (regtry(&reginfo, &s))
2519                     goto got_it;
2520                 if (s >= strend)
2521                     break;
2522                 s += UTF8SKIP(s);
2523             };
2524         }
2525         else {
2526             do {
2527                 if (regtry(&reginfo, &s))
2528                     goto got_it;
2529             } while (s++ < strend);
2530         }
2531     }
2532
2533     /* Failure. */
2534     goto phooey;
2535
2536 got_it:
2537     DEBUG_BUFFERS_r(
2538         if (swap)
2539             PerlIO_printf(Perl_debug_log,
2540                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2541                 PTR2UV(prog),
2542                 PTR2UV(swap)
2543             );
2544     );
2545     Safefree(swap);
2546     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2547
2548     if (PL_reg_state.re_state_eval_setup_done)
2549         restore_pos(aTHX_ prog);
2550     if (RXp_PAREN_NAMES(prog))
2551         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2552
2553     /* make sure $`, $&, $', and $digit will work later */
2554     if ( !(flags & REXEC_NOT_FIRST) ) {
2555         RX_MATCH_COPY_FREE(rx);
2556         if (flags & REXEC_COPY_STR) {
2557             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2558 #ifdef PERL_OLD_COPY_ON_WRITE
2559             if ((SvIsCOW(sv)
2560                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2561                 if (DEBUG_C_TEST) {
2562                     PerlIO_printf(Perl_debug_log,
2563                                   "Copy on write: regexp capture, type %d\n",
2564                                   (int) SvTYPE(sv));
2565                 }
2566                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2567                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2568                 assert (SvPOKp(prog->saved_copy));
2569             } else
2570 #endif
2571             {
2572                 RX_MATCH_COPIED_on(rx);
2573                 s = savepvn(strbeg, i);
2574                 prog->subbeg = s;
2575             }
2576             prog->sublen = i;
2577         }
2578         else {
2579             prog->subbeg = strbeg;
2580             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2581         }
2582     }
2583
2584     return 1;
2585
2586 phooey:
2587     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2588                           PL_colors[4], PL_colors[5]));
2589     if (PL_reg_state.re_state_eval_setup_done)
2590         restore_pos(aTHX_ prog);
2591     if (swap) {
2592         /* we failed :-( roll it back */
2593         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2594             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2595             PTR2UV(prog),
2596             PTR2UV(prog->offs),
2597             PTR2UV(swap)
2598         ));
2599         Safefree(prog->offs);
2600         prog->offs = swap;
2601     }
2602
2603     return 0;
2604 }
2605
2606
2607 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2608  * Do inc before dec, in case old and new rex are the same */
2609 #define SET_reg_curpm(Re2) \
2610     if (PL_reg_state.re_state_eval_setup_done) {    \
2611         (void)ReREFCNT_inc(Re2);                    \
2612         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2613         PM_SETRE((PL_reg_curpm), (Re2));            \
2614     }
2615
2616
2617 /*
2618  - regtry - try match at specific point
2619  */
2620 STATIC I32                      /* 0 failure, 1 success */
2621 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2622 {
2623     dVAR;
2624     CHECKPOINT lastcp;
2625     REGEXP *const rx = reginfo->prog;
2626     regexp *const prog = (struct regexp *)SvANY(rx);
2627     RXi_GET_DECL(prog,progi);
2628     GET_RE_DEBUG_FLAGS_DECL;
2629
2630     PERL_ARGS_ASSERT_REGTRY;
2631
2632     reginfo->cutpoint=NULL;
2633
2634     if ((prog->extflags & RXf_EVAL_SEEN)
2635         && !PL_reg_state.re_state_eval_setup_done)
2636     {
2637         MAGIC *mg;
2638
2639         PL_reg_state.re_state_eval_setup_done = TRUE;
2640         if (reginfo->sv) {
2641             /* Make $_ available to executed code. */
2642             if (reginfo->sv != DEFSV) {
2643                 SAVE_DEFSV;
2644                 DEFSV_set(reginfo->sv);
2645             }
2646
2647             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2648                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2649                 /* prepare for quick setting of pos */
2650 #ifdef PERL_OLD_COPY_ON_WRITE
2651                 if (SvIsCOW(reginfo->sv))
2652                     sv_force_normal_flags(reginfo->sv, 0);
2653 #endif
2654                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2655                                  &PL_vtbl_mglob, NULL, 0);
2656                 mg->mg_len = -1;
2657             }
2658             PL_reg_magic    = mg;
2659             PL_reg_oldpos   = mg->mg_len;
2660             SAVEDESTRUCTOR_X(restore_pos, prog);
2661         }
2662         if (!PL_reg_curpm) {
2663             Newxz(PL_reg_curpm, 1, PMOP);
2664 #ifdef USE_ITHREADS
2665             {
2666                 SV* const repointer = &PL_sv_undef;
2667                 /* this regexp is also owned by the new PL_reg_curpm, which
2668                    will try to free it.  */
2669                 av_push(PL_regex_padav, repointer);
2670                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2671                 PL_regex_pad = AvARRAY(PL_regex_padav);
2672             }
2673 #endif
2674         }
2675         SET_reg_curpm(rx);
2676         PL_reg_oldcurpm = PL_curpm;
2677         PL_curpm = PL_reg_curpm;
2678         if (RXp_MATCH_COPIED(prog)) {
2679             /*  Here is a serious problem: we cannot rewrite subbeg,
2680                 since it may be needed if this match fails.  Thus
2681                 $` inside (?{}) could fail... */
2682             PL_reg_oldsaved = prog->subbeg;
2683             PL_reg_oldsavedlen = prog->sublen;
2684 #ifdef PERL_OLD_COPY_ON_WRITE
2685             PL_nrs = prog->saved_copy;
2686 #endif
2687             RXp_MATCH_COPIED_off(prog);
2688         }
2689         else
2690             PL_reg_oldsaved = NULL;
2691         prog->subbeg = PL_bostr;
2692         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2693     }
2694 #ifdef DEBUGGING
2695     PL_reg_starttry = *startpos;
2696 #endif
2697     prog->offs[0].start = *startpos - PL_bostr;
2698     PL_reginput = *startpos;
2699     prog->lastparen = 0;
2700     prog->lastcloseparen = 0;
2701     PL_regsize = 0;
2702
2703     /* XXXX What this code is doing here?!!!  There should be no need
2704        to do this again and again, prog->lastparen should take care of
2705        this!  --ilya*/
2706
2707     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2708      * Actually, the code in regcppop() (which Ilya may be meaning by
2709      * prog->lastparen), is not needed at all by the test suite
2710      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2711      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2712      * Meanwhile, this code *is* needed for the
2713      * above-mentioned test suite tests to succeed.  The common theme
2714      * on those tests seems to be returning null fields from matches.
2715      * --jhi updated by dapm */
2716 #if 1
2717     if (prog->nparens) {
2718         regexp_paren_pair *pp = prog->offs;
2719         register I32 i;
2720         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2721             ++pp;
2722             pp->start = -1;
2723             pp->end = -1;
2724         }
2725     }
2726 #endif
2727     REGCP_SET(lastcp);
2728     if (regmatch(reginfo, progi->program + 1)) {
2729         prog->offs[0].end = PL_reginput - PL_bostr;
2730         return 1;
2731     }
2732     if (reginfo->cutpoint)
2733         *startpos= reginfo->cutpoint;
2734     REGCP_UNWIND(lastcp);
2735     return 0;
2736 }
2737
2738
2739 #define sayYES goto yes
2740 #define sayNO goto no
2741 #define sayNO_SILENT goto no_silent
2742
2743 /* we dont use STMT_START/END here because it leads to
2744    "unreachable code" warnings, which are bogus, but distracting. */
2745 #define CACHEsayNO \
2746     if (ST.cache_mask) \
2747        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2748     sayNO
2749
2750 /* this is used to determine how far from the left messages like
2751    'failed...' are printed. It should be set such that messages
2752    are inline with the regop output that created them.
2753 */
2754 #define REPORT_CODE_OFF 32
2755
2756
2757 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2758 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2759
2760 #define SLAB_FIRST(s) (&(s)->states[0])
2761 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2762
2763 /* grab a new slab and return the first slot in it */
2764
2765 STATIC regmatch_state *
2766 S_push_slab(pTHX)
2767 {
2768 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2769     dMY_CXT;
2770 #endif
2771     regmatch_slab *s = PL_regmatch_slab->next;
2772     if (!s) {
2773         Newx(s, 1, regmatch_slab);
2774         s->prev = PL_regmatch_slab;
2775         s->next = NULL;
2776         PL_regmatch_slab->next = s;
2777     }
2778     PL_regmatch_slab = s;
2779     return SLAB_FIRST(s);
2780 }
2781
2782
2783 /* push a new state then goto it */
2784
2785 #define PUSH_STATE_GOTO(state, node) \
2786     scan = node; \
2787     st->resume_state = state; \
2788     goto push_state;
2789
2790 /* push a new state with success backtracking, then goto it */
2791
2792 #define PUSH_YES_STATE_GOTO(state, node) \
2793     scan = node; \
2794     st->resume_state = state; \
2795     goto push_yes_state;
2796
2797
2798
2799 /*
2800
2801 regmatch() - main matching routine
2802
2803 This is basically one big switch statement in a loop. We execute an op,
2804 set 'next' to point the next op, and continue. If we come to a point which
2805 we may need to backtrack to on failure such as (A|B|C), we push a
2806 backtrack state onto the backtrack stack. On failure, we pop the top
2807 state, and re-enter the loop at the state indicated. If there are no more
2808 states to pop, we return failure.
2809
2810 Sometimes we also need to backtrack on success; for example /A+/, where
2811 after successfully matching one A, we need to go back and try to
2812 match another one; similarly for lookahead assertions: if the assertion
2813 completes successfully, we backtrack to the state just before the assertion
2814 and then carry on.  In these cases, the pushed state is marked as
2815 'backtrack on success too'. This marking is in fact done by a chain of
2816 pointers, each pointing to the previous 'yes' state. On success, we pop to
2817 the nearest yes state, discarding any intermediate failure-only states.
2818 Sometimes a yes state is pushed just to force some cleanup code to be
2819 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2820 it to free the inner regex.
2821
2822 Note that failure backtracking rewinds the cursor position, while
2823 success backtracking leaves it alone.
2824
2825 A pattern is complete when the END op is executed, while a subpattern
2826 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2827 ops trigger the "pop to last yes state if any, otherwise return true"
2828 behaviour.
2829
2830 A common convention in this function is to use A and B to refer to the two
2831 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2832 the subpattern to be matched possibly multiple times, while B is the entire
2833 rest of the pattern. Variable and state names reflect this convention.
2834
2835 The states in the main switch are the union of ops and failure/success of
2836 substates associated with with that op.  For example, IFMATCH is the op
2837 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2838 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2839 successfully matched A and IFMATCH_A_fail is a state saying that we have
2840 just failed to match A. Resume states always come in pairs. The backtrack
2841 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2842 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2843 on success or failure.
2844
2845 The struct that holds a backtracking state is actually a big union, with
2846 one variant for each major type of op. The variable st points to the
2847 top-most backtrack struct. To make the code clearer, within each
2848 block of code we #define ST to alias the relevant union.
2849
2850 Here's a concrete example of a (vastly oversimplified) IFMATCH
2851 implementation:
2852
2853     switch (state) {
2854     ....
2855
2856 #define ST st->u.ifmatch
2857
2858     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2859         ST.foo = ...; // some state we wish to save
2860         ...
2861         // push a yes backtrack state with a resume value of
2862         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2863         // first node of A:
2864         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2865         // NOTREACHED
2866
2867     case IFMATCH_A: // we have successfully executed A; now continue with B
2868         next = B;
2869         bar = ST.foo; // do something with the preserved value
2870         break;
2871
2872     case IFMATCH_A_fail: // A failed, so the assertion failed
2873         ...;   // do some housekeeping, then ...
2874         sayNO; // propagate the failure
2875
2876 #undef ST
2877
2878     ...
2879     }
2880
2881 For any old-timers reading this who are familiar with the old recursive
2882 approach, the code above is equivalent to:
2883
2884     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2885     {
2886         int foo = ...
2887         ...
2888         if (regmatch(A)) {
2889             next = B;
2890             bar = foo;
2891             break;
2892         }
2893         ...;   // do some housekeeping, then ...
2894         sayNO; // propagate the failure
2895     }
2896
2897 The topmost backtrack state, pointed to by st, is usually free. If you
2898 want to claim it, populate any ST.foo fields in it with values you wish to
2899 save, then do one of
2900
2901         PUSH_STATE_GOTO(resume_state, node);
2902         PUSH_YES_STATE_GOTO(resume_state, node);
2903
2904 which sets that backtrack state's resume value to 'resume_state', pushes a
2905 new free entry to the top of the backtrack stack, then goes to 'node'.
2906 On backtracking, the free slot is popped, and the saved state becomes the
2907 new free state. An ST.foo field in this new top state can be temporarily
2908 accessed to retrieve values, but once the main loop is re-entered, it
2909 becomes available for reuse.
2910
2911 Note that the depth of the backtrack stack constantly increases during the
2912 left-to-right execution of the pattern, rather than going up and down with
2913 the pattern nesting. For example the stack is at its maximum at Z at the
2914 end of the pattern, rather than at X in the following:
2915
2916     /(((X)+)+)+....(Y)+....Z/
2917
2918 The only exceptions to this are lookahead/behind assertions and the cut,
2919 (?>A), which pop all the backtrack states associated with A before
2920 continuing.
2921
2922 Backtrack state structs are allocated in slabs of about 4K in size.
2923 PL_regmatch_state and st always point to the currently active state,
2924 and PL_regmatch_slab points to the slab currently containing
2925 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2926 allocated, and is never freed until interpreter destruction. When the slab
2927 is full, a new one is allocated and chained to the end. At exit from
2928 regmatch(), slabs allocated since entry are freed.
2929
2930 */
2931
2932
2933 #define DEBUG_STATE_pp(pp)                                  \
2934     DEBUG_STATE_r({                                         \
2935         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2936         PerlIO_printf(Perl_debug_log,                       \
2937             "    %*s"pp" %s%s%s%s%s\n",                     \
2938             depth*2, "",                                    \
2939             PL_reg_name[st->resume_state],                     \
2940             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2941             ((st==yes_state) ? "Y" : ""),                   \
2942             ((st==mark_state) ? "M" : ""),                  \
2943             ((st==yes_state||st==mark_state) ? "]" : "")    \
2944         );                                                  \
2945     });
2946
2947
2948 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2949
2950 #ifdef DEBUGGING
2951
2952 STATIC void
2953 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2954     const char *start, const char *end, const char *blurb)
2955 {
2956     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2957
2958     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2959
2960     if (!PL_colorset)
2961             reginitcolors();
2962     {
2963         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2964             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2965
2966         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2967             start, end - start, 60);
2968
2969         PerlIO_printf(Perl_debug_log,
2970             "%s%s REx%s %s against %s\n",
2971                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2972
2973         if (utf8_target||utf8_pat)
2974             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2975                 utf8_pat ? "pattern" : "",
2976                 utf8_pat && utf8_target ? " and " : "",
2977                 utf8_target ? "string" : ""
2978             );
2979     }
2980 }
2981
2982 STATIC void
2983 S_dump_exec_pos(pTHX_ const char *locinput,
2984                       const regnode *scan,
2985                       const char *loc_regeol,
2986                       const char *loc_bostr,
2987                       const char *loc_reg_starttry,
2988                       const bool utf8_target)
2989 {
2990     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2991     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2992     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2993     /* The part of the string before starttry has one color
2994        (pref0_len chars), between starttry and current
2995        position another one (pref_len - pref0_len chars),
2996        after the current position the third one.
2997        We assume that pref0_len <= pref_len, otherwise we
2998        decrease pref0_len.  */
2999     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3000         ? (5 + taill) - l : locinput - loc_bostr;
3001     int pref0_len;
3002
3003     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3004
3005     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3006         pref_len++;
3007     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3008     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3009         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3010               ? (5 + taill) - pref_len : loc_regeol - locinput);
3011     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3012         l--;
3013     if (pref0_len < 0)
3014         pref0_len = 0;
3015     if (pref0_len > pref_len)
3016         pref0_len = pref_len;
3017     {
3018         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3019
3020         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3021             (locinput - pref_len),pref0_len, 60, 4, 5);
3022
3023         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3024                     (locinput - pref_len + pref0_len),
3025                     pref_len - pref0_len, 60, 2, 3);
3026
3027         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3028                     locinput, loc_regeol - locinput, 10, 0, 1);
3029
3030         const STRLEN tlen=len0+len1+len2;
3031         PerlIO_printf(Perl_debug_log,
3032                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3033                     (IV)(locinput - loc_bostr),
3034                     len0, s0,
3035                     len1, s1,
3036                     (docolor ? "" : "> <"),
3037                     len2, s2,
3038                     (int)(tlen > 19 ? 0 :  19 - tlen),
3039                     "");
3040     }
3041 }
3042
3043 #endif
3044
3045 /* reg_check_named_buff_matched()
3046  * Checks to see if a named buffer has matched. The data array of
3047  * buffer numbers corresponding to the buffer is expected to reside
3048  * in the regexp->data->data array in the slot stored in the ARG() of
3049  * node involved. Note that this routine doesn't actually care about the
3050  * name, that information is not preserved from compilation to execution.
3051  * Returns the index of the leftmost defined buffer with the given name
3052  * or 0 if non of the buffers matched.
3053  */
3054 STATIC I32
3055 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3056 {
3057     I32 n;
3058     RXi_GET_DECL(rex,rexi);
3059     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3060     I32 *nums=(I32*)SvPVX(sv_dat);
3061
3062     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3063
3064     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3065         if ((I32)rex->lastparen >= nums[n] &&
3066             rex->offs[nums[n]].end != -1)
3067         {
3068             return nums[n];
3069         }
3070     }
3071     return 0;
3072 }
3073
3074
3075 /* free all slabs above current one  - called during LEAVE_SCOPE */
3076
3077 STATIC void
3078 S_clear_backtrack_stack(pTHX_ void *p)
3079 {
3080     regmatch_slab *s = PL_regmatch_slab->next;
3081     PERL_UNUSED_ARG(p);
3082
3083     if (!s)
3084         return;
3085     PL_regmatch_slab->next = NULL;
3086     while (s) {
3087         regmatch_slab * const osl = s;
3088         s = s->next;
3089         Safefree(osl);
3090     }
3091 }
3092
3093
3094 STATIC I32                      /* 0 failure, 1 success */
3095 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3096 {
3097 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3098     dMY_CXT;
3099 #endif
3100     dVAR;
3101     register const bool utf8_target = PL_reg_match_utf8;
3102     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3103     REGEXP *rex_sv = reginfo->prog;
3104     regexp *rex = (struct regexp *)SvANY(rex_sv);
3105     RXi_GET_DECL(rex,rexi);
3106     I32 oldsave;
3107     /* the current state. This is a cached copy of PL_regmatch_state */
3108     register regmatch_state *st;
3109     /* cache heavy used fields of st in registers */
3110     register regnode *scan;
3111     register regnode *next;
3112     register U32 n = 0; /* general value; init to avoid compiler warning */
3113     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3114     register char *locinput = PL_reginput;
3115     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3116
3117     bool result = 0;        /* return value of S_regmatch */
3118     int depth = 0;          /* depth of backtrack stack */
3119     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3120     const U32 max_nochange_depth =
3121         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3122         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3123     regmatch_state *yes_state = NULL; /* state to pop to on success of
3124                                                             subpattern */
3125     /* mark_state piggy backs on the yes_state logic so that when we unwind
3126        the stack on success we can update the mark_state as we go */
3127     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3128     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3129     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3130     U32 state_num;
3131     bool no_final = 0;      /* prevent failure from backtracking? */
3132     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3133     char *startpoint = PL_reginput;
3134     SV *popmark = NULL;     /* are we looking for a mark? */
3135     SV *sv_commit = NULL;   /* last mark name seen in failure */
3136     SV *sv_yes_mark = NULL; /* last mark name we have seen
3137                                during a successful match */
3138     U32 lastopen = 0;       /* last open we saw */
3139     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3140     SV* const oreplsv = GvSV(PL_replgv);
3141     /* these three flags are set by various ops to signal information to
3142      * the very next op. They have a useful lifetime of exactly one loop
3143      * iteration, and are not preserved or restored by state pushes/pops
3144      */
3145     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3146     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3147     int logical = 0;        /* the following EVAL is:
3148                                 0: (?{...})
3149                                 1: (?(?{...})X|Y)
3150                                 2: (??{...})
3151                                or the following IFMATCH/UNLESSM is:
3152                                 false: plain (?=foo)
3153                                 true:  used as a condition: (?(?=foo))
3154                             */
3155     PAD* last_pad = NULL;
3156     dMULTICALL;
3157     I32 gimme = G_SCALAR;
3158     CV *caller_cv = NULL;       /* who called us */
3159     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3160     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3161
3162 #ifdef DEBUGGING
3163     GET_RE_DEBUG_FLAGS_DECL;
3164 #endif
3165
3166     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3167     multicall_oldcatch = 0;
3168     multicall_cv = NULL;
3169     cx = NULL;
3170
3171
3172     PERL_ARGS_ASSERT_REGMATCH;
3173
3174     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3175             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3176     }));
3177     /* on first ever call to regmatch, allocate first slab */
3178     if (!PL_regmatch_slab) {
3179         Newx(PL_regmatch_slab, 1, regmatch_slab);
3180         PL_regmatch_slab->prev = NULL;
3181         PL_regmatch_slab->next = NULL;
3182         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3183     }
3184
3185     oldsave = PL_savestack_ix;
3186     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3187     SAVEVPTR(PL_regmatch_slab);
3188     SAVEVPTR(PL_regmatch_state);
3189
3190     /* grab next free state slot */
3191     st = ++PL_regmatch_state;
3192     if (st >  SLAB_LAST(PL_regmatch_slab))
3193         st = PL_regmatch_state = S_push_slab(aTHX);
3194
3195     /* Note that nextchr is a byte even in UTF */
3196     nextchr = UCHARAT(locinput);
3197     scan = prog;
3198     while (scan != NULL) {
3199
3200         DEBUG_EXECUTE_r( {
3201             SV * const prop = sv_newmortal();
3202             regnode *rnext=regnext(scan);
3203             DUMP_EXEC_POS( locinput, scan, utf8_target );
3204             regprop(rex, prop, scan);
3205
3206             PerlIO_printf(Perl_debug_log,
3207                     "%3"IVdf":%*s%s(%"IVdf")\n",
3208                     (IV)(scan - rexi->program), depth*2, "",
3209                     SvPVX_const(prop),
3210                     (PL_regkind[OP(scan)] == END || !rnext) ?
3211                         0 : (IV)(rnext - rexi->program));
3212         });
3213
3214         next = scan + NEXT_OFF(scan);
3215         if (next == scan)
3216             next = NULL;
3217         state_num = OP(scan);
3218
3219       reenter_switch:
3220
3221         switch (state_num) {
3222         case BOL:
3223             if (locinput == PL_bostr)
3224             {
3225                 /* reginfo->till = reginfo->bol; */
3226                 break;
3227             }
3228             sayNO;
3229         case MBOL:
3230             if (locinput == PL_bostr ||
3231                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3232             {
3233                 break;
3234             }
3235             sayNO;
3236         case SBOL:
3237             if (locinput == PL_bostr)
3238                 break;
3239             sayNO;
3240         case GPOS:
3241             if (locinput == reginfo->ganch)
3242                 break;
3243             sayNO;
3244
3245         case KEEPS:
3246             /* update the startpoint */
3247             st->u.keeper.val = rex->offs[0].start;
3248             PL_reginput = locinput;
3249             rex->offs[0].start = locinput - PL_bostr;
3250             PUSH_STATE_GOTO(KEEPS_next, next);
3251             /*NOT-REACHED*/
3252         case KEEPS_next_fail:
3253             /* rollback the start point change */
3254             rex->offs[0].start = st->u.keeper.val;
3255             sayNO_SILENT;
3256             /*NOT-REACHED*/
3257         case EOL:
3258                 goto seol;
3259         case MEOL:
3260             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3261                 sayNO;
3262             break;
3263         case SEOL:
3264           seol:
3265             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3266                 sayNO;
3267             if (PL_regeol - locinput > 1)
3268                 sayNO;
3269             break;
3270         case EOS:
3271             if (PL_regeol != locinput)
3272                 sayNO;
3273             break;
3274         case SANY:
3275             if (!nextchr && locinput >= PL_regeol)
3276                 sayNO;
3277             if (utf8_target) {
3278                 locinput += PL_utf8skip[nextchr];
3279                 if (locinput > PL_regeol)
3280                     sayNO;
3281                 nextchr = UCHARAT(locinput);
3282             }
3283             else
3284                 nextchr = UCHARAT(++locinput);
3285             break;
3286         case CANY:
3287             if (!nextchr && locinput >= PL_regeol)
3288                 sayNO;
3289             nextchr = UCHARAT(++locinput);
3290             break;
3291         case REG_ANY:
3292             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3293                 sayNO;
3294             if (utf8_target) {
3295                 locinput += PL_utf8skip[nextchr];
3296                 if (locinput > PL_regeol)
3297                     sayNO;
3298                 nextchr = UCHARAT(locinput);
3299             }
3300             else
3301                 nextchr = UCHARAT(++locinput);
3302             break;
3303
3304 #undef  ST
3305 #define ST st->u.trie
3306         case TRIEC:
3307             /* In this case the charclass data is available inline so
3308                we can fail fast without a lot of extra overhead.
3309              */
3310             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3311                 DEBUG_EXECUTE_r(
3312                     PerlIO_printf(Perl_debug_log,
3313                               "%*s  %sfailed to match trie start class...%s\n",
3314                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3315                 );
3316                 sayNO_SILENT;
3317                 assert(0); /* NOTREACHED */
3318             }
3319             /* FALL THROUGH */
3320         case TRIE:
3321             /* the basic plan of execution of the trie is:
3322              * At the beginning, run though all the states, and
3323              * find the longest-matching word. Also remember the position
3324              * of the shortest matching word. For example, this pattern:
3325              *    1  2 3 4    5
3326              *    ab|a|x|abcd|abc
3327              * when matched against the string "abcde", will generate
3328              * accept states for all words except 3, with the longest
3329              * matching word being 4, and the shortest being 1 (with
3330              * the position being after char 1 of the string).
3331              *
3332              * Then for each matching word, in word order (i.e. 1,2,4,5),
3333              * we run the remainder of the pattern; on each try setting
3334              * the current position to the character following the word,
3335              * returning to try the next word on failure.
3336              *
3337              * We avoid having to build a list of words at runtime by
3338              * using a compile-time structure, wordinfo[].prev, which
3339              * gives, for each word, the previous accepting word (if any).
3340              * In the case above it would contain the mappings 1->2, 2->0,
3341              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3342              * the longest word (4 above), a list of all words, by
3343              * following the list of prev pointers; this gives us the
3344              * unordered list 4,5,1,2. Then given the current word we have
3345              * just tried, we can go through the list and find the
3346              * next-biggest word to try (so if we just failed on word 2,
3347              * the next in the list is 4).
3348              *
3349              * Since at runtime we don't record the matching position in
3350              * the string for each word, we have to work that out for
3351              * each word we're about to process. The wordinfo table holds
3352              * the character length of each word; given that we recorded
3353              * at the start: the position of the shortest word and its
3354              * length in chars, we just need to move the pointer the
3355              * difference between the two char lengths. Depending on
3356              * Unicode status and folding, that's cheap or expensive.
3357              *
3358              * This algorithm is optimised for the case where are only a
3359              * small number of accept states, i.e. 0,1, or maybe 2.
3360              * With lots of accepts states, and having to try all of them,
3361              * it becomes quadratic on number of accept states to find all
3362              * the next words.
3363              */
3364
3365             {
3366                 /* what type of TRIE am I? (utf8 makes this contextual) */
3367                 DECL_TRIE_TYPE(scan);
3368
3369                 /* what trie are we using right now */
3370                 reg_trie_data * const trie
3371                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3372                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3373                 U32 state = trie->startstate;
3374
3375                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3376                     if (trie->states[ state ].wordnum) {
3377                          DEBUG_EXECUTE_r(
3378                             PerlIO_printf(Perl_debug_log,
3379                                           "%*s  %smatched empty string...%s\n",
3380                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3381                         );
3382                         if (!trie->jump)
3383                             break;
3384                     } else {
3385                         DEBUG_EXECUTE_r(
3386                             PerlIO_printf(Perl_debug_log,
3387                                           "%*s  %sfailed to match trie start class...%s\n",
3388                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3389                         );
3390                         sayNO_SILENT;
3391                    }
3392                 }
3393
3394             {
3395                 U8 *uc = ( U8* )locinput;
3396
3397                 STRLEN len = 0;
3398                 STRLEN foldlen = 0;
3399                 U8 *uscan = (U8*)NULL;
3400                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3401                 U32 charcount = 0; /* how many input chars we have matched */
3402                 U32 accepted = 0; /* have we seen any accepting states? */
3403
3404                 ST.jump = trie->jump;
3405                 ST.me = scan;
3406                 ST.firstpos = NULL;
3407                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3408                 ST.nextword = 0;
3409
3410                 /* fully traverse the TRIE; note the position of the
3411                    shortest accept state and the wordnum of the longest
3412                    accept state */
3413
3414                 while ( state && uc <= (U8*)PL_regeol ) {
3415                     U32 base = trie->states[ state ].trans.base;
3416                     UV uvc = 0;
3417                     U16 charid = 0;
3418                     U16 wordnum;
3419                     wordnum = trie->states[ state ].wordnum;
3420
3421                     if (wordnum) { /* it's an accept state */
3422                         if (!accepted) {
3423                             accepted = 1;
3424                             /* record first match position */
3425                             if (ST.longfold) {
3426                                 ST.firstpos = (U8*)locinput;
3427                                 ST.firstchars = 0;
3428                             }
3429                             else {
3430                                 ST.firstpos = uc;
3431                                 ST.firstchars = charcount;
3432                             }
3433                         }
3434                         if (!ST.nextword || wordnum < ST.nextword)
3435                             ST.nextword = wordnum;
3436                         ST.topword = wordnum;
3437                     }
3438
3439                     DEBUG_TRIE_EXECUTE_r({
3440                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3441                                 PerlIO_printf( Perl_debug_log,
3442                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3443                                     2+depth * 2, "", PL_colors[4],
3444                                     (UV)state, (accepted ? 'Y' : 'N'));
3445                     });
3446
3447                     /* read a char and goto next state */
3448                     if ( base ) {
3449                         I32 offset;
3450                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3451                                              uscan, len, uvc, charid, foldlen,
3452                                              foldbuf, uniflags);
3453                         charcount++;
3454                         if (foldlen>0)
3455                             ST.longfold = TRUE;
3456                         if (charid &&
3457                              ( ((offset =
3458                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3459
3460                              && ((U32)offset < trie->lasttrans)
3461                              && trie->trans[offset].check == state)
3462                         {
3463                             state = trie->trans[offset].next;
3464                         }
3465                         else {
3466                             state = 0;
3467                         }
3468                         uc += len;
3469
3470                     }
3471                     else {
3472                         state = 0;
3473                     }
3474                     DEBUG_TRIE_EXECUTE_r(
3475                         PerlIO_printf( Perl_debug_log,
3476                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3477                             charid, uvc, (UV)state, PL_colors[5] );
3478                     );
3479                 }
3480                 if (!accepted)
3481                    sayNO;
3482
3483                 /* calculate total number of accept states */
3484                 {
3485                     U16 w = ST.topword;
3486                     accepted = 0;
3487                     while (w) {
3488                         w = trie->wordinfo[w].prev;
3489                         accepted++;
3490                     }
3491                     ST.accepted = accepted;
3492                 }
3493
3494                 DEBUG_EXECUTE_r(
3495                     PerlIO_printf( Perl_debug_log,
3496                         "%*s  %sgot %"IVdf" possible matches%s\n",
3497                         REPORT_CODE_OFF + depth * 2, "",
3498                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3499                 );
3500                 goto trie_first_try; /* jump into the fail handler */
3501             }}
3502             assert(0); /* NOTREACHED */
3503
3504         case TRIE_next_fail: /* we failed - try next alternative */
3505             if ( ST.jump) {
3506                 REGCP_UNWIND(ST.cp);
3507                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3508             }
3509             if (!--ST.accepted) {
3510                 DEBUG_EXECUTE_r({
3511                     PerlIO_printf( Perl_debug_log,
3512                         "%*s  %sTRIE failed...%s\n",
3513                         REPORT_CODE_OFF+depth*2, "",
3514                         PL_colors[4],
3515                         PL_colors[5] );
3516                 });
3517                 sayNO_SILENT;
3518             }
3519             {
3520                 /* Find next-highest word to process.  Note that this code
3521                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3522                 register U16 min = 0;
3523                 register U16 word;
3524                 register U16 const nextword = ST.nextword;
3525                 register reg_trie_wordinfo * const wordinfo
3526                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3527                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3528                     if (word > nextword && (!min || word < min))
3529                         min = word;
3530                 }
3531                 ST.nextword = min;
3532             }
3533
3534           trie_first_try:
3535             if (do_cutgroup) {
3536                 do_cutgroup = 0;
3537                 no_final = 0;
3538             }
3539
3540             if ( ST.jump) {
3541                 ST.lastparen = rex->lastparen;
3542                 ST.lastcloseparen = rex->lastcloseparen;
3543                 REGCP_SET(ST.cp);
3544             }
3545
3546             /* find start char of end of current word */
3547             {
3548                 U32 chars; /* how many chars to skip */
3549                 U8 *uc = ST.firstpos;
3550                 reg_trie_data * const trie
3551                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3552
3553                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3554                             >=  ST.firstchars);
3555                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3556                             - ST.firstchars;
3557
3558                 if (ST.longfold) {
3559                     /* the hard option - fold each char in turn and find
3560                      * its folded length (which may be different */
3561                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3562                     STRLEN foldlen;
3563                     STRLEN len;
3564                     UV uvc;
3565                     U8 *uscan;
3566
3567                     while (chars) {
3568                         if (utf8_target) {
3569                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3570                                                     uniflags);
3571                             uc += len;
3572                         }
3573                         else {
3574                             uvc = *uc;
3575                             uc++;
3576                         }
3577                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3578                         uscan = foldbuf;
3579                         while (foldlen) {
3580                             if (!--chars)
3581                                 break;
3582                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3583                                             uniflags);
3584                             uscan += len;
3585                             foldlen -= len;
3586                         }
3587                     }
3588                 }
3589                 else {
3590                     if (utf8_target)
3591                         while (chars--)
3592                             uc += UTF8SKIP(uc);
3593                     else
3594                         uc += chars;
3595                 }
3596                 PL_reginput = (char *)uc;
3597             }
3598
3599             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3600                             ? ST.jump[ST.nextword]
3601                             : NEXT_OFF(ST.me));
3602
3603             DEBUG_EXECUTE_r({
3604                 PerlIO_printf( Perl_debug_log,
3605                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3606                     REPORT_CODE_OFF+depth*2, "",
3607                     PL_colors[4],
3608                     ST.nextword,
3609                     PL_colors[5]
3610                     );
3611             });
3612
3613             if (ST.accepted > 1 || has_cutgroup) {
3614                 PUSH_STATE_GOTO(TRIE_next, scan);
3615                 assert(0); /* NOTREACHED */
3616             }
3617             /* only one choice left - just continue */
3618             DEBUG_EXECUTE_r({
3619                 AV *const trie_words
3620                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3621                 SV ** const tmp = av_fetch( trie_words,
3622                     ST.nextword-1, 0 );
3623                 SV *sv= tmp ? sv_newmortal() : NULL;
3624
3625                 PerlIO_printf( Perl_debug_log,
3626                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3627                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3628                     ST.nextword,
3629                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3630                             PL_colors[0], PL_colors[1],
3631                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3632                         )
3633                     : "not compiled under -Dr",
3634                     PL_colors[5] );
3635             });
3636
3637             locinput = PL_reginput;
3638             nextchr = UCHARAT(locinput);
3639             continue; /* execute rest of RE */
3640             assert(0); /* NOTREACHED */
3641 #undef  ST
3642
3643         case EXACT: {
3644             char *s = STRING(scan);
3645             ln = STR_LEN(scan);
3646             if (utf8_target != UTF_PATTERN) {
3647                 /* The target and the pattern have differing utf8ness. */
3648                 char *l = locinput;
3649                 const char * const e = s + ln;
3650
3651                 if (utf8_target) {
3652                     /* The target is utf8, the pattern is not utf8. */
3653                     while (s < e) {
3654                         STRLEN ulen;
3655                         if (l >= PL_regeol)
3656                              sayNO;
3657                         if (NATIVE_TO_UNI(*(U8*)s) !=
3658                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3659                                             uniflags))
3660                              sayNO;
3661                         l += ulen;
3662                         s ++;
3663                     }
3664                 }
3665                 else {
3666                     /* The target is not utf8, the pattern is utf8. */
3667                     while (s < e) {
3668                         STRLEN ulen;
3669                         if (l >= PL_regeol)
3670                             sayNO;
3671                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3672                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3673                                            uniflags))
3674                             sayNO;
3675                         s += ulen;
3676                         l ++;
3677                     }
3678                 }
3679                 locinput = l;
3680                 nextchr = UCHARAT(locinput);
3681                 break;
3682             }
3683             /* The target and the pattern have the same utf8ness. */
3684             /* Inline the first character, for speed. */
3685             if (UCHARAT(s) != nextchr)
3686                 sayNO;
3687             if (PL_regeol - locinput < ln)
3688                 sayNO;
3689             if (ln > 1 && memNE(s, locinput, ln))
3690                 sayNO;
3691             locinput += ln;
3692             nextchr = UCHARAT(locinput);
3693             break;
3694             }
3695         case EXACTFL: {
3696             re_fold_t folder;
3697             const U8 * fold_array;
3698             const char * s;
3699             U32 fold_utf8_flags;
3700
3701             PL_reg_flags |= RF_tainted;
3702             folder = foldEQ_locale;
3703             fold_array = PL_fold_locale;
3704             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3705             goto do_exactf;
3706
3707         case EXACTFU_SS:
3708         case EXACTFU_TRICKYFOLD:
3709         case EXACTFU:
3710             folder = foldEQ_latin1;
3711             fold_array = PL_fold_latin1;
3712             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3713             goto do_exactf;
3714
3715         case EXACTFA:
3716             folder = foldEQ_latin1;
3717             fold_array = PL_fold_latin1;
3718             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3719             goto do_exactf;
3720
3721         case EXACTF:
3722             folder = foldEQ;
3723             fold_array = PL_fold;
3724             fold_utf8_flags = 0;
3725
3726           do_exactf:
3727             s = STRING(scan);
3728             ln = STR_LEN(scan);
3729
3730             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3731               /* Either target or the pattern are utf8, or has the issue where
3732                * the fold lengths may differ. */
3733                 const char * const l = locinput;
3734                 char *e = PL_regeol;
3735
3736                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3737                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3738                 {
3739                     sayNO;
3740                 }
3741                 locinput = e;
3742                 nextchr = UCHARAT(locinput);
3743                 break;
3744             }
3745
3746             /* Neither the target nor the pattern are utf8 */
3747             if (UCHARAT(s) != nextchr &&
3748                 UCHARAT(s) != fold_array[nextchr])
3749             {
3750                 sayNO;
3751             }
3752             if (PL_regeol - locinput < ln)
3753                 sayNO;
3754             if (ln > 1 && ! folder(s, locinput, ln))
3755                 sayNO;
3756             locinput += ln;
3757             nextchr = UCHARAT(locinput);
3758             break;
3759         }
3760
3761         /* XXX Could improve efficiency by separating these all out using a
3762          * macro or in-line function.  At that point regcomp.c would no longer
3763          * have to set the FLAGS fields of these */
3764         case BOUNDL:
3765         case NBOUNDL:
3766             PL_reg_flags |= RF_tainted;
3767             /* FALL THROUGH */
3768         case BOUND:
3769         case BOUNDU:
3770         case BOUNDA:
3771         case NBOUND:
3772         case NBOUNDU:
3773         case NBOUNDA:
3774             /* was last char in word? */
3775             if (utf8_target
3776                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3777                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3778             {
3779                 if (locinput == PL_bostr)
3780                     ln = '\n';
3781                 else {
3782                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3783
3784                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3785                 }
3786                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3787                     ln = isALNUM_uni(ln);
3788                     LOAD_UTF8_CHARCLASS_ALNUM();
3789                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3790                 }
3791                 else {
3792                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3793                     n = isALNUM_LC_utf8((U8*)locinput);
3794                 }
3795             }
3796             else {
3797
3798                 /* Here the string isn't utf8, or is utf8 and only ascii
3799                  * characters are to match \w.  In the latter case looking at
3800                  * the byte just prior to the current one may be just the final
3801                  * byte of a multi-byte character.  This is ok.  There are two
3802                  * cases:
3803                  * 1) it is a single byte character, and then the test is doing
3804                  *      just what it's supposed to.
3805                  * 2) it is a multi-byte character, in which case the final
3806                  *      byte is never mistakable for ASCII, and so the test
3807                  *      will say it is not a word character, which is the
3808                  *      correct answer. */
3809                 ln = (locinput != PL_bostr) ?
3810                     UCHARAT(locinput - 1) : '\n';
3811                 switch (FLAGS(scan)) {
3812                     case REGEX_UNICODE_CHARSET:
3813                         ln = isWORDCHAR_L1(ln);
3814                         n = isWORDCHAR_L1(nextchr);
3815                         break;
3816                     case REGEX_LOCALE_CHARSET:
3817                         ln = isALNUM_LC(ln);
3818                         n = isALNUM_LC(nextchr);
3819                         break;
3820                     case REGEX_DEPENDS_CHARSET:
3821                         ln = isALNUM(ln);
3822                         n = isALNUM(nextchr);
3823                         break;
3824                     case REGEX_ASCII_RESTRICTED_CHARSET:
3825                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3826                         ln = isWORDCHAR_A(ln);
3827                         n = isWORDCHAR_A(nextchr);
3828                         break;
3829                     default:
3830                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3831                         break;
3832                 }
3833             }
3834             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3835              * regcomp.sym */
3836             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3837                     sayNO;
3838             break;
3839         case ANYOFV:
3840         case ANYOF:
3841             if (utf8_target || state_num == ANYOFV) {
3842                 STRLEN inclasslen = PL_regeol - locinput;
3843                 if (locinput >= PL_regeol)
3844                     sayNO;
3845
3846                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3847                     sayNO;
3848                 locinput += inclasslen;
3849                 nextchr = UCHARAT(locinput);
3850                 break;
3851             }
3852             else {
3853                 if (nextchr < 0)
3854                     nextchr = UCHARAT(locinput);
3855                 if (!nextchr && locinput >= PL_regeol)
3856                     sayNO;
3857                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3858                     sayNO;
3859                 nextchr = UCHARAT(++locinput);
3860                 break;
3861             }
3862             break;
3863         /* Special char classes - The defines start on line 129 or so */
3864         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3865                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3866                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3867                   ALNUMA, NALNUMA, isWORDCHAR_A,
3868                   alnum, "a");
3869
3870         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3871                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3872                   SPACEU, NSPACEU, isSPACE_L1,
3873                   SPACEA, NSPACEA, isSPACE_A,
3874                   space, " ");
3875
3876         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3877                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3878                 DIGITA, NDIGITA, isDIGIT_A,
3879                 digit, "0");
3880
3881         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3882                        a Unicode extended Grapheme Cluster */
3883             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3884               extended Grapheme Cluster is:
3885
3886                CR LF
3887                | Prepend* Begin Extend*
3888                | .
3889
3890                Begin is (Hangul-syllable | ! Control)
3891                Extend is (Grapheme_Extend | Spacing_Mark)
3892                Control is [ GCB_Control CR LF ]
3893
3894                The discussion below shows how the code for CLUMP is derived
3895                from this regex.  Note that most of these concepts are from
3896                property values of the Grapheme Cluster Boundary (GCB) property.
3897                No code point can have multiple property values for a given
3898                property.  Thus a code point in Prepend can't be in Control, but
3899                it must be in !Control.  This is why Control above includes
3900                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3901                property separately, and so can't be in GCB_Control, even though
3902                they logically are controls.  Control is not the same as gc=cc,
3903                but includes format and other characters as well.
3904
3905                The Unicode definition of Hangul-syllable is:
3906                    L+
3907                    | (L* ( ( V | LV ) V* | LVT ) T*)
3908                    | T+
3909                   )
3910                Each of these is a value for the GCB property, and hence must be
3911                disjoint, so the order they are tested is immaterial, so the
3912                above can safely be changed to
3913                    T+
3914                    | L+
3915                    | (L* ( LVT | ( V | LV ) V*) T*)
3916
3917                The last two terms can be combined like this:
3918                    L* ( L
3919                         | (( LVT | ( V | LV ) V*) T*))
3920
3921                And refactored into this:
3922                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3923
3924                That means that if we have seen any L's at all we can quit
3925                there, but if the next character is an LVT, a V, or an LV we
3926                should keep going.
3927
3928                There is a subtlety with Prepend* which showed up in testing.
3929                Note that the Begin, and only the Begin is required in:
3930                 | Prepend* Begin Extend*
3931                Also, Begin contains '! Control'.  A Prepend must be a
3932                '!  Control', which means it must also be a Begin.  What it
3933                comes down to is that if we match Prepend* and then find no
3934                suitable Begin afterwards, that if we backtrack the last
3935                Prepend, that one will be a suitable Begin.
3936             */
3937
3938             if (locinput >= PL_regeol)
3939                 sayNO;
3940             if  (! utf8_target) {
3941
3942                 /* Match either CR LF  or '.', as all the other possibilities
3943                  * require utf8 */
3944                 locinput++;         /* Match the . or CR */
3945                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3946                                        match the LF */
3947                     && locinput < PL_regeol
3948                     && UCHARAT(locinput) == '\n') locinput++;
3949             }
3950             else {
3951
3952                 /* Utf8: See if is ( CR LF ); already know that locinput <
3953                  * PL_regeol, so locinput+1 is in bounds */
3954                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3955                     locinput += 2;
3956                 }
3957                 else {
3958                     /* In case have to backtrack to beginning, then match '.' */
3959                     char *starting = locinput;
3960
3961                     /* In case have to backtrack the last prepend */
3962                     char *previous_prepend = 0;
3963
3964                     LOAD_UTF8_CHARCLASS_GCB();
3965
3966                     /* Match (prepend)* */
3967                     while (locinput < PL_regeol
3968                            && swash_fetch(PL_utf8_X_prepend,
3969                                           (U8*)locinput, utf8_target))
3970                     {
3971                         previous_prepend = locinput;
3972                         locinput += UTF8SKIP(locinput);
3973                     }
3974
3975                     /* As noted above, if we matched a prepend character, but
3976                      * the next thing won't match, back off the last prepend we
3977                      * matched, as it is guaranteed to match the begin */
3978                     if (previous_prepend
3979                         && (locinput >=  PL_regeol
3980                             || ! swash_fetch(PL_utf8_X_begin,
3981                                              (U8*)locinput, utf8_target)))
3982                     {
3983                         locinput = previous_prepend;
3984                     }
3985
3986                     /* Note that here we know PL_regeol > locinput, as we
3987                      * tested that upon input to this switch case, and if we
3988                      * moved locinput forward, we tested the result just above
3989                      * and it either passed, or we backed off so that it will
3990                      * now pass */
3991                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3992
3993                         /* Here did not match the required 'Begin' in the
3994                          * second term.  So just match the very first
3995                          * character, the '.' of the final term of the regex */
3996                         locinput = starting + UTF8SKIP(starting);
3997                     } else {
3998
3999                         /* Here is the beginning of a character that can have
4000                          * an extender.  It is either a hangul syllable, or a
4001                          * non-control */
4002                         if (swash_fetch(PL_utf8_X_non_hangul,
4003                                         (U8*)locinput, utf8_target))
4004                         {
4005
4006                             /* Here not a Hangul syllable, must be a
4007                              * ('!  * Control') */
4008                             locinput += UTF8SKIP(locinput);
4009                         } else {
4010
4011                             /* Here is a Hangul syllable.  It can be composed
4012                              * of several individual characters.  One
4013                              * possibility is T+ */
4014                             if (swash_fetch(PL_utf8_X_T,
4015                                             (U8*)locinput, utf8_target))
4016                             {
4017                                 while (locinput < PL_regeol
4018                                         && swash_fetch(PL_utf8_X_T,
4019                                                         (U8*)locinput, utf8_target))
4020                                 {
4021                                     locinput += UTF8SKIP(locinput);
4022                                 }
4023                             } else {
4024
4025                                 /* Here, not T+, but is a Hangul.  That means
4026                                  * it is one of the others: L, LV, LVT or V,
4027                                  * and matches:
4028                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4029
4030                                 /* Match L*           */
4031                                 while (locinput < PL_regeol
4032                                         && swash_fetch(PL_utf8_X_L,
4033                                                         (U8*)locinput, utf8_target))
4034                                 {
4035                                     locinput += UTF8SKIP(locinput);
4036                                 }
4037
4038                                 /* Here, have exhausted L*.  If the next
4039                                  * character is not an LV, LVT nor V, it means
4040                                  * we had to have at least one L, so matches L+
4041                                  * in the original equation, we have a complete
4042                                  * hangul syllable.  Are done. */
4043
4044                                 if (locinput < PL_regeol
4045                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4046                                                     (U8*)locinput, utf8_target))
4047                                 {
4048
4049                                     /* Otherwise keep going.  Must be LV, LVT
4050                                      * or V.  See if LVT */
4051                                     if (swash_fetch(PL_utf8_X_LVT,
4052                                                     (U8*)locinput, utf8_target))
4053                                     {
4054                                         locinput += UTF8SKIP(locinput);
4055                                     } else {
4056
4057                                         /* Must be  V or LV.  Take it, then
4058                                          * match V*     */
4059                                         locinput += UTF8SKIP(locinput);
4060                                         while (locinput < PL_regeol
4061                                                 && swash_fetch(PL_utf8_X_V,
4062                                                          (U8*)locinput, utf8_target))
4063                                         {
4064                                             locinput += UTF8SKIP(locinput);
4065                                         }
4066                                     }
4067
4068                                     /* And any of LV, LVT, or V can be followed
4069                                      * by T*            */
4070                                     while (locinput < PL_regeol
4071                                            && swash_fetch(PL_utf8_X_T,
4072                                                            (U8*)locinput,
4073                                                            utf8_target))
4074                                     {
4075                                         locinput += UTF8SKIP(locinput);
4076                                     }
4077                                 }
4078                             }
4079                         }
4080
4081                         /* Match any extender */
4082                         while (locinput < PL_regeol
4083                                 && swash_fetch(PL_utf8_X_extend,
4084                                                 (U8*)locinput, utf8_target))
4085                         {
4086                             locinput += UTF8SKIP(locinput);
4087                         }
4088                     }
4089                 }
4090                 if (locinput > PL_regeol) sayNO;
4091             }
4092             nextchr = UCHARAT(locinput);
4093             break;
4094
4095         case NREFFL:
4096         {   /* The capture buffer cases.  The ones beginning with N for the
4097                named buffers just convert to the equivalent numbered and
4098                pretend they were called as the corresponding numbered buffer
4099                op.  */
4100             /* don't initialize these in the declaration, it makes C++
4101                unhappy */
4102             char *s;
4103             char type;
4104             re_fold_t folder;
4105             const U8 *fold_array;
4106             UV utf8_fold_flags;
4107
4108             PL_reg_flags |= RF_tainted;
4109             folder = foldEQ_locale;
4110             fold_array = PL_fold_locale;
4111             type = REFFL;
4112             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4113             goto do_nref;
4114
4115         case NREFFA:
4116             folder = foldEQ_latin1;
4117             fold_array = PL_fold_latin1;
4118             type = REFFA;
4119             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4120             goto do_nref;
4121
4122         case NREFFU:
4123             folder = foldEQ_latin1;
4124             fold_array = PL_fold_latin1;
4125             type = REFFU;
4126             utf8_fold_flags = 0;
4127             goto do_nref;
4128
4129         case NREFF:
4130             folder = foldEQ;
4131             fold_array = PL_fold;
4132             type = REFF;
4133             utf8_fold_flags = 0;
4134             goto do_nref;
4135
4136         case NREF:
4137             type = REF;
4138             folder = NULL;
4139             fold_array = NULL;
4140             utf8_fold_flags = 0;
4141           do_nref:
4142
4143             /* For the named back references, find the corresponding buffer
4144              * number */
4145             n = reg_check_named_buff_matched(rex,scan);
4146
4147             if ( ! n ) {
4148                 sayNO;
4149             }
4150             goto do_nref_ref_common;
4151
4152         case REFFL:
4153             PL_reg_flags |= RF_tainted;
4154             folder = foldEQ_locale;
4155             fold_array = PL_fold_locale;
4156             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4157             goto do_ref;
4158
4159         case REFFA:
4160             folder = foldEQ_latin1;
4161             fold_array = PL_fold_latin1;
4162             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4163             goto do_ref;
4164
4165         case REFFU:
4166             folder = foldEQ_latin1;
4167             fold_array = PL_fold_latin1;
4168             utf8_fold_flags = 0;
4169             goto do_ref;
4170
4171         case REFF:
4172             folder = foldEQ;
4173             fold_array = PL_fold;
4174             utf8_fold_flags = 0;
4175             goto do_ref;
4176
4177         case REF:
4178             folder = NULL;
4179             fold_array = NULL;
4180             utf8_fold_flags = 0;
4181
4182           do_ref:
4183             type = OP(scan);
4184             n = ARG(scan);  /* which paren pair */
4185
4186           do_nref_ref_common:
4187             ln = rex->offs[n].start;
4188             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4189             if (rex->lastparen < n || ln == -1)
4190                 sayNO;                  /* Do not match unless seen CLOSEn. */
4191             if (ln == rex->offs[n].end)
4192                 break;
4193
4194             s = PL_bostr + ln;
4195             if (type != REF     /* REF can do byte comparison */
4196                 && (utf8_target || type == REFFU))
4197             { /* XXX handle REFFL better */
4198                 char * limit = PL_regeol;
4199
4200                 /* This call case insensitively compares the entire buffer
4201                     * at s, with the current input starting at locinput, but
4202                     * not going off the end given by PL_regeol, and returns in
4203                     * limit upon success, how much of the current input was
4204                     * matched */
4205                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4206                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4207                 {
4208                     sayNO;
4209                 }
4210                 locinput = limit;
4211                 nextchr = UCHARAT(locinput);
4212                 break;
4213             }
4214
4215             /* Not utf8:  Inline the first character, for speed. */
4216             if (UCHARAT(s) != nextchr &&
4217                 (type == REF ||
4218                  UCHARAT(s) != fold_array[nextchr]))
4219                 sayNO;
4220             ln = rex->offs[n].end - ln;
4221             if (locinput + ln > PL_regeol)
4222                 sayNO;
4223             if (ln > 1 && (type == REF
4224                            ? memNE(s, locinput, ln)
4225                            : ! folder(s, locinput, ln)))
4226                 sayNO;
4227             locinput += ln;
4228             nextchr = UCHARAT(locinput);
4229             break;
4230         }
4231         case NOTHING:
4232         case TAIL:
4233             break;
4234         case BACK:
4235             break;
4236
4237 #undef  ST
4238 #define ST st->u.eval
4239         {
4240             SV *ret;
4241             REGEXP *re_sv;
4242             regexp *re;
4243             regexp_internal *rei;
4244             regnode *startpoint;
4245
4246         case GOSTART:
4247         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4248             if (cur_eval && cur_eval->locinput==locinput) {
4249                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4250                     Perl_croak(aTHX_ "Infinite recursion in regex");
4251                 if ( ++nochange_depth > max_nochange_depth )
4252                     Perl_croak(aTHX_
4253                         "Pattern subroutine nesting without pos change"
4254                         " exceeded limit in regex");
4255             } else {
4256                 nochange_depth = 0;
4257             }
4258             re_sv = rex_sv;
4259             re = rex;
4260             rei = rexi;
4261             if (OP(scan)==GOSUB) {
4262                 startpoint = scan + ARG2L(scan);
4263                 ST.close_paren = ARG(scan);
4264             } else {
4265                 startpoint = rei->program+1;
4266                 ST.close_paren = 0;
4267             }
4268             goto eval_recurse_doit;
4269             assert(0); /* NOTREACHED */
4270         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4271             if (cur_eval && cur_eval->locinput==locinput) {
4272                 if ( ++nochange_depth > max_nochange_depth )
4273                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4274             } else {
4275                 nochange_depth = 0;
4276             }
4277             {
4278                 /* execute the code in the {...} */
4279
4280                 dSP;
4281                 SV ** before;
4282                 OP * const oop = PL_op;
4283                 COP * const ocurcop = PL_curcop;
4284                 OP *nop;
4285                 char *saved_regeol = PL_regeol;
4286                 struct re_save_state saved_state;
4287                 CV *newcv;
4288
4289                 /* save *all* paren positions */
4290                 regcppush(rex, 0);
4291                 REGCP_SET(runops_cp);
4292
4293                 /* To not corrupt the existing regex state while executing the
4294                  * eval we would normally put it on the save stack, like with
4295                  * save_re_context. However, re-evals have a weird scoping so we
4296                  * can't just add ENTER/LEAVE here. With that, things like
4297                  *
4298                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4299                  *
4300                  * would break, as they expect the localisation to be unwound
4301                  * only when the re-engine backtracks through the bit that
4302                  * localised it.
4303                  *
4304                  * What we do instead is just saving the state in a local c
4305                  * variable.
4306                  */
4307                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4308
4309                 PL_reg_state.re_reparsing = FALSE;
4310
4311                 if (!caller_cv)
4312                     caller_cv = find_runcv(NULL);
4313
4314                 n = ARG(scan);
4315
4316                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4317                     newcv = ((struct regexp *)SvANY(
4318                                                 (REGEXP*)(rexi->data->data[n])
4319                                             ))->qr_anoncv
4320                                         ;
4321                     nop = (OP*)rexi->data->data[n+1];
4322                 }
4323                 else if (rexi->data->what[n] == 'l') { /* literal code */
4324                     newcv = caller_cv;
4325                     nop = (OP*)rexi->data->data[n];
4326                     assert(CvDEPTH(newcv));
4327                 }
4328                 else {
4329                     /* literal with own CV */
4330                     assert(rexi->data->what[n] == 'L');
4331                     newcv = rex->qr_anoncv;
4332                     nop = (OP*)rexi->data->data[n];
4333                 }
4334
4335                 /* normally if we're about to execute code from the same
4336                  * CV that we used previously, we just use the existing
4337                  * CX stack entry. However, its possible that in the
4338                  * meantime we may have backtracked, popped from the save
4339                  * stack, and undone the SAVECOMPPAD(s) associated with
4340                  * PUSH_MULTICALL; in which case PL_comppad no longer
4341                  * points to newcv's pad. */
4342                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4343                 {
4344                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4345                     if (last_pushed_cv) {
4346                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4347                     }
4348                     else {
4349                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4350                     }
4351                     last_pushed_cv = newcv;
4352                 }
4353                 last_pad = PL_comppad;
4354
4355                 /* the initial nextstate you would normally execute
4356                  * at the start of an eval (which would cause error
4357                  * messages to come from the eval), may be optimised
4358                  * away from the execution path in the regex code blocks;
4359                  * so manually set PL_curcop to it initially */
4360                 {
4361                     OP *o = cUNOPx(nop)->op_first;
4362                     assert(o->op_type == OP_NULL);
4363                     if (o->op_targ == OP_SCOPE) {
4364                         o = cUNOPo->op_first;
4365                     }
4366                     else {
4367                         assert(o->op_targ == OP_LEAVE);
4368                         o = cUNOPo->op_first;
4369                         assert(o->op_type == OP_ENTER);
4370                         o = o->op_sibling;
4371                     }
4372
4373                     if (o->op_type != OP_STUB) {
4374                         assert(    o->op_type == OP_NEXTSTATE
4375                                 || o->op_type == OP_DBSTATE
4376                                 || (o->op_type == OP_NULL
4377                                     &&  (  o->op_targ == OP_NEXTSTATE
4378                                         || o->op_targ == OP_DBSTATE
4379                                         )
4380                                     )
4381                         );
4382                         PL_curcop = (COP*)o;
4383                     }
4384                 }
4385                 nop = nop->op_next;
4386
4387                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4388                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4389
4390                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4391
4392                 if (sv_yes_mark) {
4393                     SV *sv_mrk = get_sv("REGMARK", 1);
4394                     sv_setsv(sv_mrk, sv_yes_mark);
4395                 }
4396
4397                 /* we don't use MULTICALL here as we want to call the
4398                  * first op of the block of interest, rather than the
4399                  * first op of the sub */
4400                 before = SP;
4401                 PL_op = nop;
4402                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4403                 SPAGAIN;
4404                 if (SP == before)
4405                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4406                 else {
4407                     ret = POPs;
4408                     PUTBACK;
4409                 }
4410
4411                 /* before restoring everything, evaluate the returned
4412                  * value, so that 'uninit' warnings don't use the wrong
4413                  * PL_op or pad. Also need to process any magic vars
4414                  * (e.g. $1) *before* parentheses are restored */
4415
4416                 PL_op = NULL;
4417
4418                 re_sv = NULL;
4419                 if (logical == 0)        /*   (?{})/   */
4420                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4421                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4422                     sw = cBOOL(SvTRUE(ret));
4423                     logical = 0;
4424                 }
4425                 else {                   /*  /(??{})  */
4426                     /*  if its overloaded, let the regex compiler handle
4427                      *  it; otherwise extract regex, or stringify  */
4428                     if (!SvAMAGIC(ret)) {
4429                         SV *sv = ret;
4430                         if (SvROK(sv))
4431                             sv = SvRV(sv);
4432                         if (SvTYPE(sv) == SVt_REGEXP)
4433                             re_sv = (REGEXP*) sv;
4434                         else if (SvSMAGICAL(sv)) {
4435                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4436                             if (mg)
4437                                 re_sv = (REGEXP *) mg->mg_obj;
4438                         }
4439
4440                         /* force any magic, undef warnings here */
4441                         if (!re_sv) {
4442                             ret = sv_mortalcopy(ret);
4443                             (void) SvPV_force_nolen(ret);
4444                         }
4445                     }
4446
4447                 }
4448
4449                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4450
4451                 /* *** Note that at this point we don't restore
4452                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4453                  * be used again soon. This is safe as long as nothing
4454                  * in the regexp code uses the pad ! */
4455                 PL_op = oop;
4456                 PL_curcop = ocurcop;
4457                 PL_regeol = saved_regeol;
4458                 S_regcp_restore(aTHX_ rex, runops_cp);
4459
4460                 if (logical != 2)
4461                     break;
4462             }
4463
4464                 /* only /(??{})/  from now on */
4465                 logical = 0;
4466                 {
4467                     /* extract RE object from returned value; compiling if
4468                      * necessary */
4469
4470                     if (re_sv) {
4471                         re_sv = reg_temp_copy(NULL, re_sv);
4472                     }
4473                     else {
4474                         U32 pm_flags = 0;
4475                         const I32 osize = PL_regsize;
4476
4477                         if (SvUTF8(ret) && IN_BYTES) {
4478                             /* In use 'bytes': make a copy of the octet
4479                              * sequence, but without the flag on */
4480                             STRLEN len;
4481                             const char *const p = SvPV(ret, len);
4482                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4483                         }
4484                         if (rex->intflags & PREGf_USE_RE_EVAL)
4485                             pm_flags |= PMf_USE_RE_EVAL;
4486
4487                         /* if we got here, it should be an engine which
4488                          * supports compiling code blocks and stuff */
4489                         assert(rex->engine && rex->engine->op_comp);
4490                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4491                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4492                                     rex->engine, NULL, NULL,
4493                                     /* copy /msix etc to inner pattern */
4494                                     scan->flags,
4495                                     pm_flags);
4496
4497                         if (!(SvFLAGS(ret)
4498                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4499                                  | SVs_GMG))) {
4500                             /* This isn't a first class regexp. Instead, it's
4501                                caching a regexp onto an existing, Perl visible
4502                                scalar.  */
4503                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4504                         }
4505                         PL_regsize = osize;
4506                         /* safe to do now that any $1 etc has been
4507                          * interpolated into the new pattern string and
4508                          * compiled */
4509                         S_regcp_restore(aTHX_ rex, runops_cp);
4510                     }
4511                     re = (struct regexp *)SvANY(re_sv);
4512                 }
4513                 RXp_MATCH_COPIED_off(re);
4514                 re->subbeg = rex->subbeg;
4515                 re->sublen = rex->sublen;
4516                 rei = RXi_GET(re);
4517                 DEBUG_EXECUTE_r(
4518                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4519                         "Matching embedded");
4520                 );
4521                 startpoint = rei->program + 1;
4522                 ST.close_paren = 0; /* only used for GOSUB */
4523
4524         eval_recurse_doit: /* Share code with GOSUB below this line */
4525                 /* run the pattern returned from (??{...}) */
4526                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4527                 REGCP_SET(ST.lastcp);
4528
4529                 re->lastparen = 0;
4530                 re->lastcloseparen = 0;
4531
4532                 PL_reginput = locinput;
4533                 PL_regsize = 0;
4534
4535                 /* XXXX This is too dramatic a measure... */
4536                 PL_reg_maxiter = 0;
4537
4538                 ST.toggle_reg_flags = PL_reg_flags;
4539                 if (RX_UTF8(re_sv))
4540                     PL_reg_flags |= RF_utf8;
4541                 else
4542                     PL_reg_flags &= ~RF_utf8;
4543                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4544
4545                 ST.prev_rex = rex_sv;
4546                 ST.prev_curlyx = cur_curlyx;
4547                 rex_sv = re_sv;
4548                 SET_reg_curpm(rex_sv);
4549                 rex = re;
4550                 rexi = rei;
4551                 cur_curlyx = NULL;
4552                 ST.B = next;
4553                 ST.prev_eval = cur_eval;
4554                 cur_eval = st;
4555                 /* now continue from first node in postoned RE */
4556                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4557                 assert(0); /* NOTREACHED */
4558         }
4559
4560         case EVAL_AB: /* cleanup after a successful (??{A})B */
4561             /* note: this is called twice; first after popping B, then A */
4562             PL_reg_flags ^= ST.toggle_reg_flags;
4563             rex_sv = ST.prev_rex;
4564             SET_reg_curpm(rex_sv);
4565             rex = (struct regexp *)SvANY(rex_sv);
4566             rexi = RXi_GET(rex);
4567             regcpblow(ST.cp);
4568             cur_eval = ST.prev_eval;
4569             cur_curlyx = ST.prev_curlyx;
4570
4571             /* XXXX This is too dramatic a measure... */
4572             PL_reg_maxiter = 0;
4573             if ( nochange_depth )
4574                 nochange_depth--;
4575             sayYES;
4576
4577
4578         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4579             /* note: this is called twice; first after popping B, then A */
4580             PL_reg_flags ^= ST.toggle_reg_flags;
4581             rex_sv = ST.prev_rex;
4582             SET_reg_curpm(rex_sv);
4583             rex = (struct regexp *)SvANY(rex_sv);
4584             rexi = RXi_GET(rex);
4585
4586             PL_reginput = locinput;
4587             REGCP_UNWIND(ST.lastcp);
4588             regcppop(rex);
4589             cur_eval = ST.prev_eval;
4590             cur_curlyx = ST.prev_curlyx;
4591             /* XXXX This is too dramatic a measure... */
4592             PL_reg_maxiter = 0;
4593             if ( nochange_depth )
4594                 nochange_depth--;
4595             sayNO_SILENT;
4596 #undef ST
4597
4598         case OPEN:
4599             n = ARG(scan);  /* which paren pair */
4600             rex->offs[n].start_tmp = locinput - PL_bostr;
4601             if (n > PL_regsize)
4602                 PL_regsize = n;
4603             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4604                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4605                 PTR2UV(rex),
4606                 PTR2UV(rex->offs),
4607                 (UV)n,
4608                 (IV)rex->offs[n].start_tmp,
4609                 (UV)PL_regsize
4610             ));
4611             lastopen = n;
4612             break;
4613
4614 /* XXX really need to log other places start/end are set too */
4615 #define CLOSE_CAPTURE \
4616     rex->offs[n].start = rex->offs[n].start_tmp; \
4617     rex->offs[n].end = locinput - PL_bostr; \
4618     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4619         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4620         PTR2UV(rex), \
4621         PTR2UV(rex->offs), \
4622         (UV)n, \
4623         (IV)rex->offs[n].start, \
4624         (IV)rex->offs[n].end \
4625     ))
4626
4627         case CLOSE:
4628             n = ARG(scan);  /* which paren pair */
4629             CLOSE_CAPTURE;
4630             /*if (n > PL_regsize)
4631                 PL_regsize = n;*/
4632             if (n > rex->lastparen)
4633                 rex->lastparen = n;
4634             rex->lastcloseparen = n;
4635             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4636                 goto fake_end;
4637             }
4638             break;
4639         case ACCEPT:
4640             if (ARG(scan)){
4641                 regnode *cursor;
4642                 for (cursor=scan;
4643                      cursor && OP(cursor)!=END;
4644                      cursor=regnext(cursor))
4645                 {
4646                     if ( OP(cursor)==CLOSE ){
4647                         n = ARG(cursor);
4648                         if ( n <= lastopen ) {
4649                             CLOSE_CAPTURE;
4650                             /*if (n > PL_regsize)
4651                             PL_regsize = n;*/
4652                             if (n > rex->lastparen)
4653                                 rex->lastparen = n;
4654                             rex->lastcloseparen = n;
4655                             if ( n == ARG(scan) || (cur_eval &&
4656                                 cur_eval->u.eval.close_paren == n))
4657                                 break;
4658                         }
4659                     }
4660                 }
4661             }
4662             goto fake_end;
4663             /*NOTREACHED*/
4664         case GROUPP:
4665             n = ARG(scan);  /* which paren pair */
4666             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4667             break;
4668         case NGROUPP:
4669             /* reg_check_named_buff_matched returns 0 for no match */
4670             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4671             break;
4672         case INSUBP:
4673             n = ARG(scan);
4674             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4675             break;
4676         case DEFINEP:
4677             sw = 0;
4678             break;
4679         case IFTHEN:
4680             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4681             if (sw)
4682                 next = NEXTOPER(NEXTOPER(scan));
4683             else {
4684                 next = scan + ARG(scan);
4685                 if (OP(next) == IFTHEN) /* Fake one. */
4686                     next = NEXTOPER(NEXTOPER(next));
4687             }
4688             break;
4689         case LOGICAL:
4690             logical = scan->flags;
4691             break;
4692
4693 /*******************************************************************
4694
4695 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4696 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4697 STAR/PLUS/CURLY/CURLYN are used instead.)
4698
4699 A*B is compiled as <CURLYX><A><WHILEM><B>
4700
4701 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4702 state, which contains the current count, initialised to -1. It also sets
4703 cur_curlyx to point to this state, with any previous value saved in the
4704 state block.
4705
4706 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4707 since the pattern may possibly match zero times (i.e. it's a while {} loop
4708 rather than a do {} while loop).
4709
4710 Each entry to WHILEM represents a successful match of A. The count in the
4711 CURLYX block is incremented, another WHILEM state is pushed, and execution
4712 passes to A or B depending on greediness and the current count.
4713
4714 For example, if matching against the string a1a2a3b (where the aN are
4715 substrings that match /A/), then the match progresses as follows: (the
4716 pushed states are interspersed with the bits of strings matched so far):
4717
4718     <CURLYX cnt=-1>
4719     <CURLYX cnt=0><WHILEM>
4720     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4721     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4722     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4723     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4724
4725 (Contrast this with something like CURLYM, which maintains only a single
4726 backtrack state:
4727
4728     <CURLYM cnt=0> a1
4729     a1 <CURLYM cnt=1> a2
4730     a1 a2 <CURLYM cnt=2> a3
4731     a1 a2 a3 <CURLYM cnt=3> b
4732 )
4733
4734 Each WHILEM state block marks a point to backtrack to upon partial failure
4735 of A or B, and also contains some minor state data related to that
4736 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4737 overall state, such as the count, and pointers to the A and B ops.
4738
4739 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4740 must always point to the *current* CURLYX block, the rules are:
4741
4742 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4743 and set cur_curlyx to point the new block.
4744
4745 When popping the CURLYX block after a successful or unsuccessful match,
4746 restore the previous cur_curlyx.
4747
4748 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4749 to the outer one saved in the CURLYX block.
4750
4751 When popping the WHILEM block after a successful or unsuccessful B match,
4752 restore the previous cur_curlyx.
4753
4754 Here's an example for the pattern (AI* BI)*BO
4755 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4756
4757 cur_
4758 curlyx backtrack stack
4759 ------ ---------------
4760 NULL
4761 CO     <CO prev=NULL> <WO>
4762 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4763 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4764 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4765
4766 At this point the pattern succeeds, and we work back down the stack to
4767 clean up, restoring as we go:
4768
4769 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4770 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4771 CO     <CO prev=NULL> <WO>
4772 NULL
4773
4774 *******************************************************************/
4775
4776 #define ST st->u.curlyx
4777
4778         case CURLYX:    /* start of /A*B/  (for complex A) */
4779         {
4780             /* No need to save/restore up to this paren */
4781             I32 parenfloor = scan->flags;
4782
4783             assert(next); /* keep Coverity happy */
4784             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4785                 next += ARG(next);
4786
4787             /* XXXX Probably it is better to teach regpush to support
4788                parenfloor > PL_regsize... */
4789             if (parenfloor > (I32)rex->lastparen)
4790                 parenfloor = rex->lastparen; /* Pessimization... */
4791
4792             ST.prev_curlyx= cur_curlyx;
4793             cur_curlyx = st;
4794             ST.cp = PL_savestack_ix;
4795
4796             /* these fields contain the state of the current curly.
4797              * they are accessed by subsequent WHILEMs */
4798             ST.parenfloor = parenfloor;
4799             ST.me = scan;
4800             ST.B = next;
4801             ST.minmod = minmod;
4802             minmod = 0;
4803             ST.count = -1;      /* this will be updated by WHILEM */
4804             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4805
4806             PL_reginput = locinput;
4807             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4808             assert(0); /* NOTREACHED */
4809         }
4810
4811         case CURLYX_end: /* just finished matching all of A*B */
4812             cur_curlyx = ST.prev_curlyx;
4813             sayYES;
4814             assert(0); /* NOTREACHED */
4815
4816         case CURLYX_end_fail: /* just failed to match all of A*B */
4817             regcpblow(ST.cp);
4818             cur_curlyx = ST.prev_curlyx;
4819             sayNO;
4820             assert(0); /* NOTREACHED */
4821
4822
4823 #undef ST
4824 #define ST st->u.whilem
4825
4826         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4827         {
4828             /* see the discussion above about CURLYX/WHILEM */
4829             I32 n;
4830             int min = ARG1(cur_curlyx->u.curlyx.me);
4831             int max = ARG2(cur_curlyx->u.curlyx.me);
4832             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4833
4834             assert(cur_curlyx); /* keep Coverity happy */
4835             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4836             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4837             ST.cache_offset = 0;
4838             ST.cache_mask = 0;
4839
4840             PL_reginput = locinput;
4841
4842             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4843                   "%*s  whilem: matched %ld out of %d..%d\n",
4844                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4845             );
4846
4847             /* First just match a string of min A's. */
4848
4849             if (n < min) {
4850                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4851                 cur_curlyx->u.curlyx.lastloc = locinput;
4852                 REGCP_SET(ST.lastcp);
4853
4854                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4855                 assert(0); /* NOTREACHED */
4856             }
4857
4858             /* If degenerate A matches "", assume A done. */
4859
4860             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4861                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4862                    "%*s  whilem: empty match detected, trying continuation...\n",
4863                    REPORT_CODE_OFF+depth*2, "")
4864                 );
4865                 goto do_whilem_B_max;
4866             }
4867
4868             /* super-linear cache processing */
4869
4870             if (scan->flags) {
4871
4872                 if (!PL_reg_maxiter) {
4873                     /* start the countdown: Postpone detection until we
4874                      * know the match is not *that* much linear. */
4875                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4876                     /* possible overflow for long strings and many CURLYX's */
4877                     if (PL_reg_maxiter < 0)
4878                         PL_reg_maxiter = I32_MAX;
4879                     PL_reg_leftiter = PL_reg_maxiter;
4880                 }
4881
4882                 if (PL_reg_leftiter-- == 0) {
4883                     /* initialise cache */
4884                     const I32 size = (PL_reg_maxiter + 7)/8;
4885                     if (PL_reg_poscache) {
4886                         if ((I32)PL_reg_poscache_size < size) {
4887                             Renew(PL_reg_poscache, size, char);
4888                             PL_reg_poscache_size = size;
4889                         }
4890                         Zero(PL_reg_poscache, size, char);
4891                     }
4892                     else {
4893                         PL_reg_poscache_size = size;
4894                         Newxz(PL_reg_poscache, size, char);
4895                     }
4896                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4897       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4898                               PL_colors[4], PL_colors[5])
4899                     );
4900                 }
4901
4902                 if (PL_reg_leftiter < 0) {
4903                     /* have we already failed at this position? */
4904                     I32 offset, mask;
4905                     offset  = (scan->flags & 0xf) - 1
4906                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4907                     mask    = 1 << (offset % 8);
4908                     offset /= 8;
4909                     if (PL_reg_poscache[offset] & mask) {
4910                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4911                             "%*s  whilem: (cache) already tried at this position...\n",
4912                             REPORT_CODE_OFF+depth*2, "")
4913                         );
4914                         sayNO; /* cache records failure */
4915                     }
4916                     ST.cache_offset = offset;
4917                     ST.cache_mask   = mask;
4918                 }
4919             }
4920
4921             /* Prefer B over A for minimal matching. */
4922
4923             if (cur_curlyx->u.curlyx.minmod) {
4924                 ST.save_curlyx = cur_curlyx;
4925                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4926                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
4927                 REGCP_SET(ST.lastcp);
4928                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4929                 assert(0); /* NOTREACHED */
4930             }
4931
4932             /* Prefer A over B for maximal matching. */
4933
4934             if (n < max) { /* More greed allowed? */
4935                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4936                 cur_curlyx->u.curlyx.lastloc = locinput;
4937                 REGCP_SET(ST.lastcp);
4938                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4939                 assert(0); /* NOTREACHED */
4940             }
4941             goto do_whilem_B_max;
4942         }
4943         assert(0); /* NOTREACHED */
4944
4945         case WHILEM_B_min: /* just matched B in a minimal match */
4946         case WHILEM_B_max: /* just matched B in a maximal match */
4947             cur_curlyx = ST.save_curlyx;
4948             sayYES;
4949             assert(0); /* NOTREACHED */
4950
4951         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4952             cur_curlyx = ST.save_curlyx;
4953             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4954             cur_curlyx->u.curlyx.count--;
4955             CACHEsayNO;
4956             assert(0); /* NOTREACHED */
4957
4958         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4959             /* FALL THROUGH */
4960         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4961             REGCP_UNWIND(ST.lastcp);
4962             regcppop(rex);
4963             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4964             cur_curlyx->u.curlyx.count--;
4965             CACHEsayNO;
4966             assert(0); /* NOTREACHED */
4967
4968         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4969             REGCP_UNWIND(ST.lastcp);
4970             regcppop(rex);      /* Restore some previous $<digit>s? */
4971             PL_reginput = locinput;
4972             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4973                 "%*s  whilem: failed, trying continuation...\n",
4974                 REPORT_CODE_OFF+depth*2, "")
4975             );
4976           do_whilem_B_max:
4977             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4978                 && ckWARN(WARN_REGEXP)
4979                 && !(PL_reg_flags & RF_warned))
4980             {
4981                 PL_reg_flags |= RF_warned;
4982                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4983                      "Complex regular subexpression recursion limit (%d) "
4984                      "exceeded",
4985                      REG_INFTY - 1);
4986             }
4987
4988             /* now try B */
4989             ST.save_curlyx = cur_curlyx;
4990             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4991             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4992             assert(0); /* NOTREACHED */
4993
4994         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4995             cur_curlyx = ST.save_curlyx;
4996             REGCP_UNWIND(ST.lastcp);
4997             regcppop(rex);
4998
4999             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5000                 /* Maximum greed exceeded */
5001                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5002                     && ckWARN(WARN_REGEXP)
5003                     && !(PL_reg_flags & RF_warned))
5004                 {
5005                     PL_reg_flags |= RF_warned;
5006                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5007                         "Complex regular subexpression recursion "
5008                         "limit (%d) exceeded",
5009                         REG_INFTY - 1);
5010                 }
5011                 cur_curlyx->u.curlyx.count--;
5012                 CACHEsayNO;
5013             }
5014
5015             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5016                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5017             );
5018             /* Try grabbing another A and see if it helps. */
5019             PL_reginput = locinput;
5020             cur_curlyx->u.curlyx.lastloc = locinput;
5021             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5022             REGCP_SET(ST.lastcp);
5023             PUSH_STATE_GOTO(WHILEM_A_min,
5024                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5025             assert(0); /* NOTREACHED */
5026
5027 #undef  ST
5028 #define ST st->u.branch
5029
5030         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5031             next = scan + ARG(scan);
5032             if (next == scan)
5033                 next = NULL;
5034             scan = NEXTOPER(scan);
5035             /* FALL THROUGH */
5036
5037         case BRANCH:        /*  /(...|A|...)/ */
5038             scan = NEXTOPER(scan); /* scan now points to inner node */
5039             ST.lastparen = rex->lastparen;
5040             ST.lastcloseparen = rex->lastcloseparen;
5041             ST.next_branch = next;
5042             REGCP_SET(ST.cp);
5043             PL_reginput = locinput;
5044
5045             /* Now go into the branch */
5046             if (has_cutgroup) {
5047                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5048             } else {
5049                 PUSH_STATE_GOTO(BRANCH_next, scan);
5050             }
5051             assert(0); /* NOTREACHED */
5052         case CUTGROUP:
5053             PL_reginput = locinput;
5054             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5055                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5056             PUSH_STATE_GOTO(CUTGROUP_next,next);
5057             assert(0); /* NOTREACHED */
5058         case CUTGROUP_next_fail:
5059             do_cutgroup = 1;
5060             no_final = 1;
5061             if (st->u.mark.mark_name)
5062                 sv_commit = st->u.mark.mark_name;
5063             sayNO;
5064             assert(0); /* NOTREACHED */
5065         case BRANCH_next:
5066             sayYES;
5067             assert(0); /* NOTREACHED */
5068         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5069             if (do_cutgroup) {
5070                 do_cutgroup = 0;
5071                 no_final = 0;
5072             }
5073             REGCP_UNWIND(ST.cp);
5074             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5075             scan = ST.next_branch;
5076             /* no more branches? */
5077             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5078                 DEBUG_EXECUTE_r({
5079                     PerlIO_printf( Perl_debug_log,
5080                         "%*s  %sBRANCH failed...%s\n",
5081                         REPORT_CODE_OFF+depth*2, "",
5082                         PL_colors[4],
5083                         PL_colors[5] );
5084                 });
5085                 sayNO_SILENT;
5086             }
5087             continue; /* execute next BRANCH[J] op */
5088             assert(0); /* NOTREACHED */
5089
5090         case MINMOD:
5091             minmod = 1;
5092             break;
5093
5094 #undef  ST
5095 #define ST st->u.curlym
5096
5097         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5098
5099             /* This is an optimisation of CURLYX that enables us to push
5100              * only a single backtracking state, no matter how many matches
5101              * there are in {m,n}. It relies on the pattern being constant
5102              * length, with no parens to influence future backrefs
5103              */
5104
5105             ST.me = scan;
5106             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5107
5108             ST.lastparen      = rex->lastparen;
5109             ST.lastcloseparen = rex->lastcloseparen;
5110
5111             /* if paren positive, emulate an OPEN/CLOSE around A */
5112             if (ST.me->flags) {
5113                 U32 paren = ST.me->flags;
5114                 if (paren > PL_regsize)
5115                     PL_regsize = paren;
5116                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5117             }
5118             ST.A = scan;
5119             ST.B = next;
5120             ST.alen = 0;
5121             ST.count = 0;
5122             ST.minmod = minmod;
5123             minmod = 0;
5124             ST.c1 = CHRTEST_UNINIT;
5125             REGCP_SET(ST.cp);
5126
5127             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5128                 goto curlym_do_B;
5129
5130           curlym_do_A: /* execute the A in /A{m,n}B/  */
5131             PL_reginput = locinput;
5132             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5133             assert(0); /* NOTREACHED */
5134
5135         case CURLYM_A: /* we've just matched an A */
5136             locinput = st->locinput;
5137             nextchr = UCHARAT(locinput);
5138
5139             ST.count++;
5140             /* after first match, determine A's length: u.curlym.alen */
5141             if (ST.count == 1) {
5142                 if (PL_reg_match_utf8) {
5143                     char *s = locinput;
5144                     while (s < PL_reginput) {
5145                         ST.alen++;
5146                         s += UTF8SKIP(s);
5147                     }
5148                 }
5149                 else {
5150                     ST.alen = PL_reginput - locinput;
5151                 }
5152                 if (ST.alen == 0)
5153                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5154             }
5155             DEBUG_EXECUTE_r(
5156                 PerlIO_printf(Perl_debug_log,
5157                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5158                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5159                           (IV) ST.count, (IV)ST.alen)
5160             );
5161
5162             locinput = PL_reginput;
5163
5164             if (cur_eval && cur_eval->u.eval.close_paren &&
5165                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5166                 goto fake_end;
5167
5168             {
5169                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5170                 if ( max == REG_INFTY || ST.count < max )
5171                     goto curlym_do_A; /* try to match another A */
5172             }
5173             goto curlym_do_B; /* try to match B */
5174
5175         case CURLYM_A_fail: /* just failed to match an A */
5176             REGCP_UNWIND(ST.cp);
5177
5178             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5179                 || (cur_eval && cur_eval->u.eval.close_paren &&
5180                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5181                 sayNO;
5182
5183           curlym_do_B: /* execute the B in /A{m,n}B/  */
5184             PL_reginput = locinput;
5185             if (ST.c1 == CHRTEST_UNINIT) {
5186                 /* calculate c1 and c2 for possible match of 1st char
5187                  * following curly */
5188                 ST.c1 = ST.c2 = CHRTEST_VOID;
5189                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5190                     regnode *text_node = ST.B;
5191                     if (! HAS_TEXT(text_node))
5192                         FIND_NEXT_IMPT(text_node);
5193                     /* this used to be
5194
5195                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5196
5197                         But the former is redundant in light of the latter.
5198
5199                         if this changes back then the macro for
5200                         IS_TEXT and friends need to change.
5201                      */
5202                     if (PL_regkind[OP(text_node)] == EXACT)
5203                     {
5204
5205                         ST.c1 = (U8)*STRING(text_node);
5206                         switch (OP(text_node)) {
5207                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5208                             case EXACTFA:
5209                             case EXACTFU_SS:
5210                             case EXACTFU_TRICKYFOLD:
5211                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5212                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5213                             default: ST.c2 = ST.c1;
5214                         }
5215                     }
5216                 }
5217             }
5218
5219             DEBUG_EXECUTE_r(
5220                 PerlIO_printf(Perl_debug_log,
5221                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5222                     (int)(REPORT_CODE_OFF+(depth*2)),
5223                     "", (IV)ST.count)
5224                 );
5225             if (ST.c1 != CHRTEST_VOID
5226                     && UCHARAT(PL_reginput) != ST.c1
5227                     && UCHARAT(PL_reginput) != ST.c2)
5228             {
5229                 /* simulate B failing */
5230                 DEBUG_OPTIMISE_r(
5231                     PerlIO_printf(Perl_debug_log,
5232                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5233                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5234                         (IV)ST.c1,(IV)ST.c2
5235                 ));
5236                 state_num = CURLYM_B_fail;
5237                 goto reenter_switch;
5238             }
5239
5240             if (ST.me->flags) {
5241                 /* emulate CLOSE: mark current A as captured */
5242                 I32 paren = ST.me->flags;
5243                 if (ST.count) {
5244                     rex->offs[paren].start
5245                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5246                     rex->offs[paren].end = PL_reginput - PL_bostr;
5247                     if ((U32)paren > rex->lastparen)
5248                         rex->lastparen = paren;
5249                     rex->lastcloseparen = paren;
5250                 }
5251                 else
5252                     rex->offs[paren].end = -1;
5253                 if (cur_eval && cur_eval->u.eval.close_paren &&
5254                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5255                 {
5256                     if (ST.count)
5257                         goto fake_end;
5258                     else
5259                         sayNO;
5260                 }
5261             }
5262
5263             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5264             assert(0); /* NOTREACHED */
5265
5266         case CURLYM_B_fail: /* just failed to match a B */
5267             REGCP_UNWIND(ST.cp);
5268             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5269             if (ST.minmod) {
5270                 I32 max = ARG2(ST.me);
5271                 if (max != REG_INFTY && ST.count == max)
5272                     sayNO;
5273                 goto curlym_do_A; /* try to match a further A */
5274             }
5275             /* backtrack one A */
5276             if (ST.count == ARG1(ST.me) /* min */)
5277                 sayNO;
5278             ST.count--;
5279             locinput = HOPc(locinput, -ST.alen);
5280             goto curlym_do_B; /* try to match B */
5281
5282 #undef ST
5283 #define ST st->u.curly
5284
5285 #define CURLY_SETPAREN(paren, success) \
5286     if (paren) { \
5287         if (success) { \
5288             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5289             rex->offs[paren].end = locinput - PL_bostr; \
5290             if (paren > rex->lastparen) \
5291                 rex->lastparen = paren; \
5292             rex->lastcloseparen = paren; \
5293         } \
5294         else { \
5295             rex->offs[paren].end = -1; \
5296             rex->lastparen      = ST.lastparen; \
5297             rex->lastcloseparen = ST.lastcloseparen; \
5298         } \
5299     }
5300
5301         case STAR:              /*  /A*B/ where A is width 1 */
5302             ST.paren = 0;
5303             ST.min = 0;
5304             ST.max = REG_INFTY;
5305             scan = NEXTOPER(scan);
5306             goto repeat;
5307         case PLUS:              /*  /A+B/ where A is width 1 */
5308             ST.paren = 0;
5309             ST.min = 1;
5310             ST.max = REG_INFTY;
5311             scan = NEXTOPER(scan);
5312             goto repeat;
5313         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5314             ST.paren = scan->flags;     /* Which paren to set */
5315             ST.lastparen      = rex->lastparen;
5316             ST.lastcloseparen = rex->lastcloseparen;
5317             if (ST.paren > PL_regsize)
5318                 PL_regsize = ST.paren;
5319             ST.min = ARG1(scan);  /* min to match */
5320             ST.max = ARG2(scan);  /* max to match */
5321             if (cur_eval && cur_eval->u.eval.close_paren &&
5322                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5323                 ST.min=1;
5324                 ST.max=1;
5325             }
5326             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5327             goto repeat;
5328         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5329             ST.paren = 0;
5330             ST.min = ARG1(scan);  /* min to match */
5331             ST.max = ARG2(scan);  /* max to match */
5332             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5333           repeat:
5334             /*
5335             * Lookahead to avoid useless match attempts
5336             * when we know what character comes next.
5337             *
5338             * Used to only do .*x and .*?x, but now it allows
5339             * for )'s, ('s and (?{ ... })'s to be in the way
5340             * of the quantifier and the EXACT-like node.  -- japhy
5341             */
5342
5343             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5344                 sayNO;
5345             if (HAS_TEXT(next) || JUMPABLE(next)) {
5346                 U8 *s;
5347                 regnode *text_node = next;
5348
5349                 if (! HAS_TEXT(text_node))
5350                     FIND_NEXT_IMPT(text_node);
5351
5352                 if (! HAS_TEXT(text_node))
5353                     ST.c1 = ST.c2 = CHRTEST_VOID;
5354                 else {
5355                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5356                         ST.c1 = ST.c2 = CHRTEST_VOID;
5357                         goto assume_ok_easy;
5358                     }
5359                     else
5360                         s = (U8*)STRING(text_node);
5361
5362                     /*  Currently we only get here when
5363
5364                         PL_rekind[OP(text_node)] == EXACT
5365
5366                         if this changes back then the macro for IS_TEXT and
5367                         friends need to change. */
5368                     if (!UTF_PATTERN) {
5369                         ST.c1 = *s;
5370                         switch (OP(text_node)) {
5371                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5372                             case EXACTFA:
5373                             case EXACTFU_SS:
5374                             case EXACTFU_TRICKYFOLD:
5375                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5376                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5377                             default: ST.c2 = ST.c1; break;
5378                         }
5379                     }
5380                     else { /* UTF_PATTERN */
5381                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5382                              STRLEN ulen;
5383                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5384
5385                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5386                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5387                                                     uniflags);
5388                         }
5389                         else {
5390                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5391                                                      uniflags);
5392                         }
5393                     }
5394                 }
5395             }
5396             else
5397                 ST.c1 = ST.c2 = CHRTEST_VOID;
5398         assume_ok_easy:
5399
5400             ST.A = scan;
5401             ST.B = next;
5402             PL_reginput = locinput;
5403             if (minmod) {
5404                 minmod = 0;
5405                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5406                     sayNO;
5407                 ST.count = ST.min;
5408                 locinput = PL_reginput;
5409                 REGCP_SET(ST.cp);
5410                 if (ST.c1 == CHRTEST_VOID)
5411                     goto curly_try_B_min;
5412
5413                 ST.oldloc = locinput;
5414
5415                 /* set ST.maxpos to the furthest point along the
5416                  * string that could possibly match */
5417                 if  (ST.max == REG_INFTY) {
5418                     ST.maxpos = PL_regeol - 1;
5419                     if (utf8_target)
5420                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5421                             ST.maxpos--;
5422                 }
5423                 else if (utf8_target) {
5424                     int m = ST.max - ST.min;
5425                     for (ST.maxpos = locinput;
5426                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5427                         ST.maxpos += UTF8SKIP(ST.maxpos);
5428                 }
5429                 else {
5430                     ST.maxpos = locinput + ST.max - ST.min;
5431                     if (ST.maxpos >= PL_regeol)
5432                         ST.maxpos = PL_regeol - 1;
5433                 }
5434                 goto curly_try_B_min_known;
5435
5436             }
5437             else {
5438                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5439                 locinput = PL_reginput;
5440                 if (ST.count < ST.min)
5441                     sayNO;
5442                 if ((ST.count > ST.min)
5443                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5444                 {
5445                     /* A{m,n} must come at the end of the string, there's
5446                      * no point in backing off ... */
5447                     ST.min = ST.count;
5448                     /* ...except that $ and \Z can match before *and* after
5449                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5450                        We may back off by one in this case. */
5451                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5452                         ST.min--;
5453                 }
5454                 REGCP_SET(ST.cp);
5455                 goto curly_try_B_max;
5456             }
5457             assert(0); /* NOTREACHED */
5458
5459
5460         case CURLY_B_min_known_fail:
5461             /* failed to find B in a non-greedy match where c1,c2 valid */
5462
5463             PL_reginput = locinput;     /* Could be reset... */
5464             REGCP_UNWIND(ST.cp);
5465             if (ST.paren) {
5466                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5467             }
5468             /* Couldn't or didn't -- move forward. */
5469             ST.oldloc = locinput;
5470             if (utf8_target)
5471                 locinput += UTF8SKIP(locinput);
5472             else
5473                 locinput++;
5474             ST.count++;
5475           curly_try_B_min_known:
5476              /* find the next place where 'B' could work, then call B */
5477             {
5478                 int n;
5479                 if (utf8_target) {
5480                     n = (ST.oldloc == locinput) ? 0 : 1;
5481                     if (ST.c1 == ST.c2) {
5482                         STRLEN len;
5483                         /* set n to utf8_distance(oldloc, locinput) */
5484                         while (locinput <= ST.maxpos &&
5485                                utf8n_to_uvchr((U8*)locinput,
5486                                               UTF8_MAXBYTES, &len,
5487                                               uniflags) != (UV)ST.c1) {
5488                             locinput += len;
5489                             n++;
5490                         }
5491                     }
5492                     else {
5493                         /* set n to utf8_distance(oldloc, locinput) */
5494                         while (locinput <= ST.maxpos) {
5495                             STRLEN len;
5496                             const UV c = utf8n_to_uvchr((U8*)locinput,
5497                                                   UTF8_MAXBYTES, &len,
5498                                                   uniflags);
5499                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5500                                 break;
5501                             locinput += len;
5502                             n++;
5503                         }
5504                     }
5505                 }
5506                 else {
5507                     if (ST.c1 == ST.c2) {
5508                         while (locinput <= ST.maxpos &&
5509                                UCHARAT(locinput) != ST.c1)
5510                             locinput++;
5511                     }
5512                     else {
5513                         while (locinput <= ST.maxpos
5514                                && UCHARAT(locinput) != ST.c1
5515                                && UCHARAT(locinput) != ST.c2)
5516                             locinput++;
5517                     }
5518                     n = locinput - ST.oldloc;
5519                 }
5520                 if (locinput > ST.maxpos)
5521                     sayNO;
5522                 /* PL_reginput == oldloc now */
5523                 if (n) {
5524                     ST.count += n;
5525                     if (regrepeat(rex, ST.A, n, depth) < n)
5526                         sayNO;
5527                 }
5528                 PL_reginput = locinput;
5529                 CURLY_SETPAREN(ST.paren, ST.count);
5530                 if (cur_eval && cur_eval->u.eval.close_paren &&
5531                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5532                     goto fake_end;
5533                 }
5534                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5535             }
5536             assert(0); /* NOTREACHED */
5537
5538
5539         case CURLY_B_min_fail:
5540             /* failed to find B in a non-greedy match where c1,c2 invalid */
5541
5542             REGCP_UNWIND(ST.cp);
5543             if (ST.paren) {
5544                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5545             }
5546             /* failed -- move forward one */
5547             PL_reginput = locinput;
5548             if (regrepeat(rex, ST.A, 1, depth)) {
5549                 ST.count++;
5550                 locinput = PL_reginput;
5551                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5552                         ST.count > 0)) /* count overflow ? */
5553                 {
5554                   curly_try_B_min:
5555                     CURLY_SETPAREN(ST.paren, ST.count);
5556                     if (cur_eval && cur_eval->u.eval.close_paren &&
5557                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5558                         goto fake_end;
5559                     }
5560                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5561                 }
5562             }
5563             sayNO;
5564             assert(0); /* NOTREACHED */
5565
5566
5567         curly_try_B_max:
5568             /* a successful greedy match: now try to match B */
5569             if (cur_eval && cur_eval->u.eval.close_paren &&
5570                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5571                 goto fake_end;
5572             }
5573             {
5574                 UV c = 0;
5575                 if (ST.c1 != CHRTEST_VOID)
5576                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5577                                            UTF8_MAXBYTES, 0, uniflags)
5578                                 : (UV) UCHARAT(PL_reginput);
5579                 /* If it could work, try it. */
5580                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5581                     CURLY_SETPAREN(ST.paren, ST.count);
5582                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5583                     assert(0); /* NOTREACHED */
5584                 }
5585             }
5586             /* FALL THROUGH */
5587         case CURLY_B_max_fail:
5588             /* failed to find B in a greedy match */
5589
5590             REGCP_UNWIND(ST.cp);
5591             if (ST.paren) {
5592                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5593             }
5594             /*  back up. */
5595             if (--ST.count < ST.min)
5596                 sayNO;
5597             PL_reginput = locinput = HOPc(locinput, -1);
5598             goto curly_try_B_max;
5599
5600 #undef ST
5601
5602         case END:
5603             fake_end:
5604             if (cur_eval) {
5605                 /* we've just finished A in /(??{A})B/; now continue with B */
5606                 st->u.eval.toggle_reg_flags
5607                             = cur_eval->u.eval.toggle_reg_flags;
5608                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5609
5610                 st->u.eval.prev_rex = rex_sv;           /* inner */
5611                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5612                 rex_sv = cur_eval->u.eval.prev_rex;
5613                 SET_reg_curpm(rex_sv);
5614                 rex = (struct regexp *)SvANY(rex_sv);
5615                 rexi = RXi_GET(rex);
5616                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5617
5618                 REGCP_SET(st->u.eval.lastcp);
5619                 PL_reginput = locinput;
5620
5621                 /* Restore parens of the outer rex without popping the
5622                  * savestack */
5623                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5624
5625                 st->u.eval.prev_eval = cur_eval;
5626                 cur_eval = cur_eval->u.eval.prev_eval;
5627                 DEBUG_EXECUTE_r(
5628                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5629                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5630                 if ( nochange_depth )
5631                     nochange_depth--;
5632
5633                 PUSH_YES_STATE_GOTO(EVAL_AB,
5634                         st->u.eval.prev_eval->u.eval.B); /* match B */
5635             }
5636
5637             if (locinput < reginfo->till) {
5638                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5639                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5640                                       PL_colors[4],
5641                                       (long)(locinput - PL_reg_starttry),
5642                                       (long)(reginfo->till - PL_reg_starttry),
5643                                       PL_colors[5]));
5644
5645                 sayNO_SILENT;           /* Cannot match: too short. */
5646             }
5647             PL_reginput = locinput;     /* put where regtry can find it */
5648             sayYES;                     /* Success! */
5649
5650         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5651             DEBUG_EXECUTE_r(
5652             PerlIO_printf(Perl_debug_log,
5653                 "%*s  %ssubpattern success...%s\n",
5654                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5655             PL_reginput = locinput;     /* put where regtry can find it */
5656             sayYES;                     /* Success! */
5657
5658 #undef  ST
5659 #define ST st->u.ifmatch
5660
5661         case SUSPEND:   /* (?>A) */
5662             ST.wanted = 1;
5663             PL_reginput = locinput;
5664             goto do_ifmatch;
5665
5666         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5667             ST.wanted = 0;
5668             goto ifmatch_trivial_fail_test;
5669
5670         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5671             ST.wanted = 1;
5672           ifmatch_trivial_fail_test:
5673             if (scan->flags) {
5674                 char * const s = HOPBACKc(locinput, scan->flags);
5675                 if (!s) {
5676                     /* trivial fail */
5677                     if (logical) {
5678                         logical = 0;
5679                         sw = 1 - cBOOL(ST.wanted);
5680                     }
5681                     else if (ST.wanted)
5682                         sayNO;
5683                     next = scan + ARG(scan);
5684                     if (next == scan)
5685                         next = NULL;
5686                     break;
5687                 }
5688                 PL_reginput = s;
5689             }
5690             else
5691                 PL_reginput = locinput;
5692
5693           do_ifmatch:
5694             ST.me = scan;
5695             ST.logical = logical;
5696             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5697
5698             /* execute body of (?...A) */
5699             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5700             assert(0); /* NOTREACHED */
5701
5702         case IFMATCH_A_fail: /* body of (?...A) failed */
5703             ST.wanted = !ST.wanted;
5704             /* FALL THROUGH */
5705
5706         case IFMATCH_A: /* body of (?...A) succeeded */
5707             if (ST.logical) {
5708                 sw = cBOOL(ST.wanted);
5709             }
5710             else if (!ST.wanted)
5711                 sayNO;
5712
5713             if (OP(ST.me) == SUSPEND)
5714                 locinput = PL_reginput;
5715             else {
5716                 locinput = PL_reginput = st->locinput;
5717                 nextchr = UCHARAT(locinput);
5718             }
5719             scan = ST.me + ARG(ST.me);
5720             if (scan == ST.me)
5721                 scan = NULL;
5722             continue; /* execute B */
5723
5724 #undef ST
5725
5726         case LONGJMP:
5727             next = scan + ARG(scan);
5728             if (next == scan)
5729                 next = NULL;
5730             break;
5731         case COMMIT:
5732             reginfo->cutpoint = PL_regeol;
5733             /* FALLTHROUGH */
5734         case PRUNE:
5735             PL_reginput = locinput;
5736             if (!scan->flags)
5737                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5738             PUSH_STATE_GOTO(COMMIT_next,next);
5739             assert(0); /* NOTREACHED */
5740         case COMMIT_next_fail:
5741             no_final = 1;
5742             /* FALLTHROUGH */
5743         case OPFAIL:
5744             sayNO;
5745             assert(0); /* NOTREACHED */
5746
5747 #define ST st->u.mark
5748         case MARKPOINT:
5749             ST.prev_mark = mark_state;
5750             ST.mark_name = sv_commit = sv_yes_mark
5751                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5752             mark_state = st;
5753             ST.mark_loc = PL_reginput = locinput;
5754             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5755             assert(0); /* NOTREACHED */
5756         case MARKPOINT_next:
5757             mark_state = ST.prev_mark;
5758             sayYES;
5759             assert(0); /* NOTREACHED */
5760         case MARKPOINT_next_fail:
5761             if (popmark && sv_eq(ST.mark_name,popmark))
5762             {
5763                 if (ST.mark_loc > startpoint)
5764                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5765                 popmark = NULL; /* we found our mark */
5766                 sv_commit = ST.mark_name;
5767
5768                 DEBUG_EXECUTE_r({
5769                         PerlIO_printf(Perl_debug_log,
5770                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5771                             REPORT_CODE_OFF+depth*2, "",
5772                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5773                 });
5774             }
5775             mark_state = ST.prev_mark;
5776             sv_yes_mark = mark_state ?
5777                 mark_state->u.mark.mark_name : NULL;
5778             sayNO;
5779             assert(0); /* NOTREACHED */
5780         case SKIP:
5781             PL_reginput = locinput;
5782             if (scan->flags) {
5783                 /* (*SKIP) : if we fail we cut here*/
5784                 ST.mark_name = NULL;
5785                 ST.mark_loc = locinput;
5786                 PUSH_STATE_GOTO(SKIP_next,next);
5787             } else {
5788                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5789                    otherwise do nothing.  Meaning we need to scan
5790                  */
5791                 regmatch_state *cur = mark_state;
5792                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5793
5794                 while (cur) {
5795                     if ( sv_eq( cur->u.mark.mark_name,
5796                                 find ) )
5797                     {
5798                         ST.mark_name = find;
5799                         PUSH_STATE_GOTO( SKIP_next, next );
5800                     }
5801                     cur = cur->u.mark.prev_mark;
5802                 }
5803             }
5804             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5805             break;
5806         case SKIP_next_fail:
5807             if (ST.mark_name) {
5808                 /* (*CUT:NAME) - Set up to search for the name as we
5809                    collapse the stack*/
5810                 popmark = ST.mark_name;
5811             } else {
5812                 /* (*CUT) - No name, we cut here.*/
5813                 if (ST.mark_loc > startpoint)
5814                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5815                 /* but we set sv_commit to latest mark_name if there
5816                    is one so they can test to see how things lead to this
5817                    cut */
5818                 if (mark_state)
5819                     sv_commit=mark_state->u.mark.mark_name;
5820             }
5821             no_final = 1;
5822             sayNO;
5823             assert(0); /* NOTREACHED */
5824 #undef ST
5825         case LNBREAK:
5826             if ((n=is_LNBREAK(locinput,utf8_target))) {
5827                 locinput += n;
5828                 nextchr = UCHARAT(locinput);
5829             } else
5830                 sayNO;
5831             break;
5832
5833 #define CASE_CLASS(nAmE)                              \
5834         case nAmE:                                    \
5835             if (locinput >= PL_regeol)                \
5836                 sayNO;                                \
5837             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5838                 locinput += n;                        \
5839                 nextchr = UCHARAT(locinput);          \
5840             } else                                    \
5841                 sayNO;                                \
5842             break;                                    \
5843         case N##nAmE:                                 \
5844             if (locinput >= PL_regeol)                \
5845                 sayNO;                                \
5846             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5847                 sayNO;                                \
5848             } else {                                  \
5849                 locinput += UTF8SKIP(locinput);       \
5850                 nextchr = UCHARAT(locinput);          \
5851             }                                         \
5852             break
5853
5854         CASE_CLASS(VERTWS);
5855         CASE_CLASS(HORIZWS);
5856 #undef CASE_CLASS
5857
5858         default:
5859             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5860                           PTR2UV(scan), OP(scan));
5861             Perl_croak(aTHX_ "regexp memory corruption");
5862
5863         } /* end switch */
5864
5865         /* switch break jumps here */
5866         scan = next; /* prepare to execute the next op and ... */
5867         continue;    /* ... jump back to the top, reusing st */
5868         assert(0); /* NOTREACHED */
5869
5870       push_yes_state:
5871         /* push a state that backtracks on success */
5872         st->u.yes.prev_yes_state = yes_state;
5873         yes_state = st;
5874         /* FALL THROUGH */
5875       push_state:
5876         /* push a new regex state, then continue at scan  */
5877         {
5878             regmatch_state *newst;
5879
5880             DEBUG_STACK_r({
5881                 regmatch_state *cur = st;
5882                 regmatch_state *curyes = yes_state;
5883                 int curd = depth;
5884                 regmatch_slab *slab = PL_regmatch_slab;
5885                 for (;curd > -1;cur--,curd--) {
5886                     if (cur < SLAB_FIRST(slab)) {
5887                         slab = slab->prev;
5888                         cur = SLAB_LAST(slab);
5889                     }
5890                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5891                         REPORT_CODE_OFF + 2 + depth * 2,"",
5892                         curd, PL_reg_name[cur->resume_state],
5893                         (curyes == cur) ? "yes" : ""
5894                     );
5895                     if (curyes == cur)
5896                         curyes = cur->u.yes.prev_yes_state;
5897                 }
5898             } else
5899                 DEBUG_STATE_pp("push")
5900             );
5901             depth++;
5902             st->locinput = locinput;
5903             newst = st+1;
5904             if (newst >  SLAB_LAST(PL_regmatch_slab))
5905                 newst = S_push_slab(aTHX);
5906             PL_regmatch_state = newst;
5907
5908             locinput = PL_reginput;
5909             nextchr = UCHARAT(locinput);
5910             st = newst;
5911             continue;
5912             assert(0); /* NOTREACHED */
5913         }
5914     }
5915
5916     /*
5917     * We get here only if there's trouble -- normally "case END" is
5918     * the terminating point.
5919     */
5920     Perl_croak(aTHX_ "corrupted regexp pointers");
5921     /*NOTREACHED*/
5922     sayNO;
5923
5924 yes:
5925     if (yes_state) {
5926         /* we have successfully completed a subexpression, but we must now
5927          * pop to the state marked by yes_state and continue from there */
5928         assert(st != yes_state);
5929 #ifdef DEBUGGING
5930         while (st != yes_state) {
5931             st--;
5932             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5933                 PL_regmatch_slab = PL_regmatch_slab->prev;
5934                 st = SLAB_LAST(PL_regmatch_slab);
5935             }
5936             DEBUG_STATE_r({
5937                 if (no_final) {
5938                     DEBUG_STATE_pp("pop (no final)");
5939                 } else {
5940                     DEBUG_STATE_pp("pop (yes)");
5941                 }
5942             });
5943             depth--;
5944         }
5945 #else
5946         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5947             || yes_state > SLAB_LAST(PL_regmatch_slab))
5948         {
5949             /* not in this slab, pop slab */
5950             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5951             PL_regmatch_slab = PL_regmatch_slab->prev;
5952             st = SLAB_LAST(PL_regmatch_slab);
5953         }
5954         depth -= (st - yes_state);
5955 #endif
5956         st = yes_state;
5957         yes_state = st->u.yes.prev_yes_state;
5958         PL_regmatch_state = st;
5959
5960         if (no_final) {
5961             locinput= st->locinput;
5962             nextchr = UCHARAT(locinput);
5963         }
5964         state_num = st->resume_state + no_final;
5965         goto reenter_switch;
5966     }
5967
5968     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5969                           PL_colors[4], PL_colors[5]));
5970
5971     if (PL_reg_state.re_state_eval_setup_done) {
5972         /* each successfully executed (?{...}) block does the equivalent of
5973          *   local $^R = do {...}
5974          * When popping the save stack, all these locals would be undone;
5975          * bypass this by setting the outermost saved $^R to the latest
5976          * value */
5977         if (oreplsv != GvSV(PL_replgv))
5978             sv_setsv(oreplsv, GvSV(PL_replgv));
5979     }
5980     result = 1;
5981     goto final_exit;
5982
5983 no:
5984     DEBUG_EXECUTE_r(
5985         PerlIO_printf(Perl_debug_log,
5986             "%*s  %sfailed...%s\n",
5987             REPORT_CODE_OFF+depth*2, "",
5988             PL_colors[4], PL_colors[5])
5989         );
5990
5991 no_silent:
5992     if (no_final) {
5993         if (yes_state) {
5994             goto yes;
5995         } else {
5996             goto final_exit;
5997         }
5998     }
5999     if (depth) {
6000         /* there's a previous state to backtrack to */
6001         st--;
6002         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6003             PL_regmatch_slab = PL_regmatch_slab->prev;
6004             st = SLAB_LAST(PL_regmatch_slab);
6005         }
6006         PL_regmatch_state = st;
6007         locinput= st->locinput;
6008         nextchr = UCHARAT(locinput);
6009
6010         DEBUG_STATE_pp("pop");
6011         depth--;
6012         if (yes_state == st)
6013             yes_state = st->u.yes.prev_yes_state;
6014
6015         state_num = st->resume_state + 1; /* failure = success + 1 */
6016         goto reenter_switch;
6017     }
6018     result = 0;
6019
6020   final_exit:
6021     if (rex->intflags & PREGf_VERBARG_SEEN) {
6022         SV *sv_err = get_sv("REGERROR", 1);
6023         SV *sv_mrk = get_sv("REGMARK", 1);
6024         if (result) {
6025             sv_commit = &PL_sv_no;
6026             if (!sv_yes_mark)
6027                 sv_yes_mark = &PL_sv_yes;
6028         } else {
6029             if (!sv_commit)
6030                 sv_commit = &PL_sv_yes;
6031             sv_yes_mark = &PL_sv_no;
6032         }
6033         sv_setsv(sv_err, sv_commit);
6034         sv_setsv(sv_mrk, sv_yes_mark);
6035     }
6036
6037
6038     if (last_pushed_cv) {
6039         dSP;
6040         POP_MULTICALL;
6041     }
6042
6043     /* clean up; in particular, free all slabs above current one */
6044     LEAVE_SCOPE(oldsave);
6045
6046     return result;
6047 }
6048
6049 /*
6050  - regrepeat - repeatedly match something simple, report how many
6051  */
6052 /*
6053  * [This routine now assumes that it will only match on things of length 1.
6054  * That was true before, but now we assume scan - reginput is the count,
6055  * rather than incrementing count on every character.  [Er, except utf8.]]
6056  */
6057 STATIC I32
6058 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
6059 {
6060     dVAR;
6061     register char *scan;
6062     register I32 c;
6063     register char *loceol = PL_regeol;
6064     register I32 hardcount = 0;
6065     register bool utf8_target = PL_reg_match_utf8;
6066     UV utf8_flags;
6067 #ifndef DEBUGGING
6068     PERL_UNUSED_ARG(depth);
6069 #endif
6070
6071     PERL_ARGS_ASSERT_REGREPEAT;
6072
6073     scan = PL_reginput;
6074     if (max == REG_INFTY)
6075         max = I32_MAX;
6076     else if (max < loceol - scan)
6077         loceol = scan + max;
6078     switch (OP(p)) {
6079     case REG_ANY:
6080         if (utf8_target) {
6081             loceol = PL_regeol;
6082             while (scan < loceol && hardcount < max && *scan != '\n') {
6083                 scan += UTF8SKIP(scan);
6084                 hardcount++;
6085             }
6086         } else {
6087             while (scan < loceol && *scan != '\n')
6088                 scan++;
6089         }
6090         break;
6091     case SANY:
6092         if (utf8_target) {
6093             loceol = PL_regeol;
6094             while (scan < loceol && hardcount < max) {
6095                 scan += UTF8SKIP(scan);
6096                 hardcount++;
6097             }
6098         }
6099         else
6100             scan = loceol;
6101         break;
6102     case CANY:
6103         scan = loceol;
6104         break;
6105     case EXACT:
6106         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6107          * means they match only characters in the string that can be expressed
6108          * as a single byte.  For non-utf8 strings, that means a simple match.
6109          * For utf8 strings, the character matched must be an invariant, or
6110          * downgradable to a single byte.  The pattern's utf8ness is
6111          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6112          * it is, it's an invariant */
6113
6114         c = (U8)*STRING(p);
6115         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6116
6117         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6118             while (scan < loceol && UCHARAT(scan) == c) {
6119                 scan++;
6120             }
6121         }
6122         else {
6123
6124             /* Here, the string is utf8, and the pattern char is different
6125              * in utf8 than not, so can't compare them directly.  Outside the
6126              * loop, find the two utf8 bytes that represent c, and then
6127              * look for those in sequence in the utf8 string */
6128             U8 high = UTF8_TWO_BYTE_HI(c);
6129             U8 low = UTF8_TWO_BYTE_LO(c);
6130             loceol = PL_regeol;
6131
6132             while (hardcount < max
6133                     && scan + 1 < loceol
6134                     && UCHARAT(scan) == high
6135                     && UCHARAT(scan + 1) == low)
6136             {
6137                 scan += 2;
6138                 hardcount++;
6139             }
6140         }
6141         break;
6142     case EXACTFA:
6143         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6144         goto do_exactf;
6145
6146     case EXACTFL:
6147         PL_reg_flags |= RF_tainted;
6148         utf8_flags = FOLDEQ_UTF8_LOCALE;
6149         goto do_exactf;
6150
6151     case EXACTF:
6152             utf8_flags = 0;
6153             goto do_exactf;
6154
6155     case EXACTFU_SS:
6156     case EXACTFU_TRICKYFOLD:
6157     case EXACTFU:
6158         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6159
6160         /* The comments for the EXACT case above apply as well to these fold
6161          * ones */
6162
6163     do_exactf:
6164         c = (U8)*STRING(p);
6165         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6166
6167         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6168             char *tmpeol = loceol;
6169             while (hardcount < max
6170                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6171                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6172             {
6173                 scan = tmpeol;
6174                 tmpeol = loceol;
6175                 hardcount++;
6176             }
6177
6178             /* XXX Note that the above handles properly the German sharp s in
6179              * the pattern matching ss in the string.  But it doesn't handle
6180              * properly cases where the string contains say 'LIGATURE ff' and
6181              * the pattern is 'f+'.  This would require, say, a new function or
6182              * revised interface to foldEQ_utf8(), in which the maximum number
6183              * of characters to match could be passed and it would return how
6184              * many actually did.  This is just one of many cases where
6185              * multi-char folds don't work properly, and so the fix is being
6186              * deferred */
6187         }
6188         else {
6189             U8 folded;
6190
6191             /* Here, the string isn't utf8 and c is a single byte; and either
6192              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6193              * doesn't affect c.  Can just do simple comparisons for exact or
6194              * fold matching. */
6195             switch (OP(p)) {
6196                 case EXACTF: folded = PL_fold[c]; break;
6197                 case EXACTFA:
6198                 case EXACTFU_TRICKYFOLD:
6199                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6200                 case EXACTFL: folded = PL_fold_locale[c]; break;
6201                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6202             }
6203             while (scan < loceol &&
6204                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6205             {
6206                 scan++;
6207             }
6208         }
6209         break;
6210     case ANYOFV:
6211     case ANYOF:
6212         if (utf8_target || OP(p) == ANYOFV) {
6213             STRLEN inclasslen;
6214             loceol = PL_regeol;
6215             inclasslen = loceol - scan;
6216             while (hardcount < max
6217                    && ((inclasslen = loceol - scan) > 0)
6218                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6219             {
6220                 scan += inclasslen;
6221                 hardcount++;
6222             }
6223         } else {
6224             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6225                 scan++;
6226         }
6227         break;
6228     case ALNUMU:
6229         if (utf8_target) {
6230     utf8_wordchar:
6231             loceol = PL_regeol;
6232             LOAD_UTF8_CHARCLASS_ALNUM();
6233             while (hardcount < max && scan < loceol &&
6234                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6235             {
6236                 scan += UTF8SKIP(scan);
6237                 hardcount++;
6238             }
6239         } else {
6240             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6241                 scan++;
6242             }
6243         }
6244         break;
6245     case ALNUM:
6246         if (utf8_target)
6247             goto utf8_wordchar;
6248         while (scan < loceol && isALNUM((U8) *scan)) {
6249             scan++;
6250         }
6251         break;
6252     case ALNUMA:
6253         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6254             scan++;
6255         }
6256         break;
6257     case ALNUML:
6258         PL_reg_flags |= RF_tainted;
6259         if (utf8_target) {
6260             loceol = PL_regeol;
6261             while (hardcount < max && scan < loceol &&
6262                    isALNUM_LC_utf8((U8*)scan)) {
6263                 scan += UTF8SKIP(scan);
6264                 hardcount++;
6265             }
6266         } else {
6267             while (scan < loceol && isALNUM_LC(*scan))
6268                 scan++;
6269         }
6270         break;
6271     case NALNUMU:
6272         if (utf8_target) {
6273
6274     utf8_Nwordchar:
6275
6276             loceol = PL_regeol;
6277             LOAD_UTF8_CHARCLASS_ALNUM();
6278             while (hardcount < max && scan < loceol &&
6279                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6280             {
6281                 scan += UTF8SKIP(scan);
6282                 hardcount++;
6283             }
6284         } else {
6285             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6286                 scan++;
6287             }
6288         }
6289         break;
6290     case NALNUM:
6291         if (utf8_target)
6292             goto utf8_Nwordchar;
6293         while (scan < loceol && ! isALNUM((U8) *scan)) {
6294             scan++;
6295         }
6296         break;
6297     case NALNUMA:
6298         if (utf8_target) {
6299             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6300                 scan += UTF8SKIP(scan);
6301             }
6302         }
6303         else {
6304             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6305                 scan++;
6306             }
6307         }
6308         break;
6309     case NALNUML:
6310         PL_reg_flags |= RF_tainted;
6311         if (utf8_target) {
6312             loceol = PL_regeol;
6313             while (hardcount < max && scan < loceol &&
6314                    !isALNUM_LC_utf8((U8*)scan)) {
6315                 scan += UTF8SKIP(scan);
6316                 hardcount++;
6317             }
6318         } else {
6319             while (scan < loceol && !isALNUM_LC(*scan))
6320                 scan++;
6321         }
6322         break;
6323     case SPACEU:
6324         if (utf8_target) {
6325
6326     utf8_space:
6327
6328             loceol = PL_regeol;
6329             LOAD_UTF8_CHARCLASS_SPACE();
6330             while (hardcount < max && scan < loceol &&
6331                    (*scan == ' ' ||
6332                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6333             {
6334                 scan += UTF8SKIP(scan);
6335                 hardcount++;
6336             }
6337             break;
6338         }
6339         else {
6340             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6341                 scan++;
6342             }
6343             break;
6344         }
6345     case SPACE:
6346         if (utf8_target)
6347             goto utf8_space;
6348
6349         while (scan < loceol && isSPACE((U8) *scan)) {
6350             scan++;
6351         }
6352         break;
6353     case SPACEA:
6354         while (scan < loceol && isSPACE_A((U8) *scan)) {
6355             scan++;
6356         }
6357         break;
6358     case SPACEL:
6359         PL_reg_flags |= RF_tainted;
6360         if (utf8_target) {
6361             loceol = PL_regeol;
6362             while (hardcount < max && scan < loceol &&
6363                    isSPACE_LC_utf8((U8*)scan)) {
6364                 scan += UTF8SKIP(scan);
6365                 hardcount++;
6366             }
6367         } else {
6368             while (scan < loceol && isSPACE_LC(*scan))
6369                 scan++;
6370         }
6371         break;
6372     case NSPACEU:
6373         if (utf8_target) {
6374
6375     utf8_Nspace:
6376
6377             loceol = PL_regeol;
6378             LOAD_UTF8_CHARCLASS_SPACE();
6379             while (hardcount < max && scan < loceol &&
6380                    ! (*scan == ' ' ||
6381                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6382             {
6383                 scan += UTF8SKIP(scan);
6384                 hardcount++;
6385             }
6386             break;
6387         }
6388         else {
6389             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6390                 scan++;
6391             }
6392         }
6393         break;
6394     case NSPACE:
6395         if (utf8_target)
6396             goto utf8_Nspace;
6397
6398         while (scan < loceol && ! isSPACE((U8) *scan)) {
6399             scan++;
6400         }
6401         break;
6402     case NSPACEA:
6403         if (utf8_target) {
6404             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6405                 scan += UTF8SKIP(scan);
6406             }
6407         }
6408         else {
6409             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6410                 scan++;
6411             }
6412         }
6413         break;
6414     case NSPACEL:
6415         PL_reg_flags |= RF_tainted;
6416         if (utf8_target) {
6417             loceol = PL_regeol;
6418             while (hardcount < max && scan < loceol &&
6419                    !isSPACE_LC_utf8((U8*)scan)) {
6420                 scan += UTF8SKIP(scan);
6421                 hardcount++;
6422             }
6423         } else {
6424             while (scan < loceol && !isSPACE_LC(*scan))
6425                 scan++;
6426         }
6427         break;
6428     case DIGIT:
6429         if (utf8_target) {
6430             loceol = PL_regeol;
6431             LOAD_UTF8_CHARCLASS_DIGIT();
6432             while (hardcount < max && scan < loceol &&
6433                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6434                 scan += UTF8SKIP(scan);
6435                 hardcount++;
6436             }
6437         } else {
6438             while (scan < loceol && isDIGIT(*scan))
6439                 scan++;
6440         }
6441         break;
6442     case DIGITA:
6443         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6444             scan++;
6445         }
6446         break;
6447     case DIGITL:
6448         PL_reg_flags |= RF_tainted;
6449         if (utf8_target) {
6450             loceol = PL_regeol;
6451             while (hardcount < max && scan < loceol &&
6452                    isDIGIT_LC_utf8((U8*)scan)) {
6453                 scan += UTF8SKIP(scan);
6454                 hardcount++;
6455             }
6456         } else {
6457             while (scan < loceol && isDIGIT_LC(*scan))
6458                 scan++;
6459         }
6460         break;
6461     case NDIGIT:
6462         if (utf8_target) {
6463             loceol = PL_regeol;
6464             LOAD_UTF8_CHARCLASS_DIGIT();
6465             while (hardcount < max && scan < loceol &&
6466                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6467                 scan += UTF8SKIP(scan);
6468                 hardcount++;
6469             }
6470         } else {
6471             while (scan < loceol && !isDIGIT(*scan))
6472                 scan++;
6473         }
6474         break;
6475     case NDIGITA:
6476         if (utf8_target) {
6477             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6478                 scan += UTF8SKIP(scan);
6479             }
6480         }
6481         else {
6482             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6483                 scan++;
6484             }
6485         }
6486         break;
6487     case NDIGITL:
6488         PL_reg_flags |= RF_tainted;
6489         if (utf8_target) {
6490             loceol = PL_regeol;
6491             while (hardcount < max && scan < loceol &&
6492                    !isDIGIT_LC_utf8((U8*)scan)) {
6493                 scan += UTF8SKIP(scan);
6494                 hardcount++;
6495             }
6496         } else {
6497             while (scan < loceol && !isDIGIT_LC(*scan))
6498                 scan++;
6499         }
6500         break;
6501     case LNBREAK:
6502         if (utf8_target) {
6503             loceol = PL_regeol;
6504             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6505                 scan += c;
6506                 hardcount++;
6507             }
6508         } else {
6509             /*
6510               LNBREAK can match two latin chars, which is ok,
6511               because we have a null terminated string, but we
6512               have to use hardcount in this situation
6513             */
6514             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6515                 scan+=c;
6516                 hardcount++;
6517             }
6518         }
6519         break;
6520     case HORIZWS:
6521         if (utf8_target) {
6522             loceol = PL_regeol;
6523             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6524                 scan += c;
6525                 hardcount++;
6526             }
6527         } else {
6528             while (scan < loceol && is_HORIZWS_latin1(scan))
6529                 scan++;
6530         }
6531         break;
6532     case NHORIZWS:
6533         if (utf8_target) {
6534             loceol = PL_regeol;
6535             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6536                 scan += UTF8SKIP(scan);
6537                 hardcount++;
6538             }
6539         } else {
6540             while (scan < loceol && !is_HORIZWS_latin1(scan))
6541                 scan++;
6542
6543         }
6544         break;
6545     case VERTWS:
6546         if (utf8_target) {
6547             loceol = PL_regeol;
6548             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6549                 scan += c;
6550                 hardcount++;
6551             }
6552         } else {
6553             while (scan < loceol && is_VERTWS_latin1(scan))
6554                 scan++;
6555
6556         }
6557         break;
6558     case NVERTWS:
6559         if (utf8_target) {
6560             loceol = PL_regeol;
6561             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6562                 scan += UTF8SKIP(scan);
6563                 hardcount++;
6564             }
6565         } else {
6566             while (scan < loceol && !is_VERTWS_latin1(scan))
6567                 scan++;
6568
6569         }
6570         break;
6571
6572     default:            /* Called on something of 0 width. */
6573         break;          /* So match right here or not at all. */
6574     }
6575
6576     if (hardcount)
6577         c = hardcount;
6578     else
6579         c = scan - PL_reginput;
6580     PL_reginput = scan;
6581
6582     DEBUG_r({
6583         GET_RE_DEBUG_FLAGS_DECL;
6584         DEBUG_EXECUTE_r({
6585             SV * const prop = sv_newmortal();
6586             regprop(prog, prop, p);
6587             PerlIO_printf(Perl_debug_log,
6588                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6589                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6590         });
6591     });
6592
6593     return(c);
6594 }
6595
6596
6597 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6598 /*
6599 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6600 create a copy so that changes the caller makes won't change the shared one
6601  */
6602 SV *
6603 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6604 {
6605     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6606     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6607 }
6608 #endif
6609
6610 STATIC SV *
6611 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6612 {
6613     /* Returns the swash for the input 'node' in the regex 'prog'.
6614      * If <doinit> is true, will attempt to create the swash if not already
6615      *    done.
6616      * If <listsvp> is non-null, will return the swash initialization string in
6617      *    it.
6618      * If <altsvp> is non-null, will return the alternates to the regular swash
6619      *    in it
6620      * Tied intimately to how regcomp.c sets up the data structure */
6621
6622     dVAR;
6623     SV *sw  = NULL;
6624     SV *si  = NULL;
6625     SV *alt = NULL;
6626     SV*  invlist = NULL;
6627
6628     RXi_GET_DECL(prog,progi);
6629     const struct reg_data * const data = prog ? progi->data : NULL;
6630
6631     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6632
6633     assert(ANYOF_NONBITMAP(node));
6634
6635     if (data && data->count) {
6636         const U32 n = ARG(node);
6637
6638         if (data->what[n] == 's') {
6639             SV * const rv = MUTABLE_SV(data->data[n]);
6640             AV * const av = MUTABLE_AV(SvRV(rv));
6641             SV **const ary = AvARRAY(av);
6642             bool invlist_has_user_defined_property;
6643
6644             si = *ary;  /* ary[0] = the string to initialize the swash with */
6645
6646             /* Elements 3 and 4 are either both present or both absent. [3] is
6647              * any inversion list generated at compile time; [4] indicates if
6648              * that inversion list has any user-defined properties in it. */
6649             if (av_len(av) >= 3) {
6650                 invlist = ary[3];
6651                 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6652             }
6653             else {
6654                 invlist = NULL;
6655                 invlist_has_user_defined_property = FALSE;
6656             }
6657
6658             /* Element [1] is reserved for the set-up swash.  If already there,
6659              * return it; if not, create it and store it there */
6660             if (SvROK(ary[1])) {
6661                 sw = ary[1];
6662             }
6663             else if (si && doinit) {
6664
6665                 sw = _core_swash_init("utf8", /* the utf8 package */
6666                                       "", /* nameless */
6667                                       si,
6668                                       1, /* binary */
6669                                       0, /* not from tr/// */
6670                                       FALSE, /* is error if can't find
6671                                                 property */
6672                                       invlist,
6673                                       invlist_has_user_defined_property);
6674                 (void)av_store(av, 1, sw);
6675             }
6676
6677             /* Element [2] is for any multi-char folds.  Note that is a
6678              * fundamentally flawed design, because can't backtrack and try
6679              * again.  See [perl #89774] */
6680             if (SvTYPE(ary[2]) == SVt_PVAV) {
6681                 alt = ary[2];
6682             }
6683         }
6684     }
6685
6686     if (listsvp) {
6687         SV* matches_string = newSVpvn("", 0);
6688         SV** invlistsvp;
6689
6690         /* Use the swash, if any, which has to have incorporated into it all
6691          * possibilities */
6692         if (   sw
6693             && SvROK(sw)
6694             && SvTYPE(SvRV(sw)) == SVt_PVHV
6695             && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6696         {
6697             invlist = *invlistsvp;
6698         }
6699         else if (si && si != &PL_sv_undef) {
6700
6701             /* If no swash, use the input nitialization string, if available */
6702             sv_catsv(matches_string, si);
6703         }
6704
6705         /* Add the inversion list to whatever we have.  This may have come from
6706          * the swash, or from an input parameter */
6707         if (invlist) {
6708             sv_catsv(matches_string, _invlist_contents(invlist));
6709         }
6710         *listsvp = matches_string;
6711     }
6712
6713     if (altsvp)
6714         *altsvp  = alt;
6715
6716     return sw;
6717 }
6718
6719 /*
6720  - reginclass - determine if a character falls into a character class
6721
6722   n is the ANYOF regnode
6723   p is the target string
6724   lenp is pointer to the maximum number of bytes of how far to go in p
6725     (This is assumed wthout checking to always be at least the current
6726     character's size)
6727   utf8_target tells whether p is in UTF-8.
6728
6729   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6730   from a successful match, the value it points to will be updated to how many
6731   bytes in p were matched.  If there was no match, the value is undefined,
6732   possibly changed from the input.
6733
6734   Note that this can be a synthetic start class, a combination of various
6735   nodes, so things you think might be mutually exclusive, such as locale,
6736   aren't.  It can match both locale and non-locale
6737
6738  */
6739
6740 STATIC bool
6741 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6742 {
6743     dVAR;
6744     const char flags = ANYOF_FLAGS(n);
6745     bool match = FALSE;
6746     UV c = *p;
6747     STRLEN c_len = 0;
6748     STRLEN maxlen;
6749
6750     PERL_ARGS_ASSERT_REGINCLASS;
6751
6752     /* If c is not already the code point, get it */
6753     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6754         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6755                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6756                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6757                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6758                  * UTF8_ALLOW_FFFF */
6759         if (c_len == (STRLEN)-1)
6760             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6761     }
6762     else {
6763         c_len = 1;
6764     }
6765
6766     /* Use passed in max length, or one character if none passed in or less
6767      * than one character.  And assume will match just one character.  This is
6768      * overwritten later if matched more. */
6769     if (lenp) {
6770         maxlen = (*lenp > c_len) ? *lenp : c_len;
6771         *lenp = c_len;
6772
6773     }
6774     else {
6775         maxlen = c_len;
6776     }
6777
6778     /* If this character is potentially in the bitmap, check it */
6779     if (c < 256) {
6780         if (ANYOF_BITMAP_TEST(n, c))
6781             match = TRUE;
6782         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6783                 && ! utf8_target
6784                 && ! isASCII(c))
6785         {
6786             match = TRUE;
6787         }
6788
6789         else if (flags & ANYOF_LOCALE) {
6790             PL_reg_flags |= RF_tainted;
6791
6792             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6793                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6794             {
6795                 match = TRUE;
6796             }
6797             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6798                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6799                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6800                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6801                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6802                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6803                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6804                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6805                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6806                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6807                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6808                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6809                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6810                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6811                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6812                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6813                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6814                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6815                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6816                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6817                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6818                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6819                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6820                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6821                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6822                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6823                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6824                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6825                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6826                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6827                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6828                      ) /* How's that for a conditional? */
6829             ) {
6830                 match = TRUE;
6831             }
6832         }
6833     }
6834
6835     /* If the bitmap didn't (or couldn't) match, and something outside the
6836      * bitmap could match, try that.  Locale nodes specifiy completely the
6837      * behavior of code points in the bit map (otherwise, a utf8 target would
6838      * cause them to be treated as Unicode and not locale), except in
6839      * the very unlikely event when this node is a synthetic start class, which
6840      * could be a combination of locale and non-locale nodes.  So allow locale
6841      * to match for the synthetic start class, which will give a false
6842      * positive that will be resolved when the match is done again as not part
6843      * of the synthetic start class */
6844     if (!match) {
6845         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6846             match = TRUE;       /* Everything above 255 matches */
6847         }
6848         else if (ANYOF_NONBITMAP(n)
6849                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6850                      || (utf8_target
6851                          && (c >=256
6852                              || (! (flags & ANYOF_LOCALE))
6853                              || (flags & ANYOF_IS_SYNTHETIC)))))
6854         {
6855             AV *av;
6856             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6857
6858             if (sw) {
6859                 U8 * utf8_p;
6860                 if (utf8_target) {
6861                     utf8_p = (U8 *) p;
6862                 } else {
6863
6864                     /* Not utf8.  Convert as much of the string as available up
6865                      * to the limit of how far the (single) character in the
6866                      * pattern can possibly match (no need to go further).  If
6867                      * the node is a straight ANYOF or not folding, it can't
6868                      * match more than one.  Otherwise, It can match up to how
6869                      * far a single char can fold to.  Since not utf8, each
6870                      * character is a single byte, so the max it can be in
6871                      * bytes is the same as the max it can be in characters */
6872                     STRLEN len = (OP(n) == ANYOF
6873                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6874                                   ? 1
6875                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6876                                     ? maxlen
6877                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6878                     utf8_p = bytes_to_utf8(p, &len);
6879                 }
6880
6881                 if (swash_fetch(sw, utf8_p, TRUE))
6882                     match = TRUE;
6883                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6884
6885                     /* Here, we need to test if the fold of the target string
6886                      * matches.  The non-multi char folds have all been moved to
6887                      * the compilation phase, and the multi-char folds have
6888                      * been stored by regcomp into 'av'; we linearly check to
6889                      * see if any match the target string (folded).   We know
6890                      * that the originals were each one character, but we don't
6891                      * currently know how many characters/bytes each folded to,
6892                      * except we do know that there are small limits imposed by
6893                      * Unicode.  XXX A performance enhancement would be to have
6894                      * regcomp.c store the max number of chars/bytes that are
6895                      * in an av entry, as, say the 0th element.  Even better
6896                      * would be to have a hash of the few characters that can
6897                      * start a multi-char fold to the max number of chars of
6898                      * those folds.
6899                      *
6900                      * If there is a match, we will need to advance (if lenp is
6901                      * specified) the match pointer in the target string.  But
6902                      * what we are comparing here isn't that string directly,
6903                      * but its fold, whose length may differ from the original.
6904                      * As we go along in constructing the fold, therefore, we
6905                      * create a map so that we know how many bytes in the
6906                      * source to advance given that we have matched a certain
6907                      * number of bytes in the fold.  This map is stored in
6908                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6909                      * the fold of the first character that we are folding.
6910                      * Then map_fold_len_back[n] is set to the number of bytes
6911                      * in that first character.  Similarly let m be the
6912                      * corresponding number for the second character to be
6913                      * folded.  Then map_fold_len_back[n+m] is set to the
6914                      * number of bytes occupied by the first two source
6915                      * characters. ... */
6916                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6917                     U8 folded[UTF8_MAXBYTES_CASE+1];
6918                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6919                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6920                                                   chars */
6921
6922                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6923
6924                         /* Here, only need to fold the first char of the target
6925                          * string.  It the source wasn't utf8, is 1 byte long */
6926                         to_utf8_fold(utf8_p, folded, &foldlen);
6927                         total_foldlen = foldlen;
6928                         map_fold_len_back[foldlen] = (utf8_target)
6929                                                      ? UTF8SKIP(utf8_p)
6930                                                      : 1;
6931                     }
6932                     else {
6933
6934                         /* Here, need to fold more than the first char.  Do so
6935                          * up to the limits */
6936                         U8* source_ptr = utf8_p;    /* The source for the fold
6937                                                        is the regex target
6938                                                        string */
6939                         U8* folded_ptr = folded;
6940                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6941                                                        available byte in the
6942                                                        target string */
6943                         U8 i;
6944                         for (i = 0;
6945                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6946                              i++)
6947                         {
6948
6949                             /* Fold the next character */
6950                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6951                             STRLEN this_char_foldlen;
6952                             to_utf8_fold(source_ptr,
6953                                          this_char_folded,
6954                                          &this_char_foldlen);
6955
6956                             /* Bail if it would exceed the byte limit for
6957                              * folding a single char. */
6958                             if (this_char_foldlen + folded_ptr - folded >
6959                                                             UTF8_MAXBYTES_CASE)
6960                             {
6961                                 break;
6962                             }
6963
6964                             /* Add the fold of this character */
6965                             Copy(this_char_folded,
6966                                  folded_ptr,
6967                                  this_char_foldlen,
6968                                  U8);
6969                             source_ptr += UTF8SKIP(source_ptr);
6970                             folded_ptr += this_char_foldlen;
6971                             total_foldlen = folded_ptr - folded;
6972
6973                             /* Create map from the number of bytes in the fold
6974                              * back to the number of bytes in the source.  If
6975                              * the source isn't utf8, the byte count is just
6976                              * the number of characters so far */
6977                             map_fold_len_back[total_foldlen]
6978                                                       = (utf8_target)
6979                                                         ? source_ptr - utf8_p
6980                                                         : i + 1;
6981                         }
6982                         *folded_ptr = '\0';
6983                     }
6984
6985
6986                     /* Do the linear search to see if the fold is in the list
6987                      * of multi-char folds. */
6988                     if (av) {
6989                         I32 i;
6990                         for (i = 0; i <= av_len(av); i++) {
6991                             SV* const sv = *av_fetch(av, i, FALSE);
6992                             STRLEN len;
6993                             const char * const s = SvPV_const(sv, len);
6994
6995                             if (len <= total_foldlen
6996                                 && memEQ(s, (char*)folded, len)
6997
6998                                    /* If 0, means matched a partial char. See
6999                                     * [perl #90536] */
7000                                 && map_fold_len_back[len])
7001                             {
7002
7003                                 /* Advance the target string ptr to account for
7004                                  * this fold, but have to translate from the
7005                                  * folded length to the corresponding source
7006                                  * length. */
7007                                 if (lenp) {
7008                                     *lenp = map_fold_len_back[len];
7009                                 }
7010                                 match = TRUE;
7011                                 break;
7012                             }
7013                         }
7014                     }
7015                 }
7016
7017                 /* If we allocated a string above, free it */
7018                 if (! utf8_target) Safefree(utf8_p);
7019             }
7020         }
7021     }
7022
7023     return (flags & ANYOF_INVERT) ? !match : match;
7024 }
7025
7026 STATIC U8 *
7027 S_reghop3(U8 *s, I32 off, const U8* lim)
7028 {
7029     /* return the position 'off' UTF-8 characters away from 's', forward if
7030      * 'off' >= 0, backwards if negative.  But don't go outside of position
7031      * 'lim', which better be < s  if off < 0 */
7032
7033     dVAR;
7034
7035     PERL_ARGS_ASSERT_REGHOP3;
7036
7037     if (off >= 0) {
7038         while (off-- && s < lim) {
7039             /* XXX could check well-formedness here */
7040             s += UTF8SKIP(s);
7041         }
7042     }
7043     else {
7044         while (off++ && s > lim) {
7045             s--;
7046             if (UTF8_IS_CONTINUED(*s)) {
7047                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7048                     s--;
7049             }
7050             /* XXX could check well-formedness here */
7051         }
7052     }
7053     return s;
7054 }
7055
7056 #ifdef XXX_dmq
7057 /* there are a bunch of places where we use two reghop3's that should
7058    be replaced with this routine. but since thats not done yet
7059    we ifdef it out - dmq
7060 */
7061 STATIC U8 *
7062 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7063 {
7064     dVAR;
7065
7066     PERL_ARGS_ASSERT_REGHOP4;
7067
7068     if (off >= 0) {
7069         while (off-- && s < rlim) {
7070             /* XXX could check well-formedness here */
7071             s += UTF8SKIP(s);
7072         }
7073     }
7074     else {
7075         while (off++ && s > llim) {
7076             s--;
7077             if (UTF8_IS_CONTINUED(*s)) {
7078                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7079                     s--;
7080             }
7081             /* XXX could check well-formedness here */
7082         }
7083     }
7084     return s;
7085 }
7086 #endif
7087
7088 STATIC U8 *
7089 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7090 {
7091     dVAR;
7092
7093     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7094
7095     if (off >= 0) {
7096         while (off-- && s < lim) {
7097             /* XXX could check well-formedness here */
7098             s += UTF8SKIP(s);
7099         }
7100         if (off >= 0)
7101             return NULL;
7102     }
7103     else {
7104         while (off++ && s > lim) {
7105             s--;
7106             if (UTF8_IS_CONTINUED(*s)) {
7107                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7108                     s--;
7109             }
7110             /* XXX could check well-formedness here */
7111         }
7112         if (off <= 0)
7113             return NULL;
7114     }
7115     return s;
7116 }
7117
7118 static void
7119 restore_pos(pTHX_ void *arg)
7120 {
7121     dVAR;
7122     regexp * const rex = (regexp *)arg;
7123     if (PL_reg_state.re_state_eval_setup_done) {
7124         if (PL_reg_oldsaved) {
7125             rex->subbeg = PL_reg_oldsaved;
7126             rex->sublen = PL_reg_oldsavedlen;
7127 #ifdef PERL_OLD_COPY_ON_WRITE
7128             rex->saved_copy = PL_nrs;
7129 #endif
7130             RXp_MATCH_COPIED_on(rex);
7131         }
7132         PL_reg_magic->mg_len = PL_reg_oldpos;
7133         PL_reg_state.re_state_eval_setup_done = FALSE;
7134         PL_curpm = PL_reg_oldcurpm;
7135     }
7136 }
7137
7138 STATIC void
7139 S_to_utf8_substr(pTHX_ register regexp *prog)
7140 {
7141     int i = 1;
7142
7143     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7144
7145     do {
7146         if (prog->substrs->data[i].substr
7147             && !prog->substrs->data[i].utf8_substr) {
7148             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7149             prog->substrs->data[i].utf8_substr = sv;
7150             sv_utf8_upgrade(sv);
7151             if (SvVALID(prog->substrs->data[i].substr)) {
7152                 if (SvTAIL(prog->substrs->data[i].substr)) {
7153                     /* Trim the trailing \n that fbm_compile added last
7154                        time.  */
7155                     SvCUR_set(sv, SvCUR(sv) - 1);
7156                     /* Whilst this makes the SV technically "invalid" (as its
7157                        buffer is no longer followed by "\0") when fbm_compile()
7158                        adds the "\n" back, a "\0" is restored.  */
7159                     fbm_compile(sv, FBMcf_TAIL);
7160                 } else
7161                     fbm_compile(sv, 0);
7162             }
7163             if (prog->substrs->data[i].substr == prog->check_substr)
7164                 prog->check_utf8 = sv;
7165         }
7166     } while (i--);
7167 }
7168
7169 STATIC void
7170 S_to_byte_substr(pTHX_ register regexp *prog)
7171 {
7172     dVAR;
7173     int i = 1;
7174
7175     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7176
7177     do {
7178         if (prog->substrs->data[i].utf8_substr
7179             && !prog->substrs->data[i].substr) {
7180             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7181             if (sv_utf8_downgrade(sv, TRUE)) {
7182                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7183                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7184                         /* Trim the trailing \n that fbm_compile added last
7185                            time.  */
7186                         SvCUR_set(sv, SvCUR(sv) - 1);
7187                         fbm_compile(sv, FBMcf_TAIL);
7188                     } else
7189                         fbm_compile(sv, 0);
7190                 }
7191             } else {
7192                 SvREFCNT_dec(sv);
7193                 sv = &PL_sv_undef;
7194             }
7195             prog->substrs->data[i].substr = sv;
7196             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7197                 prog->check_substr = sv;
7198         }
7199     } while (i--);
7200 }
7201
7202 /*
7203  * Local variables:
7204  * c-indentation-style: bsd
7205  * c-basic-offset: 4
7206  * indent-tabs-mode: nil
7207  * End:
7208  *
7209  * ex: set ts=8 sts=4 sw=4 et:
7210  */