src/5017008/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86 #include "re_defs.h"
  87
  88 #ifdef PERL_IN_XSUB_RE
  89 #  include "re_comp.h"
  90 #else
  91 #  include "regcomp.h"
  92 #endif
  93
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98
  99 #ifndef STATIC
 100 #define STATIC  static
 101 #endif
 102
 103 /* Valid for non-utf8 strings: avoids the reginclass
 104  * call if there are no complications: i.e., if everything matchable is
 105  * straight forward in the bitmap */
 106 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 107                                               : ANYOF_BITMAP_TEST(p,*(c)))
 108
 109 /*
 110  * Forwards.
 111  */
 112
 113 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 114 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 115
 116 #define HOPc(pos,off) \
 117         (char *)(PL_reg_match_utf8 \
 118             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 119             : (U8*)(pos + off))
 120 #define HOPBACKc(pos, off) \
 121         (char*)(PL_reg_match_utf8\
 122             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 123             : (pos - off >= PL_bostr)           \
 124                 ? (U8*)pos - off                \
 125                 : NULL)
 126
 127 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 129
 130
 131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 132 #define NEXTCHR_IS_EOS (nextchr < 0)
 133
 134 #define SET_nextchr \
 135     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 136
 137 #define SET_locinput(p) \
 138     locinput = (p);  \
 139     SET_nextchr
 140
 141
 142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 143         if (!swash_ptr) {                                                     \
 144             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 145             ENTER; save_re_context();                                         \
 146             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 147                                          1, 0, NULL, &flags);                 \
 148             LEAVE;                                                            \
 149             assert(swash_ptr);                                                \
 150         }                                                                     \
 151     } STMT_END
 152
 153 /* If in debug mode, we test that a known character properly matches */
 154 #ifdef DEBUGGING
 155 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 156                                           property_name,                      \
 157                                           utf8_char_in_property)              \
 158         LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 159         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 160 #else
 161 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 162                                           property_name,                      \
 163                                           utf8_char_in_property)              \
 164         LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 165 #endif
 166
 167 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 168                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 169                                         swash_property_names[_CC_WORDCHAR],   \
 170                                         GREEK_SMALL_LETTER_IOTA_UTF8)
 171
 172 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 173     STMT_START {                                                              \
 174         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 175                                        "_X_regular_begin",                    \
 176                                        GREEK_SMALL_LETTER_IOTA_UTF8);         \
 177         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 178                                        "_X_extend",                           \
 179                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 180     } STMT_END
 181
 182 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 183 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 184
 185 /* for use after a quantifier and before an EXACT-like node -- japhy */
 186 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 187  *
 188  * NOTE that *nothing* that affects backtracking should be in here, specifically
 189  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 190  * node that is in between two EXACT like nodes when ascertaining what the required
 191  * "follow" character is. This should probably be moved to regex compile time
 192  * although it may be done at run time beause of the REF possibility - more
 193  * investigation required. -- demerphq
 194 */
 195 #define JUMPABLE(rn) (      \
 196     OP(rn) == OPEN ||       \
 197     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 198     OP(rn) == EVAL ||   \
 199     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 200     OP(rn) == PLUS || OP(rn) == MINMOD || \
 201     OP(rn) == KEEPS || \
 202     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 203 )
 204 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 205
 206 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 207
 208 #if 0
 209 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 210    we don't need this definition. */
 211 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 212 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 213 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 214
 215 #else
 216 /* ... so we use this as its faster. */
 217 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 218 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 219 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 220 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 221
 222 #endif
 223
 224 /*
 225   Search for mandatory following text node; for lookahead, the text must
 226   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 227 */
 228 #define FIND_NEXT_IMPT(rn) STMT_START { \
 229     while (JUMPABLE(rn)) { \
 230         const OPCODE type = OP(rn); \
 231         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 232             rn = NEXTOPER(NEXTOPER(rn)); \
 233         else if (type == PLUS) \
 234             rn = NEXTOPER(rn); \
 235         else if (type == IFMATCH) \
 236             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 237         else rn += NEXT_OFF(rn); \
 238     } \
 239 } STMT_END
 240
 241 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 242  * These are for the pre-composed Hangul syllables, which are all in a
 243  * contiguous block and arranged there in such a way so as to facilitate
 244  * alorithmic determination of their characteristics.  As such, they don't need
 245  * a swash, but can be determined by simple arithmetic.  Almost all are
 246  * GCB=LVT, but every 28th one is a GCB=LV */
 247 #define SBASE 0xAC00    /* Start of block */
 248 #define SCount 11172    /* Length of block */
 249 #define TCount 28
 250
 251 static void restore_pos(pTHX_ void *arg);
 252
 253 #define REGCP_PAREN_ELEMS 3
 254 #define REGCP_OTHER_ELEMS 3
 255 #define REGCP_FRAME_ELEMS 1
 256 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 257  * are needed for the regexp context stack bookkeeping. */
 258
 259 STATIC CHECKPOINT
 260 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 261 {
 262     dVAR;
 263     const int retval = PL_savestack_ix;
 264     const int paren_elems_to_push =
 265                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 266     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 267     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 268     I32 p;
 269     GET_RE_DEBUG_FLAGS_DECL;
 270
 271     PERL_ARGS_ASSERT_REGCPPUSH;
 272
 273     if (paren_elems_to_push < 0)
 274         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 275                    paren_elems_to_push);
 276
 277     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 278         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 279                    " out of range (%lu-%ld)",
 280                    total_elems,
 281                    (unsigned long)maxopenparen,
 282                    (long)parenfloor);
 283
 284     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 285
 286     DEBUG_BUFFERS_r(
 287         if ((int)maxopenparen > (int)parenfloor)
 288             PerlIO_printf(Perl_debug_log,
 289                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 290                 PTR2UV(rex),
 291                 PTR2UV(rex->offs)
 292             );
 293     );
 294     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 295 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 296         SSPUSHINT(rex->offs[p].end);
 297         SSPUSHINT(rex->offs[p].start);
 298         SSPUSHINT(rex->offs[p].start_tmp);
 299         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 300             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 301             (UV)p,
 302             (IV)rex->offs[p].start,
 303             (IV)rex->offs[p].start_tmp,
 304             (IV)rex->offs[p].end
 305         ));
 306     }
 307 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 308     SSPUSHINT(maxopenparen);
 309     SSPUSHINT(rex->lastparen);
 310     SSPUSHINT(rex->lastcloseparen);
 311     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 312
 313     return retval;
 314 }
 315
 316 /* These are needed since we do not localize EVAL nodes: */
 317 #define REGCP_SET(cp)                                           \
 318     DEBUG_STATE_r(                                              \
 319             PerlIO_printf(Perl_debug_log,                       \
 320                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 321                 (IV)PL_savestack_ix));                          \
 322     cp = PL_savestack_ix
 323
 324 #define REGCP_UNWIND(cp)                                        \
 325     DEBUG_STATE_r(                                              \
 326         if (cp != PL_savestack_ix)                              \
 327             PerlIO_printf(Perl_debug_log,                       \
 328                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 329                 (IV)(cp), (IV)PL_savestack_ix));                \
 330     regcpblow(cp)
 331
 332 #define UNWIND_PAREN(lp, lcp)               \
 333     for (n = rex->lastparen; n > lp; n--)   \
 334         rex->offs[n].end = -1;              \
 335     rex->lastparen = n;                     \
 336     rex->lastcloseparen = lcp;
 337
 338
 339 STATIC void
 340 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 341 {
 342     dVAR;
 343     UV i;
 344     U32 paren;
 345     GET_RE_DEBUG_FLAGS_DECL;
 346
 347     PERL_ARGS_ASSERT_REGCPPOP;
 348
 349     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 350     i = SSPOPUV;
 351     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 352     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 353     rex->lastcloseparen = SSPOPINT;
 354     rex->lastparen = SSPOPINT;
 355     *maxopenparen_p = SSPOPINT;
 356
 357     i -= REGCP_OTHER_ELEMS;
 358     /* Now restore the parentheses context. */
 359     DEBUG_BUFFERS_r(
 360         if (i || rex->lastparen + 1 <= rex->nparens)
 361             PerlIO_printf(Perl_debug_log,
 362                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 363                 PTR2UV(rex),
 364                 PTR2UV(rex->offs)
 365             );
 366     );
 367     paren = *maxopenparen_p;
 368     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 369         I32 tmps;
 370         rex->offs[paren].start_tmp = SSPOPINT;
 371         rex->offs[paren].start = SSPOPINT;
 372         tmps = SSPOPINT;
 373         if (paren <= rex->lastparen)
 374             rex->offs[paren].end = tmps;
 375         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 376             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 377             (UV)paren,
 378             (IV)rex->offs[paren].start,
 379             (IV)rex->offs[paren].start_tmp,
 380             (IV)rex->offs[paren].end,
 381             (paren > rex->lastparen ? "(skipped)" : ""));
 382         );
 383         paren--;
 384     }
 385 #if 1
 386     /* It would seem that the similar code in regtry()
 387      * already takes care of this, and in fact it is in
 388      * a better location to since this code can #if 0-ed out
 389      * but the code in regtry() is needed or otherwise tests
 390      * requiring null fields (pat.t#187 and split.t#{13,14}
 391      * (as of patchlevel 7877)  will fail.  Then again,
 392      * this code seems to be necessary or otherwise
 393      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 394      * --jhi updated by dapm */
 395     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 396         if (i > *maxopenparen_p)
 397             rex->offs[i].start = -1;
 398         rex->offs[i].end = -1;
 399         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 400             "    \\%"UVuf": %s   ..-1 undeffing\n",
 401             (UV)i,
 402             (i > *maxopenparen_p) ? "-1" : "  "
 403         ));
 404     }
 405 #endif
 406 }
 407
 408 /* restore the parens and associated vars at savestack position ix,
 409  * but without popping the stack */
 410
 411 STATIC void
 412 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 413 {
 414     I32 tmpix = PL_savestack_ix;
 415     PL_savestack_ix = ix;
 416     regcppop(rex, maxopenparen_p);
 417     PL_savestack_ix = tmpix;
 418 }
 419
 420 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 421
 422 STATIC bool
 423 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 424 {
 425     /* Returns a boolean as to whether or not 'character' is a member of the
 426      * Posix character class given by 'classnum' that should be equivalent to a
 427      * value in the typedef '_char_class_number'.
 428      *
 429      * Ideally this could be replaced by a just an array of function pointers
 430      * to the C library functions that implement the macros this calls.
 431      * However, to compile, the precise function signatures are required, and
 432      * these may vary from platform to to platform.  To avoid having to figure
 433      * out what those all are on each platform, I (khw) am using this method,
 434      * which adds an extra layer of function call overhead (unless the C
 435      * optimizer strips it away).  But we don't particularly care about
 436      * performance with locales anyway. */
 437
 438     switch ((_char_class_number) classnum) {
 439         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 440         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 441         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 442         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 443         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 444                                         || isUPPER_LC(character);
 445         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 446         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 447         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 448         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 449         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 450         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 451         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 452         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 453         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 454         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 455         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 456         default:    /* VERTSPACE should never occur in locales */
 457             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 458     }
 459
 460     assert(0); /* NOTREACHED */
 461     return FALSE;
 462 }
 463
 464 STATIC bool
 465 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 466 {
 467     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 468      * 'character' is a member of the Posix character class given by 'classnum'
 469      * that should be equivalent to a value in the typedef
 470      * '_char_class_number'.
 471      *
 472      * This just calls isFOO_lc on the code point for the character if it is in
 473      * the range 0-255.  Outside that range, all characters avoid Unicode
 474      * rules, ignoring any locale.  So use the Unicode function if this class
 475      * requires a swash, and use the Unicode macro otherwise. */
 476
 477     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 478
 479     if (UTF8_IS_INVARIANT(*character)) {
 480         return isFOO_lc(classnum, *character);
 481     }
 482     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 483         return isFOO_lc(classnum,
 484                         TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
 485     }
 486
 487     if (classnum < _FIRST_NON_SWASH_CC) {
 488
 489         /* Initialize the swash unless done already */
 490         if (! PL_utf8_swash_ptrs[classnum]) {
 491             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 492             PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 493                 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 494         }
 495
 496         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 497                                  character,
 498                                  TRUE /* is UTF */ ));
 499     }
 500
 501     switch ((_char_class_number) classnum) {
 502         case _CC_ENUM_SPACE:
 503         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 504
 505         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 506         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 507         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 508         default:                 return 0;  /* Things like CNTRL are always
 509                                                below 256 */
 510     }
 511
 512     assert(0); /* NOTREACHED */
 513     return FALSE;
 514 }
 515
 516 /*
 517  * pregexec and friends
 518  */
 519
 520 #ifndef PERL_IN_XSUB_RE
 521 /*
 522  - pregexec - match a regexp against a string
 523  */
 524 I32
 525 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 526          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 527 /* stringarg: the point in the string at which to begin matching */
 528 /* strend:    pointer to null at end of string */
 529 /* strbeg:    real beginning of string */
 530 /* minend:    end of match must be >= minend bytes after stringarg. */
 531 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 532  *            itself is accessed via the pointers above */
 533 /* nosave:    For optimizations. */
 534 {
 535     PERL_ARGS_ASSERT_PREGEXEC;
 536
 537     return
 538         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 539                       nosave ? 0 : REXEC_COPY_STR);
 540 }
 541 #endif
 542
 543 /*
 544  * Need to implement the following flags for reg_anch:
 545  *
 546  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 547  * USE_INTUIT_ML
 548  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 549  * INTUIT_AUTORITATIVE_ML
 550  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 551  * INTUIT_ONCE_ML
 552  *
 553  * Another flag for this function: SECOND_TIME (so that float substrs
 554  * with giant delta may be not rechecked).
 555  */
 556
 557 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 558
 559 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 560    Otherwise, only SvCUR(sv) is used to get strbeg. */
 561
 562 /* XXXX We assume that strpos is strbeg unless sv. */
 563
 564 /* XXXX Some places assume that there is a fixed substring.
 565         An update may be needed if optimizer marks as "INTUITable"
 566         RExen without fixed substrings.  Similarly, it is assumed that
 567         lengths of all the strings are no more than minlen, thus they
 568         cannot come from lookahead.
 569         (Or minlen should take into account lookahead.)
 570   NOTE: Some of this comment is not correct. minlen does now take account
 571   of lookahead/behind. Further research is required. -- demerphq
 572
 573 */
 574
 575 /* A failure to find a constant substring means that there is no need to make
 576    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 577    finding a substring too deep into the string means that fewer calls to
 578    regtry() should be needed.
 579
 580    REx compiler's optimizer found 4 possible hints:
 581         a) Anchored substring;
 582         b) Fixed substring;
 583         c) Whether we are anchored (beginning-of-line or \G);
 584         d) First node (of those at offset 0) which may distinguish positions;
 585    We use a)b)d) and multiline-part of c), and try to find a position in the
 586    string which does not contradict any of them.
 587  */
 588
 589 /* Most of decisions we do here should have been done at compile time.
 590    The nodes of the REx which we used for the search should have been
 591    deleted from the finite automaton. */
 592
 593 char *
 594 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 595                      char *strend, const U32 flags, re_scream_pos_data *data)
 596 {
 597     dVAR;
 598     struct regexp *const prog = ReANY(rx);
 599     I32 start_shift = 0;
 600     /* Should be nonnegative! */
 601     I32 end_shift   = 0;
 602     char *s;
 603     SV *check;
 604     char *strbeg;
 605     char *t;
 606     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 607     I32 ml_anch;
 608     char *other_last = NULL;    /* other substr checked before this */
 609     char *check_at = NULL;              /* check substr found at this pos */
 610     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 611     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 612     RXi_GET_DECL(prog,progi);
 613     bool is_utf8_pat;
 614 #ifdef DEBUGGING
 615     const char * const i_strpos = strpos;
 616 #endif
 617     GET_RE_DEBUG_FLAGS_DECL;
 618
 619     PERL_ARGS_ASSERT_RE_INTUIT_START;
 620     PERL_UNUSED_ARG(flags);
 621     PERL_UNUSED_ARG(data);
 622
 623     RX_MATCH_UTF8_set(rx,utf8_target);
 624
 625     is_utf8_pat = cBOOL(RX_UTF8(rx));
 626
 627     DEBUG_EXECUTE_r(
 628         debug_start_match(rx, utf8_target, strpos, strend,
 629             sv ? "Guessing start of match in sv for"
 630                : "Guessing start of match in string for");
 631               );
 632
 633     /* CHR_DIST() would be more correct here but it makes things slow. */
 634     if (prog->minlen > strend - strpos) {
 635         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 636                               "String too short... [re_intuit_start]\n"));
 637         goto fail;
 638     }
 639
 640     /* XXX we need to pass strbeg as a separate arg: the following is
 641      * guesswork and can be wrong... */
 642     if (sv && SvPOK(sv)) {
 643         char * p   = SvPVX(sv);
 644         STRLEN cur = SvCUR(sv);
 645         if (p <= strpos && strpos < p + cur) {
 646             strbeg = p;
 647             assert(p <= strend && strend <= p + cur);
 648         }
 649         else
 650             strbeg = strend - cur;
 651     }
 652     else
 653         strbeg = strpos;
 654
 655     PL_regeol = strend;
 656     if (utf8_target) {
 657         if (!prog->check_utf8 && prog->check_substr)
 658             to_utf8_substr(prog);
 659         check = prog->check_utf8;
 660     } else {
 661         if (!prog->check_substr && prog->check_utf8) {
 662             if (! to_byte_substr(prog)) {
 663                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 664             }
 665         }
 666         check = prog->check_substr;
 667     }
 668     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 669         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 670                      || ( (prog->extflags & RXf_ANCH_BOL)
 671                           && !multiline ) );    /* Check after \n? */
 672
 673         if (!ml_anch) {
 674           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 675                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 676                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 677                && sv && !SvROK(sv)
 678                && (strpos != strbeg)) {
 679               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 680               goto fail;
 681           }
 682           if (prog->check_offset_min == prog->check_offset_max
 683               && !(prog->extflags & RXf_CANY_SEEN)
 684               && ! multiline)   /* /m can cause \n's to match that aren't
 685                                    accounted for in the string max length.
 686                                    See [perl #115242] */
 687           {
 688             /* Substring at constant offset from beg-of-str... */
 689             I32 slen;
 690
 691             s = HOP3c(strpos, prog->check_offset_min, strend);
 692
 693             if (SvTAIL(check)) {
 694                 slen = SvCUR(check);    /* >= 1 */
 695
 696                 if ( strend - s > slen || strend - s < slen - 1
 697                      || (strend - s == slen && strend[-1] != '\n')) {
 698                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 699                     goto fail_finish;
 700                 }
 701                 /* Now should match s[0..slen-2] */
 702                 slen--;
 703                 if (slen && (*SvPVX_const(check) != *s
 704                              || (slen > 1
 705                                  && memNE(SvPVX_const(check), s, slen)))) {
 706                   report_neq:
 707                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 708                     goto fail_finish;
 709                 }
 710             }
 711             else if (*SvPVX_const(check) != *s
 712                      || ((slen = SvCUR(check)) > 1
 713                          && memNE(SvPVX_const(check), s, slen)))
 714                 goto report_neq;
 715             check_at = s;
 716             goto success_at_start;
 717           }
 718         }
 719         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 720         s = strpos;
 721         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 722         end_shift = prog->check_end_shift;
 723
 724         if (!ml_anch) {
 725             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 726                                          - (SvTAIL(check) != 0);
 727             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 728
 729             if (end_shift < eshift)
 730                 end_shift = eshift;
 731         }
 732     }
 733     else {                              /* Can match at random position */
 734         ml_anch = 0;
 735         s = strpos;
 736         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 737         end_shift = prog->check_end_shift;
 738
 739         /* end shift should be non negative here */
 740     }
 741
 742 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 743     if (end_shift < 0)
 744         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 745                    (IV)end_shift, RX_PRECOMP(prog));
 746 #endif
 747
 748   restart:
 749     /* Find a possible match in the region s..strend by looking for
 750        the "check" substring in the region corrected by start/end_shift. */
 751
 752     {
 753         I32 srch_start_shift = start_shift;
 754         I32 srch_end_shift = end_shift;
 755         U8* start_point;
 756         U8* end_point;
 757         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 758             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 759             srch_start_shift = strbeg - s;
 760         }
 761     DEBUG_OPTIMISE_MORE_r({
 762         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 763             (IV)prog->check_offset_min,
 764             (IV)srch_start_shift,
 765             (IV)srch_end_shift,
 766             (IV)prog->check_end_shift);
 767     });
 768
 769         if (prog->extflags & RXf_CANY_SEEN) {
 770             start_point= (U8*)(s + srch_start_shift);
 771             end_point= (U8*)(strend - srch_end_shift);
 772         } else {
 773             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 774             end_point= HOP3(strend, -srch_end_shift, strbeg);
 775         }
 776         DEBUG_OPTIMISE_MORE_r({
 777             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 778                 (int)(end_point - start_point),
 779                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 780                 start_point);
 781         });
 782
 783         s = fbm_instr( start_point, end_point,
 784                       check, multiline ? FBMrf_MULTILINE : 0);
 785     }
 786     /* Update the count-of-usability, remove useless subpatterns,
 787         unshift s.  */
 788
 789     DEBUG_EXECUTE_r({
 790         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 791             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 792         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 793                           (s ? "Found" : "Did not find"),
 794             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 795                 ? "anchored" : "floating"),
 796             quoted,
 797             RE_SV_TAIL(check),
 798             (s ? " at offset " : "...\n") );
 799     });
 800
 801     if (!s)
 802         goto fail_finish;
 803     /* Finish the diagnostic message */
 804     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 805
 806     /* XXX dmq: first branch is for positive lookbehind...
 807        Our check string is offset from the beginning of the pattern.
 808        So we need to do any stclass tests offset forward from that
 809        point. I think. :-(
 810      */
 811
 812
 813
 814     check_at=s;
 815
 816
 817     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 818        Start with the other substr.
 819        XXXX no SCREAM optimization yet - and a very coarse implementation
 820        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 821                 *always* match.  Probably should be marked during compile...
 822        Probably it is right to do no SCREAM here...
 823      */
 824
 825     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 826                 : (prog->float_substr && prog->anchored_substr))
 827     {
 828         /* Take into account the "other" substring. */
 829         /* XXXX May be hopelessly wrong for UTF... */
 830         if (!other_last)
 831             other_last = strpos;
 832         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 833           do_other_anchored:
 834             {
 835                 char * const last = HOP3c(s, -start_shift, strbeg);
 836                 char *last1, *last2;
 837                 char * const saved_s = s;
 838                 SV* must;
 839
 840                 t = s - prog->check_offset_max;
 841                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 842                     && (!utf8_target
 843                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 844                             && t > strpos)))
 845                     NOOP;
 846                 else
 847                     t = strpos;
 848                 t = HOP3c(t, prog->anchored_offset, strend);
 849                 if (t < other_last)     /* These positions already checked */
 850                     t = other_last;
 851                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 852                 if (last < last1)
 853                     last1 = last;
 854                 /* XXXX It is not documented what units *_offsets are in.
 855                    We assume bytes, but this is clearly wrong.
 856                    Meaning this code needs to be carefully reviewed for errors.
 857                    dmq.
 858                   */
 859
 860                 /* On end-of-str: see comment below. */
 861                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 862                 if (must == &PL_sv_undef) {
 863                     s = (char*)NULL;
 864                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 865                 }
 866                 else
 867                     s = fbm_instr(
 868                         (unsigned char*)t,
 869                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 870                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 871                         must,
 872                         multiline ? FBMrf_MULTILINE : 0
 873                     );
 874                 DEBUG_EXECUTE_r({
 875                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 876                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 877                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 878                         (s ? "Found" : "Contradicts"),
 879                         quoted, RE_SV_TAIL(must));
 880                 });
 881
 882
 883                 if (!s) {
 884                     if (last1 >= last2) {
 885                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 886                                                 ", giving up...\n"));
 887                         goto fail_finish;
 888                     }
 889                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 890                         ", trying floating at offset %ld...\n",
 891                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 892                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 893                     s = HOP3c(last, 1, strend);
 894                     goto restart;
 895                 }
 896                 else {
 897                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 898                           (long)(s - i_strpos)));
 899                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 900                     other_last = HOP3c(s, 1, strend);
 901                     s = saved_s;
 902                     if (t == strpos)
 903                         goto try_at_start;
 904                     goto try_at_offset;
 905                 }
 906             }
 907         }
 908         else {          /* Take into account the floating substring. */
 909             char *last, *last1;
 910             char * const saved_s = s;
 911             SV* must;
 912
 913             t = HOP3c(s, -start_shift, strbeg);
 914             last1 = last =
 915                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 916             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 917                 last = HOP3c(t, prog->float_max_offset, strend);
 918             s = HOP3c(t, prog->float_min_offset, strend);
 919             if (s < other_last)
 920                 s = other_last;
 921  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 922             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 923             /* fbm_instr() takes into account exact value of end-of-str
 924                if the check is SvTAIL(ed).  Since false positives are OK,
 925                and end-of-str is not later than strend we are OK. */
 926             if (must == &PL_sv_undef) {
 927                 s = (char*)NULL;
 928                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 929             }
 930             else
 931                 s = fbm_instr((unsigned char*)s,
 932                               (unsigned char*)last + SvCUR(must)
 933                                   - (SvTAIL(must)!=0),
 934                               must, multiline ? FBMrf_MULTILINE : 0);
 935             DEBUG_EXECUTE_r({
 936                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 937                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 938                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 939                     (s ? "Found" : "Contradicts"),
 940                     quoted, RE_SV_TAIL(must));
 941             });
 942             if (!s) {
 943                 if (last1 == last) {
 944                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 945                                             ", giving up...\n"));
 946                     goto fail_finish;
 947                 }
 948                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 949                     ", trying anchored starting at offset %ld...\n",
 950                     (long)(saved_s + 1 - i_strpos)));
 951                 other_last = last;
 952                 s = HOP3c(t, 1, strend);
 953                 goto restart;
 954             }
 955             else {
 956                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 957                       (long)(s - i_strpos)));
 958                 other_last = s; /* Fix this later. --Hugo */
 959                 s = saved_s;
 960                 if (t == strpos)
 961                     goto try_at_start;
 962                 goto try_at_offset;
 963             }
 964         }
 965     }
 966
 967
 968     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 969
 970     DEBUG_OPTIMISE_MORE_r(
 971         PerlIO_printf(Perl_debug_log,
 972             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 973             (IV)prog->check_offset_min,
 974             (IV)prog->check_offset_max,
 975             (IV)(s-strpos),
 976             (IV)(t-strpos),
 977             (IV)(t-s),
 978             (IV)(strend-strpos)
 979         )
 980     );
 981
 982     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 983         && (!utf8_target
 984             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 985                  && t > strpos)))
 986     {
 987         /* Fixed substring is found far enough so that the match
 988            cannot start at strpos. */
 989       try_at_offset:
 990         if (ml_anch && t[-1] != '\n') {
 991             /* Eventually fbm_*() should handle this, but often
 992                anchored_offset is not 0, so this check will not be wasted. */
 993             /* XXXX In the code below we prefer to look for "^" even in
 994                presence of anchored substrings.  And we search even
 995                beyond the found float position.  These pessimizations
 996                are historical artefacts only.  */
 997           find_anchor:
 998             while (t < strend - prog->minlen) {
 999                 if (*t == '\n') {
1000                     if (t < check_at - prog->check_offset_min) {
1001                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1002                             /* Since we moved from the found position,
1003                                we definitely contradict the found anchored
1004                                substr.  Due to the above check we do not
1005                                contradict "check" substr.
1006                                Thus we can arrive here only if check substr
1007                                is float.  Redo checking for "other"=="fixed".
1008                              */
1009                             strpos = t + 1;
1010                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1011                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1012                             goto do_other_anchored;
1013                         }
1014                         /* We don't contradict the found floating substring. */
1015                         /* XXXX Why not check for STCLASS? */
1016                         s = t + 1;
1017                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1018                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1019                         goto set_useful;
1020                     }
1021                     /* Position contradicts check-string */
1022                     /* XXXX probably better to look for check-string
1023                        than for "\n", so one should lower the limit for t? */
1024                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1025                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1026                     other_last = strpos = s = t + 1;
1027                     goto restart;
1028                 }
1029                 t++;
1030             }
1031             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1032                         PL_colors[0], PL_colors[1]));
1033             goto fail_finish;
1034         }
1035         else {
1036             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1037                         PL_colors[0], PL_colors[1]));
1038         }
1039         s = t;
1040       set_useful:
1041         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1042     }
1043     else {
1044         /* The found string does not prohibit matching at strpos,
1045            - no optimization of calling REx engine can be performed,
1046            unless it was an MBOL and we are not after MBOL,
1047            or a future STCLASS check will fail this. */
1048       try_at_start:
1049         /* Even in this situation we may use MBOL flag if strpos is offset
1050            wrt the start of the string. */
1051         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1052             && (strpos != strbeg) && strpos[-1] != '\n'
1053             /* May be due to an implicit anchor of m{.*foo}  */
1054             && !(prog->intflags & PREGf_IMPLICIT))
1055         {
1056             t = strpos;
1057             goto find_anchor;
1058         }
1059         DEBUG_EXECUTE_r( if (ml_anch)
1060             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1061                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1062         );
1063       success_at_start:
1064         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1065             && (utf8_target ? (
1066                 prog->check_utf8                /* Could be deleted already */
1067                 && --BmUSEFUL(prog->check_utf8) < 0
1068                 && (prog->check_utf8 == prog->float_utf8)
1069             ) : (
1070                 prog->check_substr              /* Could be deleted already */
1071                 && --BmUSEFUL(prog->check_substr) < 0
1072                 && (prog->check_substr == prog->float_substr)
1073             )))
1074         {
1075             /* If flags & SOMETHING - do not do it many times on the same match */
1076             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1077             /* XXX Does the destruction order has to change with utf8_target? */
1078             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1080             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1081             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1082             check = NULL;                       /* abort */
1083             s = strpos;
1084             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1085                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086             if (prog->intflags & PREGf_IMPLICIT)
1087                 prog->extflags &= ~RXf_ANCH_MBOL;
1088             /* XXXX This is a remnant of the old implementation.  It
1089                     looks wasteful, since now INTUIT can use many
1090                     other heuristics. */
1091             prog->extflags &= ~RXf_USE_INTUIT;
1092             /* XXXX What other flags might need to be cleared in this branch? */
1093         }
1094         else
1095             s = strpos;
1096     }
1097
1098     /* Last resort... */
1099     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1100     /* trie stclasses are too expensive to use here, we are better off to
1101        leave it to regmatch itself */
1102     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1103         /* minlen == 0 is possible if regstclass is \b or \B,
1104            and the fixed substr is ''$.
1105            Since minlen is already taken into account, s+1 is before strend;
1106            accidentally, minlen >= 1 guaranties no false positives at s + 1
1107            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1108            regstclass does not come from lookahead...  */
1109         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1110            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1111         const U8* const str = (U8*)STRING(progi->regstclass);
1112         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1113                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1114                     : 1);
1115         char * endpos;
1116         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1117             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1118         else if (prog->float_substr || prog->float_utf8)
1119             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1120         else
1121             endpos= strend;
1122
1123         if (checked_upto < s)
1124            checked_upto = s;
1125         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1126                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1127
1128         t = s;
1129         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1130                             NULL, is_utf8_pat);
1131         if (s) {
1132             checked_upto = s;
1133         } else {
1134 #ifdef DEBUGGING
1135             const char *what = NULL;
1136 #endif
1137             if (endpos == strend) {
1138                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1139                                 "Could not match STCLASS...\n") );
1140                 goto fail;
1141             }
1142             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143                                    "This position contradicts STCLASS...\n") );
1144             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1145                 goto fail;
1146             checked_upto = HOPBACKc(endpos, start_shift);
1147             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1148                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1149             /* Contradict one of substrings */
1150             if (prog->anchored_substr || prog->anchored_utf8) {
1151                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1152                     DEBUG_EXECUTE_r( what = "anchored" );
1153                   hop_and_restart:
1154                     s = HOP3c(t, 1, strend);
1155                     if (s + start_shift + end_shift > strend) {
1156                         /* XXXX Should be taken into account earlier? */
1157                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1158                                                "Could not match STCLASS...\n") );
1159                         goto fail;
1160                     }
1161                     if (!check)
1162                         goto giveup;
1163                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1164                                 "Looking for %s substr starting at offset %ld...\n",
1165                                  what, (long)(s + start_shift - i_strpos)) );
1166                     goto restart;
1167                 }
1168                 /* Have both, check_string is floating */
1169                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1170                     goto retry_floating_check;
1171                 /* Recheck anchored substring, but not floating... */
1172                 s = check_at;
1173                 if (!check)
1174                     goto giveup;
1175                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1176                           "Looking for anchored substr starting at offset %ld...\n",
1177                           (long)(other_last - i_strpos)) );
1178                 goto do_other_anchored;
1179             }
1180             /* Another way we could have checked stclass at the
1181                current position only: */
1182             if (ml_anch) {
1183                 s = t = t + 1;
1184                 if (!check)
1185                     goto giveup;
1186                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1187                           "Looking for /%s^%s/m starting at offset %ld...\n",
1188                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1189                 goto try_at_offset;
1190             }
1191             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1192                 goto fail;
1193             /* Check is floating substring. */
1194           retry_floating_check:
1195             t = check_at - start_shift;
1196             DEBUG_EXECUTE_r( what = "floating" );
1197             goto hop_and_restart;
1198         }
1199         if (t != s) {
1200             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1201                         "By STCLASS: moving %ld --> %ld\n",
1202                                   (long)(t - i_strpos), (long)(s - i_strpos))
1203                    );
1204         }
1205         else {
1206             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1207                                   "Does not contradict STCLASS...\n");
1208                    );
1209         }
1210     }
1211   giveup:
1212     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1213                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1214                           PL_colors[5], (long)(s - i_strpos)) );
1215     return s;
1216
1217   fail_finish:                          /* Substring not found */
1218     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1219         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1220   fail:
1221     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1222                           PL_colors[4], PL_colors[5]));
1223     return NULL;
1224 }
1225
1226 #define DECL_TRIE_TYPE(scan) \
1227     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1228                     trie_type = ((scan->flags == EXACT) \
1229                               ? (utf8_target ? trie_utf8 : trie_plain) \
1230                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1231
1232 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1233 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1234     STRLEN skiplen;                                                                 \
1235     switch (trie_type) {                                                            \
1236     case trie_utf8_fold:                                                            \
1237         if ( foldlen>0 ) {                                                          \
1238             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1239             foldlen -= len;                                                         \
1240             uscan += len;                                                           \
1241             len=0;                                                                  \
1242         } else {                                                                    \
1243             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1244             len = UTF8SKIP(uc);                                                     \
1245             skiplen = UNISKIP( uvc );                                               \
1246             foldlen -= skiplen;                                                     \
1247             uscan = foldbuf + skiplen;                                              \
1248         }                                                                           \
1249         break;                                                                      \
1250     case trie_latin_utf8_fold:                                                      \
1251         if ( foldlen>0 ) {                                                          \
1252             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1253             foldlen -= len;                                                         \
1254             uscan += len;                                                           \
1255             len=0;                                                                  \
1256         } else {                                                                    \
1257             len = 1;                                                                \
1258             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1259             skiplen = UNISKIP( uvc );                                               \
1260             foldlen -= skiplen;                                                     \
1261             uscan = foldbuf + skiplen;                                              \
1262         }                                                                           \
1263         break;                                                                      \
1264     case trie_utf8:                                                                 \
1265         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1266         break;                                                                      \
1267     case trie_plain:                                                                \
1268         uvc = (UV)*uc;                                                              \
1269         len = 1;                                                                    \
1270     }                                                                               \
1271     if (uvc < 256) {                                                                \
1272         charid = trie->charmap[ uvc ];                                              \
1273     }                                                                               \
1274     else {                                                                          \
1275         charid = 0;                                                                 \
1276         if (widecharmap) {                                                          \
1277             SV** const svpp = hv_fetch(widecharmap,                                 \
1278                         (char*)&uvc, sizeof(UV), 0);                                \
1279             if (svpp)                                                               \
1280                 charid = (U16)SvIV(*svpp);                                          \
1281         }                                                                           \
1282     }                                                                               \
1283 } STMT_END
1284
1285 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1286 STMT_START {                                              \
1287     while (s <= e) {                                      \
1288         if ( (CoNd)                                       \
1289              && (ln == 1 || folder(s, pat_string, ln))    \
1290              && (!reginfo || regtry(reginfo, &s)) )       \
1291             goto got_it;                                  \
1292         s++;                                              \
1293     }                                                     \
1294 } STMT_END
1295
1296 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1297 STMT_START {                                          \
1298     while (s < strend) {                              \
1299         CoDe                                          \
1300         s += UTF8SKIP(s);                             \
1301     }                                                 \
1302 } STMT_END
1303
1304 #define REXEC_FBC_SCAN(CoDe)                          \
1305 STMT_START {                                          \
1306     while (s < strend) {                              \
1307         CoDe                                          \
1308         s++;                                          \
1309     }                                                 \
1310 } STMT_END
1311
1312 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1313 REXEC_FBC_UTF8_SCAN(                                  \
1314     if (CoNd) {                                       \
1315         if (tmp && (!reginfo || regtry(reginfo, &s))) \
1316             goto got_it;                              \
1317         else                                          \
1318             tmp = doevery;                            \
1319     }                                                 \
1320     else                                              \
1321         tmp = 1;                                      \
1322 )
1323
1324 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1325 REXEC_FBC_SCAN(                                       \
1326     if (CoNd) {                                       \
1327         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1328             goto got_it;                              \
1329         else                                          \
1330             tmp = doevery;                            \
1331     }                                                 \
1332     else                                              \
1333         tmp = 1;                                      \
1334 )
1335
1336 #define REXEC_FBC_TRYIT               \
1337 if ((!reginfo || regtry(reginfo, &s))) \
1338     goto got_it
1339
1340 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1341     if (utf8_target) {                                             \
1342         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1343     }                                                          \
1344     else {                                                     \
1345         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1346     }
1347
1348 #define DUMP_EXEC_POS(li,s,doutf8) \
1349     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1350
1351
1352 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1353         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1354         tmp = TEST_NON_UTF8(tmp);                                              \
1355         REXEC_FBC_UTF8_SCAN(                                                   \
1356             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1357                 tmp = !tmp;                                                    \
1358                 IF_SUCCESS;                                                    \
1359             }                                                                  \
1360             else {                                                             \
1361                 IF_FAIL;                                                       \
1362             }                                                                  \
1363         );                                                                     \
1364
1365 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1366         if (s == PL_bostr) {                                                   \
1367             tmp = '\n';                                                        \
1368         }                                                                      \
1369         else {                                                                 \
1370             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1371             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1372         }                                                                      \
1373         tmp = TeSt1_UtF8;                                                      \
1374         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1375         REXEC_FBC_UTF8_SCAN(                                                   \
1376             if (tmp == ! (TeSt2_UtF8)) { \
1377                 tmp = !tmp;                                                    \
1378                 IF_SUCCESS;                                                    \
1379             }                                                                  \
1380             else {                                                             \
1381                 IF_FAIL;                                                       \
1382             }                                                                  \
1383         );                                                                     \
1384
1385 /* The only difference between the BOUND and NBOUND cases is that
1386  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1387  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1388  * with the other one being empty */
1389 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1390     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1391
1392 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1393     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1394
1395 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1396     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1397
1398 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1399     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1400
1401
1402 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1403  * be passed in completely with the variable name being tested, which isn't
1404  * such a clean interface, but this is easier to read than it was before.  We
1405  * are looking for the boundary (or non-boundary between a word and non-word
1406  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1407  * must be different.  Find the "wordness" of the character just prior to this
1408  * one, and compare it with the wordness of this one.  If they differ, we have
1409  * a boundary.  At the beginning of the string, pretend that the previous
1410  * character was a new-line */
1411 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1412     if (utf8_target) {                                                         \
1413                 UTF8_CODE \
1414     }                                                                          \
1415     else {  /* Not utf8 */                                                     \
1416         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1417         tmp = TEST_NON_UTF8(tmp);                                              \
1418         REXEC_FBC_SCAN(                                                        \
1419             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1420                 tmp = !tmp;                                                    \
1421                 IF_SUCCESS;                                                    \
1422             }                                                                  \
1423             else {                                                             \
1424                 IF_FAIL;                                                       \
1425             }                                                                  \
1426         );                                                                     \
1427     }                                                                          \
1428     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1429         goto got_it;
1430
1431 /* We know what class REx starts with.  Try to find this position... */
1432 /* if reginfo is NULL, its a dryrun */
1433 /* annoyingly all the vars in this routine have different names from their counterparts
1434    in regmatch. /grrr */
1435
1436 STATIC char *
1437 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1438     const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
1439 {
1440     dVAR;
1441     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1442     char *pat_string;   /* The pattern's exactish string */
1443     char *pat_end;          /* ptr to end char of pat_string */
1444     re_fold_t folder;   /* Function for computing non-utf8 folds */
1445     const U8 *fold_array;   /* array for folding ords < 256 */
1446     STRLEN ln;
1447     STRLEN lnc;
1448     U8 c1;
1449     U8 c2;
1450     char *e;
1451     I32 tmp = 1;        /* Scratch variable? */
1452     const bool utf8_target = PL_reg_match_utf8;
1453     UV utf8_fold_flags = 0;
1454     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1455                                    with a result inverts that result, as 0^1 =
1456                                    1 and 1^1 = 0 */
1457     _char_class_number classnum;
1458
1459     RXi_GET_DECL(prog,progi);
1460
1461     PERL_ARGS_ASSERT_FIND_BYCLASS;
1462
1463     /* We know what class it must start with. */
1464     switch (OP(c)) {
1465     case ANYOF:
1466     case ANYOF_SYNTHETIC:
1467     case ANYOF_WARN_SUPER:
1468         if (utf8_target) {
1469             REXEC_FBC_UTF8_CLASS_SCAN(
1470                       reginclass(prog, c, (U8*)s, utf8_target));
1471         }
1472         else {
1473             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1474         }
1475         break;
1476     case CANY:
1477         REXEC_FBC_SCAN(
1478             if (tmp && (!reginfo || regtry(reginfo, &s)))
1479                 goto got_it;
1480             else
1481                 tmp = doevery;
1482         );
1483         break;
1484
1485     case EXACTFA:
1486         if (is_utf8_pat || utf8_target) {
1487             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1488             goto do_exactf_utf8;
1489         }
1490         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1491         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1492         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1493
1494     case EXACTF:
1495         if (utf8_target) {
1496
1497             /* regcomp.c already folded this if pattern is in UTF-8 */
1498             utf8_fold_flags = 0;
1499             goto do_exactf_utf8;
1500         }
1501         fold_array = PL_fold;
1502         folder = foldEQ;
1503         goto do_exactf_non_utf8;
1504
1505     case EXACTFL:
1506         if (is_utf8_pat || utf8_target) {
1507             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1508             goto do_exactf_utf8;
1509         }
1510         fold_array = PL_fold_locale;
1511         folder = foldEQ_locale;
1512         goto do_exactf_non_utf8;
1513
1514     case EXACTFU_SS:
1515         if (is_utf8_pat) {
1516             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1517         }
1518         goto do_exactf_utf8;
1519
1520     case EXACTFU_TRICKYFOLD:
1521     case EXACTFU:
1522         if (is_utf8_pat || utf8_target) {
1523             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1524             goto do_exactf_utf8;
1525         }
1526
1527         /* Any 'ss' in the pattern should have been replaced by regcomp,
1528          * so we don't have to worry here about this single special case
1529          * in the Latin1 range */
1530         fold_array = PL_fold_latin1;
1531         folder = foldEQ_latin1;
1532
1533         /* FALL THROUGH */
1534
1535     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1536                            are no glitches with fold-length differences
1537                            between the target string and pattern */
1538
1539         /* The idea in the non-utf8 EXACTF* cases is to first find the
1540          * first character of the EXACTF* node and then, if necessary,
1541          * case-insensitively compare the full text of the node.  c1 is the
1542          * first character.  c2 is its fold.  This logic will not work for
1543          * Unicode semantics and the german sharp ss, which hence should
1544          * not be compiled into a node that gets here. */
1545         pat_string = STRING(c);
1546         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1547
1548         /* We know that we have to match at least 'ln' bytes (which is the
1549          * same as characters, since not utf8).  If we have to match 3
1550          * characters, and there are only 2 availabe, we know without
1551          * trying that it will fail; so don't start a match past the
1552          * required minimum number from the far end */
1553         e = HOP3c(strend, -((I32)ln), s);
1554
1555         if (!reginfo && e < s) {
1556             e = s;                      /* Due to minlen logic of intuit() */
1557         }
1558
1559         c1 = *pat_string;
1560         c2 = fold_array[c1];
1561         if (c1 == c2) { /* If char and fold are the same */
1562             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1563         }
1564         else {
1565             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1566         }
1567         break;
1568
1569     do_exactf_utf8:
1570     {
1571         unsigned expansion;
1572
1573         /* If one of the operands is in utf8, we can't use the simpler folding
1574          * above, due to the fact that many different characters can have the
1575          * same fold, or portion of a fold, or different- length fold */
1576         pat_string = STRING(c);
1577         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1578         pat_end = pat_string + ln;
1579         lnc = is_utf8_pat       /* length to match in characters */
1580                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1581                 : ln;
1582
1583         /* We have 'lnc' characters to match in the pattern, but because of
1584          * multi-character folding, each character in the target can match
1585          * up to 3 characters (Unicode guarantees it will never exceed
1586          * this) if it is utf8-encoded; and up to 2 if not (based on the
1587          * fact that the Latin 1 folds are already determined, and the
1588          * only multi-char fold in that range is the sharp-s folding to
1589          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1590          * string character.  Adjust lnc accordingly, rounding up, so that
1591          * if we need to match at least 4+1/3 chars, that really is 5. */
1592         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1593         lnc = (lnc + expansion - 1) / expansion;
1594
1595         /* As in the non-UTF8 case, if we have to match 3 characters, and
1596          * only 2 are left, it's guaranteed to fail, so don't start a
1597          * match that would require us to go beyond the end of the string
1598          */
1599         e = HOP3c(strend, -((I32)lnc), s);
1600
1601         if (!reginfo && e < s) {
1602             e = s;                      /* Due to minlen logic of intuit() */
1603         }
1604
1605         /* XXX Note that we could recalculate e to stop the loop earlier,
1606          * as the worst case expansion above will rarely be met, and as we
1607          * go along we would usually find that e moves further to the left.
1608          * This would happen only after we reached the point in the loop
1609          * where if there were no expansion we should fail.  Unclear if
1610          * worth the expense */
1611
1612         while (s <= e) {
1613             char *my_strend= (char *)strend;
1614             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1615                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1616                 && (!reginfo || regtry(reginfo, &s)) )
1617             {
1618                 goto got_it;
1619             }
1620             s += (utf8_target) ? UTF8SKIP(s) : 1;
1621         }
1622         break;
1623     }
1624     case BOUNDL:
1625         RXp_MATCH_TAINTED_on(prog);
1626         FBC_BOUND(isWORDCHAR_LC,
1627                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1628                   isWORDCHAR_LC_utf8((U8*)s));
1629         break;
1630     case NBOUNDL:
1631         RXp_MATCH_TAINTED_on(prog);
1632         FBC_NBOUND(isWORDCHAR_LC,
1633                    isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1634                    isWORDCHAR_LC_utf8((U8*)s));
1635         break;
1636     case BOUND:
1637         FBC_BOUND(isWORDCHAR,
1638                   isWORDCHAR_uni(tmp),
1639                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1640         break;
1641     case BOUNDA:
1642         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1643                          isWORDCHAR_A(tmp),
1644                          isWORDCHAR_A((U8*)s));
1645         break;
1646     case NBOUND:
1647         FBC_NBOUND(isWORDCHAR,
1648                    isWORDCHAR_uni(tmp),
1649                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1650         break;
1651     case NBOUNDA:
1652         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1653                           isWORDCHAR_A(tmp),
1654                           isWORDCHAR_A((U8*)s));
1655         break;
1656     case BOUNDU:
1657         FBC_BOUND(isWORDCHAR_L1,
1658                   isWORDCHAR_uni(tmp),
1659                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1660         break;
1661     case NBOUNDU:
1662         FBC_NBOUND(isWORDCHAR_L1,
1663                    isWORDCHAR_uni(tmp),
1664                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1665         break;
1666     case LNBREAK:
1667         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1668                         is_LNBREAK_latin1_safe(s, strend)
1669         );
1670         break;
1671
1672     /* The argument to all the POSIX node types is the class number to pass to
1673      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1674
1675     case NPOSIXL:
1676         to_complement = 1;
1677         /* FALLTHROUGH */
1678
1679     case POSIXL:
1680         RXp_MATCH_TAINTED_on(prog);
1681         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1682                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1683         break;
1684
1685     case NPOSIXD:
1686         to_complement = 1;
1687         /* FALLTHROUGH */
1688
1689     case POSIXD:
1690         if (utf8_target) {
1691             goto posix_utf8;
1692         }
1693         goto posixa;
1694
1695     case NPOSIXA:
1696         if (utf8_target) {
1697             /* The complement of something that matches only ASCII matches all
1698              * UTF-8 variant code points, plus everything in ASCII that isn't
1699              * in the class */
1700             REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1701                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1702             break;
1703         }
1704
1705         to_complement = 1;
1706         /* FALLTHROUGH */
1707
1708     case POSIXA:
1709       posixa:
1710         /* Don't need to worry about utf8, as it can match only a single
1711          * byte invariant character. */
1712         REXEC_FBC_CLASS_SCAN(
1713                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1714         break;
1715
1716     case NPOSIXU:
1717         to_complement = 1;
1718         /* FALLTHROUGH */
1719
1720     case POSIXU:
1721         if (! utf8_target) {
1722             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1723                                                                     FLAGS(c))));
1724         }
1725         else {
1726
1727       posix_utf8:
1728             classnum = (_char_class_number) FLAGS(c);
1729             if (classnum < _FIRST_NON_SWASH_CC) {
1730                 while (s < strend) {
1731
1732                     /* We avoid loading in the swash as long as possible, but
1733                      * should we have to, we jump to a separate loop.  This
1734                      * extra 'if' statement is what keeps this code from being
1735                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1736                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
1737                         goto found_above_latin1;
1738                     }
1739                     if ((UTF8_IS_INVARIANT(*s)
1740                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1741                                                                 classnum)))
1742                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1743                             && to_complement ^ cBOOL(
1744                                 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1745                                               classnum))))
1746                     {
1747                         if (tmp && (!reginfo || regtry(reginfo, &s)))
1748                             goto got_it;
1749                         else {
1750                             tmp = doevery;
1751                         }
1752                     }
1753                     else {
1754                         tmp = 1;
1755                     }
1756                     s += UTF8SKIP(s);
1757                 }
1758             }
1759             else switch (classnum) {    /* These classes are implemented as
1760                                            macros */
1761                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1762                                         revert the change of \v matching this */
1763                     /* FALL THROUGH */
1764
1765                 case _CC_ENUM_PSXSPC:
1766                     REXEC_FBC_UTF8_CLASS_SCAN(
1767                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
1768                     break;
1769
1770                 case _CC_ENUM_BLANK:
1771                     REXEC_FBC_UTF8_CLASS_SCAN(
1772                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
1773                     break;
1774
1775                 case _CC_ENUM_XDIGIT:
1776                     REXEC_FBC_UTF8_CLASS_SCAN(
1777                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1778                     break;
1779
1780                 case _CC_ENUM_VERTSPACE:
1781                     REXEC_FBC_UTF8_CLASS_SCAN(
1782                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
1783                     break;
1784
1785                 case _CC_ENUM_CNTRL:
1786                     REXEC_FBC_UTF8_CLASS_SCAN(
1787                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
1788                     break;
1789
1790                 default:
1791                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1792                     assert(0); /* NOTREACHED */
1793             }
1794         }
1795         break;
1796
1797       found_above_latin1:   /* Here we have to load a swash to get the result
1798                                for the current code point */
1799         if (! PL_utf8_swash_ptrs[classnum]) {
1800             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1801             PL_utf8_swash_ptrs[classnum] =
1802                     _core_swash_init("utf8", swash_property_names[classnum],
1803                                      &PL_sv_undef, 1, 0, NULL, &flags);
1804         }
1805
1806         /* This is a copy of the loop above for swash classes, though using the
1807          * FBC macro instead of being expanded out.  Since we've loaded the
1808          * swash, we don't have to check for that each time through the loop */
1809         REXEC_FBC_UTF8_CLASS_SCAN(
1810                 to_complement ^ cBOOL(_generic_utf8(
1811                                       classnum,
1812                                       s,
1813                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
1814                                                   (U8 *) s, TRUE))));
1815         break;
1816
1817     case AHOCORASICKC:
1818     case AHOCORASICK:
1819         {
1820             DECL_TRIE_TYPE(c);
1821             /* what trie are we using right now */
1822             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1823             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1824             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1825
1826             const char *last_start = strend - trie->minlen;
1827 #ifdef DEBUGGING
1828             const char *real_start = s;
1829 #endif
1830             STRLEN maxlen = trie->maxlen;
1831             SV *sv_points;
1832             U8 **points; /* map of where we were in the input string
1833                             when reading a given char. For ASCII this
1834                             is unnecessary overhead as the relationship
1835                             is always 1:1, but for Unicode, especially
1836                             case folded Unicode this is not true. */
1837             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1838             U8 *bitmap=NULL;
1839
1840
1841             GET_RE_DEBUG_FLAGS_DECL;
1842
1843             /* We can't just allocate points here. We need to wrap it in
1844              * an SV so it gets freed properly if there is a croak while
1845              * running the match */
1846             ENTER;
1847             SAVETMPS;
1848             sv_points=newSV(maxlen * sizeof(U8 *));
1849             SvCUR_set(sv_points,
1850                 maxlen * sizeof(U8 *));
1851             SvPOK_on(sv_points);
1852             sv_2mortal(sv_points);
1853             points=(U8**)SvPV_nolen(sv_points );
1854             if ( trie_type != trie_utf8_fold
1855                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1856             {
1857                 if (trie->bitmap)
1858                     bitmap=(U8*)trie->bitmap;
1859                 else
1860                     bitmap=(U8*)ANYOF_BITMAP(c);
1861             }
1862             /* this is the Aho-Corasick algorithm modified a touch
1863                to include special handling for long "unknown char" sequences.
1864                The basic idea being that we use AC as long as we are dealing
1865                with a possible matching char, when we encounter an unknown char
1866                (and we have not encountered an accepting state) we scan forward
1867                until we find a legal starting char.
1868                AC matching is basically that of trie matching, except that when
1869                we encounter a failing transition, we fall back to the current
1870                states "fail state", and try the current char again, a process
1871                we repeat until we reach the root state, state 1, or a legal
1872                transition. If we fail on the root state then we can either
1873                terminate if we have reached an accepting state previously, or
1874                restart the entire process from the beginning if we have not.
1875
1876              */
1877             while (s <= last_start) {
1878                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1879                 U8 *uc = (U8*)s;
1880                 U16 charid = 0;
1881                 U32 base = 1;
1882                 U32 state = 1;
1883                 UV uvc = 0;
1884                 STRLEN len = 0;
1885                 STRLEN foldlen = 0;
1886                 U8 *uscan = (U8*)NULL;
1887                 U8 *leftmost = NULL;
1888 #ifdef DEBUGGING
1889                 U32 accepted_word= 0;
1890 #endif
1891                 U32 pointpos = 0;
1892
1893                 while ( state && uc <= (U8*)strend ) {
1894                     int failed=0;
1895                     U32 word = aho->states[ state ].wordnum;
1896
1897                     if( state==1 ) {
1898                         if ( bitmap ) {
1899                             DEBUG_TRIE_EXECUTE_r(
1900                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1901                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1902                                         (char *)uc, utf8_target );
1903                                     PerlIO_printf( Perl_debug_log,
1904                                         " Scanning for legal start char...\n");
1905                                 }
1906                             );
1907                             if (utf8_target) {
1908                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1909                                     uc += UTF8SKIP(uc);
1910                                 }
1911                             } else {
1912                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1913                                     uc++;
1914                                 }
1915                             }
1916                             s= (char *)uc;
1917                         }
1918                         if (uc >(U8*)last_start) break;
1919                     }
1920
1921                     if ( word ) {
1922                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1923                         if (!leftmost || lpos < leftmost) {
1924                             DEBUG_r(accepted_word=word);
1925                             leftmost= lpos;
1926                         }
1927                         if (base==0) break;
1928
1929                     }
1930                     points[pointpos++ % maxlen]= uc;
1931                     if (foldlen || uc < (U8*)strend) {
1932                         REXEC_TRIE_READ_CHAR(trie_type, trie,
1933                                          widecharmap, uc,
1934                                          uscan, len, uvc, charid, foldlen,
1935                                          foldbuf, uniflags);
1936                         DEBUG_TRIE_EXECUTE_r({
1937                             dump_exec_pos( (char *)uc, c, strend,
1938                                         real_start, s, utf8_target);
1939                             PerlIO_printf(Perl_debug_log,
1940                                 " Charid:%3u CP:%4"UVxf" ",
1941                                  charid, uvc);
1942                         });
1943                     }
1944                     else {
1945                         len = 0;
1946                         charid = 0;
1947                     }
1948
1949
1950                     do {
1951 #ifdef DEBUGGING
1952                         word = aho->states[ state ].wordnum;
1953 #endif
1954                         base = aho->states[ state ].trans.base;
1955
1956                         DEBUG_TRIE_EXECUTE_r({
1957                             if (failed)
1958                                 dump_exec_pos( (char *)uc, c, strend, real_start,
1959                                     s,   utf8_target );
1960                             PerlIO_printf( Perl_debug_log,
1961                                 "%sState: %4"UVxf", word=%"UVxf,
1962                                 failed ? " Fail transition to " : "",
1963                                 (UV)state, (UV)word);
1964                         });
1965                         if ( base ) {
1966                             U32 tmp;
1967                             I32 offset;
1968                             if (charid &&
1969                                  ( ((offset = base + charid
1970                                     - 1 - trie->uniquecharcount)) >= 0)
1971                                  && ((U32)offset < trie->lasttrans)
1972                                  && trie->trans[offset].check == state
1973                                  && (tmp=trie->trans[offset].next))
1974                             {
1975                                 DEBUG_TRIE_EXECUTE_r(
1976                                     PerlIO_printf( Perl_debug_log," - legal\n"));
1977                                 state = tmp;
1978                                 break;
1979                             }
1980                             else {
1981                                 DEBUG_TRIE_EXECUTE_r(
1982                                     PerlIO_printf( Perl_debug_log," - fail\n"));
1983                                 failed = 1;
1984                                 state = aho->fail[state];
1985                             }
1986                         }
1987                         else {
1988                             /* we must be accepting here */
1989                             DEBUG_TRIE_EXECUTE_r(
1990                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
1991                             failed = 1;
1992                             break;
1993                         }
1994                     } while(state);
1995                     uc += len;
1996                     if (failed) {
1997                         if (leftmost)
1998                             break;
1999                         if (!state) state = 1;
2000                     }
2001                 }
2002                 if ( aho->states[ state ].wordnum ) {
2003                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2004                     if (!leftmost || lpos < leftmost) {
2005                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2006                         leftmost = lpos;
2007                     }
2008                 }
2009                 if (leftmost) {
2010                     s = (char*)leftmost;
2011                     DEBUG_TRIE_EXECUTE_r({
2012                         PerlIO_printf(
2013                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2014                             (UV)accepted_word, (IV)(s - real_start)
2015                         );
2016                     });
2017                     if (!reginfo || regtry(reginfo, &s)) {
2018                         FREETMPS;
2019                         LEAVE;
2020                         goto got_it;
2021                     }
2022                     s = HOPc(s,1);
2023                     DEBUG_TRIE_EXECUTE_r({
2024                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2025                     });
2026                 } else {
2027                     DEBUG_TRIE_EXECUTE_r(
2028                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2029                     break;
2030                 }
2031             }
2032             FREETMPS;
2033             LEAVE;
2034         }
2035         break;
2036     default:
2037         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2038         break;
2039     }
2040     return 0;
2041   got_it:
2042     return s;
2043 }
2044
2045
2046 /*
2047  - regexec_flags - match a regexp against a string
2048  */
2049 I32
2050 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2051               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2052 /* stringarg: the point in the string at which to begin matching */
2053 /* strend:    pointer to null at end of string */
2054 /* strbeg:    real beginning of string */
2055 /* minend:    end of match must be >= minend bytes after stringarg. */
2056 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2057  *            itself is accessed via the pointers above */
2058 /* data:      May be used for some additional optimizations.
2059               Currently its only used, with a U32 cast, for transmitting
2060               the ganch offset when doing a /g match. This will change */
2061 /* nosave:    For optimizations. */
2062
2063 {
2064     dVAR;
2065     struct regexp *const prog = ReANY(rx);
2066     char *s;
2067     regnode *c;
2068     char *startpos = stringarg;
2069     I32 minlen;         /* must match at least this many chars */
2070     I32 dontbother = 0; /* how many characters not to try at end */
2071     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2072     I32 scream_pos = -1;                /* Internal iterator of scream. */
2073     char *scream_olds = NULL;
2074     const bool utf8_target = cBOOL(DO_UTF8(sv));
2075     I32 multiline;
2076     RXi_GET_DECL(prog,progi);
2077     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2078     regexp_paren_pair *swap = NULL;
2079     GET_RE_DEBUG_FLAGS_DECL;
2080
2081     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2082     PERL_UNUSED_ARG(data);
2083
2084     /* Be paranoid... */
2085     if (prog == NULL || startpos == NULL) {
2086         Perl_croak(aTHX_ "NULL regexp parameter");
2087         return 0;
2088     }
2089
2090     multiline = prog->extflags & RXf_PMf_MULTILINE;
2091     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2092
2093     RX_MATCH_UTF8_set(rx, utf8_target);
2094     DEBUG_EXECUTE_r(
2095         debug_start_match(rx, utf8_target, startpos, strend,
2096         "Matching");
2097     );
2098
2099     minlen = prog->minlen;
2100
2101     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2102         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2103                               "String too short [regexec_flags]...\n"));
2104         goto phooey;
2105     }
2106
2107
2108     /* Check validity of program. */
2109     if (UCHARAT(progi->program) != REG_MAGIC) {
2110         Perl_croak(aTHX_ "corrupted regexp program");
2111     }
2112
2113     RX_MATCH_TAINTED_off(rx);
2114     PL_reg_state.re_state_eval_setup_done = FALSE;
2115     PL_reg_maxiter = 0;
2116
2117     reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
2118     reginfo.warned = FALSE;
2119     /* Mark beginning of line for ^ and lookbehind. */
2120     reginfo.bol = startpos; /* XXX not used ??? */
2121     PL_bostr  = strbeg;
2122     reginfo.sv = sv;
2123
2124     /* Mark end of line for $ (and such) */
2125     PL_regeol = strend;
2126
2127     /* see how far we have to get to not match where we matched before */
2128     reginfo.till = startpos+minend;
2129
2130     /* If there is a "must appear" string, look for it. */
2131     s = startpos;
2132
2133     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2134         MAGIC *mg;
2135         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2136             reginfo.ganch = startpos + prog->gofs;
2137             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2138               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2139         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2140                   && SvMAGIC(sv)
2141                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2142                   && mg->mg_len >= 0) {
2143             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2144             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2145                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2146
2147             if (prog->extflags & RXf_ANCH_GPOS) {
2148                 if (s > reginfo.ganch)
2149                     goto phooey;
2150                 s = reginfo.ganch - prog->gofs;
2151                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2152                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2153                 if (s < strbeg)
2154                     goto phooey;
2155             }
2156         }
2157         else if (data) {
2158             reginfo.ganch = strbeg + PTR2UV(data);
2159             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2160                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2161
2162         } else {                                /* pos() not defined */
2163             reginfo.ganch = strbeg;
2164             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2165                  "GPOS: reginfo.ganch = strbeg\n"));
2166         }
2167     }
2168     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2169         /* We have to be careful. If the previous successful match
2170            was from this regex we don't want a subsequent partially
2171            successful match to clobber the old results.
2172            So when we detect this possibility we add a swap buffer
2173            to the re, and switch the buffer each match. If we fail,
2174            we switch it back; otherwise we leave it swapped.
2175         */
2176         swap = prog->offs;
2177         /* do we need a save destructor here for eval dies? */
2178         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2179         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2180             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2181             PTR2UV(prog),
2182             PTR2UV(swap),
2183             PTR2UV(prog->offs)
2184         ));
2185     }
2186     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2187         re_scream_pos_data d;
2188
2189         d.scream_olds = &scream_olds;
2190         d.scream_pos = &scream_pos;
2191         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2192         if (!s) {
2193             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2194             goto phooey;        /* not present */
2195         }
2196     }
2197
2198
2199
2200     /* Simplest case:  anchored match need be tried only once. */
2201     /*  [unless only anchor is BOL and multiline is set] */
2202     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2203         if (s == startpos && regtry(&reginfo, &startpos))
2204             goto got_it;
2205         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2206                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2207         {
2208             char *end;
2209
2210             if (minlen)
2211                 dontbother = minlen - 1;
2212             end = HOP3c(strend, -dontbother, strbeg) - 1;
2213             /* for multiline we only have to try after newlines */
2214             if (prog->check_substr || prog->check_utf8) {
2215                 /* because of the goto we can not easily reuse the macros for bifurcating the
2216                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2217                 if (utf8_target) {
2218                     if (s == startpos)
2219                         goto after_try_utf8;
2220                     while (1) {
2221                         if (regtry(&reginfo, &s)) {
2222                             goto got_it;
2223                         }
2224                       after_try_utf8:
2225                         if (s > end) {
2226                             goto phooey;
2227                         }
2228                         if (prog->extflags & RXf_USE_INTUIT) {
2229                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2230                             if (!s) {
2231                                 goto phooey;
2232                             }
2233                         }
2234                         else {
2235                             s += UTF8SKIP(s);
2236                         }
2237                     }
2238                 } /* end search for check string in unicode */
2239                 else {
2240                     if (s == startpos) {
2241                         goto after_try_latin;
2242                     }
2243                     while (1) {
2244                         if (regtry(&reginfo, &s)) {
2245                             goto got_it;
2246                         }
2247                       after_try_latin:
2248                         if (s > end) {
2249                             goto phooey;
2250                         }
2251                         if (prog->extflags & RXf_USE_INTUIT) {
2252                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2253                             if (!s) {
2254                                 goto phooey;
2255                             }
2256                         }
2257                         else {
2258                             s++;
2259                         }
2260                     }
2261                 } /* end search for check string in latin*/
2262             } /* end search for check string */
2263             else { /* search for newline */
2264                 if (s > startpos) {
2265                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2266                     s--;
2267                 }
2268                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2269                 while (s <= end) { /* note it could be possible to match at the end of the string */
2270                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2271                         if (regtry(&reginfo, &s))
2272                             goto got_it;
2273                     }
2274                 }
2275             } /* end search for newline */
2276         } /* end anchored/multiline check string search */
2277         goto phooey;
2278     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2279     {
2280         /* the warning about reginfo.ganch being used without initialization
2281            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2282            and we only enter this block when the same bit is set. */
2283         char *tmp_s = reginfo.ganch - prog->gofs;
2284
2285         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2286             goto got_it;
2287         goto phooey;
2288     }
2289
2290     /* Messy cases:  unanchored match. */
2291     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2292         /* we have /x+whatever/ */
2293         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2294         char ch;
2295 #ifdef DEBUGGING
2296         int did_match = 0;
2297 #endif
2298         if (utf8_target) {
2299             if (! prog->anchored_utf8) {
2300                 to_utf8_substr(prog);
2301             }
2302             ch = SvPVX_const(prog->anchored_utf8)[0];
2303             REXEC_FBC_SCAN(
2304                 if (*s == ch) {
2305                     DEBUG_EXECUTE_r( did_match = 1 );
2306                     if (regtry(&reginfo, &s)) goto got_it;
2307                     s += UTF8SKIP(s);
2308                     while (s < strend && *s == ch)
2309                         s += UTF8SKIP(s);
2310                 }
2311             );
2312
2313         }
2314         else {
2315             if (! prog->anchored_substr) {
2316                 if (! to_byte_substr(prog)) {
2317                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2318                 }
2319             }
2320             ch = SvPVX_const(prog->anchored_substr)[0];
2321             REXEC_FBC_SCAN(
2322                 if (*s == ch) {
2323                     DEBUG_EXECUTE_r( did_match = 1 );
2324                     if (regtry(&reginfo, &s)) goto got_it;
2325                     s++;
2326                     while (s < strend && *s == ch)
2327                         s++;
2328                 }
2329             );
2330         }
2331         DEBUG_EXECUTE_r(if (!did_match)
2332                 PerlIO_printf(Perl_debug_log,
2333                                   "Did not find anchored character...\n")
2334                );
2335     }
2336     else if (prog->anchored_substr != NULL
2337               || prog->anchored_utf8 != NULL
2338               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2339                   && prog->float_max_offset < strend - s)) {
2340         SV *must;
2341         I32 back_max;
2342         I32 back_min;
2343         char *last;
2344         char *last1;            /* Last position checked before */
2345 #ifdef DEBUGGING
2346         int did_match = 0;
2347 #endif
2348         if (prog->anchored_substr || prog->anchored_utf8) {
2349             if (utf8_target) {
2350                 if (! prog->anchored_utf8) {
2351                     to_utf8_substr(prog);
2352                 }
2353                 must = prog->anchored_utf8;
2354             }
2355             else {
2356                 if (! prog->anchored_substr) {
2357                     if (! to_byte_substr(prog)) {
2358                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2359                     }
2360                 }
2361                 must = prog->anchored_substr;
2362             }
2363             back_max = back_min = prog->anchored_offset;
2364         } else {
2365             if (utf8_target) {
2366                 if (! prog->float_utf8) {
2367                     to_utf8_substr(prog);
2368                 }
2369                 must = prog->float_utf8;
2370             }
2371             else {
2372                 if (! prog->float_substr) {
2373                     if (! to_byte_substr(prog)) {
2374                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2375                     }
2376                 }
2377                 must = prog->float_substr;
2378             }
2379             back_max = prog->float_max_offset;
2380             back_min = prog->float_min_offset;
2381         }
2382
2383         if (back_min<0) {
2384             last = strend;
2385         } else {
2386             last = HOP3c(strend,        /* Cannot start after this */
2387                   -(I32)(CHR_SVLEN(must)
2388                          - (SvTAIL(must) != 0) + back_min), strbeg);
2389         }
2390         if (s > PL_bostr)
2391             last1 = HOPc(s, -1);
2392         else
2393             last1 = s - 1;      /* bogus */
2394
2395         /* XXXX check_substr already used to find "s", can optimize if
2396            check_substr==must. */
2397         scream_pos = -1;
2398         dontbother = end_shift;
2399         strend = HOPc(strend, -dontbother);
2400         while ( (s <= last) &&
2401                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2402                                   (unsigned char*)strend, must,
2403                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2404             DEBUG_EXECUTE_r( did_match = 1 );
2405             if (HOPc(s, -back_max) > last1) {
2406                 last1 = HOPc(s, -back_min);
2407                 s = HOPc(s, -back_max);
2408             }
2409             else {
2410                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2411
2412                 last1 = HOPc(s, -back_min);
2413                 s = t;
2414             }
2415             if (utf8_target) {
2416                 while (s <= last1) {
2417                     if (regtry(&reginfo, &s))
2418                         goto got_it;
2419                     if (s >= last1) {
2420                         s++; /* to break out of outer loop */
2421                         break;
2422                     }
2423                     s += UTF8SKIP(s);
2424                 }
2425             }
2426             else {
2427                 while (s <= last1) {
2428                     if (regtry(&reginfo, &s))
2429                         goto got_it;
2430                     s++;
2431                 }
2432             }
2433         }
2434         DEBUG_EXECUTE_r(if (!did_match) {
2435             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2436                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2437             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2438                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2439                                ? "anchored" : "floating"),
2440                 quoted, RE_SV_TAIL(must));
2441         });
2442         goto phooey;
2443     }
2444     else if ( (c = progi->regstclass) ) {
2445         if (minlen) {
2446             const OPCODE op = OP(progi->regstclass);
2447             /* don't bother with what can't match */
2448             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2449                 strend = HOPc(strend, -(minlen - 1));
2450         }
2451         DEBUG_EXECUTE_r({
2452             SV * const prop = sv_newmortal();
2453             regprop(prog, prop, c);
2454             {
2455                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2456                     s,strend-s,60);
2457                 PerlIO_printf(Perl_debug_log,
2458                     "Matching stclass %.*s against %s (%d bytes)\n",
2459                     (int)SvCUR(prop), SvPVX_const(prop),
2460                      quoted, (int)(strend - s));
2461             }
2462         });
2463         if (find_byclass(prog, c, s, strend, &reginfo, reginfo.is_utf8_pat))
2464             goto got_it;
2465         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2466     }
2467     else {
2468         dontbother = 0;
2469         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2470             /* Trim the end. */
2471             char *last= NULL;
2472             SV* float_real;
2473             STRLEN len;
2474             const char *little;
2475
2476             if (utf8_target) {
2477                 if (! prog->float_utf8) {
2478                     to_utf8_substr(prog);
2479                 }
2480                 float_real = prog->float_utf8;
2481             }
2482             else {
2483                 if (! prog->float_substr) {
2484                     if (! to_byte_substr(prog)) {
2485                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2486                     }
2487                 }
2488                 float_real = prog->float_substr;
2489             }
2490
2491             little = SvPV_const(float_real, len);
2492             if (SvTAIL(float_real)) {
2493                     /* This means that float_real contains an artificial \n on
2494                      * the end due to the presence of something like this:
2495                      * /foo$/ where we can match both "foo" and "foo\n" at the
2496                      * end of the string.  So we have to compare the end of the
2497                      * string first against the float_real without the \n and
2498                      * then against the full float_real with the string.  We
2499                      * have to watch out for cases where the string might be
2500                      * smaller than the float_real or the float_real without
2501                      * the \n. */
2502                     char *checkpos= strend - len;
2503                     DEBUG_OPTIMISE_r(
2504                         PerlIO_printf(Perl_debug_log,
2505                             "%sChecking for float_real.%s\n",
2506                             PL_colors[4], PL_colors[5]));
2507                     if (checkpos + 1 < strbeg) {
2508                         /* can't match, even if we remove the trailing \n
2509                          * string is too short to match */
2510                         DEBUG_EXECUTE_r(
2511                             PerlIO_printf(Perl_debug_log,
2512                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2513                                 PL_colors[4], PL_colors[5]));
2514                         goto phooey;
2515                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2516                         /* can match, the end of the string matches without the
2517                          * "\n" */
2518                         last = checkpos + 1;
2519                     } else if (checkpos < strbeg) {
2520                         /* cant match, string is too short when the "\n" is
2521                          * included */
2522                         DEBUG_EXECUTE_r(
2523                             PerlIO_printf(Perl_debug_log,
2524                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2525                                 PL_colors[4], PL_colors[5]));
2526                         goto phooey;
2527                     } else if (!multiline) {
2528                         /* non multiline match, so compare with the "\n" at the
2529                          * end of the string */
2530                         if (memEQ(checkpos, little, len)) {
2531                             last= checkpos;
2532                         } else {
2533                             DEBUG_EXECUTE_r(
2534                                 PerlIO_printf(Perl_debug_log,
2535                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2536                                     PL_colors[4], PL_colors[5]));
2537                             goto phooey;
2538                         }
2539                     } else {
2540                         /* multiline match, so we have to search for a place
2541                          * where the full string is located */
2542                         goto find_last;
2543                     }
2544             } else {
2545                   find_last:
2546                     if (len)
2547                         last = rninstr(s, strend, little, little + len);
2548                     else
2549                         last = strend;  /* matching "$" */
2550             }
2551             if (!last) {
2552                 /* at one point this block contained a comment which was
2553                  * probably incorrect, which said that this was a "should not
2554                  * happen" case.  Even if it was true when it was written I am
2555                  * pretty sure it is not anymore, so I have removed the comment
2556                  * and replaced it with this one. Yves */
2557                 DEBUG_EXECUTE_r(
2558                     PerlIO_printf(Perl_debug_log,
2559                         "String does not contain required substring, cannot match.\n"
2560                     ));
2561                 goto phooey;
2562             }
2563             dontbother = strend - last + prog->float_min_offset;
2564         }
2565         if (minlen && (dontbother < minlen))
2566             dontbother = minlen - 1;
2567         strend -= dontbother;              /* this one's always in bytes! */
2568         /* We don't know much -- general case. */
2569         if (utf8_target) {
2570             for (;;) {
2571                 if (regtry(&reginfo, &s))
2572                     goto got_it;
2573                 if (s >= strend)
2574                     break;
2575                 s += UTF8SKIP(s);
2576             };
2577         }
2578         else {
2579             do {
2580                 if (regtry(&reginfo, &s))
2581                     goto got_it;
2582             } while (s++ < strend);
2583         }
2584     }
2585
2586     /* Failure. */
2587     goto phooey;
2588
2589 got_it:
2590     DEBUG_BUFFERS_r(
2591         if (swap)
2592             PerlIO_printf(Perl_debug_log,
2593                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2594                 PTR2UV(prog),
2595                 PTR2UV(swap)
2596             );
2597     );
2598     Safefree(swap);
2599
2600     if (PL_reg_state.re_state_eval_setup_done)
2601         restore_pos(aTHX_ prog);
2602     if (RXp_PAREN_NAMES(prog))
2603         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2604
2605     /* make sure $`, $&, $', and $digit will work later */
2606     if ( !(flags & REXEC_NOT_FIRST) ) {
2607         if (flags & REXEC_COPY_STR) {
2608 #ifdef PERL_ANY_COW
2609             if (SvCANCOW(sv)) {
2610                 if (DEBUG_C_TEST) {
2611                     PerlIO_printf(Perl_debug_log,
2612                                   "Copy on write: regexp capture, type %d\n",
2613                                   (int) SvTYPE(sv));
2614                 }
2615                 RX_MATCH_COPY_FREE(rx);
2616                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2617                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2618                 assert (SvPOKp(prog->saved_copy));
2619                 prog->sublen  = PL_regeol - strbeg;
2620                 prog->suboffset = 0;
2621                 prog->subcoffset = 0;
2622             } else
2623 #endif
2624             {
2625                 I32 min = 0;
2626                 I32 max = PL_regeol - strbeg;
2627                 I32 sublen;
2628
2629                 if (    (flags & REXEC_COPY_SKIP_POST)
2630                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2631                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2632                 ) { /* don't copy $' part of string */
2633                     U32 n = 0;
2634                     max = -1;
2635                     /* calculate the right-most part of the string covered
2636                      * by a capture. Due to look-ahead, this may be to
2637                      * the right of $&, so we have to scan all captures */
2638                     while (n <= prog->lastparen) {
2639                         if (prog->offs[n].end > max)
2640                             max = prog->offs[n].end;
2641                         n++;
2642                     }
2643                     if (max == -1)
2644                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2645                                 ? prog->offs[0].start
2646                                 : 0;
2647                     assert(max >= 0 && max <= PL_regeol - strbeg);
2648                 }
2649
2650                 if (    (flags & REXEC_COPY_SKIP_PRE)
2651                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2652                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2653                 ) { /* don't copy $` part of string */
2654                     U32 n = 0;
2655                     min = max;
2656                     /* calculate the left-most part of the string covered
2657                      * by a capture. Due to look-behind, this may be to
2658                      * the left of $&, so we have to scan all captures */
2659                     while (min && n <= prog->lastparen) {
2660                         if (   prog->offs[n].start != -1
2661                             && prog->offs[n].start < min)
2662                         {
2663                             min = prog->offs[n].start;
2664                         }
2665                         n++;
2666                     }
2667                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2668                         && min >  prog->offs[0].end
2669                     )
2670                         min = prog->offs[0].end;
2671
2672                 }
2673
2674                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2675                 sublen = max - min;
2676
2677                 if (RX_MATCH_COPIED(rx)) {
2678                     if (sublen > prog->sublen)
2679                         prog->subbeg =
2680                                 (char*)saferealloc(prog->subbeg, sublen+1);
2681                 }
2682                 else
2683                     prog->subbeg = (char*)safemalloc(sublen+1);
2684                 Copy(strbeg + min, prog->subbeg, sublen, char);
2685                 prog->subbeg[sublen] = '\0';
2686                 prog->suboffset = min;
2687                 prog->sublen = sublen;
2688                 RX_MATCH_COPIED_on(rx);
2689             }
2690             prog->subcoffset = prog->suboffset;
2691             if (prog->suboffset && utf8_target) {
2692                 /* Convert byte offset to chars.
2693                  * XXX ideally should only compute this if @-/@+
2694                  * has been seen, a la PL_sawampersand ??? */
2695
2696                 /* If there's a direct correspondence between the
2697                  * string which we're matching and the original SV,
2698                  * then we can use the utf8 len cache associated with
2699                  * the SV. In particular, it means that under //g,
2700                  * sv_pos_b2u() will use the previously cached
2701                  * position to speed up working out the new length of
2702                  * subcoffset, rather than counting from the start of
2703                  * the string each time. This stops
2704                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2705                  * from going quadratic */
2706                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2707                     sv_pos_b2u(sv, &(prog->subcoffset));
2708                 else
2709                     prog->subcoffset = utf8_length((U8*)strbeg,
2710                                         (U8*)(strbeg+prog->suboffset));
2711             }
2712         }
2713         else {
2714             RX_MATCH_COPY_FREE(rx);
2715             prog->subbeg = strbeg;
2716             prog->suboffset = 0;
2717             prog->subcoffset = 0;
2718             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2719         }
2720     }
2721
2722     return 1;
2723
2724 phooey:
2725     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2726                           PL_colors[4], PL_colors[5]));
2727     if (PL_reg_state.re_state_eval_setup_done)
2728         restore_pos(aTHX_ prog);
2729     if (swap) {
2730         /* we failed :-( roll it back */
2731         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2732             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2733             PTR2UV(prog),
2734             PTR2UV(prog->offs),
2735             PTR2UV(swap)
2736         ));
2737         Safefree(prog->offs);
2738         prog->offs = swap;
2739     }
2740     return 0;
2741 }
2742
2743
2744 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2745  * Do inc before dec, in case old and new rex are the same */
2746 #define SET_reg_curpm(Re2) \
2747     if (PL_reg_state.re_state_eval_setup_done) {    \
2748         (void)ReREFCNT_inc(Re2);                    \
2749         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2750         PM_SETRE((PL_reg_curpm), (Re2));            \
2751     }
2752
2753
2754 /*
2755  - regtry - try match at specific point
2756  */
2757 STATIC I32                      /* 0 failure, 1 success */
2758 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2759 {
2760     dVAR;
2761     CHECKPOINT lastcp;
2762     REGEXP *const rx = reginfo->prog;
2763     regexp *const prog = ReANY(rx);
2764     I32 result;
2765     RXi_GET_DECL(prog,progi);
2766     GET_RE_DEBUG_FLAGS_DECL;
2767
2768     PERL_ARGS_ASSERT_REGTRY;
2769
2770     reginfo->cutpoint=NULL;
2771
2772     if ((prog->extflags & RXf_EVAL_SEEN)
2773         && !PL_reg_state.re_state_eval_setup_done)
2774     {
2775         MAGIC *mg;
2776
2777         PL_reg_state.re_state_eval_setup_done = TRUE;
2778         if (reginfo->sv) {
2779             /* Make $_ available to executed code. */
2780             if (reginfo->sv != DEFSV) {
2781                 SAVE_DEFSV;
2782                 DEFSV_set(reginfo->sv);
2783             }
2784
2785             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2786                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2787                 /* prepare for quick setting of pos */
2788 #ifdef PERL_OLD_COPY_ON_WRITE
2789                 if (SvIsCOW(reginfo->sv))
2790                     sv_force_normal_flags(reginfo->sv, 0);
2791 #endif
2792                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2793                                  &PL_vtbl_mglob, NULL, 0);
2794                 mg->mg_len = -1;
2795             }
2796             PL_reg_magic    = mg;
2797             PL_reg_oldpos   = mg->mg_len;
2798             SAVEDESTRUCTOR_X(restore_pos, prog);
2799         }
2800         if (!PL_reg_curpm) {
2801             Newxz(PL_reg_curpm, 1, PMOP);
2802 #ifdef USE_ITHREADS
2803             {
2804                 SV* const repointer = &PL_sv_undef;
2805                 /* this regexp is also owned by the new PL_reg_curpm, which
2806                    will try to free it.  */
2807                 av_push(PL_regex_padav, repointer);
2808                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2809                 PL_regex_pad = AvARRAY(PL_regex_padav);
2810             }
2811 #endif
2812         }
2813         SET_reg_curpm(rx);
2814         PL_reg_oldcurpm = PL_curpm;
2815         PL_curpm = PL_reg_curpm;
2816         if (RXp_MATCH_COPIED(prog)) {
2817             /*  Here is a serious problem: we cannot rewrite subbeg,
2818                 since it may be needed if this match fails.  Thus
2819                 $` inside (?{}) could fail... */
2820             PL_reg_oldsaved = prog->subbeg;
2821             PL_reg_oldsavedlen = prog->sublen;
2822             PL_reg_oldsavedoffset = prog->suboffset;
2823             PL_reg_oldsavedcoffset = prog->suboffset;
2824 #ifdef PERL_ANY_COW
2825             PL_nrs = prog->saved_copy;
2826 #endif
2827             RXp_MATCH_COPIED_off(prog);
2828         }
2829         else
2830             PL_reg_oldsaved = NULL;
2831         prog->subbeg = PL_bostr;
2832         prog->suboffset = 0;
2833         prog->subcoffset = 0;
2834         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2835     }
2836 #ifdef DEBUGGING
2837     PL_reg_starttry = *startposp;
2838 #endif
2839     prog->offs[0].start = *startposp - PL_bostr;
2840     prog->lastparen = 0;
2841     prog->lastcloseparen = 0;
2842
2843     /* XXXX What this code is doing here?!!!  There should be no need
2844        to do this again and again, prog->lastparen should take care of
2845        this!  --ilya*/
2846
2847     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2848      * Actually, the code in regcppop() (which Ilya may be meaning by
2849      * prog->lastparen), is not needed at all by the test suite
2850      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2851      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2852      * Meanwhile, this code *is* needed for the
2853      * above-mentioned test suite tests to succeed.  The common theme
2854      * on those tests seems to be returning null fields from matches.
2855      * --jhi updated by dapm */
2856 #if 1
2857     if (prog->nparens) {
2858         regexp_paren_pair *pp = prog->offs;
2859         I32 i;
2860         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2861             ++pp;
2862             pp->start = -1;
2863             pp->end = -1;
2864         }
2865     }
2866 #endif
2867     REGCP_SET(lastcp);
2868     result = regmatch(reginfo, *startposp, progi->program + 1);
2869     if (result != -1) {
2870         prog->offs[0].end = result;
2871         return 1;
2872     }
2873     if (reginfo->cutpoint)
2874         *startposp= reginfo->cutpoint;
2875     REGCP_UNWIND(lastcp);
2876     return 0;
2877 }
2878
2879
2880 #define sayYES goto yes
2881 #define sayNO goto no
2882 #define sayNO_SILENT goto no_silent
2883
2884 /* we dont use STMT_START/END here because it leads to
2885    "unreachable code" warnings, which are bogus, but distracting. */
2886 #define CACHEsayNO \
2887     if (ST.cache_mask) \
2888        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2889     sayNO
2890
2891 /* this is used to determine how far from the left messages like
2892    'failed...' are printed. It should be set such that messages
2893    are inline with the regop output that created them.
2894 */
2895 #define REPORT_CODE_OFF 32
2896
2897
2898 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2899 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2900 #define CHRTEST_NOT_A_CP_1 -999
2901 #define CHRTEST_NOT_A_CP_2 -998
2902
2903 #define SLAB_FIRST(s) (&(s)->states[0])
2904 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2905
2906 /* grab a new slab and return the first slot in it */
2907
2908 STATIC regmatch_state *
2909 S_push_slab(pTHX)
2910 {
2911 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2912     dMY_CXT;
2913 #endif
2914     regmatch_slab *s = PL_regmatch_slab->next;
2915     if (!s) {
2916         Newx(s, 1, regmatch_slab);
2917         s->prev = PL_regmatch_slab;
2918         s->next = NULL;
2919         PL_regmatch_slab->next = s;
2920     }
2921     PL_regmatch_slab = s;
2922     return SLAB_FIRST(s);
2923 }
2924
2925
2926 /* push a new state then goto it */
2927
2928 #define PUSH_STATE_GOTO(state, node, input) \
2929     pushinput = input; \
2930     scan = node; \
2931     st->resume_state = state; \
2932     goto push_state;
2933
2934 /* push a new state with success backtracking, then goto it */
2935
2936 #define PUSH_YES_STATE_GOTO(state, node, input) \
2937     pushinput = input; \
2938     scan = node; \
2939     st->resume_state = state; \
2940     goto push_yes_state;
2941
2942
2943
2944
2945 /*
2946
2947 regmatch() - main matching routine
2948
2949 This is basically one big switch statement in a loop. We execute an op,
2950 set 'next' to point the next op, and continue. If we come to a point which
2951 we may need to backtrack to on failure such as (A|B|C), we push a
2952 backtrack state onto the backtrack stack. On failure, we pop the top
2953 state, and re-enter the loop at the state indicated. If there are no more
2954 states to pop, we return failure.
2955
2956 Sometimes we also need to backtrack on success; for example /A+/, where
2957 after successfully matching one A, we need to go back and try to
2958 match another one; similarly for lookahead assertions: if the assertion
2959 completes successfully, we backtrack to the state just before the assertion
2960 and then carry on.  In these cases, the pushed state is marked as
2961 'backtrack on success too'. This marking is in fact done by a chain of
2962 pointers, each pointing to the previous 'yes' state. On success, we pop to
2963 the nearest yes state, discarding any intermediate failure-only states.
2964 Sometimes a yes state is pushed just to force some cleanup code to be
2965 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2966 it to free the inner regex.
2967
2968 Note that failure backtracking rewinds the cursor position, while
2969 success backtracking leaves it alone.
2970
2971 A pattern is complete when the END op is executed, while a subpattern
2972 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2973 ops trigger the "pop to last yes state if any, otherwise return true"
2974 behaviour.
2975
2976 A common convention in this function is to use A and B to refer to the two
2977 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2978 the subpattern to be matched possibly multiple times, while B is the entire
2979 rest of the pattern. Variable and state names reflect this convention.
2980
2981 The states in the main switch are the union of ops and failure/success of
2982 substates associated with with that op.  For example, IFMATCH is the op
2983 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2984 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2985 successfully matched A and IFMATCH_A_fail is a state saying that we have
2986 just failed to match A. Resume states always come in pairs. The backtrack
2987 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2988 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2989 on success or failure.
2990
2991 The struct that holds a backtracking state is actually a big union, with
2992 one variant for each major type of op. The variable st points to the
2993 top-most backtrack struct. To make the code clearer, within each
2994 block of code we #define ST to alias the relevant union.
2995
2996 Here's a concrete example of a (vastly oversimplified) IFMATCH
2997 implementation:
2998
2999     switch (state) {
3000     ....
3001
3002 #define ST st->u.ifmatch
3003
3004     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3005         ST.foo = ...; // some state we wish to save
3006         ...
3007         // push a yes backtrack state with a resume value of
3008         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3009         // first node of A:
3010         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3011         // NOTREACHED
3012
3013     case IFMATCH_A: // we have successfully executed A; now continue with B
3014         next = B;
3015         bar = ST.foo; // do something with the preserved value
3016         break;
3017
3018     case IFMATCH_A_fail: // A failed, so the assertion failed
3019         ...;   // do some housekeeping, then ...
3020         sayNO; // propagate the failure
3021
3022 #undef ST
3023
3024     ...
3025     }
3026
3027 For any old-timers reading this who are familiar with the old recursive
3028 approach, the code above is equivalent to:
3029
3030     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3031     {
3032         int foo = ...
3033         ...
3034         if (regmatch(A)) {
3035             next = B;
3036             bar = foo;
3037             break;
3038         }
3039         ...;   // do some housekeeping, then ...
3040         sayNO; // propagate the failure
3041     }
3042
3043 The topmost backtrack state, pointed to by st, is usually free. If you
3044 want to claim it, populate any ST.foo fields in it with values you wish to
3045 save, then do one of
3046
3047         PUSH_STATE_GOTO(resume_state, node, newinput);
3048         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3049
3050 which sets that backtrack state's resume value to 'resume_state', pushes a
3051 new free entry to the top of the backtrack stack, then goes to 'node'.
3052 On backtracking, the free slot is popped, and the saved state becomes the
3053 new free state. An ST.foo field in this new top state can be temporarily
3054 accessed to retrieve values, but once the main loop is re-entered, it
3055 becomes available for reuse.
3056
3057 Note that the depth of the backtrack stack constantly increases during the
3058 left-to-right execution of the pattern, rather than going up and down with
3059 the pattern nesting. For example the stack is at its maximum at Z at the
3060 end of the pattern, rather than at X in the following:
3061
3062     /(((X)+)+)+....(Y)+....Z/
3063
3064 The only exceptions to this are lookahead/behind assertions and the cut,
3065 (?>A), which pop all the backtrack states associated with A before
3066 continuing.
3067
3068 Backtrack state structs are allocated in slabs of about 4K in size.
3069 PL_regmatch_state and st always point to the currently active state,
3070 and PL_regmatch_slab points to the slab currently containing
3071 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3072 allocated, and is never freed until interpreter destruction. When the slab
3073 is full, a new one is allocated and chained to the end. At exit from
3074 regmatch(), slabs allocated since entry are freed.
3075
3076 */
3077
3078
3079 #define DEBUG_STATE_pp(pp)                                  \
3080     DEBUG_STATE_r({                                         \
3081         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3082         PerlIO_printf(Perl_debug_log,                       \
3083             "    %*s"pp" %s%s%s%s%s\n",                     \
3084             depth*2, "",                                    \
3085             PL_reg_name[st->resume_state],                     \
3086             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3087             ((st==yes_state) ? "Y" : ""),                   \
3088             ((st==mark_state) ? "M" : ""),                  \
3089             ((st==yes_state||st==mark_state) ? "]" : "")    \
3090         );                                                  \
3091     });
3092
3093
3094 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3095
3096 #ifdef DEBUGGING
3097
3098 STATIC void
3099 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3100     const char *start, const char *end, const char *blurb)
3101 {
3102     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3103
3104     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3105
3106     if (!PL_colorset)
3107             reginitcolors();
3108     {
3109         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3110             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3111
3112         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3113             start, end - start, 60);
3114
3115         PerlIO_printf(Perl_debug_log,
3116             "%s%s REx%s %s against %s\n",
3117                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3118
3119         if (utf8_target||utf8_pat)
3120             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3121                 utf8_pat ? "pattern" : "",
3122                 utf8_pat && utf8_target ? " and " : "",
3123                 utf8_target ? "string" : ""
3124             );
3125     }
3126 }
3127
3128 STATIC void
3129 S_dump_exec_pos(pTHX_ const char *locinput,
3130                       const regnode *scan,
3131                       const char *loc_regeol,
3132                       const char *loc_bostr,
3133                       const char *loc_reg_starttry,
3134                       const bool utf8_target)
3135 {
3136     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3137     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3138     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3139     /* The part of the string before starttry has one color
3140        (pref0_len chars), between starttry and current
3141        position another one (pref_len - pref0_len chars),
3142        after the current position the third one.
3143        We assume that pref0_len <= pref_len, otherwise we
3144        decrease pref0_len.  */
3145     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3146         ? (5 + taill) - l : locinput - loc_bostr;
3147     int pref0_len;
3148
3149     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3150
3151     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3152         pref_len++;
3153     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3154     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3155         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3156               ? (5 + taill) - pref_len : loc_regeol - locinput);
3157     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3158         l--;
3159     if (pref0_len < 0)
3160         pref0_len = 0;
3161     if (pref0_len > pref_len)
3162         pref0_len = pref_len;
3163     {
3164         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3165
3166         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3167             (locinput - pref_len),pref0_len, 60, 4, 5);
3168
3169         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3170                     (locinput - pref_len + pref0_len),
3171                     pref_len - pref0_len, 60, 2, 3);
3172
3173         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3174                     locinput, loc_regeol - locinput, 10, 0, 1);
3175
3176         const STRLEN tlen=len0+len1+len2;
3177         PerlIO_printf(Perl_debug_log,
3178                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3179                     (IV)(locinput - loc_bostr),
3180                     len0, s0,
3181                     len1, s1,
3182                     (docolor ? "" : "> <"),
3183                     len2, s2,
3184                     (int)(tlen > 19 ? 0 :  19 - tlen),
3185                     "");
3186     }
3187 }
3188
3189 #endif
3190
3191 /* reg_check_named_buff_matched()
3192  * Checks to see if a named buffer has matched. The data array of
3193  * buffer numbers corresponding to the buffer is expected to reside
3194  * in the regexp->data->data array in the slot stored in the ARG() of
3195  * node involved. Note that this routine doesn't actually care about the
3196  * name, that information is not preserved from compilation to execution.
3197  * Returns the index of the leftmost defined buffer with the given name
3198  * or 0 if non of the buffers matched.
3199  */
3200 STATIC I32
3201 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3202 {
3203     I32 n;
3204     RXi_GET_DECL(rex,rexi);
3205     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3206     I32 *nums=(I32*)SvPVX(sv_dat);
3207
3208     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3209
3210     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3211         if ((I32)rex->lastparen >= nums[n] &&
3212             rex->offs[nums[n]].end != -1)
3213         {
3214             return nums[n];
3215         }
3216     }
3217     return 0;
3218 }
3219
3220
3221 /* free all slabs above current one  - called during LEAVE_SCOPE */
3222
3223 STATIC void
3224 S_clear_backtrack_stack(pTHX_ void *p)
3225 {
3226     regmatch_slab *s = PL_regmatch_slab->next;
3227     PERL_UNUSED_ARG(p);
3228
3229     if (!s)
3230         return;
3231     PL_regmatch_slab->next = NULL;
3232     while (s) {
3233         regmatch_slab * const osl = s;
3234         s = s->next;
3235         Safefree(osl);
3236     }
3237 }
3238 static bool
3239 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3240         U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
3241 {
3242     /* This function determines if there are one or two characters that match
3243      * the first character of the passed-in EXACTish node <text_node>, and if
3244      * so, returns them in the passed-in pointers.
3245      *
3246      * If it determines that no possible character in the target string can
3247      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3248      * the first character in <text_node> requires UTF-8 to represent, and the
3249      * target string isn't in UTF-8.)
3250      *
3251      * If there are more than two characters that could match the beginning of
3252      * <text_node>, or if more context is required to determine a match or not,
3253      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3254      *
3255      * The motiviation behind this function is to allow the caller to set up
3256      * tight loops for matching.  If <text_node> is of type EXACT, there is
3257      * only one possible character that can match its first character, and so
3258      * the situation is quite simple.  But things get much more complicated if
3259      * folding is involved.  It may be that the first character of an EXACTFish
3260      * node doesn't participate in any possible fold, e.g., punctuation, so it
3261      * can be matched only by itself.  The vast majority of characters that are
3262      * in folds match just two things, their lower and upper-case equivalents.
3263      * But not all are like that; some have multiple possible matches, or match
3264      * sequences of more than one character.  This function sorts all that out.
3265      *
3266      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3267      * loop of trying to match A*, we know we can't exit where the thing
3268      * following it isn't a B.  And something can't be a B unless it is the
3269      * beginning of B.  By putting a quick test for that beginning in a tight
3270      * loop, we can rule out things that can't possibly be B without having to
3271      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3272      * character, we can make a tight loop matching A*, using the outputs of
3273      * this function.
3274      *
3275      * If the target string to match isn't in UTF-8, and there aren't
3276      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3277      * the one or two possible octets (which are characters in this situation)
3278      * that can match.  In all cases, if there is only one character that can
3279      * match, *<c1p> and *<c2p> will be identical.
3280      *
3281      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3282      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3283      * can match the beginning of <text_node>.  They should be declared with at
3284      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3285      * undefined what these contain.)  If one or both of the buffers are
3286      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3287      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3288      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3289      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3290      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3291
3292     const bool utf8_target = PL_reg_match_utf8;
3293
3294     UV c1 = CHRTEST_NOT_A_CP_1;
3295     UV c2 = CHRTEST_NOT_A_CP_2;
3296     bool use_chrtest_void = FALSE;
3297
3298     /* Used when we have both utf8 input and utf8 output, to avoid converting
3299      * to/from code points */
3300     bool utf8_has_been_setup = FALSE;
3301
3302     dVAR;
3303
3304     U8 *pat = (U8*)STRING(text_node);
3305
3306     if (OP(text_node) == EXACT) {
3307
3308         /* In an exact node, only one thing can be matched, that first
3309          * character.  If both the pat and the target are UTF-8, we can just
3310          * copy the input to the output, avoiding finding the code point of
3311          * that character */
3312         if (!is_utf8_pat) {
3313             c2 = c1 = *pat;
3314         }
3315         else if (utf8_target) {
3316             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3317             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3318             utf8_has_been_setup = TRUE;
3319         }
3320         else {
3321             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3322         }
3323     }
3324     else /* an EXACTFish node */
3325          if ((is_utf8_pat
3326                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3327                                                     pat + STR_LEN(text_node)))
3328              || (!is_utf8_pat
3329                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3330                                                     pat + STR_LEN(text_node))))
3331     {
3332         /* Multi-character folds require more context to sort out.  Also
3333          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3334          * handled outside this routine */
3335         use_chrtest_void = TRUE;
3336     }
3337     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3338         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3339         if (c1 > 256) {
3340             /* Load the folds hash, if not already done */
3341             SV** listp;
3342             if (! PL_utf8_foldclosures) {
3343                 if (! PL_utf8_tofold) {
3344                     U8 dummy[UTF8_MAXBYTES+1];
3345
3346                     /* Force loading this by folding an above-Latin1 char */
3347                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3348                     assert(PL_utf8_tofold); /* Verify that worked */
3349                 }
3350                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3351             }
3352
3353             /* The fold closures data structure is a hash with the keys being
3354              * the UTF-8 of every character that is folded to, like 'k', and
3355              * the values each an array of all code points that fold to its
3356              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3357              * not included */
3358             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3359                                      (char *) pat,
3360                                      UTF8SKIP(pat),
3361                                      FALSE))))
3362             {
3363                 /* Not found in the hash, therefore there are no folds
3364                  * containing it, so there is only a single character that
3365                  * could match */
3366                 c2 = c1;
3367             }
3368             else {  /* Does participate in folds */
3369                 AV* list = (AV*) *listp;
3370                 if (av_len(list) != 1) {
3371
3372                     /* If there aren't exactly two folds to this, it is outside
3373                      * the scope of this function */
3374                     use_chrtest_void = TRUE;
3375                 }
3376                 else {  /* There are two.  Get them */
3377                     SV** c_p = av_fetch(list, 0, FALSE);
3378                     if (c_p == NULL) {
3379                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3380                     }
3381                     c1 = SvUV(*c_p);
3382
3383                     c_p = av_fetch(list, 1, FALSE);
3384                     if (c_p == NULL) {
3385                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3386                     }
3387                     c2 = SvUV(*c_p);
3388
3389                     /* Folds that cross the 255/256 boundary are forbidden if
3390                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3391                      * pattern character is above 256, and its only other match
3392                      * is below 256, the only legal match will be to itself.
3393                      * We have thrown away the original, so have to compute
3394                      * which is the one above 255 */
3395                     if ((c1 < 256) != (c2 < 256)) {
3396                         if (OP(text_node) == EXACTFL
3397                             || (OP(text_node) == EXACTFA
3398                                 && (isASCII(c1) || isASCII(c2))))
3399                         {
3400                             if (c1 < 256) {
3401                                 c1 = c2;
3402                             }
3403                             else {
3404                                 c2 = c1;
3405                             }
3406                         }
3407                     }
3408                 }
3409             }
3410         }
3411         else /* Here, c1 is < 255 */
3412              if (utf8_target
3413                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3414                  && OP(text_node) != EXACTFL
3415                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3416         {
3417             /* Here, there could be something above Latin1 in the target which
3418              * folds to this character in the pattern.  All such cases except
3419              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3420              * involved in their folds, so are outside the scope of this
3421              * function */
3422             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3423                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3424             }
3425             else {
3426                 use_chrtest_void = TRUE;
3427             }
3428         }
3429         else { /* Here nothing above Latin1 can fold to the pattern character */
3430             switch (OP(text_node)) {
3431
3432                 case EXACTFL:   /* /l rules */
3433                     c2 = PL_fold_locale[c1];
3434                     break;
3435
3436                 case EXACTF:
3437                     if (! utf8_target) {    /* /d rules */
3438                         c2 = PL_fold[c1];
3439                         break;
3440                     }
3441                     /* FALLTHROUGH */
3442                     /* /u rules for all these.  This happens to work for
3443                      * EXACTFA as nothing in Latin1 folds to ASCII */
3444                 case EXACTFA:
3445                 case EXACTFU_TRICKYFOLD:
3446                 case EXACTFU_SS:
3447                 case EXACTFU:
3448                     c2 = PL_fold_latin1[c1];
3449                     break;
3450
3451                 default:
3452                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3453                     assert(0); /* NOTREACHED */
3454             }
3455         }
3456     }
3457
3458     /* Here have figured things out.  Set up the returns */
3459     if (use_chrtest_void) {
3460         *c2p = *c1p = CHRTEST_VOID;
3461     }
3462     else if (utf8_target) {
3463         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3464             uvchr_to_utf8(c1_utf8, c1);
3465             uvchr_to_utf8(c2_utf8, c2);
3466         }
3467
3468         /* Invariants are stored in both the utf8 and byte outputs; Use
3469          * negative numbers otherwise for the byte ones.  Make sure that the
3470          * byte ones are the same iff the utf8 ones are the same */
3471         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3472         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3473                 ? *c2_utf8
3474                 : (c1 == c2)
3475                   ? CHRTEST_NOT_A_CP_1
3476                   : CHRTEST_NOT_A_CP_2;
3477     }
3478     else if (c1 > 255) {
3479        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3480                            can represent */
3481            return FALSE;
3482        }
3483
3484        *c1p = *c2p = c2;    /* c2 is the only representable value */
3485     }
3486     else {  /* c1 is representable; see about c2 */
3487        *c1p = c1;
3488        *c2p = (c2 < 256) ? c2 : c1;
3489     }
3490
3491     return TRUE;
3492 }
3493
3494 /* returns -1 on failure, $+[0] on success */
3495 STATIC I32
3496 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3497 {
3498 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3499     dMY_CXT;
3500 #endif
3501     dVAR;
3502     const bool utf8_target = PL_reg_match_utf8;
3503     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3504     REGEXP *rex_sv = reginfo->prog;
3505     regexp *rex = ReANY(rex_sv);
3506     RXi_GET_DECL(rex,rexi);
3507     I32 oldsave;
3508     /* the current state. This is a cached copy of PL_regmatch_state */
3509     regmatch_state *st;
3510     /* cache heavy used fields of st in registers */
3511     regnode *scan;
3512     regnode *next;
3513     U32 n = 0;  /* general value; init to avoid compiler warning */
3514     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3515     char *locinput = startpos;
3516     char *pushinput; /* where to continue after a PUSH */
3517     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3518
3519     bool result = 0;        /* return value of S_regmatch */
3520     int depth = 0;          /* depth of backtrack stack */
3521     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3522     const U32 max_nochange_depth =
3523         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3524         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3525     regmatch_state *yes_state = NULL; /* state to pop to on success of
3526                                                             subpattern */
3527     /* mark_state piggy backs on the yes_state logic so that when we unwind
3528        the stack on success we can update the mark_state as we go */
3529     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3530     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3531     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3532     U32 state_num;
3533     bool no_final = 0;      /* prevent failure from backtracking? */
3534     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3535     char *startpoint = locinput;
3536     SV *popmark = NULL;     /* are we looking for a mark? */
3537     SV *sv_commit = NULL;   /* last mark name seen in failure */
3538     SV *sv_yes_mark = NULL; /* last mark name we have seen
3539                                during a successful match */
3540     U32 lastopen = 0;       /* last open we saw */
3541     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3542     SV* const oreplsv = GvSV(PL_replgv);
3543     /* these three flags are set by various ops to signal information to
3544      * the very next op. They have a useful lifetime of exactly one loop
3545      * iteration, and are not preserved or restored by state pushes/pops
3546      */
3547     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3548     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3549     int logical = 0;        /* the following EVAL is:
3550                                 0: (?{...})
3551                                 1: (?(?{...})X|Y)
3552                                 2: (??{...})
3553                                or the following IFMATCH/UNLESSM is:
3554                                 false: plain (?=foo)
3555                                 true:  used as a condition: (?(?=foo))
3556                             */
3557     PAD* last_pad = NULL;
3558     dMULTICALL;
3559     I32 gimme = G_SCALAR;
3560     CV *caller_cv = NULL;       /* who called us */
3561     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3562     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3563     U32 maxopenparen = 0;       /* max '(' index seen so far */
3564     int to_complement;  /* Invert the result? */
3565     _char_class_number classnum;
3566     bool is_utf8_pat = reginfo->is_utf8_pat;
3567
3568 #ifdef DEBUGGING
3569     GET_RE_DEBUG_FLAGS_DECL;
3570 #endif
3571
3572     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3573     multicall_oldcatch = 0;
3574     multicall_cv = NULL;
3575     cx = NULL;
3576     PERL_UNUSED_VAR(multicall_cop);
3577     PERL_UNUSED_VAR(newsp);
3578
3579
3580     PERL_ARGS_ASSERT_REGMATCH;
3581
3582     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3583             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3584     }));
3585     /* on first ever call to regmatch, allocate first slab */
3586     if (!PL_regmatch_slab) {
3587         Newx(PL_regmatch_slab, 1, regmatch_slab);
3588         PL_regmatch_slab->prev = NULL;
3589         PL_regmatch_slab->next = NULL;
3590         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3591     }
3592
3593     oldsave = PL_savestack_ix;
3594     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3595     SAVEVPTR(PL_regmatch_slab);
3596     SAVEVPTR(PL_regmatch_state);
3597
3598     /* grab next free state slot */
3599     st = ++PL_regmatch_state;
3600     if (st >  SLAB_LAST(PL_regmatch_slab))
3601         st = PL_regmatch_state = S_push_slab(aTHX);
3602
3603     /* Note that nextchr is a byte even in UTF */
3604     SET_nextchr;
3605     scan = prog;
3606     while (scan != NULL) {
3607
3608         DEBUG_EXECUTE_r( {
3609             SV * const prop = sv_newmortal();
3610             regnode *rnext=regnext(scan);
3611             DUMP_EXEC_POS( locinput, scan, utf8_target );
3612             regprop(rex, prop, scan);
3613
3614             PerlIO_printf(Perl_debug_log,
3615                     "%3"IVdf":%*s%s(%"IVdf")\n",
3616                     (IV)(scan - rexi->program), depth*2, "",
3617                     SvPVX_const(prop),
3618                     (PL_regkind[OP(scan)] == END || !rnext) ?
3619                         0 : (IV)(rnext - rexi->program));
3620         });
3621
3622         next = scan + NEXT_OFF(scan);
3623         if (next == scan)
3624             next = NULL;
3625         state_num = OP(scan);
3626
3627         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3628       reenter_switch:
3629         to_complement = 0;
3630
3631         SET_nextchr;
3632         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3633
3634         switch (state_num) {
3635         case BOL: /*  /^../  */
3636             if (locinput == PL_bostr)
3637             {
3638                 /* reginfo->till = reginfo->bol; */
3639                 break;
3640             }
3641             sayNO;
3642
3643         case MBOL: /*  /^../m  */
3644             if (locinput == PL_bostr ||
3645                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3646             {
3647                 break;
3648             }
3649             sayNO;
3650
3651         case SBOL: /*  /^../s  */
3652             if (locinput == PL_bostr)
3653                 break;
3654             sayNO;
3655
3656         case GPOS: /*  \G  */
3657             if (locinput == reginfo->ganch)
3658                 break;
3659             sayNO;
3660
3661         case KEEPS: /*   \K  */
3662             /* update the startpoint */
3663             st->u.keeper.val = rex->offs[0].start;
3664             rex->offs[0].start = locinput - PL_bostr;
3665             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3666             assert(0); /*NOTREACHED*/
3667         case KEEPS_next_fail:
3668             /* rollback the start point change */
3669             rex->offs[0].start = st->u.keeper.val;
3670             sayNO_SILENT;
3671             assert(0); /*NOTREACHED*/
3672
3673         case EOL: /* /..$/  */
3674                 goto seol;
3675
3676         case MEOL: /* /..$/m  */
3677             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3678                 sayNO;
3679             break;
3680
3681         case SEOL: /* /..$/s  */
3682           seol:
3683             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3684                 sayNO;
3685             if (PL_regeol - locinput > 1)
3686                 sayNO;
3687             break;
3688
3689         case EOS: /*  \z  */
3690             if (!NEXTCHR_IS_EOS)
3691                 sayNO;
3692             break;
3693
3694         case SANY: /*  /./s  */
3695             if (NEXTCHR_IS_EOS)
3696                 sayNO;
3697             goto increment_locinput;
3698
3699         case CANY: /*  \C  */
3700             if (NEXTCHR_IS_EOS)
3701                 sayNO;
3702             locinput++;
3703             break;
3704
3705         case REG_ANY: /*  /./  */
3706             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3707                 sayNO;
3708             goto increment_locinput;
3709
3710
3711 #undef  ST
3712 #define ST st->u.trie
3713         case TRIEC: /* (ab|cd) with known charclass */
3714             /* In this case the charclass data is available inline so
3715                we can fail fast without a lot of extra overhead.
3716              */
3717             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3718                 DEBUG_EXECUTE_r(
3719                     PerlIO_printf(Perl_debug_log,
3720                               "%*s  %sfailed to match trie start class...%s\n",
3721                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3722                 );
3723                 sayNO_SILENT;
3724                 assert(0); /* NOTREACHED */
3725             }
3726             /* FALL THROUGH */
3727         case TRIE:  /* (ab|cd)  */
3728             /* the basic plan of execution of the trie is:
3729              * At the beginning, run though all the states, and
3730              * find the longest-matching word. Also remember the position
3731              * of the shortest matching word. For example, this pattern:
3732              *    1  2 3 4    5
3733              *    ab|a|x|abcd|abc
3734              * when matched against the string "abcde", will generate
3735              * accept states for all words except 3, with the longest
3736              * matching word being 4, and the shortest being 2 (with
3737              * the position being after char 1 of the string).
3738              *
3739              * Then for each matching word, in word order (i.e. 1,2,4,5),
3740              * we run the remainder of the pattern; on each try setting
3741              * the current position to the character following the word,
3742              * returning to try the next word on failure.
3743              *
3744              * We avoid having to build a list of words at runtime by
3745              * using a compile-time structure, wordinfo[].prev, which
3746              * gives, for each word, the previous accepting word (if any).
3747              * In the case above it would contain the mappings 1->2, 2->0,
3748              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3749              * the longest word (4 above), a list of all words, by
3750              * following the list of prev pointers; this gives us the
3751              * unordered list 4,5,1,2. Then given the current word we have
3752              * just tried, we can go through the list and find the
3753              * next-biggest word to try (so if we just failed on word 2,
3754              * the next in the list is 4).
3755              *
3756              * Since at runtime we don't record the matching position in
3757              * the string for each word, we have to work that out for
3758              * each word we're about to process. The wordinfo table holds
3759              * the character length of each word; given that we recorded
3760              * at the start: the position of the shortest word and its
3761              * length in chars, we just need to move the pointer the
3762              * difference between the two char lengths. Depending on
3763              * Unicode status and folding, that's cheap or expensive.
3764              *
3765              * This algorithm is optimised for the case where are only a
3766              * small number of accept states, i.e. 0,1, or maybe 2.
3767              * With lots of accepts states, and having to try all of them,
3768              * it becomes quadratic on number of accept states to find all
3769              * the next words.
3770              */
3771
3772             {
3773                 /* what type of TRIE am I? (utf8 makes this contextual) */
3774                 DECL_TRIE_TYPE(scan);
3775
3776                 /* what trie are we using right now */
3777                 reg_trie_data * const trie
3778                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3779                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3780                 U32 state = trie->startstate;
3781
3782                 if (   trie->bitmap
3783                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3784                 {
3785                     if (trie->states[ state ].wordnum) {
3786                          DEBUG_EXECUTE_r(
3787                             PerlIO_printf(Perl_debug_log,
3788                                           "%*s  %smatched empty string...%s\n",
3789                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3790                         );
3791                         if (!trie->jump)
3792                             break;
3793                     } else {
3794                         DEBUG_EXECUTE_r(
3795                             PerlIO_printf(Perl_debug_log,
3796                                           "%*s  %sfailed to match trie start class...%s\n",
3797                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3798                         );
3799                         sayNO_SILENT;
3800                    }
3801                 }
3802
3803             {
3804                 U8 *uc = ( U8* )locinput;
3805
3806                 STRLEN len = 0;
3807                 STRLEN foldlen = 0;
3808                 U8 *uscan = (U8*)NULL;
3809                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3810                 U32 charcount = 0; /* how many input chars we have matched */
3811                 U32 accepted = 0; /* have we seen any accepting states? */
3812
3813                 ST.jump = trie->jump;
3814                 ST.me = scan;
3815                 ST.firstpos = NULL;
3816                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3817                 ST.nextword = 0;
3818
3819                 /* fully traverse the TRIE; note the position of the
3820                    shortest accept state and the wordnum of the longest
3821                    accept state */
3822
3823                 while ( state && uc <= (U8*)PL_regeol ) {
3824                     U32 base = trie->states[ state ].trans.base;
3825                     UV uvc = 0;
3826                     U16 charid = 0;
3827                     U16 wordnum;
3828                     wordnum = trie->states[ state ].wordnum;
3829
3830                     if (wordnum) { /* it's an accept state */
3831                         if (!accepted) {
3832                             accepted = 1;
3833                             /* record first match position */
3834                             if (ST.longfold) {
3835                                 ST.firstpos = (U8*)locinput;
3836                                 ST.firstchars = 0;
3837                             }
3838                             else {
3839                                 ST.firstpos = uc;
3840                                 ST.firstchars = charcount;
3841                             }
3842                         }
3843                         if (!ST.nextword || wordnum < ST.nextword)
3844                             ST.nextword = wordnum;
3845                         ST.topword = wordnum;
3846                     }
3847
3848                     DEBUG_TRIE_EXECUTE_r({
3849                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3850                                 PerlIO_printf( Perl_debug_log,
3851                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3852                                     2+depth * 2, "", PL_colors[4],
3853                                     (UV)state, (accepted ? 'Y' : 'N'));
3854                     });
3855
3856                     /* read a char and goto next state */
3857                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3858                         I32 offset;
3859                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3860                                              uscan, len, uvc, charid, foldlen,
3861                                              foldbuf, uniflags);
3862                         charcount++;
3863                         if (foldlen>0)
3864                             ST.longfold = TRUE;
3865                         if (charid &&
3866                              ( ((offset =
3867                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3868
3869                              && ((U32)offset < trie->lasttrans)
3870                              && trie->trans[offset].check == state)
3871                         {
3872                             state = trie->trans[offset].next;
3873                         }
3874                         else {
3875                             state = 0;
3876                         }
3877                         uc += len;
3878
3879                     }
3880                     else {
3881                         state = 0;
3882                     }
3883                     DEBUG_TRIE_EXECUTE_r(
3884                         PerlIO_printf( Perl_debug_log,
3885                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3886                             charid, uvc, (UV)state, PL_colors[5] );
3887                     );
3888                 }
3889                 if (!accepted)
3890                    sayNO;
3891
3892                 /* calculate total number of accept states */
3893                 {
3894                     U16 w = ST.topword;
3895                     accepted = 0;
3896                     while (w) {
3897                         w = trie->wordinfo[w].prev;
3898                         accepted++;
3899                     }
3900                     ST.accepted = accepted;
3901                 }
3902
3903                 DEBUG_EXECUTE_r(
3904                     PerlIO_printf( Perl_debug_log,
3905                         "%*s  %sgot %"IVdf" possible matches%s\n",
3906                         REPORT_CODE_OFF + depth * 2, "",
3907                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3908                 );
3909                 goto trie_first_try; /* jump into the fail handler */
3910             }}
3911             assert(0); /* NOTREACHED */
3912
3913         case TRIE_next_fail: /* we failed - try next alternative */
3914         {
3915             U8 *uc;
3916             if ( ST.jump) {
3917                 REGCP_UNWIND(ST.cp);
3918                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3919             }
3920             if (!--ST.accepted) {
3921                 DEBUG_EXECUTE_r({
3922                     PerlIO_printf( Perl_debug_log,
3923                         "%*s  %sTRIE failed...%s\n",
3924                         REPORT_CODE_OFF+depth*2, "",
3925                         PL_colors[4],
3926                         PL_colors[5] );
3927                 });
3928                 sayNO_SILENT;
3929             }
3930             {
3931                 /* Find next-highest word to process.  Note that this code
3932                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3933                 U16 min = 0;
3934                 U16 word;
3935                 U16 const nextword = ST.nextword;
3936                 reg_trie_wordinfo * const wordinfo
3937                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3938                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3939                     if (word > nextword && (!min || word < min))
3940                         min = word;
3941                 }
3942                 ST.nextword = min;
3943             }
3944
3945           trie_first_try:
3946             if (do_cutgroup) {
3947                 do_cutgroup = 0;
3948                 no_final = 0;
3949             }
3950
3951             if ( ST.jump) {
3952                 ST.lastparen = rex->lastparen;
3953                 ST.lastcloseparen = rex->lastcloseparen;
3954                 REGCP_SET(ST.cp);
3955             }
3956
3957             /* find start char of end of current word */
3958             {
3959                 U32 chars; /* how many chars to skip */
3960                 reg_trie_data * const trie
3961                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3962
3963                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3964                             >=  ST.firstchars);
3965                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3966                             - ST.firstchars;
3967                 uc = ST.firstpos;
3968
3969                 if (ST.longfold) {
3970                     /* the hard option - fold each char in turn and find
3971                      * its folded length (which may be different */
3972                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3973                     STRLEN foldlen;
3974                     STRLEN len;
3975                     UV uvc;
3976                     U8 *uscan;
3977
3978                     while (chars) {
3979                         if (utf8_target) {
3980                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3981                                                     uniflags);
3982                             uc += len;
3983                         }
3984                         else {
3985                             uvc = *uc;
3986                             uc++;
3987                         }
3988                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3989                         uscan = foldbuf;
3990                         while (foldlen) {
3991                             if (!--chars)
3992                                 break;
3993                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3994                                             uniflags);
3995                             uscan += len;
3996                             foldlen -= len;
3997                         }
3998                     }
3999                 }
4000                 else {
4001                     if (utf8_target)
4002                         while (chars--)
4003                             uc += UTF8SKIP(uc);
4004                     else
4005                         uc += chars;
4006                 }
4007             }
4008
4009             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4010                             ? ST.jump[ST.nextword]
4011                             : NEXT_OFF(ST.me));
4012
4013             DEBUG_EXECUTE_r({
4014                 PerlIO_printf( Perl_debug_log,
4015                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4016                     REPORT_CODE_OFF+depth*2, "",
4017                     PL_colors[4],
4018                     ST.nextword,
4019                     PL_colors[5]
4020                     );
4021             });
4022
4023             if (ST.accepted > 1 || has_cutgroup) {
4024                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4025                 assert(0); /* NOTREACHED */
4026             }
4027             /* only one choice left - just continue */
4028             DEBUG_EXECUTE_r({
4029                 AV *const trie_words
4030                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4031                 SV ** const tmp = av_fetch( trie_words,
4032                     ST.nextword-1, 0 );
4033                 SV *sv= tmp ? sv_newmortal() : NULL;
4034
4035                 PerlIO_printf( Perl_debug_log,
4036                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4037                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4038                     ST.nextword,
4039                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4040                             PL_colors[0], PL_colors[1],
4041                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4042                         )
4043                     : "not compiled under -Dr",
4044                     PL_colors[5] );
4045             });
4046
4047             locinput = (char*)uc;
4048             continue; /* execute rest of RE */
4049             assert(0); /* NOTREACHED */
4050         }
4051 #undef  ST
4052
4053         case EXACT: {            /*  /abc/        */
4054             char *s = STRING(scan);
4055             ln = STR_LEN(scan);
4056             if (utf8_target != is_utf8_pat) {
4057                 /* The target and the pattern have differing utf8ness. */
4058                 char *l = locinput;
4059                 const char * const e = s + ln;
4060
4061                 if (utf8_target) {
4062                     /* The target is utf8, the pattern is not utf8.
4063                      * Above-Latin1 code points can't match the pattern;
4064                      * invariants match exactly, and the other Latin1 ones need
4065                      * to be downgraded to a single byte in order to do the
4066                      * comparison.  (If we could be confident that the target
4067                      * is not malformed, this could be refactored to have fewer
4068                      * tests by just assuming that if the first bytes match, it
4069                      * is an invariant, but there are tests in the test suite
4070                      * dealing with (??{...}) which violate this) */
4071                     while (s < e) {
4072                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4073                             sayNO;
4074                         }
4075                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4076                             if (*l != *s) {
4077                                 sayNO;
4078                             }
4079                             l++;
4080                         }
4081                         else {
4082                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4083                                 sayNO;
4084                             }
4085                             l += 2;
4086                         }
4087                         s++;
4088                     }
4089                 }
4090                 else {
4091                     /* The target is not utf8, the pattern is utf8. */
4092                     while (s < e) {
4093                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4094                         {
4095                             sayNO;
4096                         }
4097                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4098                             if (*s != *l) {
4099                                 sayNO;
4100                             }
4101                             s++;
4102                         }
4103                         else {
4104                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4105                                 sayNO;
4106                             }
4107                             s += 2;
4108                         }
4109                         l++;
4110                     }
4111                 }
4112                 locinput = l;
4113             }
4114             else {
4115                 /* The target and the pattern have the same utf8ness. */
4116                 /* Inline the first character, for speed. */
4117                 if (PL_regeol - locinput < ln
4118                     || UCHARAT(s) != nextchr
4119                     || (ln > 1 && memNE(s, locinput, ln)))
4120                 {
4121                     sayNO;
4122                 }
4123                 locinput += ln;
4124             }
4125             break;
4126             }
4127
4128         case EXACTFL: {          /*  /abc/il      */
4129             re_fold_t folder;
4130             const U8 * fold_array;
4131             const char * s;
4132             U32 fold_utf8_flags;
4133
4134             RX_MATCH_TAINTED_on(reginfo->prog);
4135             folder = foldEQ_locale;
4136             fold_array = PL_fold_locale;
4137             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4138             goto do_exactf;
4139
4140         case EXACTFU_SS:         /*  /\x{df}/iu   */
4141         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4142         case EXACTFU:            /*  /abc/iu      */
4143             folder = foldEQ_latin1;
4144             fold_array = PL_fold_latin1;
4145             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4146             goto do_exactf;
4147
4148         case EXACTFA:            /*  /abc/iaa     */
4149             folder = foldEQ_latin1;
4150             fold_array = PL_fold_latin1;
4151             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4152             goto do_exactf;
4153
4154         case EXACTF:             /*  /abc/i       */
4155             folder = foldEQ;
4156             fold_array = PL_fold;
4157             fold_utf8_flags = 0;
4158
4159           do_exactf:
4160             s = STRING(scan);
4161             ln = STR_LEN(scan);
4162
4163             if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4164               /* Either target or the pattern are utf8, or has the issue where
4165                * the fold lengths may differ. */
4166                 const char * const l = locinput;
4167                 char *e = PL_regeol;
4168
4169                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4170                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4171                 {
4172                     sayNO;
4173                 }
4174                 locinput = e;
4175                 break;
4176             }
4177
4178             /* Neither the target nor the pattern are utf8 */
4179             if (UCHARAT(s) != nextchr
4180                 && !NEXTCHR_IS_EOS
4181                 && UCHARAT(s) != fold_array[nextchr])
4182             {
4183                 sayNO;
4184             }
4185             if (PL_regeol - locinput < ln)
4186                 sayNO;
4187             if (ln > 1 && ! folder(s, locinput, ln))
4188                 sayNO;
4189             locinput += ln;
4190             break;
4191         }
4192
4193         /* XXX Could improve efficiency by separating these all out using a
4194          * macro or in-line function.  At that point regcomp.c would no longer
4195          * have to set the FLAGS fields of these */
4196         case BOUNDL:  /*  /\b/l  */
4197         case NBOUNDL: /*  /\B/l  */
4198             RX_MATCH_TAINTED_on(reginfo->prog);
4199             /* FALL THROUGH */
4200         case BOUND:   /*  /\b/   */
4201         case BOUNDU:  /*  /\b/u  */
4202         case BOUNDA:  /*  /\b/a  */
4203         case NBOUND:  /*  /\B/   */
4204         case NBOUNDU: /*  /\B/u  */
4205         case NBOUNDA: /*  /\B/a  */
4206             /* was last char in word? */
4207             if (utf8_target
4208                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4209                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4210             {
4211                 if (locinput == PL_bostr)
4212                     ln = '\n';
4213                 else {
4214                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4215
4216                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4217                 }
4218                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4219                     ln = isWORDCHAR_uni(ln);
4220                     if (NEXTCHR_IS_EOS)
4221                         n = 0;
4222                     else {
4223                         LOAD_UTF8_CHARCLASS_ALNUM();
4224                         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4225                                                                 utf8_target);
4226                     }
4227                 }
4228                 else {
4229                     ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4230                     n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4231                 }
4232             }
4233             else {
4234
4235                 /* Here the string isn't utf8, or is utf8 and only ascii
4236                  * characters are to match \w.  In the latter case looking at
4237                  * the byte just prior to the current one may be just the final
4238                  * byte of a multi-byte character.  This is ok.  There are two
4239                  * cases:
4240                  * 1) it is a single byte character, and then the test is doing
4241                  *      just what it's supposed to.
4242                  * 2) it is a multi-byte character, in which case the final
4243                  *      byte is never mistakable for ASCII, and so the test
4244                  *      will say it is not a word character, which is the
4245                  *      correct answer. */
4246                 ln = (locinput != PL_bostr) ?
4247                     UCHARAT(locinput - 1) : '\n';
4248                 switch (FLAGS(scan)) {
4249                     case REGEX_UNICODE_CHARSET:
4250                         ln = isWORDCHAR_L1(ln);
4251                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4252                         break;
4253                     case REGEX_LOCALE_CHARSET:
4254                         ln = isWORDCHAR_LC(ln);
4255                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4256                         break;
4257                     case REGEX_DEPENDS_CHARSET:
4258                         ln = isWORDCHAR(ln);
4259                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4260                         break;
4261                     case REGEX_ASCII_RESTRICTED_CHARSET:
4262                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4263                         ln = isWORDCHAR_A(ln);
4264                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4265                         break;
4266                     default:
4267                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4268                         break;
4269                 }
4270             }
4271             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4272              * regcomp.sym */
4273             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4274                     sayNO;
4275             break;
4276
4277         case ANYOF:  /*  /[abc]/       */
4278         case ANYOF_WARN_SUPER:
4279             if (NEXTCHR_IS_EOS)
4280                 sayNO;
4281             if (utf8_target) {
4282                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4283                     sayNO;
4284                 locinput += UTF8SKIP(locinput);
4285             }
4286             else {
4287                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4288                     sayNO;
4289                 locinput++;
4290             }
4291             break;
4292
4293         /* The argument (FLAGS) to all the POSIX node types is the class number
4294          * */
4295
4296         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4297             to_complement = 1;
4298             /* FALLTHROUGH */
4299
4300         case POSIXL:    /* \w or [:punct:] etc. under /l */
4301             if (NEXTCHR_IS_EOS)
4302                 sayNO;
4303
4304             /* The locale hasn't influenced the outcome before this, so defer
4305              * tainting until now */
4306             RX_MATCH_TAINTED_on(reginfo->prog);
4307
4308             /* Use isFOO_lc() for characters within Latin1.  (Note that
4309              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4310              * wouldn't be invariant) */
4311             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4312                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4313                     sayNO;
4314                 }
4315             }
4316             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4317                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4318                                         (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4319                                                             *(locinput + 1))))))
4320                 {
4321                     sayNO;
4322                 }
4323             }
4324             else { /* Here, must be an above Latin-1 code point */
4325                 goto utf8_posix_not_eos;
4326             }
4327
4328             /* Here, must be utf8 */
4329             locinput += UTF8SKIP(locinput);
4330             break;
4331
4332         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4333             to_complement = 1;
4334             /* FALLTHROUGH */
4335
4336         case POSIXD:    /* \w or [:punct:] etc. under /d */
4337             if (utf8_target) {
4338                 goto utf8_posix;
4339             }
4340             goto posixa;
4341
4342         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4343
4344             if (NEXTCHR_IS_EOS) {
4345                 sayNO;
4346             }
4347
4348             /* All UTF-8 variants match */
4349             if (! UTF8_IS_INVARIANT(nextchr)) {
4350                 goto increment_locinput;
4351             }
4352
4353             to_complement = 1;
4354             /* FALLTHROUGH */
4355
4356         case POSIXA:    /* \w or [:punct:] etc. under /a */
4357
4358           posixa:
4359             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4360              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4361              * character is a single byte */
4362
4363             if (NEXTCHR_IS_EOS
4364                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4365                                                             FLAGS(scan)))))
4366             {
4367                 sayNO;
4368             }
4369
4370             /* Here we are either not in utf8, or we matched a utf8-invariant,
4371              * so the next char is the next byte */
4372             locinput++;
4373             break;
4374
4375         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4376             to_complement = 1;
4377             /* FALLTHROUGH */
4378
4379         case POSIXU:    /* \w or [:punct:] etc. under /u */
4380           utf8_posix:
4381             if (NEXTCHR_IS_EOS) {
4382                 sayNO;
4383             }
4384           utf8_posix_not_eos:
4385
4386             /* Use _generic_isCC() for characters within Latin1.  (Note that
4387              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4388              * wouldn't be invariant) */
4389             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4390                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4391                                                            FLAGS(scan)))))
4392                 {
4393                     sayNO;
4394                 }
4395                 locinput++;
4396             }
4397             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4398                 if (! (to_complement
4399                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4400                                                                *(locinput + 1)),
4401                                               FLAGS(scan)))))
4402                 {
4403                     sayNO;
4404                 }
4405                 locinput += 2;
4406             }
4407             else {  /* Handle above Latin-1 code points */
4408                 classnum = (_char_class_number) FLAGS(scan);
4409                 if (classnum < _FIRST_NON_SWASH_CC) {
4410
4411                     /* Here, uses a swash to find such code points.  Load if if
4412                      * not done already */
4413                     if (! PL_utf8_swash_ptrs[classnum]) {
4414                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4415                         PL_utf8_swash_ptrs[classnum]
4416                                 = _core_swash_init("utf8",
4417                                         swash_property_names[classnum],
4418                                         &PL_sv_undef, 1, 0, NULL, &flags);
4419                     }
4420                     if (! (to_complement
4421                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4422                                                (U8 *) locinput, TRUE))))
4423                     {
4424                         sayNO;
4425                     }
4426                 }
4427                 else {  /* Here, uses macros to find above Latin-1 code points */
4428                     switch (classnum) {
4429                         case _CC_ENUM_SPACE:    /* XXX would require separate
4430                                                    code if we revert the change
4431                                                    of \v matching this */
4432                         case _CC_ENUM_PSXSPC:
4433                             if (! (to_complement
4434                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4435                             {
4436                                 sayNO;
4437                             }
4438                             break;
4439                         case _CC_ENUM_BLANK:
4440                             if (! (to_complement
4441                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4442                             {
4443                                 sayNO;
4444                             }
4445                             break;
4446                         case _CC_ENUM_XDIGIT:
4447                             if (! (to_complement
4448                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4449                             {
4450                                 sayNO;
4451                             }
4452                             break;
4453                         case _CC_ENUM_VERTSPACE:
4454                             if (! (to_complement
4455                                             ^ cBOOL(is_VERTWS_high(locinput))))
4456                             {
4457                                 sayNO;
4458                             }
4459                             break;
4460                         default:    /* The rest, e.g. [:cntrl:], can't match
4461                                        above Latin1 */
4462                             if (! to_complement) {
4463                                 sayNO;
4464                             }
4465                             break;
4466                     }
4467                 }
4468                 locinput += UTF8SKIP(locinput);
4469             }
4470             break;
4471
4472         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4473                        a Unicode extended Grapheme Cluster */
4474             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4475               extended Grapheme Cluster is:
4476
4477             CR LF
4478             | Prepend* Begin Extend*
4479             | .
4480
4481             Begin is:           ( Special_Begin | ! Control )
4482             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4483             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4484             Control is:         [ GCB_Control | CR | LF ]
4485             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4486
4487                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4488                we can rewrite
4489
4490                    Begin is ( Regular_Begin + Special Begin )
4491
4492                It turns out that 98.4% of all Unicode code points match
4493                Regular_Begin.  Doing it this way eliminates a table match in
4494                the previous implementation for almost all Unicode code points.
4495
4496                There is a subtlety with Prepend* which showed up in testing.
4497                Note that the Begin, and only the Begin is required in:
4498                 | Prepend* Begin Extend*
4499                Also, Begin contains '! Control'.  A Prepend must be a
4500                '!  Control', which means it must also be a Begin.  What it
4501                comes down to is that if we match Prepend* and then find no
4502                suitable Begin afterwards, that if we backtrack the last
4503                Prepend, that one will be a suitable Begin.
4504             */
4505
4506             if (NEXTCHR_IS_EOS)
4507                 sayNO;
4508             if  (! utf8_target) {
4509
4510                 /* Match either CR LF  or '.', as all the other possibilities
4511                  * require utf8 */
4512                 locinput++;         /* Match the . or CR */
4513                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4514                                        match the LF */
4515                     && locinput < PL_regeol
4516                     && UCHARAT(locinput) == '\n')
4517                 {
4518                     locinput++;
4519                 }
4520             }
4521             else {
4522
4523                 /* Utf8: See if is ( CR LF ); already know that locinput <
4524                  * PL_regeol, so locinput+1 is in bounds */
4525                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4526                      && UCHARAT(locinput + 1) == '\n')
4527                 {
4528                     locinput += 2;
4529                 }
4530                 else {
4531                     STRLEN len;
4532
4533                     /* In case have to backtrack to beginning, then match '.' */
4534                     char *starting = locinput;
4535
4536                     /* In case have to backtrack the last prepend */
4537                     char *previous_prepend = NULL;
4538
4539                     LOAD_UTF8_CHARCLASS_GCB();
4540
4541                     /* Match (prepend)*   */
4542                     while (locinput < PL_regeol
4543                            && (len = is_GCB_Prepend_utf8(locinput)))
4544                     {
4545                         previous_prepend = locinput;
4546                         locinput += len;
4547                     }
4548
4549                     /* As noted above, if we matched a prepend character, but
4550                      * the next thing won't match, back off the last prepend we
4551                      * matched, as it is guaranteed to match the begin */
4552                     if (previous_prepend
4553                         && (locinput >=  PL_regeol
4554                             || (! swash_fetch(PL_utf8_X_regular_begin,
4555                                              (U8*)locinput, utf8_target)
4556                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4557                         )
4558                     {
4559                         locinput = previous_prepend;
4560                     }
4561
4562                     /* Note that here we know PL_regeol > locinput, as we
4563                      * tested that upon input to this switch case, and if we
4564                      * moved locinput forward, we tested the result just above
4565                      * and it either passed, or we backed off so that it will
4566                      * now pass */
4567                     if (swash_fetch(PL_utf8_X_regular_begin,
4568                                     (U8*)locinput, utf8_target)) {
4569                         locinput += UTF8SKIP(locinput);
4570                     }
4571                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4572
4573                         /* Here did not match the required 'Begin' in the
4574                          * second term.  So just match the very first
4575                          * character, the '.' of the final term of the regex */
4576                         locinput = starting + UTF8SKIP(starting);
4577                         goto exit_utf8;
4578                     } else {
4579
4580                         /* Here is a special begin.  It can be composed of
4581                          * several individual characters.  One possibility is
4582                          * RI+ */
4583                         if ((len = is_GCB_RI_utf8(locinput))) {
4584                             locinput += len;
4585                             while (locinput < PL_regeol
4586                                    && (len = is_GCB_RI_utf8(locinput)))
4587                             {
4588                                 locinput += len;
4589                             }
4590                         } else if ((len = is_GCB_T_utf8(locinput))) {
4591                             /* Another possibility is T+ */
4592                             locinput += len;
4593                             while (locinput < PL_regeol
4594                                 && (len = is_GCB_T_utf8(locinput)))
4595                             {
4596                                 locinput += len;
4597                             }
4598                         } else {
4599
4600                             /* Here, neither RI+ nor T+; must be some other
4601                              * Hangul.  That means it is one of the others: L,
4602                              * LV, LVT or V, and matches:
4603                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4604
4605                             /* Match L*           */
4606                             while (locinput < PL_regeol
4607                                    && (len = is_GCB_L_utf8(locinput)))
4608                             {
4609                                 locinput += len;
4610                             }
4611
4612                             /* Here, have exhausted L*.  If the next character
4613                              * is not an LV, LVT nor V, it means we had to have
4614                              * at least one L, so matches L+ in the original
4615                              * equation, we have a complete hangul syllable.
4616                              * Are done. */
4617
4618                             if (locinput < PL_regeol
4619                                 && is_GCB_LV_LVT_V_utf8(locinput))
4620                             {
4621                                 /* Otherwise keep going.  Must be LV, LVT or V.
4622                                  * See if LVT, by first ruling out V, then LV */
4623                                 if (! is_GCB_V_utf8(locinput)
4624                                         /* All but every TCount one is LV */
4625                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4626                                                                          NULL)
4627                                                                         - SBASE)
4628                                         % TCount != 0)
4629                                 {
4630                                     locinput += UTF8SKIP(locinput);
4631                                 } else {
4632
4633                                     /* Must be  V or LV.  Take it, then match
4634                                      * V*     */
4635                                     locinput += UTF8SKIP(locinput);
4636                                     while (locinput < PL_regeol
4637                                            && (len = is_GCB_V_utf8(locinput)))
4638                                     {
4639                                         locinput += len;
4640                                     }
4641                                 }
4642
4643                                 /* And any of LV, LVT, or V can be followed
4644                                  * by T*            */
4645                                 while (locinput < PL_regeol
4646                                        && (len = is_GCB_T_utf8(locinput)))
4647                                 {
4648                                     locinput += len;
4649                                 }
4650                             }
4651                         }
4652                     }
4653
4654                     /* Match any extender */
4655                     while (locinput < PL_regeol
4656                             && swash_fetch(PL_utf8_X_extend,
4657                                             (U8*)locinput, utf8_target))
4658                     {
4659                         locinput += UTF8SKIP(locinput);
4660                     }
4661                 }
4662             exit_utf8:
4663                 if (locinput > PL_regeol) sayNO;
4664             }
4665             break;
4666
4667         case NREFFL:  /*  /\g{name}/il  */
4668         {   /* The capture buffer cases.  The ones beginning with N for the
4669                named buffers just convert to the equivalent numbered and
4670                pretend they were called as the corresponding numbered buffer
4671                op.  */
4672             /* don't initialize these in the declaration, it makes C++
4673                unhappy */
4674             char *s;
4675             char type;
4676             re_fold_t folder;
4677             const U8 *fold_array;
4678             UV utf8_fold_flags;
4679
4680             RX_MATCH_TAINTED_on(reginfo->prog);
4681             folder = foldEQ_locale;
4682             fold_array = PL_fold_locale;
4683             type = REFFL;
4684             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4685             goto do_nref;
4686
4687         case NREFFA:  /*  /\g{name}/iaa  */
4688             folder = foldEQ_latin1;
4689             fold_array = PL_fold_latin1;
4690             type = REFFA;
4691             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4692             goto do_nref;
4693
4694         case NREFFU:  /*  /\g{name}/iu  */
4695             folder = foldEQ_latin1;
4696             fold_array = PL_fold_latin1;
4697             type = REFFU;
4698             utf8_fold_flags = 0;
4699             goto do_nref;
4700
4701         case NREFF:  /*  /\g{name}/i  */
4702             folder = foldEQ;
4703             fold_array = PL_fold;
4704             type = REFF;
4705             utf8_fold_flags = 0;
4706             goto do_nref;
4707
4708         case NREF:  /*  /\g{name}/   */
4709             type = REF;
4710             folder = NULL;
4711             fold_array = NULL;
4712             utf8_fold_flags = 0;
4713           do_nref:
4714
4715             /* For the named back references, find the corresponding buffer
4716              * number */
4717             n = reg_check_named_buff_matched(rex,scan);
4718
4719             if ( ! n ) {
4720                 sayNO;
4721             }
4722             goto do_nref_ref_common;
4723
4724         case REFFL:  /*  /\1/il  */
4725             RX_MATCH_TAINTED_on(reginfo->prog);
4726             folder = foldEQ_locale;
4727             fold_array = PL_fold_locale;
4728             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4729             goto do_ref;
4730
4731         case REFFA:  /*  /\1/iaa  */
4732             folder = foldEQ_latin1;
4733             fold_array = PL_fold_latin1;
4734             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4735             goto do_ref;
4736
4737         case REFFU:  /*  /\1/iu  */
4738             folder = foldEQ_latin1;
4739             fold_array = PL_fold_latin1;
4740             utf8_fold_flags = 0;
4741             goto do_ref;
4742
4743         case REFF:  /*  /\1/i  */
4744             folder = foldEQ;
4745             fold_array = PL_fold;
4746             utf8_fold_flags = 0;
4747             goto do_ref;
4748
4749         case REF:  /*  /\1/    */
4750             folder = NULL;
4751             fold_array = NULL;
4752             utf8_fold_flags = 0;
4753
4754           do_ref:
4755             type = OP(scan);
4756             n = ARG(scan);  /* which paren pair */
4757
4758           do_nref_ref_common:
4759             ln = rex->offs[n].start;
4760             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4761             if (rex->lastparen < n || ln == -1)
4762                 sayNO;                  /* Do not match unless seen CLOSEn. */
4763             if (ln == rex->offs[n].end)
4764                 break;
4765
4766             s = PL_bostr + ln;
4767             if (type != REF     /* REF can do byte comparison */
4768                 && (utf8_target || type == REFFU))
4769             { /* XXX handle REFFL better */
4770                 char * limit = PL_regeol;
4771
4772                 /* This call case insensitively compares the entire buffer
4773                     * at s, with the current input starting at locinput, but
4774                     * not going off the end given by PL_regeol, and returns in
4775                     * <limit> upon success, how much of the current input was
4776                     * matched */
4777                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4778                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4779                 {
4780                     sayNO;
4781                 }
4782                 locinput = limit;
4783                 break;
4784             }
4785
4786             /* Not utf8:  Inline the first character, for speed. */
4787             if (!NEXTCHR_IS_EOS &&
4788                 UCHARAT(s) != nextchr &&
4789                 (type == REF ||
4790                  UCHARAT(s) != fold_array[nextchr]))
4791                 sayNO;
4792             ln = rex->offs[n].end - ln;
4793             if (locinput + ln > PL_regeol)
4794                 sayNO;
4795             if (ln > 1 && (type == REF
4796                            ? memNE(s, locinput, ln)
4797                            : ! folder(s, locinput, ln)))
4798                 sayNO;
4799             locinput += ln;
4800             break;
4801         }
4802
4803         case NOTHING: /* null op; e.g. the 'nothing' following
4804                        * the '*' in m{(a+|b)*}' */
4805             break;
4806         case TAIL: /* placeholder while compiling (A|B|C) */
4807             break;
4808
4809         case BACK: /* ??? doesn't appear to be used ??? */
4810             break;
4811
4812 #undef  ST
4813 #define ST st->u.eval
4814         {
4815             SV *ret;
4816             REGEXP *re_sv;
4817             regexp *re;
4818             regexp_internal *rei;
4819             regnode *startpoint;
4820
4821         case GOSTART: /*  (?R)  */
4822         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4823             if (cur_eval && cur_eval->locinput==locinput) {
4824                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4825                     Perl_croak(aTHX_ "Infinite recursion in regex");
4826                 if ( ++nochange_depth > max_nochange_depth )
4827                     Perl_croak(aTHX_
4828                         "Pattern subroutine nesting without pos change"
4829                         " exceeded limit in regex");
4830             } else {
4831                 nochange_depth = 0;
4832             }
4833             re_sv = rex_sv;
4834             re = rex;
4835             rei = rexi;
4836             if (OP(scan)==GOSUB) {
4837                 startpoint = scan + ARG2L(scan);
4838                 ST.close_paren = ARG(scan);
4839             } else {
4840                 startpoint = rei->program+1;
4841                 ST.close_paren = 0;
4842             }
4843             goto eval_recurse_doit;
4844             assert(0); /* NOTREACHED */
4845
4846         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4847             if (cur_eval && cur_eval->locinput==locinput) {
4848                 if ( ++nochange_depth > max_nochange_depth )
4849                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4850             } else {
4851                 nochange_depth = 0;
4852             }
4853             {
4854                 /* execute the code in the {...} */
4855
4856                 dSP;
4857                 IV before;
4858                 OP * const oop = PL_op;
4859                 COP * const ocurcop = PL_curcop;
4860                 OP *nop;
4861                 char *saved_regeol = PL_regeol;
4862                 struct re_save_state saved_state;
4863                 CV *newcv;
4864
4865                 /* save *all* paren positions */
4866                 regcppush(rex, 0, maxopenparen);
4867                 REGCP_SET(runops_cp);
4868
4869                 /* To not corrupt the existing regex state while executing the
4870                  * eval we would normally put it on the save stack, like with
4871                  * save_re_context. However, re-evals have a weird scoping so we
4872                  * can't just add ENTER/LEAVE here. With that, things like
4873                  *
4874                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4875                  *
4876                  * would break, as they expect the localisation to be unwound
4877                  * only when the re-engine backtracks through the bit that
4878                  * localised it.
4879                  *
4880                  * What we do instead is just saving the state in a local c
4881                  * variable.
4882                  */
4883                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4884
4885                 PL_reg_state.re_reparsing = FALSE;
4886
4887                 if (!caller_cv)
4888                     caller_cv = find_runcv(NULL);
4889
4890                 n = ARG(scan);
4891
4892                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4893                     newcv = (ReANY(
4894                                                 (REGEXP*)(rexi->data->data[n])
4895                                             ))->qr_anoncv
4896                                         ;
4897                     nop = (OP*)rexi->data->data[n+1];
4898                 }
4899                 else if (rexi->data->what[n] == 'l') { /* literal code */
4900                     newcv = caller_cv;
4901                     nop = (OP*)rexi->data->data[n];
4902                     assert(CvDEPTH(newcv));
4903                 }
4904                 else {
4905                     /* literal with own CV */
4906                     assert(rexi->data->what[n] == 'L');
4907                     newcv = rex->qr_anoncv;
4908                     nop = (OP*)rexi->data->data[n];
4909                 }
4910
4911                 /* normally if we're about to execute code from the same
4912                  * CV that we used previously, we just use the existing
4913                  * CX stack entry. However, its possible that in the
4914                  * meantime we may have backtracked, popped from the save
4915                  * stack, and undone the SAVECOMPPAD(s) associated with
4916                  * PUSH_MULTICALL; in which case PL_comppad no longer
4917                  * points to newcv's pad. */
4918                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4919                 {
4920                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4921                     if (last_pushed_cv) {
4922                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4923                     }
4924                     else {
4925                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4926                     }
4927                     last_pushed_cv = newcv;
4928                 }
4929                 else {
4930                     /* these assignments are just to silence compiler
4931                      * warnings */
4932                     multicall_cop = NULL;
4933                     newsp = NULL;
4934                 }
4935                 last_pad = PL_comppad;
4936
4937                 /* the initial nextstate you would normally execute
4938                  * at the start of an eval (which would cause error
4939                  * messages to come from the eval), may be optimised
4940                  * away from the execution path in the regex code blocks;
4941                  * so manually set PL_curcop to it initially */
4942                 {
4943                     OP *o = cUNOPx(nop)->op_first;
4944                     assert(o->op_type == OP_NULL);
4945                     if (o->op_targ == OP_SCOPE) {
4946                         o = cUNOPo->op_first;
4947                     }
4948                     else {
4949                         assert(o->op_targ == OP_LEAVE);
4950                         o = cUNOPo->op_first;
4951                         assert(o->op_type == OP_ENTER);
4952                         o = o->op_sibling;
4953                     }
4954
4955                     if (o->op_type != OP_STUB) {
4956                         assert(    o->op_type == OP_NEXTSTATE
4957                                 || o->op_type == OP_DBSTATE
4958                                 || (o->op_type == OP_NULL
4959                                     &&  (  o->op_targ == OP_NEXTSTATE
4960                                         || o->op_targ == OP_DBSTATE
4961                                         )
4962                                     )
4963                         );
4964                         PL_curcop = (COP*)o;
4965                     }
4966                 }
4967                 nop = nop->op_next;
4968
4969                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4970                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4971
4972                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4973
4974                 if (sv_yes_mark) {
4975                     SV *sv_mrk = get_sv("REGMARK", 1);
4976                     sv_setsv(sv_mrk, sv_yes_mark);
4977                 }
4978
4979                 /* we don't use MULTICALL here as we want to call the
4980                  * first op of the block of interest, rather than the
4981                  * first op of the sub */
4982                 before = (IV)(SP-PL_stack_base);
4983                 PL_op = nop;
4984                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4985                 SPAGAIN;
4986                 if ((IV)(SP-PL_stack_base) == before)
4987                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4988                 else {
4989                     ret = POPs;
4990                     PUTBACK;
4991                 }
4992
4993                 /* before restoring everything, evaluate the returned
4994                  * value, so that 'uninit' warnings don't use the wrong
4995                  * PL_op or pad. Also need to process any magic vars
4996                  * (e.g. $1) *before* parentheses are restored */
4997
4998                 PL_op = NULL;
4999
5000                 re_sv = NULL;
5001                 if (logical == 0)        /*   (?{})/   */
5002                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5003                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5004                     sw = cBOOL(SvTRUE(ret));
5005                     logical = 0;
5006                 }
5007                 else {                   /*  /(??{})  */
5008                     /*  if its overloaded, let the regex compiler handle
5009                      *  it; otherwise extract regex, or stringify  */
5010                     if (!SvAMAGIC(ret)) {
5011                         SV *sv = ret;
5012                         if (SvROK(sv))
5013                             sv = SvRV(sv);
5014                         if (SvTYPE(sv) == SVt_REGEXP)
5015                             re_sv = (REGEXP*) sv;
5016                         else if (SvSMAGICAL(sv)) {
5017                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5018                             if (mg)
5019                                 re_sv = (REGEXP *) mg->mg_obj;
5020                         }
5021
5022                         /* force any magic, undef warnings here */
5023                         if (!re_sv) {
5024                             ret = sv_mortalcopy(ret);
5025                             (void) SvPV_force_nolen(ret);
5026                         }
5027                     }
5028
5029                 }
5030
5031                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5032
5033                 /* *** Note that at this point we don't restore
5034                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5035                  * be used again soon. This is safe as long as nothing
5036                  * in the regexp code uses the pad ! */
5037                 PL_op = oop;
5038                 PL_curcop = ocurcop;
5039                 PL_regeol = saved_regeol;
5040                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5041
5042                 if (logical != 2)
5043                     break;
5044             }
5045
5046                 /* only /(??{})/  from now on */
5047                 logical = 0;
5048                 {
5049                     /* extract RE object from returned value; compiling if
5050                      * necessary */
5051
5052                     if (re_sv) {
5053                         re_sv = reg_temp_copy(NULL, re_sv);
5054                     }
5055                     else {
5056                         U32 pm_flags = 0;
5057
5058                         if (SvUTF8(ret) && IN_BYTES) {
5059                             /* In use 'bytes': make a copy of the octet
5060                              * sequence, but without the flag on */
5061                             STRLEN len;
5062                             const char *const p = SvPV(ret, len);
5063                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5064                         }
5065                         if (rex->intflags & PREGf_USE_RE_EVAL)
5066                             pm_flags |= PMf_USE_RE_EVAL;
5067
5068                         /* if we got here, it should be an engine which
5069                          * supports compiling code blocks and stuff */
5070                         assert(rex->engine && rex->engine->op_comp);
5071                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5072                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5073                                     rex->engine, NULL, NULL,
5074                                     /* copy /msix etc to inner pattern */
5075                                     scan->flags,
5076                                     pm_flags);
5077
5078                         if (!(SvFLAGS(ret)
5079                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5080                                  | SVs_GMG))) {
5081                             /* This isn't a first class regexp. Instead, it's
5082                                caching a regexp onto an existing, Perl visible
5083                                scalar.  */
5084                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5085                         }
5086                         /* safe to do now that any $1 etc has been
5087                          * interpolated into the new pattern string and
5088                          * compiled */
5089                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5090                     }
5091                     SAVEFREESV(re_sv);
5092                     re = ReANY(re_sv);
5093                 }
5094                 RXp_MATCH_COPIED_off(re);
5095                 re->subbeg = rex->subbeg;
5096                 re->sublen = rex->sublen;
5097                 re->suboffset = rex->suboffset;
5098                 re->subcoffset = rex->subcoffset;
5099                 rei = RXi_GET(re);
5100                 DEBUG_EXECUTE_r(
5101                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5102                         "Matching embedded");
5103                 );
5104                 startpoint = rei->program + 1;
5105                 ST.close_paren = 0; /* only used for GOSUB */
5106
5107         eval_recurse_doit: /* Share code with GOSUB below this line */
5108                 /* run the pattern returned from (??{...}) */
5109
5110                 /* Save *all* the positions. */
5111                 ST.cp = regcppush(rex, 0, maxopenparen);
5112                 REGCP_SET(ST.lastcp);
5113
5114                 re->lastparen = 0;
5115                 re->lastcloseparen = 0;
5116
5117                 maxopenparen = 0;
5118
5119                 /* XXXX This is too dramatic a measure... */
5120                 PL_reg_maxiter = 0;
5121
5122                 ST.saved_utf8_pat = is_utf8_pat;
5123                 is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5124
5125                 ST.prev_rex = rex_sv;
5126                 ST.prev_curlyx = cur_curlyx;
5127                 rex_sv = re_sv;
5128                 SET_reg_curpm(rex_sv);
5129                 rex = re;
5130                 rexi = rei;
5131                 cur_curlyx = NULL;
5132                 ST.B = next;
5133                 ST.prev_eval = cur_eval;
5134                 cur_eval = st;
5135                 /* now continue from first node in postoned RE */
5136                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5137                 assert(0); /* NOTREACHED */
5138         }
5139
5140         case EVAL_AB: /* cleanup after a successful (??{A})B */
5141             /* note: this is called twice; first after popping B, then A */
5142             is_utf8_pat = ST.saved_utf8_pat;
5143             rex_sv = ST.prev_rex;
5144             SET_reg_curpm(rex_sv);
5145             rex = ReANY(rex_sv);
5146             rexi = RXi_GET(rex);
5147             regcpblow(ST.cp);
5148             cur_eval = ST.prev_eval;
5149             cur_curlyx = ST.prev_curlyx;
5150
5151             /* XXXX This is too dramatic a measure... */
5152             PL_reg_maxiter = 0;
5153             if ( nochange_depth )
5154                 nochange_depth--;
5155             sayYES;
5156
5157
5158         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5159             /* note: this is called twice; first after popping B, then A */
5160             is_utf8_pat = ST.saved_utf8_pat;
5161             rex_sv = ST.prev_rex;
5162             SET_reg_curpm(rex_sv);
5163             rex = ReANY(rex_sv);
5164             rexi = RXi_GET(rex);
5165
5166             REGCP_UNWIND(ST.lastcp);
5167             regcppop(rex, &maxopenparen);
5168             cur_eval = ST.prev_eval;
5169             cur_curlyx = ST.prev_curlyx;
5170             /* XXXX This is too dramatic a measure... */
5171             PL_reg_maxiter = 0;
5172             if ( nochange_depth )
5173                 nochange_depth--;
5174             sayNO_SILENT;
5175 #undef ST
5176
5177         case OPEN: /*  (  */
5178             n = ARG(scan);  /* which paren pair */
5179             rex->offs[n].start_tmp = locinput - PL_bostr;
5180             if (n > maxopenparen)
5181                 maxopenparen = n;
5182             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5183                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5184                 PTR2UV(rex),
5185                 PTR2UV(rex->offs),
5186                 (UV)n,
5187                 (IV)rex->offs[n].start_tmp,
5188                 (UV)maxopenparen
5189             ));
5190             lastopen = n;
5191             break;
5192
5193 /* XXX really need to log other places start/end are set too */
5194 #define CLOSE_CAPTURE \
5195     rex->offs[n].start = rex->offs[n].start_tmp; \
5196     rex->offs[n].end = locinput - PL_bostr; \
5197     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5198         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5199         PTR2UV(rex), \
5200         PTR2UV(rex->offs), \
5201         (UV)n, \
5202         (IV)rex->offs[n].start, \
5203         (IV)rex->offs[n].end \
5204     ))
5205
5206         case CLOSE:  /*  )  */
5207             n = ARG(scan);  /* which paren pair */
5208             CLOSE_CAPTURE;
5209             if (n > rex->lastparen)
5210                 rex->lastparen = n;
5211             rex->lastcloseparen = n;
5212             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5213                 goto fake_end;
5214             }
5215             break;
5216
5217         case ACCEPT:  /*  (*ACCEPT)  */
5218             if (ARG(scan)){
5219                 regnode *cursor;
5220                 for (cursor=scan;
5221                      cursor && OP(cursor)!=END;
5222                      cursor=regnext(cursor))
5223                 {
5224                     if ( OP(cursor)==CLOSE ){
5225                         n = ARG(cursor);
5226                         if ( n <= lastopen ) {
5227                             CLOSE_CAPTURE;
5228                             if (n > rex->lastparen)
5229                                 rex->lastparen = n;
5230                             rex->lastcloseparen = n;
5231                             if ( n == ARG(scan) || (cur_eval &&
5232                                 cur_eval->u.eval.close_paren == n))
5233                                 break;
5234                         }
5235                     }
5236                 }
5237             }
5238             goto fake_end;
5239             /*NOTREACHED*/
5240
5241         case GROUPP:  /*  (?(1))  */
5242             n = ARG(scan);  /* which paren pair */
5243             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5244             break;
5245
5246         case NGROUPP:  /*  (?(<name>))  */
5247             /* reg_check_named_buff_matched returns 0 for no match */
5248             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5249             break;
5250
5251         case INSUBP:   /*  (?(R))  */
5252             n = ARG(scan);
5253             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5254             break;
5255
5256         case DEFINEP:  /*  (?(DEFINE))  */
5257             sw = 0;
5258             break;
5259
5260         case IFTHEN:   /*  (?(cond)A|B)  */
5261             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5262             if (sw)
5263                 next = NEXTOPER(NEXTOPER(scan));
5264             else {
5265                 next = scan + ARG(scan);
5266                 if (OP(next) == IFTHEN) /* Fake one. */
5267                     next = NEXTOPER(NEXTOPER(next));
5268             }
5269             break;
5270
5271         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5272             logical = scan->flags;
5273             break;
5274
5275 /*******************************************************************
5276
5277 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5278 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5279 STAR/PLUS/CURLY/CURLYN are used instead.)
5280
5281 A*B is compiled as <CURLYX><A><WHILEM><B>
5282
5283 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5284 state, which contains the current count, initialised to -1. It also sets
5285 cur_curlyx to point to this state, with any previous value saved in the
5286 state block.
5287
5288 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5289 since the pattern may possibly match zero times (i.e. it's a while {} loop
5290 rather than a do {} while loop).
5291
5292 Each entry to WHILEM represents a successful match of A. The count in the
5293 CURLYX block is incremented, another WHILEM state is pushed, and execution
5294 passes to A or B depending on greediness and the current count.
5295
5296 For example, if matching against the string a1a2a3b (where the aN are
5297 substrings that match /A/), then the match progresses as follows: (the
5298 pushed states are interspersed with the bits of strings matched so far):
5299
5300     <CURLYX cnt=-1>
5301     <CURLYX cnt=0><WHILEM>
5302     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5303     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5304     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5305     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5306
5307 (Contrast this with something like CURLYM, which maintains only a single
5308 backtrack state:
5309
5310     <CURLYM cnt=0> a1
5311     a1 <CURLYM cnt=1> a2
5312     a1 a2 <CURLYM cnt=2> a3
5313     a1 a2 a3 <CURLYM cnt=3> b
5314 )
5315
5316 Each WHILEM state block marks a point to backtrack to upon partial failure
5317 of A or B, and also contains some minor state data related to that
5318 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5319 overall state, such as the count, and pointers to the A and B ops.
5320
5321 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5322 must always point to the *current* CURLYX block, the rules are:
5323
5324 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5325 and set cur_curlyx to point the new block.
5326
5327 When popping the CURLYX block after a successful or unsuccessful match,
5328 restore the previous cur_curlyx.
5329
5330 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5331 to the outer one saved in the CURLYX block.
5332
5333 When popping the WHILEM block after a successful or unsuccessful B match,
5334 restore the previous cur_curlyx.
5335
5336 Here's an example for the pattern (AI* BI)*BO
5337 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5338
5339 cur_
5340 curlyx backtrack stack
5341 ------ ---------------
5342 NULL
5343 CO     <CO prev=NULL> <WO>
5344 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5345 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5346 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5347
5348 At this point the pattern succeeds, and we work back down the stack to
5349 clean up, restoring as we go:
5350
5351 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5352 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5353 CO     <CO prev=NULL> <WO>
5354 NULL
5355
5356 *******************************************************************/
5357
5358 #define ST st->u.curlyx
5359
5360         case CURLYX:    /* start of /A*B/  (for complex A) */
5361         {
5362             /* No need to save/restore up to this paren */
5363             I32 parenfloor = scan->flags;
5364
5365             assert(next); /* keep Coverity happy */
5366             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5367                 next += ARG(next);
5368
5369             /* XXXX Probably it is better to teach regpush to support
5370                parenfloor > maxopenparen ... */
5371             if (parenfloor > (I32)rex->lastparen)
5372                 parenfloor = rex->lastparen; /* Pessimization... */
5373
5374             ST.prev_curlyx= cur_curlyx;
5375             cur_curlyx = st;
5376             ST.cp = PL_savestack_ix;
5377
5378             /* these fields contain the state of the current curly.
5379              * they are accessed by subsequent WHILEMs */
5380             ST.parenfloor = parenfloor;
5381             ST.me = scan;
5382             ST.B = next;
5383             ST.minmod = minmod;
5384             minmod = 0;
5385             ST.count = -1;      /* this will be updated by WHILEM */
5386             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5387
5388             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5389             assert(0); /* NOTREACHED */
5390         }
5391
5392         case CURLYX_end: /* just finished matching all of A*B */
5393             cur_curlyx = ST.prev_curlyx;
5394             sayYES;
5395             assert(0); /* NOTREACHED */
5396
5397         case CURLYX_end_fail: /* just failed to match all of A*B */
5398             regcpblow(ST.cp);
5399             cur_curlyx = ST.prev_curlyx;
5400             sayNO;
5401             assert(0); /* NOTREACHED */
5402
5403
5404 #undef ST
5405 #define ST st->u.whilem
5406
5407         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5408         {
5409             /* see the discussion above about CURLYX/WHILEM */
5410             I32 n;
5411             int min = ARG1(cur_curlyx->u.curlyx.me);
5412             int max = ARG2(cur_curlyx->u.curlyx.me);
5413             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5414
5415             assert(cur_curlyx); /* keep Coverity happy */
5416             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5417             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5418             ST.cache_offset = 0;
5419             ST.cache_mask = 0;
5420
5421
5422             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5423                   "%*s  whilem: matched %ld out of %d..%d\n",
5424                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5425             );
5426
5427             /* First just match a string of min A's. */
5428
5429             if (n < min) {
5430                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5431                                     maxopenparen);
5432                 cur_curlyx->u.curlyx.lastloc = locinput;
5433                 REGCP_SET(ST.lastcp);
5434
5435                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5436                 assert(0); /* NOTREACHED */
5437             }
5438
5439             /* If degenerate A matches "", assume A done. */
5440
5441             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5442                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5443                    "%*s  whilem: empty match detected, trying continuation...\n",
5444                    REPORT_CODE_OFF+depth*2, "")
5445                 );
5446                 goto do_whilem_B_max;
5447             }
5448
5449             /* super-linear cache processing */
5450
5451             if (scan->flags) {
5452
5453                 if (!PL_reg_maxiter) {
5454                     /* start the countdown: Postpone detection until we
5455                      * know the match is not *that* much linear. */
5456                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5457                     /* possible overflow for long strings and many CURLYX's */
5458                     if (PL_reg_maxiter < 0)
5459                         PL_reg_maxiter = I32_MAX;
5460                     PL_reg_leftiter = PL_reg_maxiter;
5461                 }
5462
5463                 if (PL_reg_leftiter-- == 0) {
5464                     /* initialise cache */
5465                     const I32 size = (PL_reg_maxiter + 7)/8;
5466                     if (PL_reg_poscache) {
5467                         if ((I32)PL_reg_poscache_size < size) {
5468                             Renew(PL_reg_poscache, size, char);
5469                             PL_reg_poscache_size = size;
5470                         }
5471                         Zero(PL_reg_poscache, size, char);
5472                     }
5473                     else {
5474                         PL_reg_poscache_size = size;
5475                         Newxz(PL_reg_poscache, size, char);
5476                     }
5477                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5478       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5479                               PL_colors[4], PL_colors[5])
5480                     );
5481                 }
5482
5483                 if (PL_reg_leftiter < 0) {
5484                     /* have we already failed at this position? */
5485                     I32 offset, mask;
5486                     offset  = (scan->flags & 0xf) - 1
5487                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5488                     mask    = 1 << (offset % 8);
5489                     offset /= 8;
5490                     if (PL_reg_poscache[offset] & mask) {
5491                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5492                             "%*s  whilem: (cache) already tried at this position...\n",
5493                             REPORT_CODE_OFF+depth*2, "")
5494                         );
5495                         sayNO; /* cache records failure */
5496                     }
5497                     ST.cache_offset = offset;
5498                     ST.cache_mask   = mask;
5499                 }
5500             }
5501
5502             /* Prefer B over A for minimal matching. */
5503
5504             if (cur_curlyx->u.curlyx.minmod) {
5505                 ST.save_curlyx = cur_curlyx;
5506                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5507                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5508                             maxopenparen);
5509                 REGCP_SET(ST.lastcp);
5510                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5511                                     locinput);
5512                 assert(0); /* NOTREACHED */
5513             }
5514
5515             /* Prefer A over B for maximal matching. */
5516
5517             if (n < max) { /* More greed allowed? */
5518                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5519                             maxopenparen);
5520                 cur_curlyx->u.curlyx.lastloc = locinput;
5521                 REGCP_SET(ST.lastcp);
5522                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5523                 assert(0); /* NOTREACHED */
5524             }
5525             goto do_whilem_B_max;
5526         }
5527         assert(0); /* NOTREACHED */
5528
5529         case WHILEM_B_min: /* just matched B in a minimal match */
5530         case WHILEM_B_max: /* just matched B in a maximal match */
5531             cur_curlyx = ST.save_curlyx;
5532             sayYES;
5533             assert(0); /* NOTREACHED */
5534
5535         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5536             cur_curlyx = ST.save_curlyx;
5537             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5538             cur_curlyx->u.curlyx.count--;
5539             CACHEsayNO;
5540             assert(0); /* NOTREACHED */
5541
5542         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5543             /* FALL THROUGH */
5544         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5545             REGCP_UNWIND(ST.lastcp);
5546             regcppop(rex, &maxopenparen);
5547             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5548             cur_curlyx->u.curlyx.count--;
5549             CACHEsayNO;
5550             assert(0); /* NOTREACHED */
5551
5552         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5553             REGCP_UNWIND(ST.lastcp);
5554             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5555             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5556                 "%*s  whilem: failed, trying continuation...\n",
5557                 REPORT_CODE_OFF+depth*2, "")
5558             );
5559           do_whilem_B_max:
5560             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5561                 && ckWARN(WARN_REGEXP)
5562                 && !reginfo->warned)
5563             {
5564                 reginfo->warned = TRUE;
5565                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5566                      "Complex regular subexpression recursion limit (%d) "
5567                      "exceeded",
5568                      REG_INFTY - 1);
5569             }
5570
5571             /* now try B */
5572             ST.save_curlyx = cur_curlyx;
5573             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5574             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5575                                 locinput);
5576             assert(0); /* NOTREACHED */
5577
5578         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5579             cur_curlyx = ST.save_curlyx;
5580             REGCP_UNWIND(ST.lastcp);
5581             regcppop(rex, &maxopenparen);
5582
5583             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5584                 /* Maximum greed exceeded */
5585                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5586                     && ckWARN(WARN_REGEXP)
5587                     && !reginfo->warned)
5588                 {
5589                     reginfo->warned     = TRUE;
5590                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5591                         "Complex regular subexpression recursion "
5592                         "limit (%d) exceeded",
5593                         REG_INFTY - 1);
5594                 }
5595                 cur_curlyx->u.curlyx.count--;
5596                 CACHEsayNO;
5597             }
5598
5599             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5600                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5601             );
5602             /* Try grabbing another A and see if it helps. */
5603             cur_curlyx->u.curlyx.lastloc = locinput;
5604             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5605                             maxopenparen);
5606             REGCP_SET(ST.lastcp);
5607             PUSH_STATE_GOTO(WHILEM_A_min,
5608                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5609                 locinput);
5610             assert(0); /* NOTREACHED */
5611
5612 #undef  ST
5613 #define ST st->u.branch
5614
5615         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5616             next = scan + ARG(scan);
5617             if (next == scan)
5618                 next = NULL;
5619             scan = NEXTOPER(scan);
5620             /* FALL THROUGH */
5621
5622         case BRANCH:        /*  /(...|A|...)/ */
5623             scan = NEXTOPER(scan); /* scan now points to inner node */
5624             ST.lastparen = rex->lastparen;
5625             ST.lastcloseparen = rex->lastcloseparen;
5626             ST.next_branch = next;
5627             REGCP_SET(ST.cp);
5628
5629             /* Now go into the branch */
5630             if (has_cutgroup) {
5631                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5632             } else {
5633                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5634             }
5635             assert(0); /* NOTREACHED */
5636
5637         case CUTGROUP:  /*  /(*THEN)/  */
5638             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5639                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5640             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5641             assert(0); /* NOTREACHED */
5642
5643         case CUTGROUP_next_fail:
5644             do_cutgroup = 1;
5645             no_final = 1;
5646             if (st->u.mark.mark_name)
5647                 sv_commit = st->u.mark.mark_name;
5648             sayNO;
5649             assert(0); /* NOTREACHED */
5650
5651         case BRANCH_next:
5652             sayYES;
5653             assert(0); /* NOTREACHED */
5654
5655         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5656             if (do_cutgroup) {
5657                 do_cutgroup = 0;
5658                 no_final = 0;
5659             }
5660             REGCP_UNWIND(ST.cp);
5661             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5662             scan = ST.next_branch;
5663             /* no more branches? */
5664             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5665                 DEBUG_EXECUTE_r({
5666                     PerlIO_printf( Perl_debug_log,
5667                         "%*s  %sBRANCH failed...%s\n",
5668                         REPORT_CODE_OFF+depth*2, "",
5669                         PL_colors[4],
5670                         PL_colors[5] );
5671                 });
5672                 sayNO_SILENT;
5673             }
5674             continue; /* execute next BRANCH[J] op */
5675             assert(0); /* NOTREACHED */
5676
5677         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5678             minmod = 1;
5679             break;
5680
5681 #undef  ST
5682 #define ST st->u.curlym
5683
5684         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5685
5686             /* This is an optimisation of CURLYX that enables us to push
5687              * only a single backtracking state, no matter how many matches
5688              * there are in {m,n}. It relies on the pattern being constant
5689              * length, with no parens to influence future backrefs
5690              */
5691
5692             ST.me = scan;
5693             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5694
5695             ST.lastparen      = rex->lastparen;
5696             ST.lastcloseparen = rex->lastcloseparen;
5697
5698             /* if paren positive, emulate an OPEN/CLOSE around A */
5699             if (ST.me->flags) {
5700                 U32 paren = ST.me->flags;
5701                 if (paren > maxopenparen)
5702                     maxopenparen = paren;
5703                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5704             }
5705             ST.A = scan;
5706             ST.B = next;
5707             ST.alen = 0;
5708             ST.count = 0;
5709             ST.minmod = minmod;
5710             minmod = 0;
5711             ST.c1 = CHRTEST_UNINIT;
5712             REGCP_SET(ST.cp);
5713
5714             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5715                 goto curlym_do_B;
5716
5717           curlym_do_A: /* execute the A in /A{m,n}B/  */
5718             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5719             assert(0); /* NOTREACHED */
5720
5721         case CURLYM_A: /* we've just matched an A */
5722             ST.count++;
5723             /* after first match, determine A's length: u.curlym.alen */
5724             if (ST.count == 1) {
5725                 if (PL_reg_match_utf8) {
5726                     char *s = st->locinput;
5727                     while (s < locinput) {
5728                         ST.alen++;
5729                         s += UTF8SKIP(s);
5730                     }
5731                 }
5732                 else {
5733                     ST.alen = locinput - st->locinput;
5734                 }
5735                 if (ST.alen == 0)
5736                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5737             }
5738             DEBUG_EXECUTE_r(
5739                 PerlIO_printf(Perl_debug_log,
5740                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5741                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5742                           (IV) ST.count, (IV)ST.alen)
5743             );
5744
5745             if (cur_eval && cur_eval->u.eval.close_paren &&
5746                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5747                 goto fake_end;
5748
5749             {
5750                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5751                 if ( max == REG_INFTY || ST.count < max )
5752                     goto curlym_do_A; /* try to match another A */
5753             }
5754             goto curlym_do_B; /* try to match B */
5755
5756         case CURLYM_A_fail: /* just failed to match an A */
5757             REGCP_UNWIND(ST.cp);
5758
5759             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5760                 || (cur_eval && cur_eval->u.eval.close_paren &&
5761                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5762                 sayNO;
5763
5764           curlym_do_B: /* execute the B in /A{m,n}B/  */
5765             if (ST.c1 == CHRTEST_UNINIT) {
5766                 /* calculate c1 and c2 for possible match of 1st char
5767                  * following curly */
5768                 ST.c1 = ST.c2 = CHRTEST_VOID;
5769                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5770                     regnode *text_node = ST.B;
5771                     if (! HAS_TEXT(text_node))
5772                         FIND_NEXT_IMPT(text_node);
5773                     /* this used to be
5774
5775                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5776
5777                         But the former is redundant in light of the latter.
5778
5779                         if this changes back then the macro for
5780                         IS_TEXT and friends need to change.
5781                      */
5782                     if (PL_regkind[OP(text_node)] == EXACT) {
5783                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5784                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5785                            is_utf8_pat))
5786                         {
5787                             sayNO;
5788                         }
5789                     }
5790                 }
5791             }
5792
5793             DEBUG_EXECUTE_r(
5794                 PerlIO_printf(Perl_debug_log,
5795                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5796                     (int)(REPORT_CODE_OFF+(depth*2)),
5797                     "", (IV)ST.count)
5798                 );
5799             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5800                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5801                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5802                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5803                     {
5804                         /* simulate B failing */
5805                         DEBUG_OPTIMISE_r(
5806                             PerlIO_printf(Perl_debug_log,
5807                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5808                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5809                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5810                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5811                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5812                         );
5813                         state_num = CURLYM_B_fail;
5814                         goto reenter_switch;
5815                     }
5816                 }
5817                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5818                     /* simulate B failing */
5819                     DEBUG_OPTIMISE_r(
5820                         PerlIO_printf(Perl_debug_log,
5821                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5822                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5823                             (int) nextchr, ST.c1, ST.c2)
5824                     );
5825                     state_num = CURLYM_B_fail;
5826                     goto reenter_switch;
5827                 }
5828             }
5829
5830             if (ST.me->flags) {
5831                 /* emulate CLOSE: mark current A as captured */
5832                 I32 paren = ST.me->flags;
5833                 if (ST.count) {
5834                     rex->offs[paren].start
5835                         = HOPc(locinput, -ST.alen) - PL_bostr;
5836                     rex->offs[paren].end = locinput - PL_bostr;
5837                     if ((U32)paren > rex->lastparen)
5838                         rex->lastparen = paren;
5839                     rex->lastcloseparen = paren;
5840                 }
5841                 else
5842                     rex->offs[paren].end = -1;
5843                 if (cur_eval && cur_eval->u.eval.close_paren &&
5844                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5845                 {
5846                     if (ST.count)
5847                         goto fake_end;
5848                     else
5849                         sayNO;
5850                 }
5851             }
5852
5853             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5854             assert(0); /* NOTREACHED */
5855
5856         case CURLYM_B_fail: /* just failed to match a B */
5857             REGCP_UNWIND(ST.cp);
5858             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5859             if (ST.minmod) {
5860                 I32 max = ARG2(ST.me);
5861                 if (max != REG_INFTY && ST.count == max)
5862                     sayNO;
5863                 goto curlym_do_A; /* try to match a further A */
5864             }
5865             /* backtrack one A */
5866             if (ST.count == ARG1(ST.me) /* min */)
5867                 sayNO;
5868             ST.count--;
5869             SET_locinput(HOPc(locinput, -ST.alen));
5870             goto curlym_do_B; /* try to match B */
5871
5872 #undef ST
5873 #define ST st->u.curly
5874
5875 #define CURLY_SETPAREN(paren, success) \
5876     if (paren) { \
5877         if (success) { \
5878             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5879             rex->offs[paren].end = locinput - PL_bostr; \
5880             if (paren > rex->lastparen) \
5881                 rex->lastparen = paren; \
5882             rex->lastcloseparen = paren; \
5883         } \
5884         else { \
5885             rex->offs[paren].end = -1; \
5886             rex->lastparen      = ST.lastparen; \
5887             rex->lastcloseparen = ST.lastcloseparen; \
5888         } \
5889     }
5890
5891         case STAR:              /*  /A*B/ where A is width 1 char */
5892             ST.paren = 0;
5893             ST.min = 0;
5894             ST.max = REG_INFTY;
5895             scan = NEXTOPER(scan);
5896             goto repeat;
5897
5898         case PLUS:              /*  /A+B/ where A is width 1 char */
5899             ST.paren = 0;
5900             ST.min = 1;
5901             ST.max = REG_INFTY;
5902             scan = NEXTOPER(scan);
5903             goto repeat;
5904
5905         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5906             ST.paren = scan->flags;     /* Which paren to set */
5907             ST.lastparen      = rex->lastparen;
5908             ST.lastcloseparen = rex->lastcloseparen;
5909             if (ST.paren > maxopenparen)
5910                 maxopenparen = ST.paren;
5911             ST.min = ARG1(scan);  /* min to match */
5912             ST.max = ARG2(scan);  /* max to match */
5913             if (cur_eval && cur_eval->u.eval.close_paren &&
5914                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5915                 ST.min=1;
5916                 ST.max=1;
5917             }
5918             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5919             goto repeat;
5920
5921         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5922             ST.paren = 0;
5923             ST.min = ARG1(scan);  /* min to match */
5924             ST.max = ARG2(scan);  /* max to match */
5925             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5926           repeat:
5927             /*
5928             * Lookahead to avoid useless match attempts
5929             * when we know what character comes next.
5930             *
5931             * Used to only do .*x and .*?x, but now it allows
5932             * for )'s, ('s and (?{ ... })'s to be in the way
5933             * of the quantifier and the EXACT-like node.  -- japhy
5934             */
5935
5936             assert(ST.min <= ST.max);
5937             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5938                 ST.c1 = ST.c2 = CHRTEST_VOID;
5939             }
5940             else {
5941                 regnode *text_node = next;
5942
5943                 if (! HAS_TEXT(text_node))
5944                     FIND_NEXT_IMPT(text_node);
5945
5946                 if (! HAS_TEXT(text_node))
5947                     ST.c1 = ST.c2 = CHRTEST_VOID;
5948                 else {
5949                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5950                         ST.c1 = ST.c2 = CHRTEST_VOID;
5951                     }
5952                     else {
5953
5954                     /*  Currently we only get here when
5955
5956                         PL_rekind[OP(text_node)] == EXACT
5957
5958                         if this changes back then the macro for IS_TEXT and
5959                         friends need to change. */
5960                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5961                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5962                            is_utf8_pat))
5963                         {
5964                             sayNO;
5965                         }
5966                     }
5967                 }
5968             }
5969
5970             ST.A = scan;
5971             ST.B = next;
5972             if (minmod) {
5973                 char *li = locinput;
5974                 minmod = 0;
5975                 if (ST.min &&
5976                         regrepeat(rex, &li, ST.A, ST.min, depth, is_utf8_pat)
5977                             < ST.min)
5978                     sayNO;
5979                 SET_locinput(li);
5980                 ST.count = ST.min;
5981                 REGCP_SET(ST.cp);
5982                 if (ST.c1 == CHRTEST_VOID)
5983                     goto curly_try_B_min;
5984
5985                 ST.oldloc = locinput;
5986
5987                 /* set ST.maxpos to the furthest point along the
5988                  * string that could possibly match */
5989                 if  (ST.max == REG_INFTY) {
5990                     ST.maxpos = PL_regeol - 1;
5991                     if (utf8_target)
5992                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5993                             ST.maxpos--;
5994                 }
5995                 else if (utf8_target) {
5996                     int m = ST.max - ST.min;
5997                     for (ST.maxpos = locinput;
5998                          m >0 && ST.maxpos < PL_regeol; m--)
5999                         ST.maxpos += UTF8SKIP(ST.maxpos);
6000                 }
6001                 else {
6002                     ST.maxpos = locinput + ST.max - ST.min;
6003                     if (ST.maxpos >= PL_regeol)
6004                         ST.maxpos = PL_regeol - 1;
6005                 }
6006                 goto curly_try_B_min_known;
6007
6008             }
6009             else {
6010                 /* avoid taking address of locinput, so it can remain
6011                  * a register var */
6012                 char *li = locinput;
6013                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth,
6014                                         is_utf8_pat);
6015                 if (ST.count < ST.min)
6016                     sayNO;
6017                 SET_locinput(li);
6018                 if ((ST.count > ST.min)
6019                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6020                 {
6021                     /* A{m,n} must come at the end of the string, there's
6022                      * no point in backing off ... */
6023                     ST.min = ST.count;
6024                     /* ...except that $ and \Z can match before *and* after
6025                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6026                        We may back off by one in this case. */
6027                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6028                         ST.min--;
6029                 }
6030                 REGCP_SET(ST.cp);
6031                 goto curly_try_B_max;
6032             }
6033             assert(0); /* NOTREACHED */
6034
6035
6036         case CURLY_B_min_known_fail:
6037             /* failed to find B in a non-greedy match where c1,c2 valid */
6038
6039             REGCP_UNWIND(ST.cp);
6040             if (ST.paren) {
6041                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6042             }
6043             /* Couldn't or didn't -- move forward. */
6044             ST.oldloc = locinput;
6045             if (utf8_target)
6046                 locinput += UTF8SKIP(locinput);
6047             else
6048                 locinput++;
6049             ST.count++;
6050           curly_try_B_min_known:
6051              /* find the next place where 'B' could work, then call B */
6052             {
6053                 int n;
6054                 if (utf8_target) {
6055                     n = (ST.oldloc == locinput) ? 0 : 1;
6056                     if (ST.c1 == ST.c2) {
6057                         /* set n to utf8_distance(oldloc, locinput) */
6058                         while (locinput <= ST.maxpos
6059                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6060                         {
6061                             locinput += UTF8SKIP(locinput);
6062                             n++;
6063                         }
6064                     }
6065                     else {
6066                         /* set n to utf8_distance(oldloc, locinput) */
6067                         while (locinput <= ST.maxpos
6068                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6069                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6070                         {
6071                             locinput += UTF8SKIP(locinput);
6072                             n++;
6073                         }
6074                     }
6075                 }
6076                 else {  /* Not utf8_target */
6077                     if (ST.c1 == ST.c2) {
6078                         while (locinput <= ST.maxpos &&
6079                                UCHARAT(locinput) != ST.c1)
6080                             locinput++;
6081                     }
6082                     else {
6083                         while (locinput <= ST.maxpos
6084                                && UCHARAT(locinput) != ST.c1
6085                                && UCHARAT(locinput) != ST.c2)
6086                             locinput++;
6087                     }
6088                     n = locinput - ST.oldloc;
6089                 }
6090                 if (locinput > ST.maxpos)
6091                     sayNO;
6092                 if (n) {
6093                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6094                      * at b; check that everything between oldloc and
6095                      * locinput matches */
6096                     char *li = ST.oldloc;
6097                     ST.count += n;
6098                     if (regrepeat(rex, &li, ST.A, n, depth, is_utf8_pat) < n)
6099                         sayNO;
6100                     assert(n == REG_INFTY || locinput == li);
6101                 }
6102                 CURLY_SETPAREN(ST.paren, ST.count);
6103                 if (cur_eval && cur_eval->u.eval.close_paren &&
6104                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6105                     goto fake_end;
6106                 }
6107                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6108             }
6109             assert(0); /* NOTREACHED */
6110
6111
6112         case CURLY_B_min_fail:
6113             /* failed to find B in a non-greedy match where c1,c2 invalid */
6114
6115             REGCP_UNWIND(ST.cp);
6116             if (ST.paren) {
6117                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6118             }
6119             /* failed -- move forward one */
6120             {
6121                 char *li = locinput;
6122                 if (!regrepeat(rex, &li, ST.A, 1, depth, is_utf8_pat)) {
6123                     sayNO;
6124                 }
6125                 locinput = li;
6126             }
6127             {
6128                 ST.count++;
6129                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6130                         ST.count > 0)) /* count overflow ? */
6131                 {
6132                   curly_try_B_min:
6133                     CURLY_SETPAREN(ST.paren, ST.count);
6134                     if (cur_eval && cur_eval->u.eval.close_paren &&
6135                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6136                         goto fake_end;
6137                     }
6138                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6139                 }
6140             }
6141             sayNO;
6142             assert(0); /* NOTREACHED */
6143
6144
6145         curly_try_B_max:
6146             /* a successful greedy match: now try to match B */
6147             if (cur_eval && cur_eval->u.eval.close_paren &&
6148                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6149                 goto fake_end;
6150             }
6151             {
6152                 bool could_match = locinput < PL_regeol;
6153
6154                 /* If it could work, try it. */
6155                 if (ST.c1 != CHRTEST_VOID && could_match) {
6156                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6157                     {
6158                         could_match = memEQ(locinput,
6159                                             ST.c1_utf8,
6160                                             UTF8SKIP(locinput))
6161                                     || memEQ(locinput,
6162                                              ST.c2_utf8,
6163                                              UTF8SKIP(locinput));
6164                     }
6165                     else {
6166                         could_match = UCHARAT(locinput) == ST.c1
6167                                       || UCHARAT(locinput) == ST.c2;
6168                     }
6169                 }
6170                 if (ST.c1 == CHRTEST_VOID || could_match) {
6171                     CURLY_SETPAREN(ST.paren, ST.count);
6172                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6173                     assert(0); /* NOTREACHED */
6174                 }
6175             }
6176             /* FALL THROUGH */
6177
6178         case CURLY_B_max_fail:
6179             /* failed to find B in a greedy match */
6180
6181             REGCP_UNWIND(ST.cp);
6182             if (ST.paren) {
6183                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6184             }
6185             /*  back up. */
6186             if (--ST.count < ST.min)
6187                 sayNO;
6188             locinput = HOPc(locinput, -1);
6189             goto curly_try_B_max;
6190
6191 #undef ST
6192
6193         case END: /*  last op of main pattern  */
6194             fake_end:
6195             if (cur_eval) {
6196                 /* we've just finished A in /(??{A})B/; now continue with B */
6197                 st->u.eval.saved_utf8_pat = is_utf8_pat;
6198                 is_utf8_pat = cur_eval->u.eval.saved_utf8_pat;
6199
6200                 st->u.eval.prev_rex = rex_sv;           /* inner */
6201
6202                 /* Save *all* the positions. */
6203                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6204                 rex_sv = cur_eval->u.eval.prev_rex;
6205                 SET_reg_curpm(rex_sv);
6206                 rex = ReANY(rex_sv);
6207                 rexi = RXi_GET(rex);
6208                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6209
6210                 REGCP_SET(st->u.eval.lastcp);
6211
6212                 /* Restore parens of the outer rex without popping the
6213                  * savestack */
6214                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6215                                         &maxopenparen);
6216
6217                 st->u.eval.prev_eval = cur_eval;
6218                 cur_eval = cur_eval->u.eval.prev_eval;
6219                 DEBUG_EXECUTE_r(
6220                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6221                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6222                 if ( nochange_depth )
6223                     nochange_depth--;
6224
6225                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6226                                     locinput); /* match B */
6227             }
6228
6229             if (locinput < reginfo->till) {
6230                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6231                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6232                                       PL_colors[4],
6233                                       (long)(locinput - PL_reg_starttry),
6234                                       (long)(reginfo->till - PL_reg_starttry),
6235                                       PL_colors[5]));
6236
6237                 sayNO_SILENT;           /* Cannot match: too short. */
6238             }
6239             sayYES;                     /* Success! */
6240
6241         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6242             DEBUG_EXECUTE_r(
6243             PerlIO_printf(Perl_debug_log,
6244                 "%*s  %ssubpattern success...%s\n",
6245                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6246             sayYES;                     /* Success! */
6247
6248 #undef  ST
6249 #define ST st->u.ifmatch
6250
6251         {
6252             char *newstart;
6253
6254         case SUSPEND:   /* (?>A) */
6255             ST.wanted = 1;
6256             newstart = locinput;
6257             goto do_ifmatch;
6258
6259         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6260             ST.wanted = 0;
6261             goto ifmatch_trivial_fail_test;
6262
6263         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6264             ST.wanted = 1;
6265           ifmatch_trivial_fail_test:
6266             if (scan->flags) {
6267                 char * const s = HOPBACKc(locinput, scan->flags);
6268                 if (!s) {
6269                     /* trivial fail */
6270                     if (logical) {
6271                         logical = 0;
6272                         sw = 1 - cBOOL(ST.wanted);
6273                     }
6274                     else if (ST.wanted)
6275                         sayNO;
6276                     next = scan + ARG(scan);
6277                     if (next == scan)
6278                         next = NULL;
6279                     break;
6280                 }
6281                 newstart = s;
6282             }
6283             else
6284                 newstart = locinput;
6285
6286           do_ifmatch:
6287             ST.me = scan;
6288             ST.logical = logical;
6289             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6290
6291             /* execute body of (?...A) */
6292             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6293             assert(0); /* NOTREACHED */
6294         }
6295
6296         case IFMATCH_A_fail: /* body of (?...A) failed */
6297             ST.wanted = !ST.wanted;
6298             /* FALL THROUGH */
6299
6300         case IFMATCH_A: /* body of (?...A) succeeded */
6301             if (ST.logical) {
6302                 sw = cBOOL(ST.wanted);
6303             }
6304             else if (!ST.wanted)
6305                 sayNO;
6306
6307             if (OP(ST.me) != SUSPEND) {
6308                 /* restore old position except for (?>...) */
6309                 locinput = st->locinput;
6310             }
6311             scan = ST.me + ARG(ST.me);
6312             if (scan == ST.me)
6313                 scan = NULL;
6314             continue; /* execute B */
6315
6316 #undef ST
6317
6318         case LONGJMP: /*  alternative with many branches compiles to
6319                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6320             next = scan + ARG(scan);
6321             if (next == scan)
6322                 next = NULL;
6323             break;
6324
6325         case COMMIT:  /*  (*COMMIT)  */
6326             reginfo->cutpoint = PL_regeol;
6327             /* FALLTHROUGH */
6328
6329         case PRUNE:   /*  (*PRUNE)   */
6330             if (!scan->flags)
6331                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6332             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6333             assert(0); /* NOTREACHED */
6334
6335         case COMMIT_next_fail:
6336             no_final = 1;
6337             /* FALLTHROUGH */
6338
6339         case OPFAIL:   /* (*FAIL)  */
6340             sayNO;
6341             assert(0); /* NOTREACHED */
6342
6343 #define ST st->u.mark
6344         case MARKPOINT: /*  (*MARK:foo)  */
6345             ST.prev_mark = mark_state;
6346             ST.mark_name = sv_commit = sv_yes_mark
6347                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6348             mark_state = st;
6349             ST.mark_loc = locinput;
6350             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6351             assert(0); /* NOTREACHED */
6352
6353         case MARKPOINT_next:
6354             mark_state = ST.prev_mark;
6355             sayYES;
6356             assert(0); /* NOTREACHED */
6357
6358         case MARKPOINT_next_fail:
6359             if (popmark && sv_eq(ST.mark_name,popmark))
6360             {
6361                 if (ST.mark_loc > startpoint)
6362                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6363                 popmark = NULL; /* we found our mark */
6364                 sv_commit = ST.mark_name;
6365
6366                 DEBUG_EXECUTE_r({
6367                         PerlIO_printf(Perl_debug_log,
6368                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6369                             REPORT_CODE_OFF+depth*2, "",
6370                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6371                 });
6372             }
6373             mark_state = ST.prev_mark;
6374             sv_yes_mark = mark_state ?
6375                 mark_state->u.mark.mark_name : NULL;
6376             sayNO;
6377             assert(0); /* NOTREACHED */
6378
6379         case SKIP:  /*  (*SKIP)  */
6380             if (scan->flags) {
6381                 /* (*SKIP) : if we fail we cut here*/
6382                 ST.mark_name = NULL;
6383                 ST.mark_loc = locinput;
6384                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6385             } else {
6386                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6387                    otherwise do nothing.  Meaning we need to scan
6388                  */
6389                 regmatch_state *cur = mark_state;
6390                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6391
6392                 while (cur) {
6393                     if ( sv_eq( cur->u.mark.mark_name,
6394                                 find ) )
6395                     {
6396                         ST.mark_name = find;
6397                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6398                     }
6399                     cur = cur->u.mark.prev_mark;
6400                 }
6401             }
6402             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6403             break;
6404
6405         case SKIP_next_fail:
6406             if (ST.mark_name) {
6407                 /* (*CUT:NAME) - Set up to search for the name as we
6408                    collapse the stack*/
6409                 popmark = ST.mark_name;
6410             } else {
6411                 /* (*CUT) - No name, we cut here.*/
6412                 if (ST.mark_loc > startpoint)
6413                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6414                 /* but we set sv_commit to latest mark_name if there
6415                    is one so they can test to see how things lead to this
6416                    cut */
6417                 if (mark_state)
6418                     sv_commit=mark_state->u.mark.mark_name;
6419             }
6420             no_final = 1;
6421             sayNO;
6422             assert(0); /* NOTREACHED */
6423 #undef ST
6424
6425         case LNBREAK: /* \R */
6426             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6427                 locinput += n;
6428             } else
6429                 sayNO;
6430             break;
6431
6432         default:
6433             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6434                           PTR2UV(scan), OP(scan));
6435             Perl_croak(aTHX_ "regexp memory corruption");
6436
6437         /* this is a point to jump to in order to increment
6438          * locinput by one character */
6439         increment_locinput:
6440             assert(!NEXTCHR_IS_EOS);
6441             if (utf8_target) {
6442                 locinput += PL_utf8skip[nextchr];
6443                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6444                 if (locinput > PL_regeol)
6445                     sayNO;
6446             }
6447             else
6448                 locinput++;
6449             break;
6450
6451         } /* end switch */
6452
6453         /* switch break jumps here */
6454         scan = next; /* prepare to execute the next op and ... */
6455         continue;    /* ... jump back to the top, reusing st */
6456         assert(0); /* NOTREACHED */
6457
6458       push_yes_state:
6459         /* push a state that backtracks on success */
6460         st->u.yes.prev_yes_state = yes_state;
6461         yes_state = st;
6462         /* FALL THROUGH */
6463       push_state:
6464         /* push a new regex state, then continue at scan  */
6465         {
6466             regmatch_state *newst;
6467
6468             DEBUG_STACK_r({
6469                 regmatch_state *cur = st;
6470                 regmatch_state *curyes = yes_state;
6471                 int curd = depth;
6472                 regmatch_slab *slab = PL_regmatch_slab;
6473                 for (;curd > -1;cur--,curd--) {
6474                     if (cur < SLAB_FIRST(slab)) {
6475                         slab = slab->prev;
6476                         cur = SLAB_LAST(slab);
6477                     }
6478                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6479                         REPORT_CODE_OFF + 2 + depth * 2,"",
6480                         curd, PL_reg_name[cur->resume_state],
6481                         (curyes == cur) ? "yes" : ""
6482                     );
6483                     if (curyes == cur)
6484                         curyes = cur->u.yes.prev_yes_state;
6485                 }
6486             } else
6487                 DEBUG_STATE_pp("push")
6488             );
6489             depth++;
6490             st->locinput = locinput;
6491             newst = st+1;
6492             if (newst >  SLAB_LAST(PL_regmatch_slab))
6493                 newst = S_push_slab(aTHX);
6494             PL_regmatch_state = newst;
6495
6496             locinput = pushinput;
6497             st = newst;
6498             continue;
6499             assert(0); /* NOTREACHED */
6500         }
6501     }
6502
6503     /*
6504     * We get here only if there's trouble -- normally "case END" is
6505     * the terminating point.
6506     */
6507     Perl_croak(aTHX_ "corrupted regexp pointers");
6508     /*NOTREACHED*/
6509     sayNO;
6510
6511 yes:
6512     if (yes_state) {
6513         /* we have successfully completed a subexpression, but we must now
6514          * pop to the state marked by yes_state and continue from there */
6515         assert(st != yes_state);
6516 #ifdef DEBUGGING
6517         while (st != yes_state) {
6518             st--;
6519             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6520                 PL_regmatch_slab = PL_regmatch_slab->prev;
6521                 st = SLAB_LAST(PL_regmatch_slab);
6522             }
6523             DEBUG_STATE_r({
6524                 if (no_final) {
6525                     DEBUG_STATE_pp("pop (no final)");
6526                 } else {
6527                     DEBUG_STATE_pp("pop (yes)");
6528                 }
6529             });
6530             depth--;
6531         }
6532 #else
6533         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6534             || yes_state > SLAB_LAST(PL_regmatch_slab))
6535         {
6536             /* not in this slab, pop slab */
6537             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6538             PL_regmatch_slab = PL_regmatch_slab->prev;
6539             st = SLAB_LAST(PL_regmatch_slab);
6540         }
6541         depth -= (st - yes_state);
6542 #endif
6543         st = yes_state;
6544         yes_state = st->u.yes.prev_yes_state;
6545         PL_regmatch_state = st;
6546
6547         if (no_final)
6548             locinput= st->locinput;
6549         state_num = st->resume_state + no_final;
6550         goto reenter_switch;
6551     }
6552
6553     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6554                           PL_colors[4], PL_colors[5]));
6555
6556     if (PL_reg_state.re_state_eval_setup_done) {
6557         /* each successfully executed (?{...}) block does the equivalent of
6558          *   local $^R = do {...}
6559          * When popping the save stack, all these locals would be undone;
6560          * bypass this by setting the outermost saved $^R to the latest
6561          * value */
6562         if (oreplsv != GvSV(PL_replgv))
6563             sv_setsv(oreplsv, GvSV(PL_replgv));
6564     }
6565     result = 1;
6566     goto final_exit;
6567
6568 no:
6569     DEBUG_EXECUTE_r(
6570         PerlIO_printf(Perl_debug_log,
6571             "%*s  %sfailed...%s\n",
6572             REPORT_CODE_OFF+depth*2, "",
6573             PL_colors[4], PL_colors[5])
6574         );
6575
6576 no_silent:
6577     if (no_final) {
6578         if (yes_state) {
6579             goto yes;
6580         } else {
6581             goto final_exit;
6582         }
6583     }
6584     if (depth) {
6585         /* there's a previous state to backtrack to */
6586         st--;
6587         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6588             PL_regmatch_slab = PL_regmatch_slab->prev;
6589             st = SLAB_LAST(PL_regmatch_slab);
6590         }
6591         PL_regmatch_state = st;
6592         locinput= st->locinput;
6593
6594         DEBUG_STATE_pp("pop");
6595         depth--;
6596         if (yes_state == st)
6597             yes_state = st->u.yes.prev_yes_state;
6598
6599         state_num = st->resume_state + 1; /* failure = success + 1 */
6600         goto reenter_switch;
6601     }
6602     result = 0;
6603
6604   final_exit:
6605     if (rex->intflags & PREGf_VERBARG_SEEN) {
6606         SV *sv_err = get_sv("REGERROR", 1);
6607         SV *sv_mrk = get_sv("REGMARK", 1);
6608         if (result) {
6609             sv_commit = &PL_sv_no;
6610             if (!sv_yes_mark)
6611                 sv_yes_mark = &PL_sv_yes;
6612         } else {
6613             if (!sv_commit)
6614                 sv_commit = &PL_sv_yes;
6615             sv_yes_mark = &PL_sv_no;
6616         }
6617         sv_setsv(sv_err, sv_commit);
6618         sv_setsv(sv_mrk, sv_yes_mark);
6619     }
6620
6621
6622     if (last_pushed_cv) {
6623         dSP;
6624         POP_MULTICALL;
6625         PERL_UNUSED_VAR(SP);
6626     }
6627
6628     /* clean up; in particular, free all slabs above current one */
6629     LEAVE_SCOPE(oldsave);
6630
6631     assert(!result ||  locinput - PL_bostr >= 0);
6632     return result ?  locinput - PL_bostr : -1;
6633 }
6634
6635 /*
6636  - regrepeat - repeatedly match something simple, report how many
6637  *
6638  * What 'simple' means is a node which can be the operand of a quantifier like
6639  * '+', or {1,3}
6640  *
6641  * startposp - pointer a pointer to the start position.  This is updated
6642  *             to point to the byte following the highest successful
6643  *             match.
6644  * p         - the regnode to be repeatedly matched against.
6645  * max       - maximum number of things to match.
6646  * depth     - (for debugging) backtracking depth.
6647  */
6648 STATIC I32
6649 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6650                 I32 max, int depth, bool is_utf8_pat)
6651 {
6652     dVAR;
6653     char *scan;     /* Pointer to current position in target string */
6654     I32 c;
6655     char *loceol = PL_regeol;   /* local version */
6656     I32 hardcount = 0;  /* How many matches so far */
6657     bool utf8_target = PL_reg_match_utf8;
6658     int to_complement = 0;  /* Invert the result? */
6659     UV utf8_flags;
6660     _char_class_number classnum;
6661 #ifndef DEBUGGING
6662     PERL_UNUSED_ARG(depth);
6663 #endif
6664
6665     PERL_ARGS_ASSERT_REGREPEAT;
6666
6667     scan = *startposp;
6668     if (max == REG_INFTY)
6669         max = I32_MAX;
6670     else if (! utf8_target && scan + max < loceol)
6671         loceol = scan + max;
6672
6673     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6674      * to the maximum of how far we should go in it (leaving it set to the real
6675      * end, if the maximum permissible would take us beyond that).  This allows
6676      * us to make the loop exit condition that we haven't gone past <loceol> to
6677      * also mean that we haven't exceeded the max permissible count, saving a
6678      * test each time through the loop.  But it assumes that the OP matches a
6679      * single byte, which is true for most of the OPs below when applied to a
6680      * non-UTF-8 target.  Those relatively few OPs that don't have this
6681      * characteristic will have to compensate.
6682      *
6683      * There is no adjustment for UTF-8 targets, as the number of bytes per
6684      * character varies.  OPs will have to test both that the count is less
6685      * than the max permissible (using <hardcount> to keep track), and that we
6686      * are still within the bounds of the string (using <loceol>.  A few OPs
6687      * match a single byte no matter what the encoding.  They can omit the max
6688      * test if, for the UTF-8 case, they do the adjustment that was skipped
6689      * above.
6690      *
6691      * Thus, the code above sets things up for the common case; and exceptional
6692      * cases need extra work; the common case is to make sure <scan> doesn't
6693      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6694      * count doesn't exceed the maximum permissible */
6695
6696     switch (OP(p)) {
6697     case REG_ANY:
6698         if (utf8_target) {
6699             while (scan < loceol && hardcount < max && *scan != '\n') {
6700                 scan += UTF8SKIP(scan);
6701                 hardcount++;
6702             }
6703         } else {
6704             while (scan < loceol && *scan != '\n')
6705                 scan++;
6706         }
6707         break;
6708     case SANY:
6709         if (utf8_target) {
6710             while (scan < loceol && hardcount < max) {
6711                 scan += UTF8SKIP(scan);
6712                 hardcount++;
6713             }
6714         }
6715         else
6716             scan = loceol;
6717         break;
6718     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6719         if (utf8_target && scan + max < loceol) {
6720
6721             /* <loceol> hadn't been adjusted in the UTF-8 case */
6722             scan +=  max;
6723         }
6724         else {
6725             scan = loceol;
6726         }
6727         break;
6728     case EXACT:
6729         assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6730
6731         c = (U8)*STRING(p);
6732
6733         /* Can use a simple loop if the pattern char to match on is invariant
6734          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6735          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6736          * true iff it doesn't matter if the argument is in UTF-8 or not */
6737         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! is_utf8_pat)) {
6738             if (utf8_target && scan + max < loceol) {
6739                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6740                  * since here, to match at all, 1 char == 1 byte */
6741                 loceol = scan + max;
6742             }
6743             while (scan < loceol && UCHARAT(scan) == c) {
6744                 scan++;
6745             }
6746         }
6747         else if (is_utf8_pat) {
6748             if (utf8_target) {
6749                 STRLEN scan_char_len;
6750
6751                 /* When both target and pattern are UTF-8, we have to do
6752                  * string EQ */
6753                 while (hardcount < max
6754                        && scan < loceol
6755                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6756                        && memEQ(scan, STRING(p), scan_char_len))
6757                 {
6758                     scan += scan_char_len;
6759                     hardcount++;
6760                 }
6761             }
6762             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6763
6764                 /* Target isn't utf8; convert the character in the UTF-8
6765                  * pattern to non-UTF8, and do a simple loop */
6766                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6767                 while (scan < loceol && UCHARAT(scan) == c) {
6768                     scan++;
6769                 }
6770             } /* else pattern char is above Latin1, can't possibly match the
6771                  non-UTF-8 target */
6772         }
6773         else {
6774
6775             /* Here, the string must be utf8; pattern isn't, and <c> is
6776              * different in utf8 than not, so can't compare them directly.
6777              * Outside the loop, find the two utf8 bytes that represent c, and
6778              * then look for those in sequence in the utf8 string */
6779             U8 high = UTF8_TWO_BYTE_HI(c);
6780             U8 low = UTF8_TWO_BYTE_LO(c);
6781
6782             while (hardcount < max
6783                     && scan + 1 < loceol
6784                     && UCHARAT(scan) == high
6785                     && UCHARAT(scan + 1) == low)
6786             {
6787                 scan += 2;
6788                 hardcount++;
6789             }
6790         }
6791         break;
6792
6793     case EXACTFA:
6794         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6795         goto do_exactf;
6796
6797     case EXACTFL:
6798         RXp_MATCH_TAINTED_on(prog);
6799         utf8_flags = FOLDEQ_UTF8_LOCALE;
6800         goto do_exactf;
6801
6802     case EXACTF:
6803             utf8_flags = 0;
6804             goto do_exactf;
6805
6806     case EXACTFU_SS:
6807     case EXACTFU_TRICKYFOLD:
6808     case EXACTFU:
6809         utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6810
6811     do_exactf: {
6812         int c1, c2;
6813         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6814
6815         assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6816
6817         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6818                                         is_utf8_pat))
6819         {
6820             if (c1 == CHRTEST_VOID) {
6821                 /* Use full Unicode fold matching */
6822                 char *tmpeol = PL_regeol;
6823                 STRLEN pat_len = is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6824                 while (hardcount < max
6825                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6826                                              STRING(p), NULL, pat_len,
6827                                              is_utf8_pat, utf8_flags))
6828                 {
6829                     scan = tmpeol;
6830                     tmpeol = PL_regeol;
6831                     hardcount++;
6832                 }
6833             }
6834             else if (utf8_target) {
6835                 if (c1 == c2) {
6836                     while (scan < loceol
6837                            && hardcount < max
6838                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6839                     {
6840                         scan += UTF8SKIP(scan);
6841                         hardcount++;
6842                     }
6843                 }
6844                 else {
6845                     while (scan < loceol
6846                            && hardcount < max
6847                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6848                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6849                     {
6850                         scan += UTF8SKIP(scan);
6851                         hardcount++;
6852                     }
6853                 }
6854             }
6855             else if (c1 == c2) {
6856                 while (scan < loceol && UCHARAT(scan) == c1) {
6857                     scan++;
6858                 }
6859             }
6860             else {
6861                 while (scan < loceol &&
6862                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6863                 {
6864                     scan++;
6865                 }
6866             }
6867         }
6868         break;
6869     }
6870     case ANYOF:
6871     case ANYOF_WARN_SUPER:
6872         if (utf8_target) {
6873             while (hardcount < max
6874                    && scan < loceol
6875                    && reginclass(prog, p, (U8*)scan, utf8_target))
6876             {
6877                 scan += UTF8SKIP(scan);
6878                 hardcount++;
6879             }
6880         } else {
6881             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6882                 scan++;
6883         }
6884         break;
6885
6886     /* The argument (FLAGS) to all the POSIX node types is the class number */
6887
6888     case NPOSIXL:
6889         to_complement = 1;
6890         /* FALLTHROUGH */
6891
6892     case POSIXL:
6893         RXp_MATCH_TAINTED_on(prog);
6894         if (! utf8_target) {
6895             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6896                                                                    *scan)))
6897             {
6898                 scan++;
6899             }
6900         } else {
6901             while (hardcount < max && scan < loceol
6902                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6903                                                                   (U8 *) scan)))
6904             {
6905                 scan += UTF8SKIP(scan);
6906                 hardcount++;
6907             }
6908         }
6909         break;
6910
6911     case POSIXD:
6912         if (utf8_target) {
6913             goto utf8_posix;
6914         }
6915         /* FALLTHROUGH */
6916
6917     case POSIXA:
6918         if (utf8_target && scan + max < loceol) {
6919
6920             /* We didn't adjust <loceol> at the beginning of this routine
6921              * because is UTF-8, but it is actually ok to do so, since here, to
6922              * match, 1 char == 1 byte. */
6923             loceol = scan + max;
6924         }
6925         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6926             scan++;
6927         }
6928         break;
6929
6930     case NPOSIXD:
6931         if (utf8_target) {
6932             to_complement = 1;
6933             goto utf8_posix;
6934         }
6935         /* FALL THROUGH */
6936
6937     case NPOSIXA:
6938         if (! utf8_target) {
6939             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6940                 scan++;
6941             }
6942         }
6943         else {
6944
6945             /* The complement of something that matches only ASCII matches all
6946              * UTF-8 variant code points, plus everything in ASCII that isn't
6947              * in the class. */
6948             while (hardcount < max && scan < loceol
6949                    && (! UTF8_IS_INVARIANT(*scan)
6950                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
6951             {
6952                 scan += UTF8SKIP(scan);
6953                 hardcount++;
6954             }
6955         }
6956         break;
6957
6958     case NPOSIXU:
6959         to_complement = 1;
6960         /* FALLTHROUGH */
6961
6962     case POSIXU:
6963         if (! utf8_target) {
6964             while (scan < loceol && to_complement
6965                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
6966             {
6967                 scan++;
6968             }
6969         }
6970         else {
6971       utf8_posix:
6972             classnum = (_char_class_number) FLAGS(p);
6973             if (classnum < _FIRST_NON_SWASH_CC) {
6974
6975                 /* Here, a swash is needed for above-Latin1 code points.
6976                  * Process as many Latin1 code points using the built-in rules.
6977                  * Go to another loop to finish processing upon encountering
6978                  * the first Latin1 code point.  We could do that in this loop
6979                  * as well, but the other way saves having to test if the swash
6980                  * has been loaded every time through the loop: extra space to
6981                  * save a test. */
6982                 while (hardcount < max && scan < loceol) {
6983                     if (UTF8_IS_INVARIANT(*scan)) {
6984                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6985                                                                    classnum))))
6986                         {
6987                             break;
6988                         }
6989                         scan++;
6990                     }
6991                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6992                         if (! (to_complement
6993                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6994                                                                    *(scan + 1)),
6995                                                     classnum))))
6996                         {
6997                             break;
6998                         }
6999                         scan += 2;
7000                     }
7001                     else {
7002                         goto found_above_latin1;
7003                     }
7004
7005                     hardcount++;
7006                 }
7007             }
7008             else {
7009                 /* For these character classes, the knowledge of how to handle
7010                  * every code point is compiled in to Perl via a macro.  This
7011                  * code is written for making the loops as tight as possible.
7012                  * It could be refactored to save space instead */
7013                 switch (classnum) {
7014                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7015                                                if we revert the change of \v
7016                                                matching this */
7017                         /* FALL THROUGH */
7018                     case _CC_ENUM_PSXSPC:
7019                         while (hardcount < max
7020                                && scan < loceol
7021                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7022                         {
7023                             scan += UTF8SKIP(scan);
7024                             hardcount++;
7025                         }
7026                         break;
7027                     case _CC_ENUM_BLANK:
7028                         while (hardcount < max
7029                                && scan < loceol
7030                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7031                         {
7032                             scan += UTF8SKIP(scan);
7033                             hardcount++;
7034                         }
7035                         break;
7036                     case _CC_ENUM_XDIGIT:
7037                         while (hardcount < max
7038                                && scan < loceol
7039                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7040                         {
7041                             scan += UTF8SKIP(scan);
7042                             hardcount++;
7043                         }
7044                         break;
7045                     case _CC_ENUM_VERTSPACE:
7046                         while (hardcount < max
7047                                && scan < loceol
7048                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7049                         {
7050                             scan += UTF8SKIP(scan);
7051                             hardcount++;
7052                         }
7053                         break;
7054                     case _CC_ENUM_CNTRL:
7055                         while (hardcount < max
7056                                && scan < loceol
7057                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7058                         {
7059                             scan += UTF8SKIP(scan);
7060                             hardcount++;
7061                         }
7062                         break;
7063                     default:
7064                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7065                 }
7066             }
7067         }
7068         break;
7069
7070       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7071
7072         /* Load the swash if not already present */
7073         if (! PL_utf8_swash_ptrs[classnum]) {
7074             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7075             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7076                                         "utf8", swash_property_names[classnum],
7077                                         &PL_sv_undef, 1, 0, NULL, &flags);
7078         }
7079
7080         while (hardcount < max && scan < loceol
7081                && to_complement ^ cBOOL(_generic_utf8(
7082                                        classnum,
7083                                        scan,
7084                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7085                                                    (U8 *) scan,
7086                                                    TRUE))))
7087         {
7088             scan += UTF8SKIP(scan);
7089             hardcount++;
7090         }
7091         break;
7092
7093     case LNBREAK:
7094         if (utf8_target) {
7095             while (hardcount < max && scan < loceol &&
7096                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7097                 scan += c;
7098                 hardcount++;
7099             }
7100         } else {
7101             /* LNBREAK can match one or two latin chars, which is ok, but we
7102              * have to use hardcount in this situation, and throw away the
7103              * adjustment to <loceol> done before the switch statement */
7104             loceol = PL_regeol;
7105             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7106                 scan+=c;
7107                 hardcount++;
7108             }
7109         }
7110         break;
7111
7112     case BOUND:
7113     case BOUNDA:
7114     case BOUNDL:
7115     case BOUNDU:
7116     case EOS:
7117     case GPOS:
7118     case KEEPS:
7119     case NBOUND:
7120     case NBOUNDA:
7121     case NBOUNDL:
7122     case NBOUNDU:
7123     case OPFAIL:
7124     case SBOL:
7125     case SEOL:
7126         /* These are all 0 width, so match right here or not at all. */
7127         break;
7128
7129     default:
7130         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7131         assert(0); /* NOTREACHED */
7132
7133     }
7134
7135     if (hardcount)
7136         c = hardcount;
7137     else
7138         c = scan - *startposp;
7139     *startposp = scan;
7140
7141     DEBUG_r({
7142         GET_RE_DEBUG_FLAGS_DECL;
7143         DEBUG_EXECUTE_r({
7144             SV * const prop = sv_newmortal();
7145             regprop(prog, prop, p);
7146             PerlIO_printf(Perl_debug_log,
7147                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7148                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7149         });
7150     });
7151
7152     return(c);
7153 }
7154
7155
7156 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7157 /*
7158 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7159 create a copy so that changes the caller makes won't change the shared one.
7160 If <altsvp> is non-null, will return NULL in it, for back-compat.
7161  */
7162 SV *
7163 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7164 {
7165     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7166
7167     if (altsvp) {
7168         *altsvp = NULL;
7169     }
7170
7171     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7172 }
7173 #endif
7174
7175 STATIC SV *
7176 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7177 {
7178     /* Returns the swash for the input 'node' in the regex 'prog'.
7179      * If <doinit> is true, will attempt to create the swash if not already
7180      *    done.
7181      * If <listsvp> is non-null, will return the swash initialization string in
7182      *    it.
7183      * Tied intimately to how regcomp.c sets up the data structure */
7184
7185     dVAR;
7186     SV *sw  = NULL;
7187     SV *si  = NULL;
7188     SV*  invlist = NULL;
7189
7190     RXi_GET_DECL(prog,progi);
7191     const struct reg_data * const data = prog ? progi->data : NULL;
7192
7193     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7194
7195     assert(ANYOF_NONBITMAP(node));
7196
7197     if (data && data->count) {
7198         const U32 n = ARG(node);
7199
7200         if (data->what[n] == 's') {
7201             SV * const rv = MUTABLE_SV(data->data[n]);
7202             AV * const av = MUTABLE_AV(SvRV(rv));
7203             SV **const ary = AvARRAY(av);
7204             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7205
7206             si = *ary;  /* ary[0] = the string to initialize the swash with */
7207
7208             /* Elements 2 and 3 are either both present or both absent. [2] is
7209              * any inversion list generated at compile time; [3] indicates if
7210              * that inversion list has any user-defined properties in it. */
7211             if (av_len(av) >= 2) {
7212                 invlist = ary[2];
7213                 if (SvUV(ary[3])) {
7214                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7215                 }
7216             }
7217             else {
7218                 invlist = NULL;
7219             }
7220
7221             /* Element [1] is reserved for the set-up swash.  If already there,
7222              * return it; if not, create it and store it there */
7223             if (SvROK(ary[1])) {
7224                 sw = ary[1];
7225             }
7226             else if (si && doinit) {
7227
7228                 sw = _core_swash_init("utf8", /* the utf8 package */
7229                                       "", /* nameless */
7230                                       si,
7231                                       1, /* binary */
7232                                       0, /* not from tr/// */
7233                                       invlist,
7234                                       &swash_init_flags);
7235                 (void)av_store(av, 1, sw);
7236             }
7237         }
7238     }
7239
7240     if (listsvp) {
7241         SV* matches_string = newSVpvn("", 0);
7242
7243         /* Use the swash, if any, which has to have incorporated into it all
7244          * possibilities */
7245         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7246             && (si && si != &PL_sv_undef))
7247         {
7248
7249             /* If no swash, use the input initialization string, if available */
7250             sv_catsv(matches_string, si);
7251         }
7252
7253         /* Add the inversion list to whatever we have.  This may have come from
7254          * the swash, or from an input parameter */
7255         if (invlist) {
7256             sv_catsv(matches_string, _invlist_contents(invlist));
7257         }
7258         *listsvp = matches_string;
7259     }
7260
7261     return sw;
7262 }
7263
7264 /*
7265  - reginclass - determine if a character falls into a character class
7266
7267   n is the ANYOF regnode
7268   p is the target string
7269   utf8_target tells whether p is in UTF-8.
7270
7271   Returns true if matched; false otherwise.
7272
7273   Note that this can be a synthetic start class, a combination of various
7274   nodes, so things you think might be mutually exclusive, such as locale,
7275   aren't.  It can match both locale and non-locale
7276
7277  */
7278
7279 STATIC bool
7280 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7281 {
7282     dVAR;
7283     const char flags = ANYOF_FLAGS(n);
7284     bool match = FALSE;
7285     UV c = *p;
7286
7287     PERL_ARGS_ASSERT_REGINCLASS;
7288
7289     /* If c is not already the code point, get it.  Note that
7290      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7291     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7292         STRLEN c_len = 0;
7293         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7294                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7295                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7296                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7297                  * UTF8_ALLOW_FFFF */
7298         if (c_len == (STRLEN)-1)
7299             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7300     }
7301
7302     /* If this character is potentially in the bitmap, check it */
7303     if (c < 256) {
7304         if (ANYOF_BITMAP_TEST(n, c))
7305             match = TRUE;
7306         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7307                 && ! utf8_target
7308                 && ! isASCII(c))
7309         {
7310             match = TRUE;
7311         }
7312         else if (flags & ANYOF_LOCALE) {
7313             RXp_MATCH_TAINTED_on(prog);
7314
7315             if ((flags & ANYOF_LOC_FOLD)
7316                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7317             {
7318                 match = TRUE;
7319             }
7320             else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7321
7322                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7323                  * if the class includes the Posix character class given by
7324                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7325                  * complemented Posix class given by int(bit/2).  So we loop
7326                  * through the bits, each time changing whether we complement
7327                  * the result or not.  Suppose for the sake of illustration
7328                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7329                  * is set, it means there is a match for this ANYOF node if the
7330                  * character is in the class given by the expression (0 / 2 = 0
7331                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7332                  * and since 'to_complement' is 0, the result will stay TRUE,
7333                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7334                  * bit 1 is 1.  That means there is a match if the character
7335                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7336                  * but will on bit 1.  On the second iteration 'to_complement'
7337                  * will be 1, so the exclusive or will reverse things, so we
7338                  * are testing for \W.  On the third iteration, 'to_complement'
7339                  * will be 0, and we would be testing for \s; the fourth
7340                  * iteration would test for \S, etc.
7341                  *
7342                  * Note that this code assumes that all the classes are closed
7343                  * under folding.  For example, if a character matches \w, then
7344                  * its fold does too; and vice versa.  This should be true for
7345                  * any well-behaved locale for all the currently defined Posix
7346                  * classes, except for :lower: and :upper:, which are handled
7347                  * by the pseudo-class :cased: which matches if either of the
7348                  * other two does.  To get rid of this assumption, an outer
7349                  * loop could be used below to iterate over both the source
7350                  * character, and its fold (if different) */
7351
7352                 int count = 0;
7353                 int to_complement = 0;
7354                 while (count < ANYOF_MAX) {
7355                     if (ANYOF_CLASS_TEST(n, count)
7356                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7357                     {
7358                         match = TRUE;
7359                         break;
7360                     }
7361                     count++;
7362                     to_complement ^= 1;
7363                 }
7364             }
7365         }
7366     }
7367
7368     /* If the bitmap didn't (or couldn't) match, and something outside the
7369      * bitmap could match, try that.  Locale nodes specify completely the
7370      * behavior of code points in the bit map (otherwise, a utf8 target would
7371      * cause them to be treated as Unicode and not locale), except in
7372      * the very unlikely event when this node is a synthetic start class, which
7373      * could be a combination of locale and non-locale nodes.  So allow locale
7374      * to match for the synthetic start class, which will give a false
7375      * positive that will be resolved when the match is done again as not part
7376      * of the synthetic start class */
7377     if (!match) {
7378         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7379             match = TRUE;       /* Everything above 255 matches */
7380         }
7381         else if (ANYOF_NONBITMAP(n)
7382                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7383                      || (utf8_target
7384                          && (c >=256
7385                              || (! (flags & ANYOF_LOCALE))
7386                              || OP(n) == ANYOF_SYNTHETIC))))
7387         {
7388             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7389             if (sw) {
7390                 U8 * utf8_p;
7391                 if (utf8_target) {
7392                     utf8_p = (U8 *) p;
7393                 } else { /* Convert to utf8 */
7394                     STRLEN len = 1;
7395                     utf8_p = bytes_to_utf8(p, &len);
7396                 }
7397
7398                 if (swash_fetch(sw, utf8_p, TRUE)) {
7399                     match = TRUE;
7400                 }
7401
7402                 /* If we allocated a string above, free it */
7403                 if (! utf8_target) Safefree(utf8_p);
7404             }
7405         }
7406
7407         if (UNICODE_IS_SUPER(c)
7408             && OP(n) == ANYOF_WARN_SUPER
7409             && ckWARN_d(WARN_NON_UNICODE))
7410         {
7411             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7412                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7413         }
7414     }
7415
7416     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7417     return cBOOL(flags & ANYOF_INVERT) ^ match;
7418 }
7419
7420 STATIC U8 *
7421 S_reghop3(U8 *s, I32 off, const U8* lim)
7422 {
7423     /* return the position 'off' UTF-8 characters away from 's', forward if
7424      * 'off' >= 0, backwards if negative.  But don't go outside of position
7425      * 'lim', which better be < s  if off < 0 */
7426
7427     dVAR;
7428
7429     PERL_ARGS_ASSERT_REGHOP3;
7430
7431     if (off >= 0) {
7432         while (off-- && s < lim) {
7433             /* XXX could check well-formedness here */
7434             s += UTF8SKIP(s);
7435         }
7436     }
7437     else {
7438         while (off++ && s > lim) {
7439             s--;
7440             if (UTF8_IS_CONTINUED(*s)) {
7441                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7442                     s--;
7443             }
7444             /* XXX could check well-formedness here */
7445         }
7446     }
7447     return s;
7448 }
7449
7450 #ifdef XXX_dmq
7451 /* there are a bunch of places where we use two reghop3's that should
7452    be replaced with this routine. but since thats not done yet
7453    we ifdef it out - dmq
7454 */
7455 STATIC U8 *
7456 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7457 {
7458     dVAR;
7459
7460     PERL_ARGS_ASSERT_REGHOP4;
7461
7462     if (off >= 0) {
7463         while (off-- && s < rlim) {
7464             /* XXX could check well-formedness here */
7465             s += UTF8SKIP(s);
7466         }
7467     }
7468     else {
7469         while (off++ && s > llim) {
7470             s--;
7471             if (UTF8_IS_CONTINUED(*s)) {
7472                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7473                     s--;
7474             }
7475             /* XXX could check well-formedness here */
7476         }
7477     }
7478     return s;
7479 }
7480 #endif
7481
7482 STATIC U8 *
7483 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7484 {
7485     dVAR;
7486
7487     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7488
7489     if (off >= 0) {
7490         while (off-- && s < lim) {
7491             /* XXX could check well-formedness here */
7492             s += UTF8SKIP(s);
7493         }
7494         if (off >= 0)
7495             return NULL;
7496     }
7497     else {
7498         while (off++ && s > lim) {
7499             s--;
7500             if (UTF8_IS_CONTINUED(*s)) {
7501                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7502                     s--;
7503             }
7504             /* XXX could check well-formedness here */
7505         }
7506         if (off <= 0)
7507             return NULL;
7508     }
7509     return s;
7510 }
7511
7512 static void
7513 restore_pos(pTHX_ void *arg)
7514 {
7515     dVAR;
7516     regexp * const rex = (regexp *)arg;
7517     if (PL_reg_state.re_state_eval_setup_done) {
7518         if (PL_reg_oldsaved) {
7519             rex->subbeg = PL_reg_oldsaved;
7520             rex->sublen = PL_reg_oldsavedlen;
7521             rex->suboffset = PL_reg_oldsavedoffset;
7522             rex->subcoffset = PL_reg_oldsavedcoffset;
7523 #ifdef PERL_ANY_COW
7524             rex->saved_copy = PL_nrs;
7525 #endif
7526             RXp_MATCH_COPIED_on(rex);
7527         }
7528         PL_reg_magic->mg_len = PL_reg_oldpos;
7529         PL_reg_state.re_state_eval_setup_done = FALSE;
7530         PL_curpm = PL_reg_oldcurpm;
7531     }
7532 }
7533
7534 STATIC void
7535 S_to_utf8_substr(pTHX_ regexp *prog)
7536 {
7537     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7538      * on the converted value */
7539
7540     int i = 1;
7541
7542     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7543
7544     do {
7545         if (prog->substrs->data[i].substr
7546             && !prog->substrs->data[i].utf8_substr) {
7547             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7548             prog->substrs->data[i].utf8_substr = sv;
7549             sv_utf8_upgrade(sv);
7550             if (SvVALID(prog->substrs->data[i].substr)) {
7551                 if (SvTAIL(prog->substrs->data[i].substr)) {
7552                     /* Trim the trailing \n that fbm_compile added last
7553                        time.  */
7554                     SvCUR_set(sv, SvCUR(sv) - 1);
7555                     /* Whilst this makes the SV technically "invalid" (as its
7556                        buffer is no longer followed by "\0") when fbm_compile()
7557                        adds the "\n" back, a "\0" is restored.  */
7558                     fbm_compile(sv, FBMcf_TAIL);
7559                 } else
7560                     fbm_compile(sv, 0);
7561             }
7562             if (prog->substrs->data[i].substr == prog->check_substr)
7563                 prog->check_utf8 = sv;
7564         }
7565     } while (i--);
7566 }
7567
7568 STATIC bool
7569 S_to_byte_substr(pTHX_ regexp *prog)
7570 {
7571     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7572      * on the converted value; returns FALSE if can't be converted. */
7573
7574     dVAR;
7575     int i = 1;
7576
7577     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7578
7579     do {
7580         if (prog->substrs->data[i].utf8_substr
7581             && !prog->substrs->data[i].substr) {
7582             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7583             if (! sv_utf8_downgrade(sv, TRUE)) {
7584                 return FALSE;
7585             }
7586             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7587                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7588                     /* Trim the trailing \n that fbm_compile added last
7589                         time.  */
7590                     SvCUR_set(sv, SvCUR(sv) - 1);
7591                     fbm_compile(sv, FBMcf_TAIL);
7592                 } else
7593                     fbm_compile(sv, 0);
7594             }
7595             prog->substrs->data[i].substr = sv;
7596             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7597                 prog->check_substr = sv;
7598         }
7599     } while (i--);
7600
7601     return TRUE;
7602 }
7603
7604 /*
7605  * Local variables:
7606  * c-indentation-style: bsd
7607  * c-basic-offset: 4
7608  * indent-tabs-mode: nil
7609  * End:
7610  *
7611  * ex: set ts=8 sts=4 sw=4 et:
7612  */