orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86
  87 #ifdef PERL_IN_XSUB_RE
  88 #  include "re_comp.h"
  89 #else
  90 #  include "regcomp.h"
  91 #endif
  92
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102 /* Valid for non-utf8 strings: avoids the reginclass
 103  * call if there are no complications: i.e., if everything matchable is
 104  * straight forward in the bitmap */
 105 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 106                                               : ANYOF_BITMAP_TEST(p,*(c)))
 107
 108 /*
 109  * Forwards.
 110  */
 111
 112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 113 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 114
 115 #define HOPc(pos,off) \
 116         (char *)(reginfo->is_utf8_target \
 117             ? reghop3((U8*)pos, off, \
 118                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 119             : (U8*)(pos + off))
 120 #define HOPBACKc(pos, off) \
 121         (char*)(reginfo->is_utf8_target \
 122             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 123             : (pos - off >= reginfo->strbeg)    \
 124                 ? (U8*)pos - off                \
 125                 : NULL)
 126
 127 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 129
 130
 131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 132 #define NEXTCHR_IS_EOS (nextchr < 0)
 133
 134 #define SET_nextchr \
 135     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 136
 137 #define SET_locinput(p) \
 138     locinput = (p);  \
 139     SET_nextchr
 140
 141
 142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 143         if (!swash_ptr) {                                                     \
 144             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 145             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 146                                          1, 0, NULL, &flags);                 \
 147             assert(swash_ptr);                                                \
 148         }                                                                     \
 149     } STMT_END
 150
 151 /* If in debug mode, we test that a known character properly matches */
 152 #ifdef DEBUGGING
 153 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 154                                           property_name,                      \
 155                                           utf8_char_in_property)              \
 156         LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 157         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 158 #else
 159 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 160                                           property_name,                      \
 161                                           utf8_char_in_property)              \
 162         LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 163 #endif
 164
 165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 166                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 167                                         swash_property_names[_CC_WORDCHAR],   \
 168                                         GREEK_SMALL_LETTER_IOTA_UTF8)
 169
 170 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 171     STMT_START {                                                              \
 172         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 173                                        "_X_regular_begin",                    \
 174                                        GREEK_SMALL_LETTER_IOTA_UTF8);         \
 175         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 176                                        "_X_extend",                           \
 177                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 178     } STMT_END
 179
 180 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 182
 183 /* for use after a quantifier and before an EXACT-like node -- japhy */
 184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 185  *
 186  * NOTE that *nothing* that affects backtracking should be in here, specifically
 187  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 188  * node that is in between two EXACT like nodes when ascertaining what the required
 189  * "follow" character is. This should probably be moved to regex compile time
 190  * although it may be done at run time beause of the REF possibility - more
 191  * investigation required. -- demerphq
 192 */
 193 #define JUMPABLE(rn) (      \
 194     OP(rn) == OPEN ||       \
 195     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 196     OP(rn) == EVAL ||   \
 197     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 198     OP(rn) == PLUS || OP(rn) == MINMOD || \
 199     OP(rn) == KEEPS || \
 200     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 201 )
 202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 203
 204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 205
 206 #if 0
 207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 208    we don't need this definition. */
 209 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 210 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 212
 213 #else
 214 /* ... so we use this as its faster. */
 215 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 216 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 217 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 219
 220 #endif
 221
 222 /*
 223   Search for mandatory following text node; for lookahead, the text must
 224   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 225 */
 226 #define FIND_NEXT_IMPT(rn) STMT_START { \
 227     while (JUMPABLE(rn)) { \
 228         const OPCODE type = OP(rn); \
 229         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 230             rn = NEXTOPER(NEXTOPER(rn)); \
 231         else if (type == PLUS) \
 232             rn = NEXTOPER(rn); \
 233         else if (type == IFMATCH) \
 234             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 235         else rn += NEXT_OFF(rn); \
 236     } \
 237 } STMT_END
 238
 239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 240  * These are for the pre-composed Hangul syllables, which are all in a
 241  * contiguous block and arranged there in such a way so as to facilitate
 242  * alorithmic determination of their characteristics.  As such, they don't need
 243  * a swash, but can be determined by simple arithmetic.  Almost all are
 244  * GCB=LVT, but every 28th one is a GCB=LV */
 245 #define SBASE 0xAC00    /* Start of block */
 246 #define SCount 11172    /* Length of block */
 247 #define TCount 28
 248
 249 #define SLAB_FIRST(s) (&(s)->states[0])
 250 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 251
 252 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 253 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 254 static regmatch_state * S_push_slab(pTHX);
 255
 256 #define REGCP_PAREN_ELEMS 3
 257 #define REGCP_OTHER_ELEMS 3
 258 #define REGCP_FRAME_ELEMS 1
 259 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 260  * are needed for the regexp context stack bookkeeping. */
 261
 262 STATIC CHECKPOINT
 263 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 264 {
 265     dVAR;
 266     const int retval = PL_savestack_ix;
 267     const int paren_elems_to_push =
 268                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 269     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 270     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 271     I32 p;
 272     GET_RE_DEBUG_FLAGS_DECL;
 273
 274     PERL_ARGS_ASSERT_REGCPPUSH;
 275
 276     if (paren_elems_to_push < 0)
 277         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 278                    paren_elems_to_push);
 279
 280     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 281         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 282                    " out of range (%lu-%ld)",
 283                    total_elems,
 284                    (unsigned long)maxopenparen,
 285                    (long)parenfloor);
 286
 287     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 288
 289     DEBUG_BUFFERS_r(
 290         if ((int)maxopenparen > (int)parenfloor)
 291             PerlIO_printf(Perl_debug_log,
 292                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 293                 PTR2UV(rex),
 294                 PTR2UV(rex->offs)
 295             );
 296     );
 297     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 298 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 299         SSPUSHINT(rex->offs[p].end);
 300         SSPUSHINT(rex->offs[p].start);
 301         SSPUSHINT(rex->offs[p].start_tmp);
 302         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 303             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 304             (UV)p,
 305             (IV)rex->offs[p].start,
 306             (IV)rex->offs[p].start_tmp,
 307             (IV)rex->offs[p].end
 308         ));
 309     }
 310 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 311     SSPUSHINT(maxopenparen);
 312     SSPUSHINT(rex->lastparen);
 313     SSPUSHINT(rex->lastcloseparen);
 314     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 315
 316     return retval;
 317 }
 318
 319 /* These are needed since we do not localize EVAL nodes: */
 320 #define REGCP_SET(cp)                                           \
 321     DEBUG_STATE_r(                                              \
 322             PerlIO_printf(Perl_debug_log,                       \
 323                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 324                 (IV)PL_savestack_ix));                          \
 325     cp = PL_savestack_ix
 326
 327 #define REGCP_UNWIND(cp)                                        \
 328     DEBUG_STATE_r(                                              \
 329         if (cp != PL_savestack_ix)                              \
 330             PerlIO_printf(Perl_debug_log,                       \
 331                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 332                 (IV)(cp), (IV)PL_savestack_ix));                \
 333     regcpblow(cp)
 334
 335 #define UNWIND_PAREN(lp, lcp)               \
 336     for (n = rex->lastparen; n > lp; n--)   \
 337         rex->offs[n].end = -1;              \
 338     rex->lastparen = n;                     \
 339     rex->lastcloseparen = lcp;
 340
 341
 342 STATIC void
 343 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 344 {
 345     dVAR;
 346     UV i;
 347     U32 paren;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     PERL_ARGS_ASSERT_REGCPPOP;
 351
 352     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 353     i = SSPOPUV;
 354     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 355     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 356     rex->lastcloseparen = SSPOPINT;
 357     rex->lastparen = SSPOPINT;
 358     *maxopenparen_p = SSPOPINT;
 359
 360     i -= REGCP_OTHER_ELEMS;
 361     /* Now restore the parentheses context. */
 362     DEBUG_BUFFERS_r(
 363         if (i || rex->lastparen + 1 <= rex->nparens)
 364             PerlIO_printf(Perl_debug_log,
 365                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 366                 PTR2UV(rex),
 367                 PTR2UV(rex->offs)
 368             );
 369     );
 370     paren = *maxopenparen_p;
 371     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 372         I32 tmps;
 373         rex->offs[paren].start_tmp = SSPOPINT;
 374         rex->offs[paren].start = SSPOPINT;
 375         tmps = SSPOPINT;
 376         if (paren <= rex->lastparen)
 377             rex->offs[paren].end = tmps;
 378         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 379             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 380             (UV)paren,
 381             (IV)rex->offs[paren].start,
 382             (IV)rex->offs[paren].start_tmp,
 383             (IV)rex->offs[paren].end,
 384             (paren > rex->lastparen ? "(skipped)" : ""));
 385         );
 386         paren--;
 387     }
 388 #if 1
 389     /* It would seem that the similar code in regtry()
 390      * already takes care of this, and in fact it is in
 391      * a better location to since this code can #if 0-ed out
 392      * but the code in regtry() is needed or otherwise tests
 393      * requiring null fields (pat.t#187 and split.t#{13,14}
 394      * (as of patchlevel 7877)  will fail.  Then again,
 395      * this code seems to be necessary or otherwise
 396      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 397      * --jhi updated by dapm */
 398     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 399         if (i > *maxopenparen_p)
 400             rex->offs[i].start = -1;
 401         rex->offs[i].end = -1;
 402         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 403             "    \\%"UVuf": %s   ..-1 undeffing\n",
 404             (UV)i,
 405             (i > *maxopenparen_p) ? "-1" : "  "
 406         ));
 407     }
 408 #endif
 409 }
 410
 411 /* restore the parens and associated vars at savestack position ix,
 412  * but without popping the stack */
 413
 414 STATIC void
 415 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 416 {
 417     I32 tmpix = PL_savestack_ix;
 418     PL_savestack_ix = ix;
 419     regcppop(rex, maxopenparen_p);
 420     PL_savestack_ix = tmpix;
 421 }
 422
 423 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 424
 425 STATIC bool
 426 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 427 {
 428     /* Returns a boolean as to whether or not 'character' is a member of the
 429      * Posix character class given by 'classnum' that should be equivalent to a
 430      * value in the typedef '_char_class_number'.
 431      *
 432      * Ideally this could be replaced by a just an array of function pointers
 433      * to the C library functions that implement the macros this calls.
 434      * However, to compile, the precise function signatures are required, and
 435      * these may vary from platform to to platform.  To avoid having to figure
 436      * out what those all are on each platform, I (khw) am using this method,
 437      * which adds an extra layer of function call overhead (unless the C
 438      * optimizer strips it away).  But we don't particularly care about
 439      * performance with locales anyway. */
 440
 441     switch ((_char_class_number) classnum) {
 442         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 443         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 444         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 445         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 446         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 447                                         || isUPPER_LC(character);
 448         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 449         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 450         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 451         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 452         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 453         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 454         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 455         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 456         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 457         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 458         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 459         default:    /* VERTSPACE should never occur in locales */
 460             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 461     }
 462
 463     assert(0); /* NOTREACHED */
 464     return FALSE;
 465 }
 466
 467 STATIC bool
 468 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 469 {
 470     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 471      * 'character' is a member of the Posix character class given by 'classnum'
 472      * that should be equivalent to a value in the typedef
 473      * '_char_class_number'.
 474      *
 475      * This just calls isFOO_lc on the code point for the character if it is in
 476      * the range 0-255.  Outside that range, all characters avoid Unicode
 477      * rules, ignoring any locale.  So use the Unicode function if this class
 478      * requires a swash, and use the Unicode macro otherwise. */
 479
 480     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 481
 482     if (UTF8_IS_INVARIANT(*character)) {
 483         return isFOO_lc(classnum, *character);
 484     }
 485     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 486         return isFOO_lc(classnum,
 487                         TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
 488     }
 489
 490     if (classnum < _FIRST_NON_SWASH_CC) {
 491
 492         /* Initialize the swash unless done already */
 493         if (! PL_utf8_swash_ptrs[classnum]) {
 494             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 495             PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 496                 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 497         }
 498
 499         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 500                                  character,
 501                                  TRUE /* is UTF */ ));
 502     }
 503
 504     switch ((_char_class_number) classnum) {
 505         case _CC_ENUM_SPACE:
 506         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 507
 508         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 509         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 510         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 511         default:                 return 0;  /* Things like CNTRL are always
 512                                                below 256 */
 513     }
 514
 515     assert(0); /* NOTREACHED */
 516     return FALSE;
 517 }
 518
 519 /*
 520  * pregexec and friends
 521  */
 522
 523 #ifndef PERL_IN_XSUB_RE
 524 /*
 525  - pregexec - match a regexp against a string
 526  */
 527 I32
 528 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 529          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 530 /* stringarg: the point in the string at which to begin matching */
 531 /* strend:    pointer to null at end of string */
 532 /* strbeg:    real beginning of string */
 533 /* minend:    end of match must be >= minend bytes after stringarg. */
 534 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 535  *            itself is accessed via the pointers above */
 536 /* nosave:    For optimizations. */
 537 {
 538     PERL_ARGS_ASSERT_PREGEXEC;
 539
 540     return
 541         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 542                       nosave ? 0 : REXEC_COPY_STR);
 543 }
 544 #endif
 545
 546 /*
 547  * Need to implement the following flags for reg_anch:
 548  *
 549  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 550  * USE_INTUIT_ML
 551  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 552  * INTUIT_AUTORITATIVE_ML
 553  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 554  * INTUIT_ONCE_ML
 555  *
 556  * Another flag for this function: SECOND_TIME (so that float substrs
 557  * with giant delta may be not rechecked).
 558  */
 559
 560 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 561    Otherwise, only SvCUR(sv) is used to get strbeg. */
 562
 563 /* XXXX Some places assume that there is a fixed substring.
 564         An update may be needed if optimizer marks as "INTUITable"
 565         RExen without fixed substrings.  Similarly, it is assumed that
 566         lengths of all the strings are no more than minlen, thus they
 567         cannot come from lookahead.
 568         (Or minlen should take into account lookahead.)
 569   NOTE: Some of this comment is not correct. minlen does now take account
 570   of lookahead/behind. Further research is required. -- demerphq
 571
 572 */
 573
 574 /* A failure to find a constant substring means that there is no need to make
 575    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 576    finding a substring too deep into the string means that fewer calls to
 577    regtry() should be needed.
 578
 579    REx compiler's optimizer found 4 possible hints:
 580         a) Anchored substring;
 581         b) Fixed substring;
 582         c) Whether we are anchored (beginning-of-line or \G);
 583         d) First node (of those at offset 0) which may distinguish positions;
 584    We use a)b)d) and multiline-part of c), and try to find a position in the
 585    string which does not contradict any of them.
 586  */
 587
 588 /* Most of decisions we do here should have been done at compile time.
 589    The nodes of the REx which we used for the search should have been
 590    deleted from the finite automaton. */
 591
 592 /* args:
 593  * rx:     the regex to match against
 594  * sv:     the SV being matched: only used for utf8 flag; the string
 595  *         itself is accessed via the pointers below. Note that on
 596  *         something like an overloaded SV, SvPOK(sv) may be false
 597  *         and the string pointers may point to something unrelated to
 598  *         the SV itself.
 599  * strbeg: real beginning of string
 600  * strpos: the point in the string at which to begin matching
 601  * strend: pointer to the byte following the last char of the string
 602  * flags   currently unused; set to 0
 603  * data:   currently unused; set to NULL
 604  */
 605
 606 char *
 607 Perl_re_intuit_start(pTHX_
 608                     REGEXP * const rx,
 609                     SV *sv,
 610                     const char * const strbeg,
 611                     char *strpos,
 612                     char *strend,
 613                     const U32 flags,
 614                     re_scream_pos_data *data)
 615 {
 616     dVAR;
 617     struct regexp *const prog = ReANY(rx);
 618     I32 start_shift = 0;
 619     /* Should be nonnegative! */
 620     I32 end_shift   = 0;
 621     char *s;
 622     SV *check;
 623     char *t;
 624     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 625     I32 ml_anch;
 626     char *other_last = NULL;    /* other substr checked before this */
 627     char *check_at = NULL;              /* check substr found at this pos */
 628     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 629     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 630     RXi_GET_DECL(prog,progi);
 631     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 632     regmatch_info *const reginfo = &reginfo_buf;
 633 #ifdef DEBUGGING
 634     const char * const i_strpos = strpos;
 635 #endif
 636     GET_RE_DEBUG_FLAGS_DECL;
 637
 638     PERL_ARGS_ASSERT_RE_INTUIT_START;
 639     PERL_UNUSED_ARG(flags);
 640     PERL_UNUSED_ARG(data);
 641
 642     /* CHR_DIST() would be more correct here but it makes things slow. */
 643     if (prog->minlen > strend - strpos) {
 644         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 645                               "String too short... [re_intuit_start]\n"));
 646         goto fail;
 647     }
 648
 649     reginfo->is_utf8_target = cBOOL(utf8_target);
 650     reginfo->info_aux = NULL;
 651     reginfo->strbeg = strbeg;
 652     reginfo->strend = strend;
 653     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 654     reginfo->intuit = 1;
 655     /* not actually used within intuit, but zero for safety anyway */
 656     reginfo->poscache_maxiter = 0;
 657
 658     if (utf8_target) {
 659         if (!prog->check_utf8 && prog->check_substr)
 660             to_utf8_substr(prog);
 661         check = prog->check_utf8;
 662     } else {
 663         if (!prog->check_substr && prog->check_utf8) {
 664             if (! to_byte_substr(prog)) {
 665                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 666             }
 667         }
 668         check = prog->check_substr;
 669     }
 670     if ((prog->extflags & RXf_ANCH)     /* Match at beg-of-str or after \n */
 671          && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
 672     {
 673         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 674                      || ( (prog->extflags & RXf_ANCH_BOL)
 675                           && !multiline ) );    /* Check after \n? */
 676
 677         if (!ml_anch) {
 678           if (    !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 679                && (strpos != strbeg)) {
 680               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 681               goto fail;
 682           }
 683           if (prog->check_offset_min == prog->check_offset_max
 684               && !(prog->extflags & RXf_CANY_SEEN)
 685               && ! multiline)   /* /m can cause \n's to match that aren't
 686                                    accounted for in the string max length.
 687                                    See [perl #115242] */
 688           {
 689             /* Substring at constant offset from beg-of-str... */
 690             I32 slen;
 691
 692             s = HOP3c(strpos, prog->check_offset_min, strend);
 693
 694             if (SvTAIL(check)) {
 695                 slen = SvCUR(check);    /* >= 1 */
 696
 697                 if ( strend - s > slen || strend - s < slen - 1
 698                      || (strend - s == slen && strend[-1] != '\n')) {
 699                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 700                     goto fail_finish;
 701                 }
 702                 /* Now should match s[0..slen-2] */
 703                 slen--;
 704                 if (slen && (*SvPVX_const(check) != *s
 705                              || (slen > 1
 706                                  && memNE(SvPVX_const(check), s, slen)))) {
 707                   report_neq:
 708                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 709                     goto fail_finish;
 710                 }
 711             }
 712             else if (*SvPVX_const(check) != *s
 713                      || ((slen = SvCUR(check)) > 1
 714                          && memNE(SvPVX_const(check), s, slen)))
 715                 goto report_neq;
 716             check_at = s;
 717             goto success_at_start;
 718           }
 719         }
 720         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 721         s = strpos;
 722         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 723         end_shift = prog->check_end_shift;
 724
 725         if (!ml_anch) {
 726             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 727                                          - (SvTAIL(check) != 0);
 728             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 729
 730             if (end_shift < eshift)
 731                 end_shift = eshift;
 732         }
 733     }
 734     else {                              /* Can match at random position */
 735         ml_anch = 0;
 736         s = strpos;
 737         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 738         end_shift = prog->check_end_shift;
 739
 740         /* end shift should be non negative here */
 741     }
 742
 743 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 744     if (end_shift < 0)
 745         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 746                    (IV)end_shift, RX_PRECOMP(prog));
 747 #endif
 748
 749   restart:
 750     /* Find a possible match in the region s..strend by looking for
 751        the "check" substring in the region corrected by start/end_shift. */
 752
 753     {
 754         I32 srch_start_shift = start_shift;
 755         I32 srch_end_shift = end_shift;
 756         U8* start_point;
 757         U8* end_point;
 758         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 759             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 760             srch_start_shift = strbeg - s;
 761         }
 762     DEBUG_OPTIMISE_MORE_r({
 763         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 764             (IV)prog->check_offset_min,
 765             (IV)srch_start_shift,
 766             (IV)srch_end_shift,
 767             (IV)prog->check_end_shift);
 768     });
 769
 770         if (prog->extflags & RXf_CANY_SEEN) {
 771             start_point= (U8*)(s + srch_start_shift);
 772             end_point= (U8*)(strend - srch_end_shift);
 773         } else {
 774             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 775             end_point= HOP3(strend, -srch_end_shift, strbeg);
 776         }
 777         DEBUG_OPTIMISE_MORE_r({
 778             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 779                 (int)(end_point - start_point),
 780                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 781                 start_point);
 782         });
 783
 784         s = fbm_instr( start_point, end_point,
 785                       check, multiline ? FBMrf_MULTILINE : 0);
 786     }
 787     /* Update the count-of-usability, remove useless subpatterns,
 788         unshift s.  */
 789
 790     DEBUG_EXECUTE_r({
 791         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 792             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 793         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 794                           (s ? "Found" : "Did not find"),
 795             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 796                 ? "anchored" : "floating"),
 797             quoted,
 798             RE_SV_TAIL(check),
 799             (s ? " at offset " : "...\n") );
 800     });
 801
 802     if (!s)
 803         goto fail_finish;
 804     /* Finish the diagnostic message */
 805     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 806
 807     /* XXX dmq: first branch is for positive lookbehind...
 808        Our check string is offset from the beginning of the pattern.
 809        So we need to do any stclass tests offset forward from that
 810        point. I think. :-(
 811      */
 812
 813
 814
 815     check_at=s;
 816
 817
 818     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 819        Start with the other substr.
 820        XXXX no SCREAM optimization yet - and a very coarse implementation
 821        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 822                 *always* match.  Probably should be marked during compile...
 823        Probably it is right to do no SCREAM here...
 824      */
 825
 826     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 827                 : (prog->float_substr && prog->anchored_substr))
 828     {
 829         /* Take into account the "other" substring. */
 830         /* XXXX May be hopelessly wrong for UTF... */
 831         if (!other_last)
 832             other_last = strpos;
 833         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 834           do_other_anchored:
 835             {
 836                 char * const last = HOP3c(s, -start_shift, strbeg);
 837                 char *last1, *last2;
 838                 char * const saved_s = s;
 839                 SV* must;
 840
 841                 t = s - prog->check_offset_max;
 842                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 843                     && (!utf8_target
 844                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 845                             && t > strpos)))
 846                     NOOP;
 847                 else
 848                     t = strpos;
 849                 t = HOP3c(t, prog->anchored_offset, strend);
 850                 if (t < other_last)     /* These positions already checked */
 851                     t = other_last;
 852                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 853                 if (last < last1)
 854                     last1 = last;
 855                 /* XXXX It is not documented what units *_offsets are in.
 856                    We assume bytes, but this is clearly wrong.
 857                    Meaning this code needs to be carefully reviewed for errors.
 858                    dmq.
 859                   */
 860
 861                 /* On end-of-str: see comment below. */
 862                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 863                 if (must == &PL_sv_undef) {
 864                     s = (char*)NULL;
 865                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 866                 }
 867                 else
 868                     s = fbm_instr(
 869                         (unsigned char*)t,
 870                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 871                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 872                         must,
 873                         multiline ? FBMrf_MULTILINE : 0
 874                     );
 875                 DEBUG_EXECUTE_r({
 876                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 879                         (s ? "Found" : "Contradicts"),
 880                         quoted, RE_SV_TAIL(must));
 881                 });
 882
 883
 884                 if (!s) {
 885                     if (last1 >= last2) {
 886                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 887                                                 ", giving up...\n"));
 888                         goto fail_finish;
 889                     }
 890                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 891                         ", trying floating at offset %ld...\n",
 892                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 893                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 894                     s = HOP3c(last, 1, strend);
 895                     goto restart;
 896                 }
 897                 else {
 898                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 899                           (long)(s - i_strpos)));
 900                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 901                     other_last = HOP3c(s, 1, strend);
 902                     s = saved_s;
 903                     if (t == strpos)
 904                         goto try_at_start;
 905                     goto try_at_offset;
 906                 }
 907             }
 908         }
 909         else {          /* Take into account the floating substring. */
 910             char *last, *last1;
 911             char * const saved_s = s;
 912             SV* must;
 913
 914             t = HOP3c(s, -start_shift, strbeg);
 915             last1 = last =
 916                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 917             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 918                 last = HOP3c(t, prog->float_max_offset, strend);
 919             s = HOP3c(t, prog->float_min_offset, strend);
 920             if (s < other_last)
 921                 s = other_last;
 922  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 923             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 924             /* fbm_instr() takes into account exact value of end-of-str
 925                if the check is SvTAIL(ed).  Since false positives are OK,
 926                and end-of-str is not later than strend we are OK. */
 927             if (must == &PL_sv_undef) {
 928                 s = (char*)NULL;
 929                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 930             }
 931             else
 932                 s = fbm_instr((unsigned char*)s,
 933                               (unsigned char*)last + SvCUR(must)
 934                                   - (SvTAIL(must)!=0),
 935                               must, multiline ? FBMrf_MULTILINE : 0);
 936             DEBUG_EXECUTE_r({
 937                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 938                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 939                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 940                     (s ? "Found" : "Contradicts"),
 941                     quoted, RE_SV_TAIL(must));
 942             });
 943             if (!s) {
 944                 if (last1 == last) {
 945                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 946                                             ", giving up...\n"));
 947                     goto fail_finish;
 948                 }
 949                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 950                     ", trying anchored starting at offset %ld...\n",
 951                     (long)(saved_s + 1 - i_strpos)));
 952                 other_last = last;
 953                 s = HOP3c(t, 1, strend);
 954                 goto restart;
 955             }
 956             else {
 957                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 958                       (long)(s - i_strpos)));
 959                 other_last = s; /* Fix this later. --Hugo */
 960                 s = saved_s;
 961                 if (t == strpos)
 962                     goto try_at_start;
 963                 goto try_at_offset;
 964             }
 965         }
 966     }
 967
 968
 969     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 970
 971     DEBUG_OPTIMISE_MORE_r(
 972         PerlIO_printf(Perl_debug_log,
 973             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 974             (IV)prog->check_offset_min,
 975             (IV)prog->check_offset_max,
 976             (IV)(s-strpos),
 977             (IV)(t-strpos),
 978             (IV)(t-s),
 979             (IV)(strend-strpos)
 980         )
 981     );
 982
 983     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 984         && (!utf8_target
 985             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 986                  && t > strpos)))
 987     {
 988         /* Fixed substring is found far enough so that the match
 989            cannot start at strpos. */
 990       try_at_offset:
 991         if (ml_anch && t[-1] != '\n') {
 992             /* Eventually fbm_*() should handle this, but often
 993                anchored_offset is not 0, so this check will not be wasted. */
 994             /* XXXX In the code below we prefer to look for "^" even in
 995                presence of anchored substrings.  And we search even
 996                beyond the found float position.  These pessimizations
 997                are historical artefacts only.  */
 998           find_anchor:
 999             while (t < strend - prog->minlen) {
1000                 if (*t == '\n') {
1001                     if (t < check_at - prog->check_offset_min) {
1002                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1003                             /* Since we moved from the found position,
1004                                we definitely contradict the found anchored
1005                                substr.  Due to the above check we do not
1006                                contradict "check" substr.
1007                                Thus we can arrive here only if check substr
1008                                is float.  Redo checking for "other"=="fixed".
1009                              */
1010                             strpos = t + 1;
1011                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1012                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1013                             goto do_other_anchored;
1014                         }
1015                         /* We don't contradict the found floating substring. */
1016                         /* XXXX Why not check for STCLASS? */
1017                         s = t + 1;
1018                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1019                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1020                         goto set_useful;
1021                     }
1022                     /* Position contradicts check-string */
1023                     /* XXXX probably better to look for check-string
1024                        than for "\n", so one should lower the limit for t? */
1025                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1026                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1027                     other_last = strpos = s = t + 1;
1028                     goto restart;
1029                 }
1030                 t++;
1031             }
1032             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1033                         PL_colors[0], PL_colors[1]));
1034             goto fail_finish;
1035         }
1036         else {
1037             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1038                         PL_colors[0], PL_colors[1]));
1039         }
1040         s = t;
1041       set_useful:
1042         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1043     }
1044     else {
1045         /* The found string does not prohibit matching at strpos,
1046            - no optimization of calling REx engine can be performed,
1047            unless it was an MBOL and we are not after MBOL,
1048            or a future STCLASS check will fail this. */
1049       try_at_start:
1050         /* Even in this situation we may use MBOL flag if strpos is offset
1051            wrt the start of the string. */
1052         if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1053             /* May be due to an implicit anchor of m{.*foo}  */
1054             && !(prog->intflags & PREGf_IMPLICIT))
1055         {
1056             t = strpos;
1057             goto find_anchor;
1058         }
1059         DEBUG_EXECUTE_r( if (ml_anch)
1060             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1061                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1062         );
1063       success_at_start:
1064         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1065             && (utf8_target ? (
1066                 prog->check_utf8                /* Could be deleted already */
1067                 && --BmUSEFUL(prog->check_utf8) < 0
1068                 && (prog->check_utf8 == prog->float_utf8)
1069             ) : (
1070                 prog->check_substr              /* Could be deleted already */
1071                 && --BmUSEFUL(prog->check_substr) < 0
1072                 && (prog->check_substr == prog->float_substr)
1073             )))
1074         {
1075             /* If flags & SOMETHING - do not do it many times on the same match */
1076             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1077             /* XXX Does the destruction order has to change with utf8_target? */
1078             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1080             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1081             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1082             check = NULL;                       /* abort */
1083             s = strpos;
1084             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1085                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086             if (prog->intflags & PREGf_IMPLICIT)
1087                 prog->extflags &= ~RXf_ANCH_MBOL;
1088             /* XXXX This is a remnant of the old implementation.  It
1089                     looks wasteful, since now INTUIT can use many
1090                     other heuristics. */
1091             prog->extflags &= ~RXf_USE_INTUIT;
1092             /* XXXX What other flags might need to be cleared in this branch? */
1093         }
1094         else
1095             s = strpos;
1096     }
1097
1098     /* Last resort... */
1099     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1100     /* trie stclasses are too expensive to use here, we are better off to
1101        leave it to regmatch itself */
1102     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1103         /* minlen == 0 is possible if regstclass is \b or \B,
1104            and the fixed substr is ''$.
1105            Since minlen is already taken into account, s+1 is before strend;
1106            accidentally, minlen >= 1 guaranties no false positives at s + 1
1107            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1108            regstclass does not come from lookahead...  */
1109         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1110            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1111         const U8* const str = (U8*)STRING(progi->regstclass);
1112         /* XXX this value could be pre-computed */
1113         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1114                     ?  (reginfo->is_utf8_pat
1115                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1116                         : STR_LEN(progi->regstclass))
1117                     : 1);
1118         char * endpos;
1119         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1120             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1121         else if (prog->float_substr || prog->float_utf8)
1122             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1123         else
1124             endpos= strend;
1125
1126         if (checked_upto < s)
1127            checked_upto = s;
1128         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1129                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1130
1131         t = s;
1132         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1133                             reginfo);
1134         if (s) {
1135             checked_upto = s;
1136         } else {
1137 #ifdef DEBUGGING
1138             const char *what = NULL;
1139 #endif
1140             if (endpos == strend) {
1141                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142                                 "Could not match STCLASS...\n") );
1143                 goto fail;
1144             }
1145             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1146                                    "This position contradicts STCLASS...\n") );
1147             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1148                 goto fail;
1149             checked_upto = HOPBACKc(endpos, start_shift);
1150             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1151                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1152             /* Contradict one of substrings */
1153             if (prog->anchored_substr || prog->anchored_utf8) {
1154                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1155                     DEBUG_EXECUTE_r( what = "anchored" );
1156                   hop_and_restart:
1157                     s = HOP3c(t, 1, strend);
1158                     if (s + start_shift + end_shift > strend) {
1159                         /* XXXX Should be taken into account earlier? */
1160                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1161                                                "Could not match STCLASS...\n") );
1162                         goto fail;
1163                     }
1164                     if (!check)
1165                         goto giveup;
1166                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1167                                 "Looking for %s substr starting at offset %ld...\n",
1168                                  what, (long)(s + start_shift - i_strpos)) );
1169                     goto restart;
1170                 }
1171                 /* Have both, check_string is floating */
1172                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1173                     goto retry_floating_check;
1174                 /* Recheck anchored substring, but not floating... */
1175                 s = check_at;
1176                 if (!check)
1177                     goto giveup;
1178                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1179                           "Looking for anchored substr starting at offset %ld...\n",
1180                           (long)(other_last - i_strpos)) );
1181                 goto do_other_anchored;
1182             }
1183             /* Another way we could have checked stclass at the
1184                current position only: */
1185             if (ml_anch) {
1186                 s = t = t + 1;
1187                 if (!check)
1188                     goto giveup;
1189                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1190                           "Looking for /%s^%s/m starting at offset %ld...\n",
1191                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1192                 goto try_at_offset;
1193             }
1194             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1195                 goto fail;
1196             /* Check is floating substring. */
1197           retry_floating_check:
1198             t = check_at - start_shift;
1199             DEBUG_EXECUTE_r( what = "floating" );
1200             goto hop_and_restart;
1201         }
1202         if (t != s) {
1203             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1204                         "By STCLASS: moving %ld --> %ld\n",
1205                                   (long)(t - i_strpos), (long)(s - i_strpos))
1206                    );
1207         }
1208         else {
1209             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1210                                   "Does not contradict STCLASS...\n");
1211                    );
1212         }
1213     }
1214   giveup:
1215     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1216                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1217                           PL_colors[5], (long)(s - i_strpos)) );
1218     return s;
1219
1220   fail_finish:                          /* Substring not found */
1221     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1222         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1223   fail:
1224     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1225                           PL_colors[4], PL_colors[5]));
1226     return NULL;
1227 }
1228
1229 #define DECL_TRIE_TYPE(scan) \
1230     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1231                     trie_type = ((scan->flags == EXACT) \
1232                               ? (utf8_target ? trie_utf8 : trie_plain) \
1233                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1234
1235 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1236 STMT_START {                               \
1237     STRLEN skiplen;                                                                 \
1238     switch (trie_type) {                                                            \
1239     case trie_utf8_fold:                                                            \
1240         if ( foldlen>0 ) {                                                          \
1241             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1242             foldlen -= len;                                                         \
1243             uscan += len;                                                           \
1244             len=0;                                                                  \
1245         } else {                                                                    \
1246             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1247             len = UTF8SKIP(uc);                                                     \
1248             skiplen = UNISKIP( uvc );                                               \
1249             foldlen -= skiplen;                                                     \
1250             uscan = foldbuf + skiplen;                                              \
1251         }                                                                           \
1252         break;                                                                      \
1253     case trie_latin_utf8_fold:                                                      \
1254         if ( foldlen>0 ) {                                                          \
1255             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1256             foldlen -= len;                                                         \
1257             uscan += len;                                                           \
1258             len=0;                                                                  \
1259         } else {                                                                    \
1260             len = 1;                                                                \
1261             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);   \
1262             skiplen = UNISKIP( uvc );                                               \
1263             foldlen -= skiplen;                                                     \
1264             uscan = foldbuf + skiplen;                                              \
1265         }                                                                           \
1266         break;                                                                      \
1267     case trie_utf8:                                                                 \
1268         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1269         break;                                                                      \
1270     case trie_plain:                                                                \
1271         uvc = (UV)*uc;                                                              \
1272         len = 1;                                                                    \
1273     }                                                                               \
1274     if (uvc < 256) {                                                                \
1275         charid = trie->charmap[ uvc ];                                              \
1276     }                                                                               \
1277     else {                                                                          \
1278         charid = 0;                                                                 \
1279         if (widecharmap) {                                                          \
1280             SV** const svpp = hv_fetch(widecharmap,                                 \
1281                         (char*)&uvc, sizeof(UV), 0);                                \
1282             if (svpp)                                                               \
1283                 charid = (U16)SvIV(*svpp);                                          \
1284         }                                                                           \
1285     }                                                                               \
1286 } STMT_END
1287
1288 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1289 STMT_START {                                              \
1290     while (s <= e) {                                      \
1291         if ( (CoNd)                                       \
1292              && (ln == 1 || folder(s, pat_string, ln))    \
1293              && (reginfo->intuit || regtry(reginfo, &s)) )\
1294             goto got_it;                                  \
1295         s++;                                              \
1296     }                                                     \
1297 } STMT_END
1298
1299 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1300 STMT_START {                                          \
1301     while (s < strend) {                              \
1302         CoDe                                          \
1303         s += UTF8SKIP(s);                             \
1304     }                                                 \
1305 } STMT_END
1306
1307 #define REXEC_FBC_SCAN(CoDe)                          \
1308 STMT_START {                                          \
1309     while (s < strend) {                              \
1310         CoDe                                          \
1311         s++;                                          \
1312     }                                                 \
1313 } STMT_END
1314
1315 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1316 REXEC_FBC_UTF8_SCAN(                                  \
1317     if (CoNd) {                                       \
1318         if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1319             goto got_it;                              \
1320         else                                          \
1321             tmp = doevery;                            \
1322     }                                                 \
1323     else                                              \
1324         tmp = 1;                                      \
1325 )
1326
1327 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1328 REXEC_FBC_SCAN(                                       \
1329     if (CoNd) {                                       \
1330         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1331             goto got_it;                              \
1332         else                                          \
1333             tmp = doevery;                            \
1334     }                                                 \
1335     else                                              \
1336         tmp = 1;                                      \
1337 )
1338
1339 #define REXEC_FBC_TRYIT               \
1340 if ((reginfo->intuit || regtry(reginfo, &s))) \
1341     goto got_it
1342
1343 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1344     if (utf8_target) {                                             \
1345         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1346     }                                                          \
1347     else {                                                     \
1348         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1349     }
1350
1351 #define DUMP_EXEC_POS(li,s,doutf8) \
1352     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1353                 startpos, doutf8)
1354
1355
1356 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1357         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1358         tmp = TEST_NON_UTF8(tmp);                                              \
1359         REXEC_FBC_UTF8_SCAN(                                                   \
1360             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1361                 tmp = !tmp;                                                    \
1362                 IF_SUCCESS;                                                    \
1363             }                                                                  \
1364             else {                                                             \
1365                 IF_FAIL;                                                       \
1366             }                                                                  \
1367         );                                                                     \
1368
1369 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1370         if (s == reginfo->strbeg) {                                            \
1371             tmp = '\n';                                                        \
1372         }                                                                      \
1373         else {                                                                 \
1374             U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1375             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1376         }                                                                      \
1377         tmp = TeSt1_UtF8;                                                      \
1378         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1379         REXEC_FBC_UTF8_SCAN(                                                   \
1380             if (tmp == ! (TeSt2_UtF8)) { \
1381                 tmp = !tmp;                                                    \
1382                 IF_SUCCESS;                                                    \
1383             }                                                                  \
1384             else {                                                             \
1385                 IF_FAIL;                                                       \
1386             }                                                                  \
1387         );                                                                     \
1388
1389 /* The only difference between the BOUND and NBOUND cases is that
1390  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1391  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1392  * with the other one being empty */
1393 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1394     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1395
1396 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1397     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1398
1399 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1400     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1401
1402 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1403     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1404
1405
1406 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1407  * be passed in completely with the variable name being tested, which isn't
1408  * such a clean interface, but this is easier to read than it was before.  We
1409  * are looking for the boundary (or non-boundary between a word and non-word
1410  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1411  * must be different.  Find the "wordness" of the character just prior to this
1412  * one, and compare it with the wordness of this one.  If they differ, we have
1413  * a boundary.  At the beginning of the string, pretend that the previous
1414  * character was a new-line */
1415 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1416     if (utf8_target) {                                                         \
1417                 UTF8_CODE \
1418     }                                                                          \
1419     else {  /* Not utf8 */                                                     \
1420         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1421         tmp = TEST_NON_UTF8(tmp);                                              \
1422         REXEC_FBC_SCAN(                                                        \
1423             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1424                 tmp = !tmp;                                                    \
1425                 IF_SUCCESS;                                                    \
1426             }                                                                  \
1427             else {                                                             \
1428                 IF_FAIL;                                                       \
1429             }                                                                  \
1430         );                                                                     \
1431     }                                                                          \
1432     if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))           \
1433         goto got_it;
1434
1435 /* We know what class REx starts with.  Try to find this position... */
1436 /* if reginfo->intuit, its a dryrun */
1437 /* annoyingly all the vars in this routine have different names from their counterparts
1438    in regmatch. /grrr */
1439
1440 STATIC char *
1441 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1442     const char *strend, regmatch_info *reginfo)
1443 {
1444     dVAR;
1445     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1446     char *pat_string;   /* The pattern's exactish string */
1447     char *pat_end;          /* ptr to end char of pat_string */
1448     re_fold_t folder;   /* Function for computing non-utf8 folds */
1449     const U8 *fold_array;   /* array for folding ords < 256 */
1450     STRLEN ln;
1451     STRLEN lnc;
1452     U8 c1;
1453     U8 c2;
1454     char *e;
1455     I32 tmp = 1;        /* Scratch variable? */
1456     const bool utf8_target = reginfo->is_utf8_target;
1457     UV utf8_fold_flags = 0;
1458     const bool is_utf8_pat = reginfo->is_utf8_pat;
1459     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1460                                    with a result inverts that result, as 0^1 =
1461                                    1 and 1^1 = 0 */
1462     _char_class_number classnum;
1463
1464     RXi_GET_DECL(prog,progi);
1465
1466     PERL_ARGS_ASSERT_FIND_BYCLASS;
1467
1468     /* We know what class it must start with. */
1469     switch (OP(c)) {
1470     case ANYOF:
1471     case ANYOF_SYNTHETIC:
1472     case ANYOF_WARN_SUPER:
1473         if (utf8_target) {
1474             REXEC_FBC_UTF8_CLASS_SCAN(
1475                       reginclass(prog, c, (U8*)s, utf8_target));
1476         }
1477         else {
1478             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1479         }
1480         break;
1481     case CANY:
1482         REXEC_FBC_SCAN(
1483             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1484                 goto got_it;
1485             else
1486                 tmp = doevery;
1487         );
1488         break;
1489
1490     case EXACTFA:
1491         if (is_utf8_pat || utf8_target) {
1492             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1493             goto do_exactf_utf8;
1494         }
1495         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1496         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1497         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1498
1499     case EXACTF:
1500         if (utf8_target) {
1501
1502             /* regcomp.c already folded this if pattern is in UTF-8 */
1503             utf8_fold_flags = 0;
1504             goto do_exactf_utf8;
1505         }
1506         fold_array = PL_fold;
1507         folder = foldEQ;
1508         goto do_exactf_non_utf8;
1509
1510     case EXACTFL:
1511         if (is_utf8_pat || utf8_target) {
1512             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1513             goto do_exactf_utf8;
1514         }
1515         fold_array = PL_fold_locale;
1516         folder = foldEQ_locale;
1517         goto do_exactf_non_utf8;
1518
1519     case EXACTFU_SS:
1520         if (is_utf8_pat) {
1521             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1522         }
1523         goto do_exactf_utf8;
1524
1525     case EXACTFU_TRICKYFOLD:
1526     case EXACTFU:
1527         if (is_utf8_pat || utf8_target) {
1528             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1529             goto do_exactf_utf8;
1530         }
1531
1532         /* Any 'ss' in the pattern should have been replaced by regcomp,
1533          * so we don't have to worry here about this single special case
1534          * in the Latin1 range */
1535         fold_array = PL_fold_latin1;
1536         folder = foldEQ_latin1;
1537
1538         /* FALL THROUGH */
1539
1540     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1541                            are no glitches with fold-length differences
1542                            between the target string and pattern */
1543
1544         /* The idea in the non-utf8 EXACTF* cases is to first find the
1545          * first character of the EXACTF* node and then, if necessary,
1546          * case-insensitively compare the full text of the node.  c1 is the
1547          * first character.  c2 is its fold.  This logic will not work for
1548          * Unicode semantics and the german sharp ss, which hence should
1549          * not be compiled into a node that gets here. */
1550         pat_string = STRING(c);
1551         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1552
1553         /* We know that we have to match at least 'ln' bytes (which is the
1554          * same as characters, since not utf8).  If we have to match 3
1555          * characters, and there are only 2 availabe, we know without
1556          * trying that it will fail; so don't start a match past the
1557          * required minimum number from the far end */
1558         e = HOP3c(strend, -((I32)ln), s);
1559
1560         if (reginfo->intuit && e < s) {
1561             e = s;                      /* Due to minlen logic of intuit() */
1562         }
1563
1564         c1 = *pat_string;
1565         c2 = fold_array[c1];
1566         if (c1 == c2) { /* If char and fold are the same */
1567             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1568         }
1569         else {
1570             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1571         }
1572         break;
1573
1574     do_exactf_utf8:
1575     {
1576         unsigned expansion;
1577
1578         /* If one of the operands is in utf8, we can't use the simpler folding
1579          * above, due to the fact that many different characters can have the
1580          * same fold, or portion of a fold, or different- length fold */
1581         pat_string = STRING(c);
1582         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1583         pat_end = pat_string + ln;
1584         lnc = is_utf8_pat       /* length to match in characters */
1585                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1586                 : ln;
1587
1588         /* We have 'lnc' characters to match in the pattern, but because of
1589          * multi-character folding, each character in the target can match
1590          * up to 3 characters (Unicode guarantees it will never exceed
1591          * this) if it is utf8-encoded; and up to 2 if not (based on the
1592          * fact that the Latin 1 folds are already determined, and the
1593          * only multi-char fold in that range is the sharp-s folding to
1594          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1595          * string character.  Adjust lnc accordingly, rounding up, so that
1596          * if we need to match at least 4+1/3 chars, that really is 5. */
1597         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1598         lnc = (lnc + expansion - 1) / expansion;
1599
1600         /* As in the non-UTF8 case, if we have to match 3 characters, and
1601          * only 2 are left, it's guaranteed to fail, so don't start a
1602          * match that would require us to go beyond the end of the string
1603          */
1604         e = HOP3c(strend, -((I32)lnc), s);
1605
1606         if (reginfo->intuit && e < s) {
1607             e = s;                      /* Due to minlen logic of intuit() */
1608         }
1609
1610         /* XXX Note that we could recalculate e to stop the loop earlier,
1611          * as the worst case expansion above will rarely be met, and as we
1612          * go along we would usually find that e moves further to the left.
1613          * This would happen only after we reached the point in the loop
1614          * where if there were no expansion we should fail.  Unclear if
1615          * worth the expense */
1616
1617         while (s <= e) {
1618             char *my_strend= (char *)strend;
1619             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1620                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1621                 && (reginfo->intuit || regtry(reginfo, &s)) )
1622             {
1623                 goto got_it;
1624             }
1625             s += (utf8_target) ? UTF8SKIP(s) : 1;
1626         }
1627         break;
1628     }
1629     case BOUNDL:
1630         RXp_MATCH_TAINTED_on(prog);
1631         FBC_BOUND(isWORDCHAR_LC,
1632                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1633                   isWORDCHAR_LC_utf8((U8*)s));
1634         break;
1635     case NBOUNDL:
1636         RXp_MATCH_TAINTED_on(prog);
1637         FBC_NBOUND(isWORDCHAR_LC,
1638                    isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1639                    isWORDCHAR_LC_utf8((U8*)s));
1640         break;
1641     case BOUND:
1642         FBC_BOUND(isWORDCHAR,
1643                   isWORDCHAR_uni(tmp),
1644                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1645         break;
1646     case BOUNDA:
1647         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1648                          isWORDCHAR_A(tmp),
1649                          isWORDCHAR_A((U8*)s));
1650         break;
1651     case NBOUND:
1652         FBC_NBOUND(isWORDCHAR,
1653                    isWORDCHAR_uni(tmp),
1654                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1655         break;
1656     case NBOUNDA:
1657         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1658                           isWORDCHAR_A(tmp),
1659                           isWORDCHAR_A((U8*)s));
1660         break;
1661     case BOUNDU:
1662         FBC_BOUND(isWORDCHAR_L1,
1663                   isWORDCHAR_uni(tmp),
1664                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1665         break;
1666     case NBOUNDU:
1667         FBC_NBOUND(isWORDCHAR_L1,
1668                    isWORDCHAR_uni(tmp),
1669                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1670         break;
1671     case LNBREAK:
1672         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1673                         is_LNBREAK_latin1_safe(s, strend)
1674         );
1675         break;
1676
1677     /* The argument to all the POSIX node types is the class number to pass to
1678      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1679
1680     case NPOSIXL:
1681         to_complement = 1;
1682         /* FALLTHROUGH */
1683
1684     case POSIXL:
1685         RXp_MATCH_TAINTED_on(prog);
1686         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1687                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1688         break;
1689
1690     case NPOSIXD:
1691         to_complement = 1;
1692         /* FALLTHROUGH */
1693
1694     case POSIXD:
1695         if (utf8_target) {
1696             goto posix_utf8;
1697         }
1698         goto posixa;
1699
1700     case NPOSIXA:
1701         if (utf8_target) {
1702             /* The complement of something that matches only ASCII matches all
1703              * UTF-8 variant code points, plus everything in ASCII that isn't
1704              * in the class */
1705             REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1706                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1707             break;
1708         }
1709
1710         to_complement = 1;
1711         /* FALLTHROUGH */
1712
1713     case POSIXA:
1714       posixa:
1715         /* Don't need to worry about utf8, as it can match only a single
1716          * byte invariant character. */
1717         REXEC_FBC_CLASS_SCAN(
1718                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1719         break;
1720
1721     case NPOSIXU:
1722         to_complement = 1;
1723         /* FALLTHROUGH */
1724
1725     case POSIXU:
1726         if (! utf8_target) {
1727             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1728                                                                     FLAGS(c))));
1729         }
1730         else {
1731
1732       posix_utf8:
1733             classnum = (_char_class_number) FLAGS(c);
1734             if (classnum < _FIRST_NON_SWASH_CC) {
1735                 while (s < strend) {
1736
1737                     /* We avoid loading in the swash as long as possible, but
1738                      * should we have to, we jump to a separate loop.  This
1739                      * extra 'if' statement is what keeps this code from being
1740                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1741                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
1742                         goto found_above_latin1;
1743                     }
1744                     if ((UTF8_IS_INVARIANT(*s)
1745                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1746                                                                 classnum)))
1747                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1748                             && to_complement ^ cBOOL(
1749                                 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1750                                               classnum))))
1751                     {
1752                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1753                             goto got_it;
1754                         else {
1755                             tmp = doevery;
1756                         }
1757                     }
1758                     else {
1759                         tmp = 1;
1760                     }
1761                     s += UTF8SKIP(s);
1762                 }
1763             }
1764             else switch (classnum) {    /* These classes are implemented as
1765                                            macros */
1766                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1767                                         revert the change of \v matching this */
1768                     /* FALL THROUGH */
1769
1770                 case _CC_ENUM_PSXSPC:
1771                     REXEC_FBC_UTF8_CLASS_SCAN(
1772                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
1773                     break;
1774
1775                 case _CC_ENUM_BLANK:
1776                     REXEC_FBC_UTF8_CLASS_SCAN(
1777                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
1778                     break;
1779
1780                 case _CC_ENUM_XDIGIT:
1781                     REXEC_FBC_UTF8_CLASS_SCAN(
1782                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1783                     break;
1784
1785                 case _CC_ENUM_VERTSPACE:
1786                     REXEC_FBC_UTF8_CLASS_SCAN(
1787                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
1788                     break;
1789
1790                 case _CC_ENUM_CNTRL:
1791                     REXEC_FBC_UTF8_CLASS_SCAN(
1792                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
1793                     break;
1794
1795                 default:
1796                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1797                     assert(0); /* NOTREACHED */
1798             }
1799         }
1800         break;
1801
1802       found_above_latin1:   /* Here we have to load a swash to get the result
1803                                for the current code point */
1804         if (! PL_utf8_swash_ptrs[classnum]) {
1805             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1806             PL_utf8_swash_ptrs[classnum] =
1807                     _core_swash_init("utf8", swash_property_names[classnum],
1808                                      &PL_sv_undef, 1, 0, NULL, &flags);
1809         }
1810
1811         /* This is a copy of the loop above for swash classes, though using the
1812          * FBC macro instead of being expanded out.  Since we've loaded the
1813          * swash, we don't have to check for that each time through the loop */
1814         REXEC_FBC_UTF8_CLASS_SCAN(
1815                 to_complement ^ cBOOL(_generic_utf8(
1816                                       classnum,
1817                                       s,
1818                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
1819                                                   (U8 *) s, TRUE))));
1820         break;
1821
1822     case AHOCORASICKC:
1823     case AHOCORASICK:
1824         {
1825             DECL_TRIE_TYPE(c);
1826             /* what trie are we using right now */
1827             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1828             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1829             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1830
1831             const char *last_start = strend - trie->minlen;
1832 #ifdef DEBUGGING
1833             const char *real_start = s;
1834 #endif
1835             STRLEN maxlen = trie->maxlen;
1836             SV *sv_points;
1837             U8 **points; /* map of where we were in the input string
1838                             when reading a given char. For ASCII this
1839                             is unnecessary overhead as the relationship
1840                             is always 1:1, but for Unicode, especially
1841                             case folded Unicode this is not true. */
1842             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1843             U8 *bitmap=NULL;
1844
1845
1846             GET_RE_DEBUG_FLAGS_DECL;
1847
1848             /* We can't just allocate points here. We need to wrap it in
1849              * an SV so it gets freed properly if there is a croak while
1850              * running the match */
1851             ENTER;
1852             SAVETMPS;
1853             sv_points=newSV(maxlen * sizeof(U8 *));
1854             SvCUR_set(sv_points,
1855                 maxlen * sizeof(U8 *));
1856             SvPOK_on(sv_points);
1857             sv_2mortal(sv_points);
1858             points=(U8**)SvPV_nolen(sv_points );
1859             if ( trie_type != trie_utf8_fold
1860                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1861             {
1862                 if (trie->bitmap)
1863                     bitmap=(U8*)trie->bitmap;
1864                 else
1865                     bitmap=(U8*)ANYOF_BITMAP(c);
1866             }
1867             /* this is the Aho-Corasick algorithm modified a touch
1868                to include special handling for long "unknown char" sequences.
1869                The basic idea being that we use AC as long as we are dealing
1870                with a possible matching char, when we encounter an unknown char
1871                (and we have not encountered an accepting state) we scan forward
1872                until we find a legal starting char.
1873                AC matching is basically that of trie matching, except that when
1874                we encounter a failing transition, we fall back to the current
1875                states "fail state", and try the current char again, a process
1876                we repeat until we reach the root state, state 1, or a legal
1877                transition. If we fail on the root state then we can either
1878                terminate if we have reached an accepting state previously, or
1879                restart the entire process from the beginning if we have not.
1880
1881              */
1882             while (s <= last_start) {
1883                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1884                 U8 *uc = (U8*)s;
1885                 U16 charid = 0;
1886                 U32 base = 1;
1887                 U32 state = 1;
1888                 UV uvc = 0;
1889                 STRLEN len = 0;
1890                 STRLEN foldlen = 0;
1891                 U8 *uscan = (U8*)NULL;
1892                 U8 *leftmost = NULL;
1893 #ifdef DEBUGGING
1894                 U32 accepted_word= 0;
1895 #endif
1896                 U32 pointpos = 0;
1897
1898                 while ( state && uc <= (U8*)strend ) {
1899                     int failed=0;
1900                     U32 word = aho->states[ state ].wordnum;
1901
1902                     if( state==1 ) {
1903                         if ( bitmap ) {
1904                             DEBUG_TRIE_EXECUTE_r(
1905                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1907                                         (char *)uc, utf8_target );
1908                                     PerlIO_printf( Perl_debug_log,
1909                                         " Scanning for legal start char...\n");
1910                                 }
1911                             );
1912                             if (utf8_target) {
1913                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1914                                     uc += UTF8SKIP(uc);
1915                                 }
1916                             } else {
1917                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1918                                     uc++;
1919                                 }
1920                             }
1921                             s= (char *)uc;
1922                         }
1923                         if (uc >(U8*)last_start) break;
1924                     }
1925
1926                     if ( word ) {
1927                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1928                         if (!leftmost || lpos < leftmost) {
1929                             DEBUG_r(accepted_word=word);
1930                             leftmost= lpos;
1931                         }
1932                         if (base==0) break;
1933
1934                     }
1935                     points[pointpos++ % maxlen]= uc;
1936                     if (foldlen || uc < (U8*)strend) {
1937                         REXEC_TRIE_READ_CHAR(trie_type, trie,
1938                                          widecharmap, uc,
1939                                          uscan, len, uvc, charid, foldlen,
1940                                          foldbuf, uniflags);
1941                         DEBUG_TRIE_EXECUTE_r({
1942                             dump_exec_pos( (char *)uc, c, strend,
1943                                         real_start, s, utf8_target);
1944                             PerlIO_printf(Perl_debug_log,
1945                                 " Charid:%3u CP:%4"UVxf" ",
1946                                  charid, uvc);
1947                         });
1948                     }
1949                     else {
1950                         len = 0;
1951                         charid = 0;
1952                     }
1953
1954
1955                     do {
1956 #ifdef DEBUGGING
1957                         word = aho->states[ state ].wordnum;
1958 #endif
1959                         base = aho->states[ state ].trans.base;
1960
1961                         DEBUG_TRIE_EXECUTE_r({
1962                             if (failed)
1963                                 dump_exec_pos( (char *)uc, c, strend, real_start,
1964                                     s,   utf8_target );
1965                             PerlIO_printf( Perl_debug_log,
1966                                 "%sState: %4"UVxf", word=%"UVxf,
1967                                 failed ? " Fail transition to " : "",
1968                                 (UV)state, (UV)word);
1969                         });
1970                         if ( base ) {
1971                             U32 tmp;
1972                             I32 offset;
1973                             if (charid &&
1974                                  ( ((offset = base + charid
1975                                     - 1 - trie->uniquecharcount)) >= 0)
1976                                  && ((U32)offset < trie->lasttrans)
1977                                  && trie->trans[offset].check == state
1978                                  && (tmp=trie->trans[offset].next))
1979                             {
1980                                 DEBUG_TRIE_EXECUTE_r(
1981                                     PerlIO_printf( Perl_debug_log," - legal\n"));
1982                                 state = tmp;
1983                                 break;
1984                             }
1985                             else {
1986                                 DEBUG_TRIE_EXECUTE_r(
1987                                     PerlIO_printf( Perl_debug_log," - fail\n"));
1988                                 failed = 1;
1989                                 state = aho->fail[state];
1990                             }
1991                         }
1992                         else {
1993                             /* we must be accepting here */
1994                             DEBUG_TRIE_EXECUTE_r(
1995                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
1996                             failed = 1;
1997                             break;
1998                         }
1999                     } while(state);
2000                     uc += len;
2001                     if (failed) {
2002                         if (leftmost)
2003                             break;
2004                         if (!state) state = 1;
2005                     }
2006                 }
2007                 if ( aho->states[ state ].wordnum ) {
2008                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2009                     if (!leftmost || lpos < leftmost) {
2010                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2011                         leftmost = lpos;
2012                     }
2013                 }
2014                 if (leftmost) {
2015                     s = (char*)leftmost;
2016                     DEBUG_TRIE_EXECUTE_r({
2017                         PerlIO_printf(
2018                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2019                             (UV)accepted_word, (IV)(s - real_start)
2020                         );
2021                     });
2022                     if (reginfo->intuit || regtry(reginfo, &s)) {
2023                         FREETMPS;
2024                         LEAVE;
2025                         goto got_it;
2026                     }
2027                     s = HOPc(s,1);
2028                     DEBUG_TRIE_EXECUTE_r({
2029                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2030                     });
2031                 } else {
2032                     DEBUG_TRIE_EXECUTE_r(
2033                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2034                     break;
2035                 }
2036             }
2037             FREETMPS;
2038             LEAVE;
2039         }
2040         break;
2041     default:
2042         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2043         break;
2044     }
2045     return 0;
2046   got_it:
2047     return s;
2048 }
2049
2050 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2051  * flags have same meanings as with regexec_flags() */
2052
2053 static void
2054 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2055                             char *strbeg,
2056                             char *strend,
2057                             SV *sv,
2058                             U32 flags,
2059                             bool utf8_target)
2060 {
2061     struct regexp *const prog = ReANY(rx);
2062
2063     if (flags & REXEC_COPY_STR) {
2064 #ifdef PERL_ANY_COW
2065         if (SvCANCOW(sv)) {
2066             if (DEBUG_C_TEST) {
2067                 PerlIO_printf(Perl_debug_log,
2068                               "Copy on write: regexp capture, type %d\n",
2069                               (int) SvTYPE(sv));
2070             }
2071             /* Create a new COW SV to share the match string and store
2072              * in saved_copy, unless the current COW SV in saved_copy
2073              * is valid and suitable for our purpose */
2074             if ((   prog->saved_copy
2075                  && SvIsCOW(prog->saved_copy)
2076                  && SvPOKp(prog->saved_copy)
2077                  && SvIsCOW(sv)
2078                  && SvPOKp(sv)
2079                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2080             {
2081                 /* just reuse saved_copy SV */
2082                 if (RXp_MATCH_COPIED(prog)) {
2083                     Safefree(prog->subbeg);
2084                     RXp_MATCH_COPIED_off(prog);
2085                 }
2086             }
2087             else {
2088                 /* create new COW SV to share string */
2089                 RX_MATCH_COPY_FREE(rx);
2090                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2091             }
2092             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2093             assert (SvPOKp(prog->saved_copy));
2094             prog->sublen  = strend - strbeg;
2095             prog->suboffset = 0;
2096             prog->subcoffset = 0;
2097         } else
2098 #endif
2099         {
2100             I32 min = 0;
2101             I32 max = strend - strbeg;
2102             I32 sublen;
2103
2104             if (    (flags & REXEC_COPY_SKIP_POST)
2105                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2106                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2107             ) { /* don't copy $' part of string */
2108                 U32 n = 0;
2109                 max = -1;
2110                 /* calculate the right-most part of the string covered
2111                  * by a capture. Due to look-ahead, this may be to
2112                  * the right of $&, so we have to scan all captures */
2113                 while (n <= prog->lastparen) {
2114                     if (prog->offs[n].end > max)
2115                         max = prog->offs[n].end;
2116                     n++;
2117                 }
2118                 if (max == -1)
2119                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2120                             ? prog->offs[0].start
2121                             : 0;
2122                 assert(max >= 0 && max <= strend - strbeg);
2123             }
2124
2125             if (    (flags & REXEC_COPY_SKIP_PRE)
2126                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2127                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2128             ) { /* don't copy $` part of string */
2129                 U32 n = 0;
2130                 min = max;
2131                 /* calculate the left-most part of the string covered
2132                  * by a capture. Due to look-behind, this may be to
2133                  * the left of $&, so we have to scan all captures */
2134                 while (min && n <= prog->lastparen) {
2135                     if (   prog->offs[n].start != -1
2136                         && prog->offs[n].start < min)
2137                     {
2138                         min = prog->offs[n].start;
2139                     }
2140                     n++;
2141                 }
2142                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2143                     && min >  prog->offs[0].end
2144                 )
2145                     min = prog->offs[0].end;
2146
2147             }
2148
2149             assert(min >= 0 && min <= max && min <= strend - strbeg);
2150             sublen = max - min;
2151
2152             if (RX_MATCH_COPIED(rx)) {
2153                 if (sublen > prog->sublen)
2154                     prog->subbeg =
2155                             (char*)saferealloc(prog->subbeg, sublen+1);
2156             }
2157             else
2158                 prog->subbeg = (char*)safemalloc(sublen+1);
2159             Copy(strbeg + min, prog->subbeg, sublen, char);
2160             prog->subbeg[sublen] = '\0';
2161             prog->suboffset = min;
2162             prog->sublen = sublen;
2163             RX_MATCH_COPIED_on(rx);
2164         }
2165         prog->subcoffset = prog->suboffset;
2166         if (prog->suboffset && utf8_target) {
2167             /* Convert byte offset to chars.
2168              * XXX ideally should only compute this if @-/@+
2169              * has been seen, a la PL_sawampersand ??? */
2170
2171             /* If there's a direct correspondence between the
2172              * string which we're matching and the original SV,
2173              * then we can use the utf8 len cache associated with
2174              * the SV. In particular, it means that under //g,
2175              * sv_pos_b2u() will use the previously cached
2176              * position to speed up working out the new length of
2177              * subcoffset, rather than counting from the start of
2178              * the string each time. This stops
2179              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2180              * from going quadratic */
2181             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2182                 sv_pos_b2u(sv, &(prog->subcoffset));
2183             else
2184                 prog->subcoffset = utf8_length((U8*)strbeg,
2185                                     (U8*)(strbeg+prog->suboffset));
2186         }
2187     }
2188     else {
2189         RX_MATCH_COPY_FREE(rx);
2190         prog->subbeg = strbeg;
2191         prog->suboffset = 0;
2192         prog->subcoffset = 0;
2193         prog->sublen = strend - strbeg;
2194     }
2195 }
2196
2197
2198
2199
2200 /*
2201  - regexec_flags - match a regexp against a string
2202  */
2203 I32
2204 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2205               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2206 /* stringarg: the point in the string at which to begin matching */
2207 /* strend:    pointer to null at end of string */
2208 /* strbeg:    real beginning of string */
2209 /* minend:    end of match must be >= minend bytes after stringarg. */
2210 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2211  *            itself is accessed via the pointers above */
2212 /* data:      May be used for some additional optimizations.
2213               Currently unused. */
2214 /* flags:     For optimizations. See REXEC_* in regexp.h */
2215
2216 {
2217     dVAR;
2218     struct regexp *const prog = ReANY(rx);
2219     char *s;
2220     regnode *c;
2221     char *startpos;
2222     I32 minlen;         /* must match at least this many chars */
2223     I32 dontbother = 0; /* how many characters not to try at end */
2224     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2225     const bool utf8_target = cBOOL(DO_UTF8(sv));
2226     I32 multiline;
2227     RXi_GET_DECL(prog,progi);
2228     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2229     regmatch_info *const reginfo = &reginfo_buf;
2230     regexp_paren_pair *swap = NULL;
2231     I32 oldsave;
2232     GET_RE_DEBUG_FLAGS_DECL;
2233
2234     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2235     PERL_UNUSED_ARG(data);
2236
2237     /* Be paranoid... */
2238     if (prog == NULL || stringarg == NULL) {
2239         Perl_croak(aTHX_ "NULL regexp parameter");
2240         return 0;
2241     }
2242
2243     DEBUG_EXECUTE_r(
2244         debug_start_match(rx, utf8_target, stringarg, strend,
2245         "Matching");
2246     );
2247
2248     startpos = stringarg;
2249
2250     if (prog->extflags & RXf_GPOS_SEEN) {
2251         MAGIC *mg;
2252
2253         /* set reginfo->ganch, the position where \G can match */
2254
2255         reginfo->ganch =
2256             (flags & REXEC_IGNOREPOS)
2257             ? stringarg /* use start pos rather than pos() */
2258             : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2259             ? strbeg + mg->mg_len /* Defined pos() */
2260             : strbeg; /* pos() not defined; use start of string */
2261
2262         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2263             "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
2264
2265         /* in the presence of \G, we may need to start looking earlier in
2266          * the string than the suggested start point of stringarg:
2267          * if gofs->prog is set, then that's a known, fixed minimum
2268          * offset, such as
2269          * /..\G/:   gofs = 2
2270          * /ab|c\G/: gofs = 1
2271          * or if the minimum offset isn't known, then we have to go back
2272          * to the start of the string, e.g. /w+\G/
2273          */
2274
2275         if (prog->extflags & RXf_ANCH_GPOS) {
2276             startpos  = reginfo->ganch - prog->gofs;
2277             if (startpos <
2278                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2279             {
2280                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2281                         "fail: ganch-gofs before earliest possible start\n"));
2282                 return 0;
2283             }
2284         }
2285         else if (prog->gofs) {
2286             if (startpos - prog->gofs < strbeg)
2287                 startpos = strbeg;
2288             else
2289                 startpos -= prog->gofs;
2290         }
2291         else if (prog->extflags & RXf_GPOS_FLOAT)
2292             startpos = strbeg;
2293     }
2294
2295     minlen = prog->minlen;
2296     if ((startpos + minlen) > strend || startpos < strbeg) {
2297         DEBUG_r(PerlIO_printf(Perl_debug_log,
2298                     "Regex match can't succeed, so not even tried\n"));
2299         return 0;
2300     }
2301
2302     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2303      * which will call destuctors to reset PL_regmatch_state, free higher
2304      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2305      * regmatch_info_aux_eval */
2306
2307     oldsave = PL_savestack_ix;
2308
2309     s = startpos;
2310
2311     if ((prog->extflags & RXf_USE_INTUIT)
2312         && !(flags & REXEC_CHECKED))
2313     {
2314         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2315                                     flags, NULL);
2316         if (!s)
2317             return 0;
2318
2319         if (prog->extflags & RXf_CHECK_ALL) {
2320             /* we can match based purely on the result of INTUIT.
2321              * Set up captures etc just for $& and $-[0]
2322              * (an intuit-only match wont have $1,$2,..) */
2323             assert(!prog->nparens);
2324
2325             /* s/// doesn't like it if $& is earlier than where we asked it to
2326              * start searching (which can happen on something like /.\G/) */
2327             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2328                     && (s < stringarg))
2329             {
2330                 /* this should only be possible under \G */
2331                 assert(prog->extflags & RXf_GPOS_SEEN);
2332                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2333                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2334                 goto phooey;
2335             }
2336
2337             /* match via INTUIT shouldn't have any captures.
2338              * Let @-, @+, $^N know */
2339             prog->lastparen = prog->lastcloseparen = 0;
2340             RX_MATCH_UTF8_set(rx, utf8_target);
2341             prog->offs[0].start = s - strbeg;
2342             prog->offs[0].end = utf8_target
2343                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2344                 : s - strbeg + prog->minlenret;
2345             if ( !(flags & REXEC_NOT_FIRST) )
2346                 S_reg_set_capture_string(aTHX_ rx,
2347                                         strbeg, strend,
2348                                         sv, flags, utf8_target);
2349
2350             return 1;
2351         }
2352     }
2353
2354     multiline = prog->extflags & RXf_PMf_MULTILINE;
2355
2356     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2357         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2358                               "String too short [regexec_flags]...\n"));
2359         goto phooey;
2360     }
2361
2362     /* Check validity of program. */
2363     if (UCHARAT(progi->program) != REG_MAGIC) {
2364         Perl_croak(aTHX_ "corrupted regexp program");
2365     }
2366
2367     RX_MATCH_TAINTED_off(rx);
2368
2369     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2370     reginfo->intuit = 0;
2371     reginfo->is_utf8_target = cBOOL(utf8_target);
2372     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2373     reginfo->warned = FALSE;
2374     reginfo->strbeg  = strbeg;
2375     reginfo->sv = sv;
2376     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2377     reginfo->strend = strend;
2378     /* see how far we have to get to not match where we matched before */
2379     reginfo->till = stringarg + minend;
2380
2381     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2382         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2383            S_cleanup_regmatch_info_aux has executed (registered by
2384            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2385            magic belonging to this SV.
2386            Not newSVsv, either, as it does not COW.
2387         */
2388         reginfo->sv = newSV(0);
2389         sv_setsv(reginfo->sv, sv);
2390         SAVEFREESV(reginfo->sv);
2391     }
2392
2393     /* reserve next 2 or 3 slots in PL_regmatch_state:
2394      * slot N+0: may currently be in use: skip it
2395      * slot N+1: use for regmatch_info_aux struct
2396      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2397      * slot N+3: ready for use by regmatch()
2398      */
2399
2400     {
2401         regmatch_state *old_regmatch_state;
2402         regmatch_slab  *old_regmatch_slab;
2403         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2404
2405         /* on first ever match, allocate first slab */
2406         if (!PL_regmatch_slab) {
2407             Newx(PL_regmatch_slab, 1, regmatch_slab);
2408             PL_regmatch_slab->prev = NULL;
2409             PL_regmatch_slab->next = NULL;
2410             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2411         }
2412
2413         old_regmatch_state = PL_regmatch_state;
2414         old_regmatch_slab  = PL_regmatch_slab;
2415
2416         for (i=0; i <= max; i++) {
2417             if (i == 1)
2418                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2419             else if (i ==2)
2420                 reginfo->info_aux_eval =
2421                 reginfo->info_aux->info_aux_eval =
2422                             &(PL_regmatch_state->u.info_aux_eval);
2423
2424             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2425                 PL_regmatch_state = S_push_slab(aTHX);
2426         }
2427
2428         /* note initial PL_regmatch_state position; at end of match we'll
2429          * pop back to there and free any higher slabs */
2430
2431         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2432         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2433         reginfo->info_aux->poscache = NULL;
2434
2435         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2436
2437         if ((prog->extflags & RXf_EVAL_SEEN))
2438             S_setup_eval_state(aTHX_ reginfo);
2439         else
2440             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2441     }
2442
2443     /* If there is a "must appear" string, look for it. */
2444
2445     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2446         /* We have to be careful. If the previous successful match
2447            was from this regex we don't want a subsequent partially
2448            successful match to clobber the old results.
2449            So when we detect this possibility we add a swap buffer
2450            to the re, and switch the buffer each match. If we fail,
2451            we switch it back; otherwise we leave it swapped.
2452         */
2453         swap = prog->offs;
2454         /* do we need a save destructor here for eval dies? */
2455         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2456         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2457             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2458             PTR2UV(prog),
2459             PTR2UV(swap),
2460             PTR2UV(prog->offs)
2461         ));
2462     }
2463
2464     /* Simplest case:  anchored match need be tried only once. */
2465     /*  [unless only anchor is BOL and multiline is set] */
2466     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2467         if (s == startpos && regtry(reginfo, &s))
2468             goto got_it;
2469         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2470                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2471         {
2472             char *end;
2473
2474             if (minlen)
2475                 dontbother = minlen - 1;
2476             end = HOP3c(strend, -dontbother, strbeg) - 1;
2477             /* for multiline we only have to try after newlines */
2478             if (prog->check_substr || prog->check_utf8) {
2479                 /* because of the goto we can not easily reuse the macros for bifurcating the
2480                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2481                 if (utf8_target) {
2482                     if (s == startpos)
2483                         goto after_try_utf8;
2484                     while (1) {
2485                         if (regtry(reginfo, &s)) {
2486                             goto got_it;
2487                         }
2488                       after_try_utf8:
2489                         if (s > end) {
2490                             goto phooey;
2491                         }
2492                         if (prog->extflags & RXf_USE_INTUIT) {
2493                             s = re_intuit_start(rx, sv, strbeg,
2494                                     s + UTF8SKIP(s), strend, flags, NULL);
2495                             if (!s) {
2496                                 goto phooey;
2497                             }
2498                         }
2499                         else {
2500                             s += UTF8SKIP(s);
2501                         }
2502                     }
2503                 } /* end search for check string in unicode */
2504                 else {
2505                     if (s == startpos) {
2506                         goto after_try_latin;
2507                     }
2508                     while (1) {
2509                         if (regtry(reginfo, &s)) {
2510                             goto got_it;
2511                         }
2512                       after_try_latin:
2513                         if (s > end) {
2514                             goto phooey;
2515                         }
2516                         if (prog->extflags & RXf_USE_INTUIT) {
2517                             s = re_intuit_start(rx, sv, strbeg,
2518                                         s + 1, strend, flags, NULL);
2519                             if (!s) {
2520                                 goto phooey;
2521                             }
2522                         }
2523                         else {
2524                             s++;
2525                         }
2526                     }
2527                 } /* end search for check string in latin*/
2528             } /* end search for check string */
2529             else { /* search for newline */
2530                 if (s > startpos) {
2531                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2532                     s--;
2533                 }
2534                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2535                 while (s <= end) { /* note it could be possible to match at the end of the string */
2536                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2537                         if (regtry(reginfo, &s))
2538                             goto got_it;
2539                     }
2540                 }
2541             } /* end search for newline */
2542         } /* end anchored/multiline check string search */
2543         goto phooey;
2544     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2545     {
2546         /* For anchored \G, the only position it can match from is
2547          * (ganch-gofs); we already set startpos to this above; if intuit
2548          * moved us on from there, we can't possibly succeed */
2549         assert(startpos == reginfo->ganch - prog->gofs);
2550         if (s == startpos && regtry(reginfo, &s))
2551             goto got_it;
2552         goto phooey;
2553     }
2554
2555     /* Messy cases:  unanchored match. */
2556     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2557         /* we have /x+whatever/ */
2558         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2559         char ch;
2560 #ifdef DEBUGGING
2561         int did_match = 0;
2562 #endif
2563         if (utf8_target) {
2564             if (! prog->anchored_utf8) {
2565                 to_utf8_substr(prog);
2566             }
2567             ch = SvPVX_const(prog->anchored_utf8)[0];
2568             REXEC_FBC_SCAN(
2569                 if (*s == ch) {
2570                     DEBUG_EXECUTE_r( did_match = 1 );
2571                     if (regtry(reginfo, &s)) goto got_it;
2572                     s += UTF8SKIP(s);
2573                     while (s < strend && *s == ch)
2574                         s += UTF8SKIP(s);
2575                 }
2576             );
2577
2578         }
2579         else {
2580             if (! prog->anchored_substr) {
2581                 if (! to_byte_substr(prog)) {
2582                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2583                 }
2584             }
2585             ch = SvPVX_const(prog->anchored_substr)[0];
2586             REXEC_FBC_SCAN(
2587                 if (*s == ch) {
2588                     DEBUG_EXECUTE_r( did_match = 1 );
2589                     if (regtry(reginfo, &s)) goto got_it;
2590                     s++;
2591                     while (s < strend && *s == ch)
2592                         s++;
2593                 }
2594             );
2595         }
2596         DEBUG_EXECUTE_r(if (!did_match)
2597                 PerlIO_printf(Perl_debug_log,
2598                                   "Did not find anchored character...\n")
2599                );
2600     }
2601     else if (prog->anchored_substr != NULL
2602               || prog->anchored_utf8 != NULL
2603               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2604                   && prog->float_max_offset < strend - s)) {
2605         SV *must;
2606         I32 back_max;
2607         I32 back_min;
2608         char *last;
2609         char *last1;            /* Last position checked before */
2610 #ifdef DEBUGGING
2611         int did_match = 0;
2612 #endif
2613         if (prog->anchored_substr || prog->anchored_utf8) {
2614             if (utf8_target) {
2615                 if (! prog->anchored_utf8) {
2616                     to_utf8_substr(prog);
2617                 }
2618                 must = prog->anchored_utf8;
2619             }
2620             else {
2621                 if (! prog->anchored_substr) {
2622                     if (! to_byte_substr(prog)) {
2623                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2624                     }
2625                 }
2626                 must = prog->anchored_substr;
2627             }
2628             back_max = back_min = prog->anchored_offset;
2629         } else {
2630             if (utf8_target) {
2631                 if (! prog->float_utf8) {
2632                     to_utf8_substr(prog);
2633                 }
2634                 must = prog->float_utf8;
2635             }
2636             else {
2637                 if (! prog->float_substr) {
2638                     if (! to_byte_substr(prog)) {
2639                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2640                     }
2641                 }
2642                 must = prog->float_substr;
2643             }
2644             back_max = prog->float_max_offset;
2645             back_min = prog->float_min_offset;
2646         }
2647
2648         if (back_min<0) {
2649             last = strend;
2650         } else {
2651             last = HOP3c(strend,        /* Cannot start after this */
2652                   -(I32)(CHR_SVLEN(must)
2653                          - (SvTAIL(must) != 0) + back_min), strbeg);
2654         }
2655         if (s > reginfo->strbeg)
2656             last1 = HOPc(s, -1);
2657         else
2658             last1 = s - 1;      /* bogus */
2659
2660         /* XXXX check_substr already used to find "s", can optimize if
2661            check_substr==must. */
2662         dontbother = end_shift;
2663         strend = HOPc(strend, -dontbother);
2664         while ( (s <= last) &&
2665                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2666                                   (unsigned char*)strend, must,
2667                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2668             DEBUG_EXECUTE_r( did_match = 1 );
2669             if (HOPc(s, -back_max) > last1) {
2670                 last1 = HOPc(s, -back_min);
2671                 s = HOPc(s, -back_max);
2672             }
2673             else {
2674                 char * const t = (last1 >= reginfo->strbeg)
2675                                     ? HOPc(last1, 1) : last1 + 1;
2676
2677                 last1 = HOPc(s, -back_min);
2678                 s = t;
2679             }
2680             if (utf8_target) {
2681                 while (s <= last1) {
2682                     if (regtry(reginfo, &s))
2683                         goto got_it;
2684                     if (s >= last1) {
2685                         s++; /* to break out of outer loop */
2686                         break;
2687                     }
2688                     s += UTF8SKIP(s);
2689                 }
2690             }
2691             else {
2692                 while (s <= last1) {
2693                     if (regtry(reginfo, &s))
2694                         goto got_it;
2695                     s++;
2696                 }
2697             }
2698         }
2699         DEBUG_EXECUTE_r(if (!did_match) {
2700             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2701                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2702             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2703                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2704                                ? "anchored" : "floating"),
2705                 quoted, RE_SV_TAIL(must));
2706         });
2707         goto phooey;
2708     }
2709     else if ( (c = progi->regstclass) ) {
2710         if (minlen) {
2711             const OPCODE op = OP(progi->regstclass);
2712             /* don't bother with what can't match */
2713             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2714                 strend = HOPc(strend, -(minlen - 1));
2715         }
2716         DEBUG_EXECUTE_r({
2717             SV * const prop = sv_newmortal();
2718             regprop(prog, prop, c);
2719             {
2720                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2721                     s,strend-s,60);
2722                 PerlIO_printf(Perl_debug_log,
2723                     "Matching stclass %.*s against %s (%d bytes)\n",
2724                     (int)SvCUR(prop), SvPVX_const(prop),
2725                      quoted, (int)(strend - s));
2726             }
2727         });
2728         if (find_byclass(prog, c, s, strend, reginfo))
2729             goto got_it;
2730         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2731     }
2732     else {
2733         dontbother = 0;
2734         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2735             /* Trim the end. */
2736             char *last= NULL;
2737             SV* float_real;
2738             STRLEN len;
2739             const char *little;
2740
2741             if (utf8_target) {
2742                 if (! prog->float_utf8) {
2743                     to_utf8_substr(prog);
2744                 }
2745                 float_real = prog->float_utf8;
2746             }
2747             else {
2748                 if (! prog->float_substr) {
2749                     if (! to_byte_substr(prog)) {
2750                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2751                     }
2752                 }
2753                 float_real = prog->float_substr;
2754             }
2755
2756             little = SvPV_const(float_real, len);
2757             if (SvTAIL(float_real)) {
2758                     /* This means that float_real contains an artificial \n on
2759                      * the end due to the presence of something like this:
2760                      * /foo$/ where we can match both "foo" and "foo\n" at the
2761                      * end of the string.  So we have to compare the end of the
2762                      * string first against the float_real without the \n and
2763                      * then against the full float_real with the string.  We
2764                      * have to watch out for cases where the string might be
2765                      * smaller than the float_real or the float_real without
2766                      * the \n. */
2767                     char *checkpos= strend - len;
2768                     DEBUG_OPTIMISE_r(
2769                         PerlIO_printf(Perl_debug_log,
2770                             "%sChecking for float_real.%s\n",
2771                             PL_colors[4], PL_colors[5]));
2772                     if (checkpos + 1 < strbeg) {
2773                         /* can't match, even if we remove the trailing \n
2774                          * string is too short to match */
2775                         DEBUG_EXECUTE_r(
2776                             PerlIO_printf(Perl_debug_log,
2777                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2778                                 PL_colors[4], PL_colors[5]));
2779                         goto phooey;
2780                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2781                         /* can match, the end of the string matches without the
2782                          * "\n" */
2783                         last = checkpos + 1;
2784                     } else if (checkpos < strbeg) {
2785                         /* cant match, string is too short when the "\n" is
2786                          * included */
2787                         DEBUG_EXECUTE_r(
2788                             PerlIO_printf(Perl_debug_log,
2789                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2790                                 PL_colors[4], PL_colors[5]));
2791                         goto phooey;
2792                     } else if (!multiline) {
2793                         /* non multiline match, so compare with the "\n" at the
2794                          * end of the string */
2795                         if (memEQ(checkpos, little, len)) {
2796                             last= checkpos;
2797                         } else {
2798                             DEBUG_EXECUTE_r(
2799                                 PerlIO_printf(Perl_debug_log,
2800                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2801                                     PL_colors[4], PL_colors[5]));
2802                             goto phooey;
2803                         }
2804                     } else {
2805                         /* multiline match, so we have to search for a place
2806                          * where the full string is located */
2807                         goto find_last;
2808                     }
2809             } else {
2810                   find_last:
2811                     if (len)
2812                         last = rninstr(s, strend, little, little + len);
2813                     else
2814                         last = strend;  /* matching "$" */
2815             }
2816             if (!last) {
2817                 /* at one point this block contained a comment which was
2818                  * probably incorrect, which said that this was a "should not
2819                  * happen" case.  Even if it was true when it was written I am
2820                  * pretty sure it is not anymore, so I have removed the comment
2821                  * and replaced it with this one. Yves */
2822                 DEBUG_EXECUTE_r(
2823                     PerlIO_printf(Perl_debug_log,
2824                         "String does not contain required substring, cannot match.\n"
2825                     ));
2826                 goto phooey;
2827             }
2828             dontbother = strend - last + prog->float_min_offset;
2829         }
2830         if (minlen && (dontbother < minlen))
2831             dontbother = minlen - 1;
2832         strend -= dontbother;              /* this one's always in bytes! */
2833         /* We don't know much -- general case. */
2834         if (utf8_target) {
2835             for (;;) {
2836                 if (regtry(reginfo, &s))
2837                     goto got_it;
2838                 if (s >= strend)
2839                     break;
2840                 s += UTF8SKIP(s);
2841             };
2842         }
2843         else {
2844             do {
2845                 if (regtry(reginfo, &s))
2846                     goto got_it;
2847             } while (s++ < strend);
2848         }
2849     }
2850
2851     /* Failure. */
2852     goto phooey;
2853
2854 got_it:
2855     /* s/// doesn't like it if $& is earlier than where we asked it to
2856      * start searching (which can happen on something like /.\G/) */
2857     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2858             && (prog->offs[0].start < stringarg - strbeg))
2859     {
2860         /* this should only be possible under \G */
2861         assert(prog->extflags & RXf_GPOS_SEEN);
2862         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2863             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2864         goto phooey;
2865     }
2866
2867     DEBUG_BUFFERS_r(
2868         if (swap)
2869             PerlIO_printf(Perl_debug_log,
2870                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2871                 PTR2UV(prog),
2872                 PTR2UV(swap)
2873             );
2874     );
2875     Safefree(swap);
2876
2877     /* clean up; this will trigger destructors that will free all slabs
2878      * above the current one, and cleanup the regmatch_info_aux
2879      * and regmatch_info_aux_eval sructs */
2880
2881     LEAVE_SCOPE(oldsave);
2882
2883     if (RXp_PAREN_NAMES(prog))
2884         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2885
2886     RX_MATCH_UTF8_set(rx, utf8_target);
2887
2888     /* make sure $`, $&, $', and $digit will work later */
2889     if ( !(flags & REXEC_NOT_FIRST) )
2890         S_reg_set_capture_string(aTHX_ rx,
2891                                     strbeg, reginfo->strend,
2892                                     sv, flags, utf8_target);
2893
2894     return 1;
2895
2896 phooey:
2897     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2898                           PL_colors[4], PL_colors[5]));
2899
2900     /* clean up; this will trigger destructors that will free all slabs
2901      * above the current one, and cleanup the regmatch_info_aux
2902      * and regmatch_info_aux_eval sructs */
2903
2904     LEAVE_SCOPE(oldsave);
2905
2906     if (swap) {
2907         /* we failed :-( roll it back */
2908         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2909             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2910             PTR2UV(prog),
2911             PTR2UV(prog->offs),
2912             PTR2UV(swap)
2913         ));
2914         Safefree(prog->offs);
2915         prog->offs = swap;
2916     }
2917     return 0;
2918 }
2919
2920
2921 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2922  * Do inc before dec, in case old and new rex are the same */
2923 #define SET_reg_curpm(Re2) \
2924     if (reginfo->info_aux_eval) {                   \
2925         (void)ReREFCNT_inc(Re2);                    \
2926         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2927         PM_SETRE((PL_reg_curpm), (Re2));            \
2928     }
2929
2930
2931 /*
2932  - regtry - try match at specific point
2933  */
2934 STATIC I32                      /* 0 failure, 1 success */
2935 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2936 {
2937     dVAR;
2938     CHECKPOINT lastcp;
2939     REGEXP *const rx = reginfo->prog;
2940     regexp *const prog = ReANY(rx);
2941     I32 result;
2942     RXi_GET_DECL(prog,progi);
2943     GET_RE_DEBUG_FLAGS_DECL;
2944
2945     PERL_ARGS_ASSERT_REGTRY;
2946
2947     reginfo->cutpoint=NULL;
2948
2949     prog->offs[0].start = *startposp - reginfo->strbeg;
2950     prog->lastparen = 0;
2951     prog->lastcloseparen = 0;
2952
2953     /* XXXX What this code is doing here?!!!  There should be no need
2954        to do this again and again, prog->lastparen should take care of
2955        this!  --ilya*/
2956
2957     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2958      * Actually, the code in regcppop() (which Ilya may be meaning by
2959      * prog->lastparen), is not needed at all by the test suite
2960      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2961      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2962      * Meanwhile, this code *is* needed for the
2963      * above-mentioned test suite tests to succeed.  The common theme
2964      * on those tests seems to be returning null fields from matches.
2965      * --jhi updated by dapm */
2966 #if 1
2967     if (prog->nparens) {
2968         regexp_paren_pair *pp = prog->offs;
2969         I32 i;
2970         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2971             ++pp;
2972             pp->start = -1;
2973             pp->end = -1;
2974         }
2975     }
2976 #endif
2977     REGCP_SET(lastcp);
2978     result = regmatch(reginfo, *startposp, progi->program + 1);
2979     if (result != -1) {
2980         prog->offs[0].end = result;
2981         return 1;
2982     }
2983     if (reginfo->cutpoint)
2984         *startposp= reginfo->cutpoint;
2985     REGCP_UNWIND(lastcp);
2986     return 0;
2987 }
2988
2989
2990 #define sayYES goto yes
2991 #define sayNO goto no
2992 #define sayNO_SILENT goto no_silent
2993
2994 /* we dont use STMT_START/END here because it leads to
2995    "unreachable code" warnings, which are bogus, but distracting. */
2996 #define CACHEsayNO \
2997     if (ST.cache_mask) \
2998        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
2999     sayNO
3000
3001 /* this is used to determine how far from the left messages like
3002    'failed...' are printed. It should be set such that messages
3003    are inline with the regop output that created them.
3004 */
3005 #define REPORT_CODE_OFF 32
3006
3007
3008 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3009 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3010 #define CHRTEST_NOT_A_CP_1 -999
3011 #define CHRTEST_NOT_A_CP_2 -998
3012
3013 /* grab a new slab and return the first slot in it */
3014
3015 STATIC regmatch_state *
3016 S_push_slab(pTHX)
3017 {
3018 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3019     dMY_CXT;
3020 #endif
3021     regmatch_slab *s = PL_regmatch_slab->next;
3022     if (!s) {
3023         Newx(s, 1, regmatch_slab);
3024         s->prev = PL_regmatch_slab;
3025         s->next = NULL;
3026         PL_regmatch_slab->next = s;
3027     }
3028     PL_regmatch_slab = s;
3029     return SLAB_FIRST(s);
3030 }
3031
3032
3033 /* push a new state then goto it */
3034
3035 #define PUSH_STATE_GOTO(state, node, input) \
3036     pushinput = input; \
3037     scan = node; \
3038     st->resume_state = state; \
3039     goto push_state;
3040
3041 /* push a new state with success backtracking, then goto it */
3042
3043 #define PUSH_YES_STATE_GOTO(state, node, input) \
3044     pushinput = input; \
3045     scan = node; \
3046     st->resume_state = state; \
3047     goto push_yes_state;
3048
3049
3050
3051
3052 /*
3053
3054 regmatch() - main matching routine
3055
3056 This is basically one big switch statement in a loop. We execute an op,
3057 set 'next' to point the next op, and continue. If we come to a point which
3058 we may need to backtrack to on failure such as (A|B|C), we push a
3059 backtrack state onto the backtrack stack. On failure, we pop the top
3060 state, and re-enter the loop at the state indicated. If there are no more
3061 states to pop, we return failure.
3062
3063 Sometimes we also need to backtrack on success; for example /A+/, where
3064 after successfully matching one A, we need to go back and try to
3065 match another one; similarly for lookahead assertions: if the assertion
3066 completes successfully, we backtrack to the state just before the assertion
3067 and then carry on.  In these cases, the pushed state is marked as
3068 'backtrack on success too'. This marking is in fact done by a chain of
3069 pointers, each pointing to the previous 'yes' state. On success, we pop to
3070 the nearest yes state, discarding any intermediate failure-only states.
3071 Sometimes a yes state is pushed just to force some cleanup code to be
3072 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3073 it to free the inner regex.
3074
3075 Note that failure backtracking rewinds the cursor position, while
3076 success backtracking leaves it alone.
3077
3078 A pattern is complete when the END op is executed, while a subpattern
3079 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3080 ops trigger the "pop to last yes state if any, otherwise return true"
3081 behaviour.
3082
3083 A common convention in this function is to use A and B to refer to the two
3084 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3085 the subpattern to be matched possibly multiple times, while B is the entire
3086 rest of the pattern. Variable and state names reflect this convention.
3087
3088 The states in the main switch are the union of ops and failure/success of
3089 substates associated with with that op.  For example, IFMATCH is the op
3090 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3091 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3092 successfully matched A and IFMATCH_A_fail is a state saying that we have
3093 just failed to match A. Resume states always come in pairs. The backtrack
3094 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3095 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3096 on success or failure.
3097
3098 The struct that holds a backtracking state is actually a big union, with
3099 one variant for each major type of op. The variable st points to the
3100 top-most backtrack struct. To make the code clearer, within each
3101 block of code we #define ST to alias the relevant union.
3102
3103 Here's a concrete example of a (vastly oversimplified) IFMATCH
3104 implementation:
3105
3106     switch (state) {
3107     ....
3108
3109 #define ST st->u.ifmatch
3110
3111     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3112         ST.foo = ...; // some state we wish to save
3113         ...
3114         // push a yes backtrack state with a resume value of
3115         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3116         // first node of A:
3117         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3118         // NOTREACHED
3119
3120     case IFMATCH_A: // we have successfully executed A; now continue with B
3121         next = B;
3122         bar = ST.foo; // do something with the preserved value
3123         break;
3124
3125     case IFMATCH_A_fail: // A failed, so the assertion failed
3126         ...;   // do some housekeeping, then ...
3127         sayNO; // propagate the failure
3128
3129 #undef ST
3130
3131     ...
3132     }
3133
3134 For any old-timers reading this who are familiar with the old recursive
3135 approach, the code above is equivalent to:
3136
3137     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3138     {
3139         int foo = ...
3140         ...
3141         if (regmatch(A)) {
3142             next = B;
3143             bar = foo;
3144             break;
3145         }
3146         ...;   // do some housekeeping, then ...
3147         sayNO; // propagate the failure
3148     }
3149
3150 The topmost backtrack state, pointed to by st, is usually free. If you
3151 want to claim it, populate any ST.foo fields in it with values you wish to
3152 save, then do one of
3153
3154         PUSH_STATE_GOTO(resume_state, node, newinput);
3155         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3156
3157 which sets that backtrack state's resume value to 'resume_state', pushes a
3158 new free entry to the top of the backtrack stack, then goes to 'node'.
3159 On backtracking, the free slot is popped, and the saved state becomes the
3160 new free state. An ST.foo field in this new top state can be temporarily
3161 accessed to retrieve values, but once the main loop is re-entered, it
3162 becomes available for reuse.
3163
3164 Note that the depth of the backtrack stack constantly increases during the
3165 left-to-right execution of the pattern, rather than going up and down with
3166 the pattern nesting. For example the stack is at its maximum at Z at the
3167 end of the pattern, rather than at X in the following:
3168
3169     /(((X)+)+)+....(Y)+....Z/
3170
3171 The only exceptions to this are lookahead/behind assertions and the cut,
3172 (?>A), which pop all the backtrack states associated with A before
3173 continuing.
3174
3175 Backtrack state structs are allocated in slabs of about 4K in size.
3176 PL_regmatch_state and st always point to the currently active state,
3177 and PL_regmatch_slab points to the slab currently containing
3178 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3179 allocated, and is never freed until interpreter destruction. When the slab
3180 is full, a new one is allocated and chained to the end. At exit from
3181 regmatch(), slabs allocated since entry are freed.
3182
3183 */
3184
3185
3186 #define DEBUG_STATE_pp(pp)                                  \
3187     DEBUG_STATE_r({                                         \
3188         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3189         PerlIO_printf(Perl_debug_log,                       \
3190             "    %*s"pp" %s%s%s%s%s\n",                     \
3191             depth*2, "",                                    \
3192             PL_reg_name[st->resume_state],                     \
3193             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3194             ((st==yes_state) ? "Y" : ""),                   \
3195             ((st==mark_state) ? "M" : ""),                  \
3196             ((st==yes_state||st==mark_state) ? "]" : "")    \
3197         );                                                  \
3198     });
3199
3200
3201 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3202
3203 #ifdef DEBUGGING
3204
3205 STATIC void
3206 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3207     const char *start, const char *end, const char *blurb)
3208 {
3209     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3210
3211     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3212
3213     if (!PL_colorset)
3214             reginitcolors();
3215     {
3216         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3217             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3218
3219         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3220             start, end - start, 60);
3221
3222         PerlIO_printf(Perl_debug_log,
3223             "%s%s REx%s %s against %s\n",
3224                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3225
3226         if (utf8_target||utf8_pat)
3227             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3228                 utf8_pat ? "pattern" : "",
3229                 utf8_pat && utf8_target ? " and " : "",
3230                 utf8_target ? "string" : ""
3231             );
3232     }
3233 }
3234
3235 STATIC void
3236 S_dump_exec_pos(pTHX_ const char *locinput,
3237                       const regnode *scan,
3238                       const char *loc_regeol,
3239                       const char *loc_bostr,
3240                       const char *loc_reg_starttry,
3241                       const bool utf8_target)
3242 {
3243     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3244     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3245     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3246     /* The part of the string before starttry has one color
3247        (pref0_len chars), between starttry and current
3248        position another one (pref_len - pref0_len chars),
3249        after the current position the third one.
3250        We assume that pref0_len <= pref_len, otherwise we
3251        decrease pref0_len.  */
3252     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3253         ? (5 + taill) - l : locinput - loc_bostr;
3254     int pref0_len;
3255
3256     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3257
3258     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3259         pref_len++;
3260     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3261     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3262         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3263               ? (5 + taill) - pref_len : loc_regeol - locinput);
3264     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3265         l--;
3266     if (pref0_len < 0)
3267         pref0_len = 0;
3268     if (pref0_len > pref_len)
3269         pref0_len = pref_len;
3270     {
3271         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3272
3273         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3274             (locinput - pref_len),pref0_len, 60, 4, 5);
3275
3276         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3277                     (locinput - pref_len + pref0_len),
3278                     pref_len - pref0_len, 60, 2, 3);
3279
3280         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3281                     locinput, loc_regeol - locinput, 10, 0, 1);
3282
3283         const STRLEN tlen=len0+len1+len2;
3284         PerlIO_printf(Perl_debug_log,
3285                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3286                     (IV)(locinput - loc_bostr),
3287                     len0, s0,
3288                     len1, s1,
3289                     (docolor ? "" : "> <"),
3290                     len2, s2,
3291                     (int)(tlen > 19 ? 0 :  19 - tlen),
3292                     "");
3293     }
3294 }
3295
3296 #endif
3297
3298 /* reg_check_named_buff_matched()
3299  * Checks to see if a named buffer has matched. The data array of
3300  * buffer numbers corresponding to the buffer is expected to reside
3301  * in the regexp->data->data array in the slot stored in the ARG() of
3302  * node involved. Note that this routine doesn't actually care about the
3303  * name, that information is not preserved from compilation to execution.
3304  * Returns the index of the leftmost defined buffer with the given name
3305  * or 0 if non of the buffers matched.
3306  */
3307 STATIC I32
3308 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3309 {
3310     I32 n;
3311     RXi_GET_DECL(rex,rexi);
3312     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3313     I32 *nums=(I32*)SvPVX(sv_dat);
3314
3315     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3316
3317     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3318         if ((I32)rex->lastparen >= nums[n] &&
3319             rex->offs[nums[n]].end != -1)
3320         {
3321             return nums[n];
3322         }
3323     }
3324     return 0;
3325 }
3326
3327
3328 static bool
3329 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3330         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3331 {
3332     /* This function determines if there are one or two characters that match
3333      * the first character of the passed-in EXACTish node <text_node>, and if
3334      * so, returns them in the passed-in pointers.
3335      *
3336      * If it determines that no possible character in the target string can
3337      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3338      * the first character in <text_node> requires UTF-8 to represent, and the
3339      * target string isn't in UTF-8.)
3340      *
3341      * If there are more than two characters that could match the beginning of
3342      * <text_node>, or if more context is required to determine a match or not,
3343      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3344      *
3345      * The motiviation behind this function is to allow the caller to set up
3346      * tight loops for matching.  If <text_node> is of type EXACT, there is
3347      * only one possible character that can match its first character, and so
3348      * the situation is quite simple.  But things get much more complicated if
3349      * folding is involved.  It may be that the first character of an EXACTFish
3350      * node doesn't participate in any possible fold, e.g., punctuation, so it
3351      * can be matched only by itself.  The vast majority of characters that are
3352      * in folds match just two things, their lower and upper-case equivalents.
3353      * But not all are like that; some have multiple possible matches, or match
3354      * sequences of more than one character.  This function sorts all that out.
3355      *
3356      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3357      * loop of trying to match A*, we know we can't exit where the thing
3358      * following it isn't a B.  And something can't be a B unless it is the
3359      * beginning of B.  By putting a quick test for that beginning in a tight
3360      * loop, we can rule out things that can't possibly be B without having to
3361      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3362      * character, we can make a tight loop matching A*, using the outputs of
3363      * this function.
3364      *
3365      * If the target string to match isn't in UTF-8, and there aren't
3366      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3367      * the one or two possible octets (which are characters in this situation)
3368      * that can match.  In all cases, if there is only one character that can
3369      * match, *<c1p> and *<c2p> will be identical.
3370      *
3371      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3372      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3373      * can match the beginning of <text_node>.  They should be declared with at
3374      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3375      * undefined what these contain.)  If one or both of the buffers are
3376      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3377      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3378      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3379      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3380      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3381
3382     const bool utf8_target = reginfo->is_utf8_target;
3383
3384     UV c1 = CHRTEST_NOT_A_CP_1;
3385     UV c2 = CHRTEST_NOT_A_CP_2;
3386     bool use_chrtest_void = FALSE;
3387     const bool is_utf8_pat = reginfo->is_utf8_pat;
3388
3389     /* Used when we have both utf8 input and utf8 output, to avoid converting
3390      * to/from code points */
3391     bool utf8_has_been_setup = FALSE;
3392
3393     dVAR;
3394
3395     U8 *pat = (U8*)STRING(text_node);
3396
3397     if (OP(text_node) == EXACT) {
3398
3399         /* In an exact node, only one thing can be matched, that first
3400          * character.  If both the pat and the target are UTF-8, we can just
3401          * copy the input to the output, avoiding finding the code point of
3402          * that character */
3403         if (!is_utf8_pat) {
3404             c2 = c1 = *pat;
3405         }
3406         else if (utf8_target) {
3407             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3408             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3409             utf8_has_been_setup = TRUE;
3410         }
3411         else {
3412             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3413         }
3414     }
3415     else /* an EXACTFish node */
3416          if ((is_utf8_pat
3417                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3418                                                     pat + STR_LEN(text_node)))
3419              || (!is_utf8_pat
3420                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3421                                                     pat + STR_LEN(text_node))))
3422     {
3423         /* Multi-character folds require more context to sort out.  Also
3424          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3425          * handled outside this routine */
3426         use_chrtest_void = TRUE;
3427     }
3428     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3429         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3430         if (c1 > 256) {
3431             /* Load the folds hash, if not already done */
3432             SV** listp;
3433             if (! PL_utf8_foldclosures) {
3434                 if (! PL_utf8_tofold) {
3435                     U8 dummy[UTF8_MAXBYTES+1];
3436
3437                     /* Force loading this by folding an above-Latin1 char */
3438                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3439                     assert(PL_utf8_tofold); /* Verify that worked */
3440                 }
3441                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3442             }
3443
3444             /* The fold closures data structure is a hash with the keys being
3445              * the UTF-8 of every character that is folded to, like 'k', and
3446              * the values each an array of all code points that fold to its
3447              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3448              * not included */
3449             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3450                                      (char *) pat,
3451                                      UTF8SKIP(pat),
3452                                      FALSE))))
3453             {
3454                 /* Not found in the hash, therefore there are no folds
3455                  * containing it, so there is only a single character that
3456                  * could match */
3457                 c2 = c1;
3458             }
3459             else {  /* Does participate in folds */
3460                 AV* list = (AV*) *listp;
3461                 if (av_len(list) != 1) {
3462
3463                     /* If there aren't exactly two folds to this, it is outside
3464                      * the scope of this function */
3465                     use_chrtest_void = TRUE;
3466                 }
3467                 else {  /* There are two.  Get them */
3468                     SV** c_p = av_fetch(list, 0, FALSE);
3469                     if (c_p == NULL) {
3470                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3471                     }
3472                     c1 = SvUV(*c_p);
3473
3474                     c_p = av_fetch(list, 1, FALSE);
3475                     if (c_p == NULL) {
3476                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3477                     }
3478                     c2 = SvUV(*c_p);
3479
3480                     /* Folds that cross the 255/256 boundary are forbidden if
3481                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3482                      * pattern character is above 256, and its only other match
3483                      * is below 256, the only legal match will be to itself.
3484                      * We have thrown away the original, so have to compute
3485                      * which is the one above 255 */
3486                     if ((c1 < 256) != (c2 < 256)) {
3487                         if (OP(text_node) == EXACTFL
3488                             || (OP(text_node) == EXACTFA
3489                                 && (isASCII(c1) || isASCII(c2))))
3490                         {
3491                             if (c1 < 256) {
3492                                 c1 = c2;
3493                             }
3494                             else {
3495                                 c2 = c1;
3496                             }
3497                         }
3498                     }
3499                 }
3500             }
3501         }
3502         else /* Here, c1 is < 255 */
3503              if (utf8_target
3504                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3505                  && OP(text_node) != EXACTFL
3506                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3507         {
3508             /* Here, there could be something above Latin1 in the target which
3509              * folds to this character in the pattern.  All such cases except
3510              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3511              * involved in their folds, so are outside the scope of this
3512              * function */
3513             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3514                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3515             }
3516             else {
3517                 use_chrtest_void = TRUE;
3518             }
3519         }
3520         else { /* Here nothing above Latin1 can fold to the pattern character */
3521             switch (OP(text_node)) {
3522
3523                 case EXACTFL:   /* /l rules */
3524                     c2 = PL_fold_locale[c1];
3525                     break;
3526
3527                 case EXACTF:
3528                     if (! utf8_target) {    /* /d rules */
3529                         c2 = PL_fold[c1];
3530                         break;
3531                     }
3532                     /* FALLTHROUGH */
3533                     /* /u rules for all these.  This happens to work for
3534                      * EXACTFA as nothing in Latin1 folds to ASCII */
3535                 case EXACTFA:
3536                 case EXACTFU_TRICKYFOLD:
3537                 case EXACTFU_SS:
3538                 case EXACTFU:
3539                     c2 = PL_fold_latin1[c1];
3540                     break;
3541
3542                 default:
3543                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3544                     assert(0); /* NOTREACHED */
3545             }
3546         }
3547     }
3548
3549     /* Here have figured things out.  Set up the returns */
3550     if (use_chrtest_void) {
3551         *c2p = *c1p = CHRTEST_VOID;
3552     }
3553     else if (utf8_target) {
3554         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3555             uvchr_to_utf8(c1_utf8, c1);
3556             uvchr_to_utf8(c2_utf8, c2);
3557         }
3558
3559         /* Invariants are stored in both the utf8 and byte outputs; Use
3560          * negative numbers otherwise for the byte ones.  Make sure that the
3561          * byte ones are the same iff the utf8 ones are the same */
3562         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3563         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3564                 ? *c2_utf8
3565                 : (c1 == c2)
3566                   ? CHRTEST_NOT_A_CP_1
3567                   : CHRTEST_NOT_A_CP_2;
3568     }
3569     else if (c1 > 255) {
3570        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3571                            can represent */
3572            return FALSE;
3573        }
3574
3575        *c1p = *c2p = c2;    /* c2 is the only representable value */
3576     }
3577     else {  /* c1 is representable; see about c2 */
3578        *c1p = c1;
3579        *c2p = (c2 < 256) ? c2 : c1;
3580     }
3581
3582     return TRUE;
3583 }
3584
3585 /* returns -1 on failure, $+[0] on success */
3586 STATIC I32
3587 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3588 {
3589 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3590     dMY_CXT;
3591 #endif
3592     dVAR;
3593     const bool utf8_target = reginfo->is_utf8_target;
3594     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3595     REGEXP *rex_sv = reginfo->prog;
3596     regexp *rex = ReANY(rex_sv);
3597     RXi_GET_DECL(rex,rexi);
3598     /* the current state. This is a cached copy of PL_regmatch_state */
3599     regmatch_state *st;
3600     /* cache heavy used fields of st in registers */
3601     regnode *scan;
3602     regnode *next;
3603     U32 n = 0;  /* general value; init to avoid compiler warning */
3604     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3605     char *locinput = startpos;
3606     char *pushinput; /* where to continue after a PUSH */
3607     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3608
3609     bool result = 0;        /* return value of S_regmatch */
3610     int depth = 0;          /* depth of backtrack stack */
3611     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3612     const U32 max_nochange_depth =
3613         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3614         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3615     regmatch_state *yes_state = NULL; /* state to pop to on success of
3616                                                             subpattern */
3617     /* mark_state piggy backs on the yes_state logic so that when we unwind
3618        the stack on success we can update the mark_state as we go */
3619     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3620     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3621     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3622     U32 state_num;
3623     bool no_final = 0;      /* prevent failure from backtracking? */
3624     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3625     char *startpoint = locinput;
3626     SV *popmark = NULL;     /* are we looking for a mark? */
3627     SV *sv_commit = NULL;   /* last mark name seen in failure */
3628     SV *sv_yes_mark = NULL; /* last mark name we have seen
3629                                during a successful match */
3630     U32 lastopen = 0;       /* last open we saw */
3631     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3632     SV* const oreplsv = GvSV(PL_replgv);
3633     /* these three flags are set by various ops to signal information to
3634      * the very next op. They have a useful lifetime of exactly one loop
3635      * iteration, and are not preserved or restored by state pushes/pops
3636      */
3637     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3638     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3639     int logical = 0;        /* the following EVAL is:
3640                                 0: (?{...})
3641                                 1: (?(?{...})X|Y)
3642                                 2: (??{...})
3643                                or the following IFMATCH/UNLESSM is:
3644                                 false: plain (?=foo)
3645                                 true:  used as a condition: (?(?=foo))
3646                             */
3647     PAD* last_pad = NULL;
3648     dMULTICALL;
3649     I32 gimme = G_SCALAR;
3650     CV *caller_cv = NULL;       /* who called us */
3651     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3652     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3653     U32 maxopenparen = 0;       /* max '(' index seen so far */
3654     int to_complement;  /* Invert the result? */
3655     _char_class_number classnum;
3656     bool is_utf8_pat = reginfo->is_utf8_pat;
3657
3658 #ifdef DEBUGGING
3659     GET_RE_DEBUG_FLAGS_DECL;
3660 #endif
3661
3662     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3663     multicall_oldcatch = 0;
3664     multicall_cv = NULL;
3665     cx = NULL;
3666     PERL_UNUSED_VAR(multicall_cop);
3667     PERL_UNUSED_VAR(newsp);
3668
3669
3670     PERL_ARGS_ASSERT_REGMATCH;
3671
3672     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3673             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3674     }));
3675
3676     st = PL_regmatch_state;
3677
3678     /* Note that nextchr is a byte even in UTF */
3679     SET_nextchr;
3680     scan = prog;
3681     while (scan != NULL) {
3682
3683         DEBUG_EXECUTE_r( {
3684             SV * const prop = sv_newmortal();
3685             regnode *rnext=regnext(scan);
3686             DUMP_EXEC_POS( locinput, scan, utf8_target );
3687             regprop(rex, prop, scan);
3688
3689             PerlIO_printf(Perl_debug_log,
3690                     "%3"IVdf":%*s%s(%"IVdf")\n",
3691                     (IV)(scan - rexi->program), depth*2, "",
3692                     SvPVX_const(prop),
3693                     (PL_regkind[OP(scan)] == END || !rnext) ?
3694                         0 : (IV)(rnext - rexi->program));
3695         });
3696
3697         next = scan + NEXT_OFF(scan);
3698         if (next == scan)
3699             next = NULL;
3700         state_num = OP(scan);
3701
3702       reenter_switch:
3703         to_complement = 0;
3704
3705         SET_nextchr;
3706         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3707
3708         switch (state_num) {
3709         case BOL: /*  /^../  */
3710             if (locinput == reginfo->strbeg)
3711                 break;
3712             sayNO;
3713
3714         case MBOL: /*  /^../m  */
3715             if (locinput == reginfo->strbeg ||
3716                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3717             {
3718                 break;
3719             }
3720             sayNO;
3721
3722         case SBOL: /*  /^../s  */
3723             if (locinput == reginfo->strbeg)
3724                 break;
3725             sayNO;
3726
3727         case GPOS: /*  \G  */
3728             if (locinput == reginfo->ganch)
3729                 break;
3730             sayNO;
3731
3732         case KEEPS: /*   \K  */
3733             /* update the startpoint */
3734             st->u.keeper.val = rex->offs[0].start;
3735             rex->offs[0].start = locinput - reginfo->strbeg;
3736             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3737             assert(0); /*NOTREACHED*/
3738         case KEEPS_next_fail:
3739             /* rollback the start point change */
3740             rex->offs[0].start = st->u.keeper.val;
3741             sayNO_SILENT;
3742             assert(0); /*NOTREACHED*/
3743
3744         case EOL: /* /..$/  */
3745                 goto seol;
3746
3747         case MEOL: /* /..$/m  */
3748             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3749                 sayNO;
3750             break;
3751
3752         case SEOL: /* /..$/s  */
3753           seol:
3754             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3755                 sayNO;
3756             if (reginfo->strend - locinput > 1)
3757                 sayNO;
3758             break;
3759
3760         case EOS: /*  \z  */
3761             if (!NEXTCHR_IS_EOS)
3762                 sayNO;
3763             break;
3764
3765         case SANY: /*  /./s  */
3766             if (NEXTCHR_IS_EOS)
3767                 sayNO;
3768             goto increment_locinput;
3769
3770         case CANY: /*  \C  */
3771             if (NEXTCHR_IS_EOS)
3772                 sayNO;
3773             locinput++;
3774             break;
3775
3776         case REG_ANY: /*  /./  */
3777             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3778                 sayNO;
3779             goto increment_locinput;
3780
3781
3782 #undef  ST
3783 #define ST st->u.trie
3784         case TRIEC: /* (ab|cd) with known charclass */
3785             /* In this case the charclass data is available inline so
3786                we can fail fast without a lot of extra overhead.
3787              */
3788             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3789                 DEBUG_EXECUTE_r(
3790                     PerlIO_printf(Perl_debug_log,
3791                               "%*s  %sfailed to match trie start class...%s\n",
3792                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3793                 );
3794                 sayNO_SILENT;
3795                 assert(0); /* NOTREACHED */
3796             }
3797             /* FALL THROUGH */
3798         case TRIE:  /* (ab|cd)  */
3799             /* the basic plan of execution of the trie is:
3800              * At the beginning, run though all the states, and
3801              * find the longest-matching word. Also remember the position
3802              * of the shortest matching word. For example, this pattern:
3803              *    1  2 3 4    5
3804              *    ab|a|x|abcd|abc
3805              * when matched against the string "abcde", will generate
3806              * accept states for all words except 3, with the longest
3807              * matching word being 4, and the shortest being 2 (with
3808              * the position being after char 1 of the string).
3809              *
3810              * Then for each matching word, in word order (i.e. 1,2,4,5),
3811              * we run the remainder of the pattern; on each try setting
3812              * the current position to the character following the word,
3813              * returning to try the next word on failure.
3814              *
3815              * We avoid having to build a list of words at runtime by
3816              * using a compile-time structure, wordinfo[].prev, which
3817              * gives, for each word, the previous accepting word (if any).
3818              * In the case above it would contain the mappings 1->2, 2->0,
3819              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3820              * the longest word (4 above), a list of all words, by
3821              * following the list of prev pointers; this gives us the
3822              * unordered list 4,5,1,2. Then given the current word we have
3823              * just tried, we can go through the list and find the
3824              * next-biggest word to try (so if we just failed on word 2,
3825              * the next in the list is 4).
3826              *
3827              * Since at runtime we don't record the matching position in
3828              * the string for each word, we have to work that out for
3829              * each word we're about to process. The wordinfo table holds
3830              * the character length of each word; given that we recorded
3831              * at the start: the position of the shortest word and its
3832              * length in chars, we just need to move the pointer the
3833              * difference between the two char lengths. Depending on
3834              * Unicode status and folding, that's cheap or expensive.
3835              *
3836              * This algorithm is optimised for the case where are only a
3837              * small number of accept states, i.e. 0,1, or maybe 2.
3838              * With lots of accepts states, and having to try all of them,
3839              * it becomes quadratic on number of accept states to find all
3840              * the next words.
3841              */
3842
3843             {
3844                 /* what type of TRIE am I? (utf8 makes this contextual) */
3845                 DECL_TRIE_TYPE(scan);
3846
3847                 /* what trie are we using right now */
3848                 reg_trie_data * const trie
3849                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3850                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3851                 U32 state = trie->startstate;
3852
3853                 if (   trie->bitmap
3854                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3855                 {
3856                     if (trie->states[ state ].wordnum) {
3857                          DEBUG_EXECUTE_r(
3858                             PerlIO_printf(Perl_debug_log,
3859                                           "%*s  %smatched empty string...%s\n",
3860                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3861                         );
3862                         if (!trie->jump)
3863                             break;
3864                     } else {
3865                         DEBUG_EXECUTE_r(
3866                             PerlIO_printf(Perl_debug_log,
3867                                           "%*s  %sfailed to match trie start class...%s\n",
3868                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3869                         );
3870                         sayNO_SILENT;
3871                    }
3872                 }
3873
3874             {
3875                 U8 *uc = ( U8* )locinput;
3876
3877                 STRLEN len = 0;
3878                 STRLEN foldlen = 0;
3879                 U8 *uscan = (U8*)NULL;
3880                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3881                 U32 charcount = 0; /* how many input chars we have matched */
3882                 U32 accepted = 0; /* have we seen any accepting states? */
3883
3884                 ST.jump = trie->jump;
3885                 ST.me = scan;
3886                 ST.firstpos = NULL;
3887                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3888                 ST.nextword = 0;
3889
3890                 /* fully traverse the TRIE; note the position of the
3891                    shortest accept state and the wordnum of the longest
3892                    accept state */
3893
3894                 while ( state && uc <= (U8*)(reginfo->strend) ) {
3895                     U32 base = trie->states[ state ].trans.base;
3896                     UV uvc = 0;
3897                     U16 charid = 0;
3898                     U16 wordnum;
3899                     wordnum = trie->states[ state ].wordnum;
3900
3901                     if (wordnum) { /* it's an accept state */
3902                         if (!accepted) {
3903                             accepted = 1;
3904                             /* record first match position */
3905                             if (ST.longfold) {
3906                                 ST.firstpos = (U8*)locinput;
3907                                 ST.firstchars = 0;
3908                             }
3909                             else {
3910                                 ST.firstpos = uc;
3911                                 ST.firstchars = charcount;
3912                             }
3913                         }
3914                         if (!ST.nextword || wordnum < ST.nextword)
3915                             ST.nextword = wordnum;
3916                         ST.topword = wordnum;
3917                     }
3918
3919                     DEBUG_TRIE_EXECUTE_r({
3920                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3921                                 PerlIO_printf( Perl_debug_log,
3922                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3923                                     2+depth * 2, "", PL_colors[4],
3924                                     (UV)state, (accepted ? 'Y' : 'N'));
3925                     });
3926
3927                     /* read a char and goto next state */
3928                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3929                         I32 offset;
3930                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3931                                              uscan, len, uvc, charid, foldlen,
3932                                              foldbuf, uniflags);
3933                         charcount++;
3934                         if (foldlen>0)
3935                             ST.longfold = TRUE;
3936                         if (charid &&
3937                              ( ((offset =
3938                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3939
3940                              && ((U32)offset < trie->lasttrans)
3941                              && trie->trans[offset].check == state)
3942                         {
3943                             state = trie->trans[offset].next;
3944                         }
3945                         else {
3946                             state = 0;
3947                         }
3948                         uc += len;
3949
3950                     }
3951                     else {
3952                         state = 0;
3953                     }
3954                     DEBUG_TRIE_EXECUTE_r(
3955                         PerlIO_printf( Perl_debug_log,
3956                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3957                             charid, uvc, (UV)state, PL_colors[5] );
3958                     );
3959                 }
3960                 if (!accepted)
3961                    sayNO;
3962
3963                 /* calculate total number of accept states */
3964                 {
3965                     U16 w = ST.topword;
3966                     accepted = 0;
3967                     while (w) {
3968                         w = trie->wordinfo[w].prev;
3969                         accepted++;
3970                     }
3971                     ST.accepted = accepted;
3972                 }
3973
3974                 DEBUG_EXECUTE_r(
3975                     PerlIO_printf( Perl_debug_log,
3976                         "%*s  %sgot %"IVdf" possible matches%s\n",
3977                         REPORT_CODE_OFF + depth * 2, "",
3978                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3979                 );
3980                 goto trie_first_try; /* jump into the fail handler */
3981             }}
3982             assert(0); /* NOTREACHED */
3983
3984         case TRIE_next_fail: /* we failed - try next alternative */
3985         {
3986             U8 *uc;
3987             if ( ST.jump) {
3988                 REGCP_UNWIND(ST.cp);
3989                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3990             }
3991             if (!--ST.accepted) {
3992                 DEBUG_EXECUTE_r({
3993                     PerlIO_printf( Perl_debug_log,
3994                         "%*s  %sTRIE failed...%s\n",
3995                         REPORT_CODE_OFF+depth*2, "",
3996                         PL_colors[4],
3997                         PL_colors[5] );
3998                 });
3999                 sayNO_SILENT;
4000             }
4001             {
4002                 /* Find next-highest word to process.  Note that this code
4003                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
4004                 U16 min = 0;
4005                 U16 word;
4006                 U16 const nextword = ST.nextword;
4007                 reg_trie_wordinfo * const wordinfo
4008                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4009                 for (word=ST.topword; word; word=wordinfo[word].prev) {
4010                     if (word > nextword && (!min || word < min))
4011                         min = word;
4012                 }
4013                 ST.nextword = min;
4014             }
4015
4016           trie_first_try:
4017             if (do_cutgroup) {
4018                 do_cutgroup = 0;
4019                 no_final = 0;
4020             }
4021
4022             if ( ST.jump) {
4023                 ST.lastparen = rex->lastparen;
4024                 ST.lastcloseparen = rex->lastcloseparen;
4025                 REGCP_SET(ST.cp);
4026             }
4027
4028             /* find start char of end of current word */
4029             {
4030                 U32 chars; /* how many chars to skip */
4031                 reg_trie_data * const trie
4032                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4033
4034                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4035                             >=  ST.firstchars);
4036                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4037                             - ST.firstchars;
4038                 uc = ST.firstpos;
4039
4040                 if (ST.longfold) {
4041                     /* the hard option - fold each char in turn and find
4042                      * its folded length (which may be different */
4043                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4044                     STRLEN foldlen;
4045                     STRLEN len;
4046                     UV uvc;
4047                     U8 *uscan;
4048
4049                     while (chars) {
4050                         if (utf8_target) {
4051                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4052                                                     uniflags);
4053                             uc += len;
4054                         }
4055                         else {
4056                             uvc = *uc;
4057                             uc++;
4058                         }
4059                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4060                         uscan = foldbuf;
4061                         while (foldlen) {
4062                             if (!--chars)
4063                                 break;
4064                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4065                                             uniflags);
4066                             uscan += len;
4067                             foldlen -= len;
4068                         }
4069                     }
4070                 }
4071                 else {
4072                     if (utf8_target)
4073                         while (chars--)
4074                             uc += UTF8SKIP(uc);
4075                     else
4076                         uc += chars;
4077                 }
4078             }
4079
4080             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4081                             ? ST.jump[ST.nextword]
4082                             : NEXT_OFF(ST.me));
4083
4084             DEBUG_EXECUTE_r({
4085                 PerlIO_printf( Perl_debug_log,
4086                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4087                     REPORT_CODE_OFF+depth*2, "",
4088                     PL_colors[4],
4089                     ST.nextword,
4090                     PL_colors[5]
4091                     );
4092             });
4093
4094             if (ST.accepted > 1 || has_cutgroup) {
4095                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4096                 assert(0); /* NOTREACHED */
4097             }
4098             /* only one choice left - just continue */
4099             DEBUG_EXECUTE_r({
4100                 AV *const trie_words
4101                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4102                 SV ** const tmp = av_fetch( trie_words,
4103                     ST.nextword-1, 0 );
4104                 SV *sv= tmp ? sv_newmortal() : NULL;
4105
4106                 PerlIO_printf( Perl_debug_log,
4107                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4108                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4109                     ST.nextword,
4110                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4111                             PL_colors[0], PL_colors[1],
4112                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4113                         )
4114                     : "not compiled under -Dr",
4115                     PL_colors[5] );
4116             });
4117
4118             locinput = (char*)uc;
4119             continue; /* execute rest of RE */
4120             assert(0); /* NOTREACHED */
4121         }
4122 #undef  ST
4123
4124         case EXACT: {            /*  /abc/        */
4125             char *s = STRING(scan);
4126             ln = STR_LEN(scan);
4127             if (utf8_target != is_utf8_pat) {
4128                 /* The target and the pattern have differing utf8ness. */
4129                 char *l = locinput;
4130                 const char * const e = s + ln;
4131
4132                 if (utf8_target) {
4133                     /* The target is utf8, the pattern is not utf8.
4134                      * Above-Latin1 code points can't match the pattern;
4135                      * invariants match exactly, and the other Latin1 ones need
4136                      * to be downgraded to a single byte in order to do the
4137                      * comparison.  (If we could be confident that the target
4138                      * is not malformed, this could be refactored to have fewer
4139                      * tests by just assuming that if the first bytes match, it
4140                      * is an invariant, but there are tests in the test suite
4141                      * dealing with (??{...}) which violate this) */
4142                     while (s < e) {
4143                         if (l >= reginfo->strend
4144                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4145                         {
4146                             sayNO;
4147                         }
4148                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4149                             if (*l != *s) {
4150                                 sayNO;
4151                             }
4152                             l++;
4153                         }
4154                         else {
4155                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4156                                 sayNO;
4157                             }
4158                             l += 2;
4159                         }
4160                         s++;
4161                     }
4162                 }
4163                 else {
4164                     /* The target is not utf8, the pattern is utf8. */
4165                     while (s < e) {
4166                         if (l >= reginfo->strend
4167                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4168                         {
4169                             sayNO;
4170                         }
4171                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4172                             if (*s != *l) {
4173                                 sayNO;
4174                             }
4175                             s++;
4176                         }
4177                         else {
4178                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4179                                 sayNO;
4180                             }
4181                             s += 2;
4182                         }
4183                         l++;
4184                     }
4185                 }
4186                 locinput = l;
4187             }
4188             else {
4189                 /* The target and the pattern have the same utf8ness. */
4190                 /* Inline the first character, for speed. */
4191                 if (reginfo->strend - locinput < ln
4192                     || UCHARAT(s) != nextchr
4193                     || (ln > 1 && memNE(s, locinput, ln)))
4194                 {
4195                     sayNO;
4196                 }
4197                 locinput += ln;
4198             }
4199             break;
4200             }
4201
4202         case EXACTFL: {          /*  /abc/il      */
4203             re_fold_t folder;
4204             const U8 * fold_array;
4205             const char * s;
4206             U32 fold_utf8_flags;
4207
4208             RX_MATCH_TAINTED_on(reginfo->prog);
4209             folder = foldEQ_locale;
4210             fold_array = PL_fold_locale;
4211             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4212             goto do_exactf;
4213
4214         case EXACTFU_SS:         /*  /\x{df}/iu   */
4215         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4216         case EXACTFU:            /*  /abc/iu      */
4217             folder = foldEQ_latin1;
4218             fold_array = PL_fold_latin1;
4219             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4220             goto do_exactf;
4221
4222         case EXACTFA:            /*  /abc/iaa     */
4223             folder = foldEQ_latin1;
4224             fold_array = PL_fold_latin1;
4225             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4226             goto do_exactf;
4227
4228         case EXACTF:             /*  /abc/i       */
4229             folder = foldEQ;
4230             fold_array = PL_fold;
4231             fold_utf8_flags = 0;
4232
4233           do_exactf:
4234             s = STRING(scan);
4235             ln = STR_LEN(scan);
4236
4237             if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4238               /* Either target or the pattern are utf8, or has the issue where
4239                * the fold lengths may differ. */
4240                 const char * const l = locinput;
4241                 char *e = reginfo->strend;
4242
4243                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4244                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4245                 {
4246                     sayNO;
4247                 }
4248                 locinput = e;
4249                 break;
4250             }
4251
4252             /* Neither the target nor the pattern are utf8 */
4253             if (UCHARAT(s) != nextchr
4254                 && !NEXTCHR_IS_EOS
4255                 && UCHARAT(s) != fold_array[nextchr])
4256             {
4257                 sayNO;
4258             }
4259             if (reginfo->strend - locinput < ln)
4260                 sayNO;
4261             if (ln > 1 && ! folder(s, locinput, ln))
4262                 sayNO;
4263             locinput += ln;
4264             break;
4265         }
4266
4267         /* XXX Could improve efficiency by separating these all out using a
4268          * macro or in-line function.  At that point regcomp.c would no longer
4269          * have to set the FLAGS fields of these */
4270         case BOUNDL:  /*  /\b/l  */
4271         case NBOUNDL: /*  /\B/l  */
4272             RX_MATCH_TAINTED_on(reginfo->prog);
4273             /* FALL THROUGH */
4274         case BOUND:   /*  /\b/   */
4275         case BOUNDU:  /*  /\b/u  */
4276         case BOUNDA:  /*  /\b/a  */
4277         case NBOUND:  /*  /\B/   */
4278         case NBOUNDU: /*  /\B/u  */
4279         case NBOUNDA: /*  /\B/a  */
4280             /* was last char in word? */
4281             if (utf8_target
4282                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4283                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4284             {
4285                 if (locinput == reginfo->strbeg)
4286                     ln = '\n';
4287                 else {
4288                     const U8 * const r =
4289                             reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4290
4291                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4292                 }
4293                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4294                     ln = isWORDCHAR_uni(ln);
4295                     if (NEXTCHR_IS_EOS)
4296                         n = 0;
4297                     else {
4298                         LOAD_UTF8_CHARCLASS_ALNUM();
4299                         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4300                                                                 utf8_target);
4301                     }
4302                 }
4303                 else {
4304                     ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4305                     n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4306                 }
4307             }
4308             else {
4309
4310                 /* Here the string isn't utf8, or is utf8 and only ascii
4311                  * characters are to match \w.  In the latter case looking at
4312                  * the byte just prior to the current one may be just the final
4313                  * byte of a multi-byte character.  This is ok.  There are two
4314                  * cases:
4315                  * 1) it is a single byte character, and then the test is doing
4316                  *      just what it's supposed to.
4317                  * 2) it is a multi-byte character, in which case the final
4318                  *      byte is never mistakable for ASCII, and so the test
4319                  *      will say it is not a word character, which is the
4320                  *      correct answer. */
4321                 ln = (locinput != reginfo->strbeg) ?
4322                     UCHARAT(locinput - 1) : '\n';
4323                 switch (FLAGS(scan)) {
4324                     case REGEX_UNICODE_CHARSET:
4325                         ln = isWORDCHAR_L1(ln);
4326                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4327                         break;
4328                     case REGEX_LOCALE_CHARSET:
4329                         ln = isWORDCHAR_LC(ln);
4330                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4331                         break;
4332                     case REGEX_DEPENDS_CHARSET:
4333                         ln = isWORDCHAR(ln);
4334                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4335                         break;
4336                     case REGEX_ASCII_RESTRICTED_CHARSET:
4337                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4338                         ln = isWORDCHAR_A(ln);
4339                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4340                         break;
4341                     default:
4342                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4343                         break;
4344                 }
4345             }
4346             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4347              * regcomp.sym */
4348             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4349                     sayNO;
4350             break;
4351
4352         case ANYOF:  /*  /[abc]/       */
4353         case ANYOF_WARN_SUPER:
4354             if (NEXTCHR_IS_EOS)
4355                 sayNO;
4356             if (utf8_target) {
4357                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4358                     sayNO;
4359                 locinput += UTF8SKIP(locinput);
4360             }
4361             else {
4362                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4363                     sayNO;
4364                 locinput++;
4365             }
4366             break;
4367
4368         /* The argument (FLAGS) to all the POSIX node types is the class number
4369          * */
4370
4371         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4372             to_complement = 1;
4373             /* FALLTHROUGH */
4374
4375         case POSIXL:    /* \w or [:punct:] etc. under /l */
4376             if (NEXTCHR_IS_EOS)
4377                 sayNO;
4378
4379             /* The locale hasn't influenced the outcome before this, so defer
4380              * tainting until now */
4381             RX_MATCH_TAINTED_on(reginfo->prog);
4382
4383             /* Use isFOO_lc() for characters within Latin1.  (Note that
4384              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4385              * wouldn't be invariant) */
4386             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4387                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4388                     sayNO;
4389                 }
4390             }
4391             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4392                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4393                                         (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4394                                                             *(locinput + 1))))))
4395                 {
4396                     sayNO;
4397                 }
4398             }
4399             else { /* Here, must be an above Latin-1 code point */
4400                 goto utf8_posix_not_eos;
4401             }
4402
4403             /* Here, must be utf8 */
4404             locinput += UTF8SKIP(locinput);
4405             break;
4406
4407         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4408             to_complement = 1;
4409             /* FALLTHROUGH */
4410
4411         case POSIXD:    /* \w or [:punct:] etc. under /d */
4412             if (utf8_target) {
4413                 goto utf8_posix;
4414             }
4415             goto posixa;
4416
4417         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4418
4419             if (NEXTCHR_IS_EOS) {
4420                 sayNO;
4421             }
4422
4423             /* All UTF-8 variants match */
4424             if (! UTF8_IS_INVARIANT(nextchr)) {
4425                 goto increment_locinput;
4426             }
4427
4428             to_complement = 1;
4429             /* FALLTHROUGH */
4430
4431         case POSIXA:    /* \w or [:punct:] etc. under /a */
4432
4433           posixa:
4434             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4435              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4436              * character is a single byte */
4437
4438             if (NEXTCHR_IS_EOS
4439                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4440                                                             FLAGS(scan)))))
4441             {
4442                 sayNO;
4443             }
4444
4445             /* Here we are either not in utf8, or we matched a utf8-invariant,
4446              * so the next char is the next byte */
4447             locinput++;
4448             break;
4449
4450         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4451             to_complement = 1;
4452             /* FALLTHROUGH */
4453
4454         case POSIXU:    /* \w or [:punct:] etc. under /u */
4455           utf8_posix:
4456             if (NEXTCHR_IS_EOS) {
4457                 sayNO;
4458             }
4459           utf8_posix_not_eos:
4460
4461             /* Use _generic_isCC() for characters within Latin1.  (Note that
4462              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4463              * wouldn't be invariant) */
4464             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4465                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4466                                                            FLAGS(scan)))))
4467                 {
4468                     sayNO;
4469                 }
4470                 locinput++;
4471             }
4472             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4473                 if (! (to_complement
4474                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4475                                                                *(locinput + 1)),
4476                                               FLAGS(scan)))))
4477                 {
4478                     sayNO;
4479                 }
4480                 locinput += 2;
4481             }
4482             else {  /* Handle above Latin-1 code points */
4483                 classnum = (_char_class_number) FLAGS(scan);
4484                 if (classnum < _FIRST_NON_SWASH_CC) {
4485
4486                     /* Here, uses a swash to find such code points.  Load if if
4487                      * not done already */
4488                     if (! PL_utf8_swash_ptrs[classnum]) {
4489                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4490                         PL_utf8_swash_ptrs[classnum]
4491                                 = _core_swash_init("utf8",
4492                                         swash_property_names[classnum],
4493                                         &PL_sv_undef, 1, 0, NULL, &flags);
4494                     }
4495                     if (! (to_complement
4496                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4497                                                (U8 *) locinput, TRUE))))
4498                     {
4499                         sayNO;
4500                     }
4501                 }
4502                 else {  /* Here, uses macros to find above Latin-1 code points */
4503                     switch (classnum) {
4504                         case _CC_ENUM_SPACE:    /* XXX would require separate
4505                                                    code if we revert the change
4506                                                    of \v matching this */
4507                         case _CC_ENUM_PSXSPC:
4508                             if (! (to_complement
4509                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4510                             {
4511                                 sayNO;
4512                             }
4513                             break;
4514                         case _CC_ENUM_BLANK:
4515                             if (! (to_complement
4516                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4517                             {
4518                                 sayNO;
4519                             }
4520                             break;
4521                         case _CC_ENUM_XDIGIT:
4522                             if (! (to_complement
4523                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4524                             {
4525                                 sayNO;
4526                             }
4527                             break;
4528                         case _CC_ENUM_VERTSPACE:
4529                             if (! (to_complement
4530                                             ^ cBOOL(is_VERTWS_high(locinput))))
4531                             {
4532                                 sayNO;
4533                             }
4534                             break;
4535                         default:    /* The rest, e.g. [:cntrl:], can't match
4536                                        above Latin1 */
4537                             if (! to_complement) {
4538                                 sayNO;
4539                             }
4540                             break;
4541                     }
4542                 }
4543                 locinput += UTF8SKIP(locinput);
4544             }
4545             break;
4546
4547         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4548                        a Unicode extended Grapheme Cluster */
4549             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4550               extended Grapheme Cluster is:
4551
4552             CR LF
4553             | Prepend* Begin Extend*
4554             | .
4555
4556             Begin is:           ( Special_Begin | ! Control )
4557             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4558             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4559             Control is:         [ GCB_Control | CR | LF ]
4560             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4561
4562                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4563                we can rewrite
4564
4565                    Begin is ( Regular_Begin + Special Begin )
4566
4567                It turns out that 98.4% of all Unicode code points match
4568                Regular_Begin.  Doing it this way eliminates a table match in
4569                the previous implementation for almost all Unicode code points.
4570
4571                There is a subtlety with Prepend* which showed up in testing.
4572                Note that the Begin, and only the Begin is required in:
4573                 | Prepend* Begin Extend*
4574                Also, Begin contains '! Control'.  A Prepend must be a
4575                '!  Control', which means it must also be a Begin.  What it
4576                comes down to is that if we match Prepend* and then find no
4577                suitable Begin afterwards, that if we backtrack the last
4578                Prepend, that one will be a suitable Begin.
4579             */
4580
4581             if (NEXTCHR_IS_EOS)
4582                 sayNO;
4583             if  (! utf8_target) {
4584
4585                 /* Match either CR LF  or '.', as all the other possibilities
4586                  * require utf8 */
4587                 locinput++;         /* Match the . or CR */
4588                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4589                                        match the LF */
4590                     && locinput < reginfo->strend
4591                     && UCHARAT(locinput) == '\n')
4592                 {
4593                     locinput++;
4594                 }
4595             }
4596             else {
4597
4598                 /* Utf8: See if is ( CR LF ); already know that locinput <
4599                  * reginfo->strend, so locinput+1 is in bounds */
4600                 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4601                      && UCHARAT(locinput + 1) == '\n')
4602                 {
4603                     locinput += 2;
4604                 }
4605                 else {
4606                     STRLEN len;
4607
4608                     /* In case have to backtrack to beginning, then match '.' */
4609                     char *starting = locinput;
4610
4611                     /* In case have to backtrack the last prepend */
4612                     char *previous_prepend = NULL;
4613
4614                     LOAD_UTF8_CHARCLASS_GCB();
4615
4616                     /* Match (prepend)*   */
4617                     while (locinput < reginfo->strend
4618                            && (len = is_GCB_Prepend_utf8(locinput)))
4619                     {
4620                         previous_prepend = locinput;
4621                         locinput += len;
4622                     }
4623
4624                     /* As noted above, if we matched a prepend character, but
4625                      * the next thing won't match, back off the last prepend we
4626                      * matched, as it is guaranteed to match the begin */
4627                     if (previous_prepend
4628                         && (locinput >=  reginfo->strend
4629                             || (! swash_fetch(PL_utf8_X_regular_begin,
4630                                              (U8*)locinput, utf8_target)
4631                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4632                         )
4633                     {
4634                         locinput = previous_prepend;
4635                     }
4636
4637                     /* Note that here we know reginfo->strend > locinput, as we
4638                      * tested that upon input to this switch case, and if we
4639                      * moved locinput forward, we tested the result just above
4640                      * and it either passed, or we backed off so that it will
4641                      * now pass */
4642                     if (swash_fetch(PL_utf8_X_regular_begin,
4643                                     (U8*)locinput, utf8_target)) {
4644                         locinput += UTF8SKIP(locinput);
4645                     }
4646                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4647
4648                         /* Here did not match the required 'Begin' in the
4649                          * second term.  So just match the very first
4650                          * character, the '.' of the final term of the regex */
4651                         locinput = starting + UTF8SKIP(starting);
4652                         goto exit_utf8;
4653                     } else {
4654
4655                         /* Here is a special begin.  It can be composed of
4656                          * several individual characters.  One possibility is
4657                          * RI+ */
4658                         if ((len = is_GCB_RI_utf8(locinput))) {
4659                             locinput += len;
4660                             while (locinput < reginfo->strend
4661                                    && (len = is_GCB_RI_utf8(locinput)))
4662                             {
4663                                 locinput += len;
4664                             }
4665                         } else if ((len = is_GCB_T_utf8(locinput))) {
4666                             /* Another possibility is T+ */
4667                             locinput += len;
4668                             while (locinput < reginfo->strend
4669                                 && (len = is_GCB_T_utf8(locinput)))
4670                             {
4671                                 locinput += len;
4672                             }
4673                         } else {
4674
4675                             /* Here, neither RI+ nor T+; must be some other
4676                              * Hangul.  That means it is one of the others: L,
4677                              * LV, LVT or V, and matches:
4678                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4679
4680                             /* Match L*           */
4681                             while (locinput < reginfo->strend
4682                                    && (len = is_GCB_L_utf8(locinput)))
4683                             {
4684                                 locinput += len;
4685                             }
4686
4687                             /* Here, have exhausted L*.  If the next character
4688                              * is not an LV, LVT nor V, it means we had to have
4689                              * at least one L, so matches L+ in the original
4690                              * equation, we have a complete hangul syllable.
4691                              * Are done. */
4692
4693                             if (locinput < reginfo->strend
4694                                 && is_GCB_LV_LVT_V_utf8(locinput))
4695                             {
4696                                 /* Otherwise keep going.  Must be LV, LVT or V.
4697                                  * See if LVT, by first ruling out V, then LV */
4698                                 if (! is_GCB_V_utf8(locinput)
4699                                         /* All but every TCount one is LV */
4700                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4701                                                                          NULL)
4702                                                                         - SBASE)
4703                                         % TCount != 0)
4704                                 {
4705                                     locinput += UTF8SKIP(locinput);
4706                                 } else {
4707
4708                                     /* Must be  V or LV.  Take it, then match
4709                                      * V*     */
4710                                     locinput += UTF8SKIP(locinput);
4711                                     while (locinput < reginfo->strend
4712                                            && (len = is_GCB_V_utf8(locinput)))
4713                                     {
4714                                         locinput += len;
4715                                     }
4716                                 }
4717
4718                                 /* And any of LV, LVT, or V can be followed
4719                                  * by T*            */
4720                                 while (locinput < reginfo->strend
4721                                        && (len = is_GCB_T_utf8(locinput)))
4722                                 {
4723                                     locinput += len;
4724                                 }
4725                             }
4726                         }
4727                     }
4728
4729                     /* Match any extender */
4730                     while (locinput < reginfo->strend
4731                             && swash_fetch(PL_utf8_X_extend,
4732                                             (U8*)locinput, utf8_target))
4733                     {
4734                         locinput += UTF8SKIP(locinput);
4735                     }
4736                 }
4737             exit_utf8:
4738                 if (locinput > reginfo->strend) sayNO;
4739             }
4740             break;
4741
4742         case NREFFL:  /*  /\g{name}/il  */
4743         {   /* The capture buffer cases.  The ones beginning with N for the
4744                named buffers just convert to the equivalent numbered and
4745                pretend they were called as the corresponding numbered buffer
4746                op.  */
4747             /* don't initialize these in the declaration, it makes C++
4748                unhappy */
4749             const char *s;
4750             char type;
4751             re_fold_t folder;
4752             const U8 *fold_array;
4753             UV utf8_fold_flags;
4754
4755             RX_MATCH_TAINTED_on(reginfo->prog);
4756             folder = foldEQ_locale;
4757             fold_array = PL_fold_locale;
4758             type = REFFL;
4759             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4760             goto do_nref;
4761
4762         case NREFFA:  /*  /\g{name}/iaa  */
4763             folder = foldEQ_latin1;
4764             fold_array = PL_fold_latin1;
4765             type = REFFA;
4766             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4767             goto do_nref;
4768
4769         case NREFFU:  /*  /\g{name}/iu  */
4770             folder = foldEQ_latin1;
4771             fold_array = PL_fold_latin1;
4772             type = REFFU;
4773             utf8_fold_flags = 0;
4774             goto do_nref;
4775
4776         case NREFF:  /*  /\g{name}/i  */
4777             folder = foldEQ;
4778             fold_array = PL_fold;
4779             type = REFF;
4780             utf8_fold_flags = 0;
4781             goto do_nref;
4782
4783         case NREF:  /*  /\g{name}/   */
4784             type = REF;
4785             folder = NULL;
4786             fold_array = NULL;
4787             utf8_fold_flags = 0;
4788           do_nref:
4789
4790             /* For the named back references, find the corresponding buffer
4791              * number */
4792             n = reg_check_named_buff_matched(rex,scan);
4793
4794             if ( ! n ) {
4795                 sayNO;
4796             }
4797             goto do_nref_ref_common;
4798
4799         case REFFL:  /*  /\1/il  */
4800             RX_MATCH_TAINTED_on(reginfo->prog);
4801             folder = foldEQ_locale;
4802             fold_array = PL_fold_locale;
4803             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4804             goto do_ref;
4805
4806         case REFFA:  /*  /\1/iaa  */
4807             folder = foldEQ_latin1;
4808             fold_array = PL_fold_latin1;
4809             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4810             goto do_ref;
4811
4812         case REFFU:  /*  /\1/iu  */
4813             folder = foldEQ_latin1;
4814             fold_array = PL_fold_latin1;
4815             utf8_fold_flags = 0;
4816             goto do_ref;
4817
4818         case REFF:  /*  /\1/i  */
4819             folder = foldEQ;
4820             fold_array = PL_fold;
4821             utf8_fold_flags = 0;
4822             goto do_ref;
4823
4824         case REF:  /*  /\1/    */
4825             folder = NULL;
4826             fold_array = NULL;
4827             utf8_fold_flags = 0;
4828
4829           do_ref:
4830             type = OP(scan);
4831             n = ARG(scan);  /* which paren pair */
4832
4833           do_nref_ref_common:
4834             ln = rex->offs[n].start;
4835             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4836             if (rex->lastparen < n || ln == -1)
4837                 sayNO;                  /* Do not match unless seen CLOSEn. */
4838             if (ln == rex->offs[n].end)
4839                 break;
4840
4841             s = reginfo->strbeg + ln;
4842             if (type != REF     /* REF can do byte comparison */
4843                 && (utf8_target || type == REFFU))
4844             { /* XXX handle REFFL better */
4845                 char * limit = reginfo->strend;
4846
4847                 /* This call case insensitively compares the entire buffer
4848                     * at s, with the current input starting at locinput, but
4849                     * not going off the end given by reginfo->strend, and
4850                     * returns in <limit> upon success, how much of the
4851                     * current input was matched */
4852                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4853                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4854                 {
4855                     sayNO;
4856                 }
4857                 locinput = limit;
4858                 break;
4859             }
4860
4861             /* Not utf8:  Inline the first character, for speed. */
4862             if (!NEXTCHR_IS_EOS &&
4863                 UCHARAT(s) != nextchr &&
4864                 (type == REF ||
4865                  UCHARAT(s) != fold_array[nextchr]))
4866                 sayNO;
4867             ln = rex->offs[n].end - ln;
4868             if (locinput + ln > reginfo->strend)
4869                 sayNO;
4870             if (ln > 1 && (type == REF
4871                            ? memNE(s, locinput, ln)
4872                            : ! folder(s, locinput, ln)))
4873                 sayNO;
4874             locinput += ln;
4875             break;
4876         }
4877
4878         case NOTHING: /* null op; e.g. the 'nothing' following
4879                        * the '*' in m{(a+|b)*}' */
4880             break;
4881         case TAIL: /* placeholder while compiling (A|B|C) */
4882             break;
4883
4884         case BACK: /* ??? doesn't appear to be used ??? */
4885             break;
4886
4887 #undef  ST
4888 #define ST st->u.eval
4889         {
4890             SV *ret;
4891             REGEXP *re_sv;
4892             regexp *re;
4893             regexp_internal *rei;
4894             regnode *startpoint;
4895
4896         case GOSTART: /*  (?R)  */
4897         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4898             if (cur_eval && cur_eval->locinput==locinput) {
4899                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4900                     Perl_croak(aTHX_ "Infinite recursion in regex");
4901                 if ( ++nochange_depth > max_nochange_depth )
4902                     Perl_croak(aTHX_
4903                         "Pattern subroutine nesting without pos change"
4904                         " exceeded limit in regex");
4905             } else {
4906                 nochange_depth = 0;
4907             }
4908             re_sv = rex_sv;
4909             re = rex;
4910             rei = rexi;
4911             if (OP(scan)==GOSUB) {
4912                 startpoint = scan + ARG2L(scan);
4913                 ST.close_paren = ARG(scan);
4914             } else {
4915                 startpoint = rei->program+1;
4916                 ST.close_paren = 0;
4917             }
4918             goto eval_recurse_doit;
4919             assert(0); /* NOTREACHED */
4920
4921         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4922             if (cur_eval && cur_eval->locinput==locinput) {
4923                 if ( ++nochange_depth > max_nochange_depth )
4924                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4925             } else {
4926                 nochange_depth = 0;
4927             }
4928             {
4929                 /* execute the code in the {...} */
4930
4931                 dSP;
4932                 IV before;
4933                 OP * const oop = PL_op;
4934                 COP * const ocurcop = PL_curcop;
4935                 OP *nop;
4936                 CV *newcv;
4937
4938                 /* save *all* paren positions */
4939                 regcppush(rex, 0, maxopenparen);
4940                 REGCP_SET(runops_cp);
4941
4942                 if (!caller_cv)
4943                     caller_cv = find_runcv(NULL);
4944
4945                 n = ARG(scan);
4946
4947                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4948                     newcv = (ReANY(
4949                                                 (REGEXP*)(rexi->data->data[n])
4950                                             ))->qr_anoncv
4951                                         ;
4952                     nop = (OP*)rexi->data->data[n+1];
4953                 }
4954                 else if (rexi->data->what[n] == 'l') { /* literal code */
4955                     newcv = caller_cv;
4956                     nop = (OP*)rexi->data->data[n];
4957                     assert(CvDEPTH(newcv));
4958                 }
4959                 else {
4960                     /* literal with own CV */
4961                     assert(rexi->data->what[n] == 'L');
4962                     newcv = rex->qr_anoncv;
4963                     nop = (OP*)rexi->data->data[n];
4964                 }
4965
4966                 /* normally if we're about to execute code from the same
4967                  * CV that we used previously, we just use the existing
4968                  * CX stack entry. However, its possible that in the
4969                  * meantime we may have backtracked, popped from the save
4970                  * stack, and undone the SAVECOMPPAD(s) associated with
4971                  * PUSH_MULTICALL; in which case PL_comppad no longer
4972                  * points to newcv's pad. */
4973                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4974                 {
4975                     U8 flags = (CXp_SUB_RE |
4976                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4977                     if (last_pushed_cv) {
4978                         CHANGE_MULTICALL_FLAGS(newcv, flags);
4979                     }
4980                     else {
4981                         PUSH_MULTICALL_FLAGS(newcv, flags);
4982                     }
4983                     last_pushed_cv = newcv;
4984                 }
4985                 else {
4986                     /* these assignments are just to silence compiler
4987                      * warnings */
4988                     multicall_cop = NULL;
4989                     newsp = NULL;
4990                 }
4991                 last_pad = PL_comppad;
4992
4993                 /* the initial nextstate you would normally execute
4994                  * at the start of an eval (which would cause error
4995                  * messages to come from the eval), may be optimised
4996                  * away from the execution path in the regex code blocks;
4997                  * so manually set PL_curcop to it initially */
4998                 {
4999                     OP *o = cUNOPx(nop)->op_first;
5000                     assert(o->op_type == OP_NULL);
5001                     if (o->op_targ == OP_SCOPE) {
5002                         o = cUNOPo->op_first;
5003                     }
5004                     else {
5005                         assert(o->op_targ == OP_LEAVE);
5006                         o = cUNOPo->op_first;
5007                         assert(o->op_type == OP_ENTER);
5008                         o = o->op_sibling;
5009                     }
5010
5011                     if (o->op_type != OP_STUB) {
5012                         assert(    o->op_type == OP_NEXTSTATE
5013                                 || o->op_type == OP_DBSTATE
5014                                 || (o->op_type == OP_NULL
5015                                     &&  (  o->op_targ == OP_NEXTSTATE
5016                                         || o->op_targ == OP_DBSTATE
5017                                         )
5018                                     )
5019                         );
5020                         PL_curcop = (COP*)o;
5021                     }
5022                 }
5023                 nop = nop->op_next;
5024
5025                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5026                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5027
5028                 rex->offs[0].end = locinput - reginfo->strbeg;
5029                 if (reginfo->info_aux_eval->pos_magic)
5030                         reginfo->info_aux_eval->pos_magic->mg_len
5031                                         = locinput - reginfo->strbeg;
5032
5033                 if (sv_yes_mark) {
5034                     SV *sv_mrk = get_sv("REGMARK", 1);
5035                     sv_setsv(sv_mrk, sv_yes_mark);
5036                 }
5037
5038                 /* we don't use MULTICALL here as we want to call the
5039                  * first op of the block of interest, rather than the
5040                  * first op of the sub */
5041                 before = (IV)(SP-PL_stack_base);
5042                 PL_op = nop;
5043                 CALLRUNOPS(aTHX);                       /* Scalar context. */
5044                 SPAGAIN;
5045                 if ((IV)(SP-PL_stack_base) == before)
5046                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5047                 else {
5048                     ret = POPs;
5049                     PUTBACK;
5050                 }
5051
5052                 /* before restoring everything, evaluate the returned
5053                  * value, so that 'uninit' warnings don't use the wrong
5054                  * PL_op or pad. Also need to process any magic vars
5055                  * (e.g. $1) *before* parentheses are restored */
5056
5057                 PL_op = NULL;
5058
5059                 re_sv = NULL;
5060                 if (logical == 0)        /*   (?{})/   */
5061                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5062                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5063                     sw = cBOOL(SvTRUE(ret));
5064                     logical = 0;
5065                 }
5066                 else {                   /*  /(??{})  */
5067                     /*  if its overloaded, let the regex compiler handle
5068                      *  it; otherwise extract regex, or stringify  */
5069                     if (!SvAMAGIC(ret)) {
5070                         SV *sv = ret;
5071                         if (SvROK(sv))
5072                             sv = SvRV(sv);
5073                         if (SvTYPE(sv) == SVt_REGEXP)
5074                             re_sv = (REGEXP*) sv;
5075                         else if (SvSMAGICAL(sv)) {
5076                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5077                             if (mg)
5078                                 re_sv = (REGEXP *) mg->mg_obj;
5079                         }
5080
5081                         /* force any magic, undef warnings here */
5082                         if (!re_sv) {
5083                             ret = sv_mortalcopy(ret);
5084                             (void) SvPV_force_nolen(ret);
5085                         }
5086                     }
5087
5088                 }
5089
5090                 /* *** Note that at this point we don't restore
5091                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5092                  * be used again soon. This is safe as long as nothing
5093                  * in the regexp code uses the pad ! */
5094                 PL_op = oop;
5095                 PL_curcop = ocurcop;
5096                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5097                 PL_curpm = PL_reg_curpm;
5098
5099                 if (logical != 2)
5100                     break;
5101             }
5102
5103                 /* only /(??{})/  from now on */
5104                 logical = 0;
5105                 {
5106                     /* extract RE object from returned value; compiling if
5107                      * necessary */
5108
5109                     if (re_sv) {
5110                         re_sv = reg_temp_copy(NULL, re_sv);
5111                     }
5112                     else {
5113                         U32 pm_flags = 0;
5114
5115                         if (SvUTF8(ret) && IN_BYTES) {
5116                             /* In use 'bytes': make a copy of the octet
5117                              * sequence, but without the flag on */
5118                             STRLEN len;
5119                             const char *const p = SvPV(ret, len);
5120                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5121                         }
5122                         if (rex->intflags & PREGf_USE_RE_EVAL)
5123                             pm_flags |= PMf_USE_RE_EVAL;
5124
5125                         /* if we got here, it should be an engine which
5126                          * supports compiling code blocks and stuff */
5127                         assert(rex->engine && rex->engine->op_comp);
5128                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5129                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5130                                     rex->engine, NULL, NULL,
5131                                     /* copy /msix etc to inner pattern */
5132                                     scan->flags,
5133                                     pm_flags);
5134
5135                         if (!(SvFLAGS(ret)
5136                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5137                                  | SVs_GMG))) {
5138                             /* This isn't a first class regexp. Instead, it's
5139                                caching a regexp onto an existing, Perl visible
5140                                scalar.  */
5141                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5142                         }
5143                         /* safe to do now that any $1 etc has been
5144                          * interpolated into the new pattern string and
5145                          * compiled */
5146                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5147                     }
5148                     SAVEFREESV(re_sv);
5149                     re = ReANY(re_sv);
5150                 }
5151                 RXp_MATCH_COPIED_off(re);
5152                 re->subbeg = rex->subbeg;
5153                 re->sublen = rex->sublen;
5154                 re->suboffset = rex->suboffset;
5155                 re->subcoffset = rex->subcoffset;
5156                 rei = RXi_GET(re);
5157                 DEBUG_EXECUTE_r(
5158                     debug_start_match(re_sv, utf8_target, locinput,
5159                                     reginfo->strend, "Matching embedded");
5160                 );
5161                 startpoint = rei->program + 1;
5162                 ST.close_paren = 0; /* only used for GOSUB */
5163
5164         eval_recurse_doit: /* Share code with GOSUB below this line */
5165                 /* run the pattern returned from (??{...}) */
5166
5167                 /* Save *all* the positions. */
5168                 ST.cp = regcppush(rex, 0, maxopenparen);
5169                 REGCP_SET(ST.lastcp);
5170
5171                 re->lastparen = 0;
5172                 re->lastcloseparen = 0;
5173
5174                 maxopenparen = 0;
5175
5176                 /* invalidate the S-L poscache. We're now executing a
5177                  * different set of WHILEM ops (and their associated
5178                  * indexes) against the same string, so the bits in the
5179                  * cache are meaningless. Setting maxiter to zero forces
5180                  * the cache to be invalidated and zeroed before reuse.
5181                  * XXX This is too dramatic a measure. Ideally we should
5182                  * save the old cache and restore when running the outer
5183                  * pattern again */
5184                 reginfo->poscache_maxiter = 0;
5185
5186                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5187
5188                 ST.prev_rex = rex_sv;
5189                 ST.prev_curlyx = cur_curlyx;
5190                 rex_sv = re_sv;
5191                 SET_reg_curpm(rex_sv);
5192                 rex = re;
5193                 rexi = rei;
5194                 cur_curlyx = NULL;
5195                 ST.B = next;
5196                 ST.prev_eval = cur_eval;
5197                 cur_eval = st;
5198                 /* now continue from first node in postoned RE */
5199                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5200                 assert(0); /* NOTREACHED */
5201         }
5202
5203         case EVAL_AB: /* cleanup after a successful (??{A})B */
5204             /* note: this is called twice; first after popping B, then A */
5205             rex_sv = ST.prev_rex;
5206             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5207             SET_reg_curpm(rex_sv);
5208             rex = ReANY(rex_sv);
5209             rexi = RXi_GET(rex);
5210             regcpblow(ST.cp);
5211             cur_eval = ST.prev_eval;
5212             cur_curlyx = ST.prev_curlyx;
5213
5214             /* Invalidate cache. See "invalidate" comment above. */
5215             reginfo->poscache_maxiter = 0;
5216             if ( nochange_depth )
5217                 nochange_depth--;
5218             sayYES;
5219
5220
5221         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5222             /* note: this is called twice; first after popping B, then A */
5223             rex_sv = ST.prev_rex;
5224             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5225             SET_reg_curpm(rex_sv);
5226             rex = ReANY(rex_sv);
5227             rexi = RXi_GET(rex);
5228
5229             REGCP_UNWIND(ST.lastcp);
5230             regcppop(rex, &maxopenparen);
5231             cur_eval = ST.prev_eval;
5232             cur_curlyx = ST.prev_curlyx;
5233             /* Invalidate cache. See "invalidate" comment above. */
5234             reginfo->poscache_maxiter = 0;
5235             if ( nochange_depth )
5236                 nochange_depth--;
5237             sayNO_SILENT;
5238 #undef ST
5239
5240         case OPEN: /*  (  */
5241             n = ARG(scan);  /* which paren pair */
5242             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5243             if (n > maxopenparen)
5244                 maxopenparen = n;
5245             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5246                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5247                 PTR2UV(rex),
5248                 PTR2UV(rex->offs),
5249                 (UV)n,
5250                 (IV)rex->offs[n].start_tmp,
5251                 (UV)maxopenparen
5252             ));
5253             lastopen = n;
5254             break;
5255
5256 /* XXX really need to log other places start/end are set too */
5257 #define CLOSE_CAPTURE \
5258     rex->offs[n].start = rex->offs[n].start_tmp; \
5259     rex->offs[n].end = locinput - reginfo->strbeg; \
5260     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5261         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5262         PTR2UV(rex), \
5263         PTR2UV(rex->offs), \
5264         (UV)n, \
5265         (IV)rex->offs[n].start, \
5266         (IV)rex->offs[n].end \
5267     ))
5268
5269         case CLOSE:  /*  )  */
5270             n = ARG(scan);  /* which paren pair */
5271             CLOSE_CAPTURE;
5272             if (n > rex->lastparen)
5273                 rex->lastparen = n;
5274             rex->lastcloseparen = n;
5275             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5276                 goto fake_end;
5277             }
5278             break;
5279
5280         case ACCEPT:  /*  (*ACCEPT)  */
5281             if (ARG(scan)){
5282                 regnode *cursor;
5283                 for (cursor=scan;
5284                      cursor && OP(cursor)!=END;
5285                      cursor=regnext(cursor))
5286                 {
5287                     if ( OP(cursor)==CLOSE ){
5288                         n = ARG(cursor);
5289                         if ( n <= lastopen ) {
5290                             CLOSE_CAPTURE;
5291                             if (n > rex->lastparen)
5292                                 rex->lastparen = n;
5293                             rex->lastcloseparen = n;
5294                             if ( n == ARG(scan) || (cur_eval &&
5295                                 cur_eval->u.eval.close_paren == n))
5296                                 break;
5297                         }
5298                     }
5299                 }
5300             }
5301             goto fake_end;
5302             /*NOTREACHED*/
5303
5304         case GROUPP:  /*  (?(1))  */
5305             n = ARG(scan);  /* which paren pair */
5306             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5307             break;
5308
5309         case NGROUPP:  /*  (?(<name>))  */
5310             /* reg_check_named_buff_matched returns 0 for no match */
5311             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5312             break;
5313
5314         case INSUBP:   /*  (?(R))  */
5315             n = ARG(scan);
5316             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5317             break;
5318
5319         case DEFINEP:  /*  (?(DEFINE))  */
5320             sw = 0;
5321             break;
5322
5323         case IFTHEN:   /*  (?(cond)A|B)  */
5324             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5325             if (sw)
5326                 next = NEXTOPER(NEXTOPER(scan));
5327             else {
5328                 next = scan + ARG(scan);
5329                 if (OP(next) == IFTHEN) /* Fake one. */
5330                     next = NEXTOPER(NEXTOPER(next));
5331             }
5332             break;
5333
5334         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5335             logical = scan->flags;
5336             break;
5337
5338 /*******************************************************************
5339
5340 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5341 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5342 STAR/PLUS/CURLY/CURLYN are used instead.)
5343
5344 A*B is compiled as <CURLYX><A><WHILEM><B>
5345
5346 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5347 state, which contains the current count, initialised to -1. It also sets
5348 cur_curlyx to point to this state, with any previous value saved in the
5349 state block.
5350
5351 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5352 since the pattern may possibly match zero times (i.e. it's a while {} loop
5353 rather than a do {} while loop).
5354
5355 Each entry to WHILEM represents a successful match of A. The count in the
5356 CURLYX block is incremented, another WHILEM state is pushed, and execution
5357 passes to A or B depending on greediness and the current count.
5358
5359 For example, if matching against the string a1a2a3b (where the aN are
5360 substrings that match /A/), then the match progresses as follows: (the
5361 pushed states are interspersed with the bits of strings matched so far):
5362
5363     <CURLYX cnt=-1>
5364     <CURLYX cnt=0><WHILEM>
5365     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5366     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5367     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5368     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5369
5370 (Contrast this with something like CURLYM, which maintains only a single
5371 backtrack state:
5372
5373     <CURLYM cnt=0> a1
5374     a1 <CURLYM cnt=1> a2
5375     a1 a2 <CURLYM cnt=2> a3
5376     a1 a2 a3 <CURLYM cnt=3> b
5377 )
5378
5379 Each WHILEM state block marks a point to backtrack to upon partial failure
5380 of A or B, and also contains some minor state data related to that
5381 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5382 overall state, such as the count, and pointers to the A and B ops.
5383
5384 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5385 must always point to the *current* CURLYX block, the rules are:
5386
5387 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5388 and set cur_curlyx to point the new block.
5389
5390 When popping the CURLYX block after a successful or unsuccessful match,
5391 restore the previous cur_curlyx.
5392
5393 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5394 to the outer one saved in the CURLYX block.
5395
5396 When popping the WHILEM block after a successful or unsuccessful B match,
5397 restore the previous cur_curlyx.
5398
5399 Here's an example for the pattern (AI* BI)*BO
5400 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5401
5402 cur_
5403 curlyx backtrack stack
5404 ------ ---------------
5405 NULL
5406 CO     <CO prev=NULL> <WO>
5407 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5408 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5409 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5410
5411 At this point the pattern succeeds, and we work back down the stack to
5412 clean up, restoring as we go:
5413
5414 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5415 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5416 CO     <CO prev=NULL> <WO>
5417 NULL
5418
5419 *******************************************************************/
5420
5421 #define ST st->u.curlyx
5422
5423         case CURLYX:    /* start of /A*B/  (for complex A) */
5424         {
5425             /* No need to save/restore up to this paren */
5426             I32 parenfloor = scan->flags;
5427
5428             assert(next); /* keep Coverity happy */
5429             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5430                 next += ARG(next);
5431
5432             /* XXXX Probably it is better to teach regpush to support
5433                parenfloor > maxopenparen ... */
5434             if (parenfloor > (I32)rex->lastparen)
5435                 parenfloor = rex->lastparen; /* Pessimization... */
5436
5437             ST.prev_curlyx= cur_curlyx;
5438             cur_curlyx = st;
5439             ST.cp = PL_savestack_ix;
5440
5441             /* these fields contain the state of the current curly.
5442              * they are accessed by subsequent WHILEMs */
5443             ST.parenfloor = parenfloor;
5444             ST.me = scan;
5445             ST.B = next;
5446             ST.minmod = minmod;
5447             minmod = 0;
5448             ST.count = -1;      /* this will be updated by WHILEM */
5449             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5450
5451             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5452             assert(0); /* NOTREACHED */
5453         }
5454
5455         case CURLYX_end: /* just finished matching all of A*B */
5456             cur_curlyx = ST.prev_curlyx;
5457             sayYES;
5458             assert(0); /* NOTREACHED */
5459
5460         case CURLYX_end_fail: /* just failed to match all of A*B */
5461             regcpblow(ST.cp);
5462             cur_curlyx = ST.prev_curlyx;
5463             sayNO;
5464             assert(0); /* NOTREACHED */
5465
5466
5467 #undef ST
5468 #define ST st->u.whilem
5469
5470         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5471         {
5472             /* see the discussion above about CURLYX/WHILEM */
5473             I32 n;
5474             int min = ARG1(cur_curlyx->u.curlyx.me);
5475             int max = ARG2(cur_curlyx->u.curlyx.me);
5476             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5477
5478             assert(cur_curlyx); /* keep Coverity happy */
5479             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5480             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5481             ST.cache_offset = 0;
5482             ST.cache_mask = 0;
5483
5484
5485             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5486                   "%*s  whilem: matched %ld out of %d..%d\n",
5487                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5488             );
5489
5490             /* First just match a string of min A's. */
5491
5492             if (n < min) {
5493                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5494                                     maxopenparen);
5495                 cur_curlyx->u.curlyx.lastloc = locinput;
5496                 REGCP_SET(ST.lastcp);
5497
5498                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5499                 assert(0); /* NOTREACHED */
5500             }
5501
5502             /* If degenerate A matches "", assume A done. */
5503
5504             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5505                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5506                    "%*s  whilem: empty match detected, trying continuation...\n",
5507                    REPORT_CODE_OFF+depth*2, "")
5508                 );
5509                 goto do_whilem_B_max;
5510             }
5511
5512             /* super-linear cache processing.
5513              *
5514              * The idea here is that for certain types of CURLYX/WHILEM -
5515              * principally those whose upper bound is infinity (and
5516              * excluding regexes that have things like \1 and other very
5517              * non-regular expresssiony things), then if a pattern like
5518              * /....A*.../ fails and we backtrack to the WHILEM, then we
5519              * make a note that this particular WHILEM op was at string
5520              * position 47 (say) when the rest of pattern failed. Then, if
5521              * we ever find ourselves back at that WHILEM, and at string
5522              * position 47 again, we can just fail immediately rather than
5523              * running the rest of the pattern again.
5524              *
5525              * This is very handy when patterns start to go
5526              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5527              * with a combinatorial explosion of backtracking.
5528              *
5529              * The cache is implemented as a bit array, with one bit per
5530              * string byte position per WHILEM op (up to 16) - so its
5531              * between 0.25 and 2x the string size.
5532              *
5533              * To avoid allocating a poscache buffer every time, we do an
5534              * initially countdown; only after we have  executed a WHILEM
5535              * op (string-length x #WHILEMs) times do we allocate the
5536              * cache.
5537              *
5538              * The top 4 bits of scan->flags byte say how many different
5539              * relevant CURLLYX/WHILEM op pairs there are, while the
5540              * bottom 4-bits is the identifying index number of this
5541              * WHILEM.
5542              */
5543
5544             if (scan->flags) {
5545
5546                 if (!reginfo->poscache_maxiter) {
5547                     /* start the countdown: Postpone detection until we
5548                      * know the match is not *that* much linear. */
5549                     reginfo->poscache_maxiter
5550                         =    (reginfo->strend - reginfo->strbeg + 1)
5551                            * (scan->flags>>4);
5552                     /* possible overflow for long strings and many CURLYX's */
5553                     if (reginfo->poscache_maxiter < 0)
5554                         reginfo->poscache_maxiter = I32_MAX;
5555                     reginfo->poscache_iter = reginfo->poscache_maxiter;
5556                 }
5557
5558                 if (reginfo->poscache_iter-- == 0) {
5559                     /* initialise cache */
5560                     const I32 size = (reginfo->poscache_maxiter + 7)/8;
5561                     regmatch_info_aux *const aux = reginfo->info_aux;
5562                     if (aux->poscache) {
5563                         if ((I32)reginfo->poscache_size < size) {
5564                             Renew(aux->poscache, size, char);
5565                             reginfo->poscache_size = size;
5566                         }
5567                         Zero(aux->poscache, size, char);
5568                     }
5569                     else {
5570                         reginfo->poscache_size = size;
5571                         Newxz(aux->poscache, size, char);
5572                     }
5573                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5574       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5575                               PL_colors[4], PL_colors[5])
5576                     );
5577                 }
5578
5579                 if (reginfo->poscache_iter < 0) {
5580                     /* have we already failed at this position? */
5581                     I32 offset, mask;
5582
5583                     reginfo->poscache_iter = -1; /* stop eventual underflow */
5584                     offset  = (scan->flags & 0xf) - 1
5585                                 +   (locinput - reginfo->strbeg)
5586                                   * (scan->flags>>4);
5587                     mask    = 1 << (offset % 8);
5588                     offset /= 8;
5589                     if (reginfo->info_aux->poscache[offset] & mask) {
5590                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5591                             "%*s  whilem: (cache) already tried at this position...\n",
5592                             REPORT_CODE_OFF+depth*2, "")
5593                         );
5594                         sayNO; /* cache records failure */
5595                     }
5596                     ST.cache_offset = offset;
5597                     ST.cache_mask   = mask;
5598                 }
5599             }
5600
5601             /* Prefer B over A for minimal matching. */
5602
5603             if (cur_curlyx->u.curlyx.minmod) {
5604                 ST.save_curlyx = cur_curlyx;
5605                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5606                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5607                             maxopenparen);
5608                 REGCP_SET(ST.lastcp);
5609                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5610                                     locinput);
5611                 assert(0); /* NOTREACHED */
5612             }
5613
5614             /* Prefer A over B for maximal matching. */
5615
5616             if (n < max) { /* More greed allowed? */
5617                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5618                             maxopenparen);
5619                 cur_curlyx->u.curlyx.lastloc = locinput;
5620                 REGCP_SET(ST.lastcp);
5621                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5622                 assert(0); /* NOTREACHED */
5623             }
5624             goto do_whilem_B_max;
5625         }
5626         assert(0); /* NOTREACHED */
5627
5628         case WHILEM_B_min: /* just matched B in a minimal match */
5629         case WHILEM_B_max: /* just matched B in a maximal match */
5630             cur_curlyx = ST.save_curlyx;
5631             sayYES;
5632             assert(0); /* NOTREACHED */
5633
5634         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5635             cur_curlyx = ST.save_curlyx;
5636             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5637             cur_curlyx->u.curlyx.count--;
5638             CACHEsayNO;
5639             assert(0); /* NOTREACHED */
5640
5641         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5642             /* FALL THROUGH */
5643         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5644             REGCP_UNWIND(ST.lastcp);
5645             regcppop(rex, &maxopenparen);
5646             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5647             cur_curlyx->u.curlyx.count--;
5648             CACHEsayNO;
5649             assert(0); /* NOTREACHED */
5650
5651         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5652             REGCP_UNWIND(ST.lastcp);
5653             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5654             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5655                 "%*s  whilem: failed, trying continuation...\n",
5656                 REPORT_CODE_OFF+depth*2, "")
5657             );
5658           do_whilem_B_max:
5659             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5660                 && ckWARN(WARN_REGEXP)
5661                 && !reginfo->warned)
5662             {
5663                 reginfo->warned = TRUE;
5664                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5665                      "Complex regular subexpression recursion limit (%d) "
5666                      "exceeded",
5667                      REG_INFTY - 1);
5668             }
5669
5670             /* now try B */
5671             ST.save_curlyx = cur_curlyx;
5672             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5673             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5674                                 locinput);
5675             assert(0); /* NOTREACHED */
5676
5677         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5678             cur_curlyx = ST.save_curlyx;
5679             REGCP_UNWIND(ST.lastcp);
5680             regcppop(rex, &maxopenparen);
5681
5682             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5683                 /* Maximum greed exceeded */
5684                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5685                     && ckWARN(WARN_REGEXP)
5686                     && !reginfo->warned)
5687                 {
5688                     reginfo->warned     = TRUE;
5689                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5690                         "Complex regular subexpression recursion "
5691                         "limit (%d) exceeded",
5692                         REG_INFTY - 1);
5693                 }
5694                 cur_curlyx->u.curlyx.count--;
5695                 CACHEsayNO;
5696             }
5697
5698             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5699                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5700             );
5701             /* Try grabbing another A and see if it helps. */
5702             cur_curlyx->u.curlyx.lastloc = locinput;
5703             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5704                             maxopenparen);
5705             REGCP_SET(ST.lastcp);
5706             PUSH_STATE_GOTO(WHILEM_A_min,
5707                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5708                 locinput);
5709             assert(0); /* NOTREACHED */
5710
5711 #undef  ST
5712 #define ST st->u.branch
5713
5714         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5715             next = scan + ARG(scan);
5716             if (next == scan)
5717                 next = NULL;
5718             scan = NEXTOPER(scan);
5719             /* FALL THROUGH */
5720
5721         case BRANCH:        /*  /(...|A|...)/ */
5722             scan = NEXTOPER(scan); /* scan now points to inner node */
5723             ST.lastparen = rex->lastparen;
5724             ST.lastcloseparen = rex->lastcloseparen;
5725             ST.next_branch = next;
5726             REGCP_SET(ST.cp);
5727
5728             /* Now go into the branch */
5729             if (has_cutgroup) {
5730                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5731             } else {
5732                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5733             }
5734             assert(0); /* NOTREACHED */
5735
5736         case CUTGROUP:  /*  /(*THEN)/  */
5737             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5738                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5739             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5740             assert(0); /* NOTREACHED */
5741
5742         case CUTGROUP_next_fail:
5743             do_cutgroup = 1;
5744             no_final = 1;
5745             if (st->u.mark.mark_name)
5746                 sv_commit = st->u.mark.mark_name;
5747             sayNO;
5748             assert(0); /* NOTREACHED */
5749
5750         case BRANCH_next:
5751             sayYES;
5752             assert(0); /* NOTREACHED */
5753
5754         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5755             if (do_cutgroup) {
5756                 do_cutgroup = 0;
5757                 no_final = 0;
5758             }
5759             REGCP_UNWIND(ST.cp);
5760             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5761             scan = ST.next_branch;
5762             /* no more branches? */
5763             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5764                 DEBUG_EXECUTE_r({
5765                     PerlIO_printf( Perl_debug_log,
5766                         "%*s  %sBRANCH failed...%s\n",
5767                         REPORT_CODE_OFF+depth*2, "",
5768                         PL_colors[4],
5769                         PL_colors[5] );
5770                 });
5771                 sayNO_SILENT;
5772             }
5773             continue; /* execute next BRANCH[J] op */
5774             assert(0); /* NOTREACHED */
5775
5776         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5777             minmod = 1;
5778             break;
5779
5780 #undef  ST
5781 #define ST st->u.curlym
5782
5783         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5784
5785             /* This is an optimisation of CURLYX that enables us to push
5786              * only a single backtracking state, no matter how many matches
5787              * there are in {m,n}. It relies on the pattern being constant
5788              * length, with no parens to influence future backrefs
5789              */
5790
5791             ST.me = scan;
5792             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5793
5794             ST.lastparen      = rex->lastparen;
5795             ST.lastcloseparen = rex->lastcloseparen;
5796
5797             /* if paren positive, emulate an OPEN/CLOSE around A */
5798             if (ST.me->flags) {
5799                 U32 paren = ST.me->flags;
5800                 if (paren > maxopenparen)
5801                     maxopenparen = paren;
5802                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5803             }
5804             ST.A = scan;
5805             ST.B = next;
5806             ST.alen = 0;
5807             ST.count = 0;
5808             ST.minmod = minmod;
5809             minmod = 0;
5810             ST.c1 = CHRTEST_UNINIT;
5811             REGCP_SET(ST.cp);
5812
5813             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5814                 goto curlym_do_B;
5815
5816           curlym_do_A: /* execute the A in /A{m,n}B/  */
5817             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5818             assert(0); /* NOTREACHED */
5819
5820         case CURLYM_A: /* we've just matched an A */
5821             ST.count++;
5822             /* after first match, determine A's length: u.curlym.alen */
5823             if (ST.count == 1) {
5824                 if (reginfo->is_utf8_target) {
5825                     char *s = st->locinput;
5826                     while (s < locinput) {
5827                         ST.alen++;
5828                         s += UTF8SKIP(s);
5829                     }
5830                 }
5831                 else {
5832                     ST.alen = locinput - st->locinput;
5833                 }
5834                 if (ST.alen == 0)
5835                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5836             }
5837             DEBUG_EXECUTE_r(
5838                 PerlIO_printf(Perl_debug_log,
5839                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5840                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5841                           (IV) ST.count, (IV)ST.alen)
5842             );
5843
5844             if (cur_eval && cur_eval->u.eval.close_paren &&
5845                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5846                 goto fake_end;
5847
5848             {
5849                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5850                 if ( max == REG_INFTY || ST.count < max )
5851                     goto curlym_do_A; /* try to match another A */
5852             }
5853             goto curlym_do_B; /* try to match B */
5854
5855         case CURLYM_A_fail: /* just failed to match an A */
5856             REGCP_UNWIND(ST.cp);
5857
5858             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5859                 || (cur_eval && cur_eval->u.eval.close_paren &&
5860                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5861                 sayNO;
5862
5863           curlym_do_B: /* execute the B in /A{m,n}B/  */
5864             if (ST.c1 == CHRTEST_UNINIT) {
5865                 /* calculate c1 and c2 for possible match of 1st char
5866                  * following curly */
5867                 ST.c1 = ST.c2 = CHRTEST_VOID;
5868                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5869                     regnode *text_node = ST.B;
5870                     if (! HAS_TEXT(text_node))
5871                         FIND_NEXT_IMPT(text_node);
5872                     /* this used to be
5873
5874                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5875
5876                         But the former is redundant in light of the latter.
5877
5878                         if this changes back then the macro for
5879                         IS_TEXT and friends need to change.
5880                      */
5881                     if (PL_regkind[OP(text_node)] == EXACT) {
5882                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5883                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5884                            reginfo))
5885                         {
5886                             sayNO;
5887                         }
5888                     }
5889                 }
5890             }
5891
5892             DEBUG_EXECUTE_r(
5893                 PerlIO_printf(Perl_debug_log,
5894                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5895                     (int)(REPORT_CODE_OFF+(depth*2)),
5896                     "", (IV)ST.count)
5897                 );
5898             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5899                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5900                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5901                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5902                     {
5903                         /* simulate B failing */
5904                         DEBUG_OPTIMISE_r(
5905                             PerlIO_printf(Perl_debug_log,
5906                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5907                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5908                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5909                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5910                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5911                         );
5912                         state_num = CURLYM_B_fail;
5913                         goto reenter_switch;
5914                     }
5915                 }
5916                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5917                     /* simulate B failing */
5918                     DEBUG_OPTIMISE_r(
5919                         PerlIO_printf(Perl_debug_log,
5920                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5921                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5922                             (int) nextchr, ST.c1, ST.c2)
5923                     );
5924                     state_num = CURLYM_B_fail;
5925                     goto reenter_switch;
5926                 }
5927             }
5928
5929             if (ST.me->flags) {
5930                 /* emulate CLOSE: mark current A as captured */
5931                 I32 paren = ST.me->flags;
5932                 if (ST.count) {
5933                     rex->offs[paren].start
5934                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5935                     rex->offs[paren].end = locinput - reginfo->strbeg;
5936                     if ((U32)paren > rex->lastparen)
5937                         rex->lastparen = paren;
5938                     rex->lastcloseparen = paren;
5939                 }
5940                 else
5941                     rex->offs[paren].end = -1;
5942                 if (cur_eval && cur_eval->u.eval.close_paren &&
5943                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5944                 {
5945                     if (ST.count)
5946                         goto fake_end;
5947                     else
5948                         sayNO;
5949                 }
5950             }
5951
5952             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5953             assert(0); /* NOTREACHED */
5954
5955         case CURLYM_B_fail: /* just failed to match a B */
5956             REGCP_UNWIND(ST.cp);
5957             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5958             if (ST.minmod) {
5959                 I32 max = ARG2(ST.me);
5960                 if (max != REG_INFTY && ST.count == max)
5961                     sayNO;
5962                 goto curlym_do_A; /* try to match a further A */
5963             }
5964             /* backtrack one A */
5965             if (ST.count == ARG1(ST.me) /* min */)
5966                 sayNO;
5967             ST.count--;
5968             SET_locinput(HOPc(locinput, -ST.alen));
5969             goto curlym_do_B; /* try to match B */
5970
5971 #undef ST
5972 #define ST st->u.curly
5973
5974 #define CURLY_SETPAREN(paren, success) \
5975     if (paren) { \
5976         if (success) { \
5977             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5978             rex->offs[paren].end = locinput - reginfo->strbeg; \
5979             if (paren > rex->lastparen) \
5980                 rex->lastparen = paren; \
5981             rex->lastcloseparen = paren; \
5982         } \
5983         else { \
5984             rex->offs[paren].end = -1; \
5985             rex->lastparen      = ST.lastparen; \
5986             rex->lastcloseparen = ST.lastcloseparen; \
5987         } \
5988     }
5989
5990         case STAR:              /*  /A*B/ where A is width 1 char */
5991             ST.paren = 0;
5992             ST.min = 0;
5993             ST.max = REG_INFTY;
5994             scan = NEXTOPER(scan);
5995             goto repeat;
5996
5997         case PLUS:              /*  /A+B/ where A is width 1 char */
5998             ST.paren = 0;
5999             ST.min = 1;
6000             ST.max = REG_INFTY;
6001             scan = NEXTOPER(scan);
6002             goto repeat;
6003
6004         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
6005             ST.paren = scan->flags;     /* Which paren to set */
6006             ST.lastparen      = rex->lastparen;
6007             ST.lastcloseparen = rex->lastcloseparen;
6008             if (ST.paren > maxopenparen)
6009                 maxopenparen = ST.paren;
6010             ST.min = ARG1(scan);  /* min to match */
6011             ST.max = ARG2(scan);  /* max to match */
6012             if (cur_eval && cur_eval->u.eval.close_paren &&
6013                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6014                 ST.min=1;
6015                 ST.max=1;
6016             }
6017             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6018             goto repeat;
6019
6020         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
6021             ST.paren = 0;
6022             ST.min = ARG1(scan);  /* min to match */
6023             ST.max = ARG2(scan);  /* max to match */
6024             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6025           repeat:
6026             /*
6027             * Lookahead to avoid useless match attempts
6028             * when we know what character comes next.
6029             *
6030             * Used to only do .*x and .*?x, but now it allows
6031             * for )'s, ('s and (?{ ... })'s to be in the way
6032             * of the quantifier and the EXACT-like node.  -- japhy
6033             */
6034
6035             assert(ST.min <= ST.max);
6036             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6037                 ST.c1 = ST.c2 = CHRTEST_VOID;
6038             }
6039             else {
6040                 regnode *text_node = next;
6041
6042                 if (! HAS_TEXT(text_node))
6043                     FIND_NEXT_IMPT(text_node);
6044
6045                 if (! HAS_TEXT(text_node))
6046                     ST.c1 = ST.c2 = CHRTEST_VOID;
6047                 else {
6048                     if ( PL_regkind[OP(text_node)] != EXACT ) {
6049                         ST.c1 = ST.c2 = CHRTEST_VOID;
6050                     }
6051                     else {
6052
6053                     /*  Currently we only get here when
6054
6055                         PL_rekind[OP(text_node)] == EXACT
6056
6057                         if this changes back then the macro for IS_TEXT and
6058                         friends need to change. */
6059                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6060                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6061                            reginfo))
6062                         {
6063                             sayNO;
6064                         }
6065                     }
6066                 }
6067             }
6068
6069             ST.A = scan;
6070             ST.B = next;
6071             if (minmod) {
6072                 char *li = locinput;
6073                 minmod = 0;
6074                 if (ST.min &&
6075                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6076                             < ST.min)
6077                     sayNO;
6078                 SET_locinput(li);
6079                 ST.count = ST.min;
6080                 REGCP_SET(ST.cp);
6081                 if (ST.c1 == CHRTEST_VOID)
6082                     goto curly_try_B_min;
6083
6084                 ST.oldloc = locinput;
6085
6086                 /* set ST.maxpos to the furthest point along the
6087                  * string that could possibly match */
6088                 if  (ST.max == REG_INFTY) {
6089                     ST.maxpos = reginfo->strend - 1;
6090                     if (utf8_target)
6091                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6092                             ST.maxpos--;
6093                 }
6094                 else if (utf8_target) {
6095                     int m = ST.max - ST.min;
6096                     for (ST.maxpos = locinput;
6097                          m >0 && ST.maxpos < reginfo->strend; m--)
6098                         ST.maxpos += UTF8SKIP(ST.maxpos);
6099                 }
6100                 else {
6101                     ST.maxpos = locinput + ST.max - ST.min;
6102                     if (ST.maxpos >= reginfo->strend)
6103                         ST.maxpos = reginfo->strend - 1;
6104                 }
6105                 goto curly_try_B_min_known;
6106
6107             }
6108             else {
6109                 /* avoid taking address of locinput, so it can remain
6110                  * a register var */
6111                 char *li = locinput;
6112                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6113                 if (ST.count < ST.min)
6114                     sayNO;
6115                 SET_locinput(li);
6116                 if ((ST.count > ST.min)
6117                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6118                 {
6119                     /* A{m,n} must come at the end of the string, there's
6120                      * no point in backing off ... */
6121                     ST.min = ST.count;
6122                     /* ...except that $ and \Z can match before *and* after
6123                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6124                        We may back off by one in this case. */
6125                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6126                         ST.min--;
6127                 }
6128                 REGCP_SET(ST.cp);
6129                 goto curly_try_B_max;
6130             }
6131             assert(0); /* NOTREACHED */
6132
6133
6134         case CURLY_B_min_known_fail:
6135             /* failed to find B in a non-greedy match where c1,c2 valid */
6136
6137             REGCP_UNWIND(ST.cp);
6138             if (ST.paren) {
6139                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6140             }
6141             /* Couldn't or didn't -- move forward. */
6142             ST.oldloc = locinput;
6143             if (utf8_target)
6144                 locinput += UTF8SKIP(locinput);
6145             else
6146                 locinput++;
6147             ST.count++;
6148           curly_try_B_min_known:
6149              /* find the next place where 'B' could work, then call B */
6150             {
6151                 int n;
6152                 if (utf8_target) {
6153                     n = (ST.oldloc == locinput) ? 0 : 1;
6154                     if (ST.c1 == ST.c2) {
6155                         /* set n to utf8_distance(oldloc, locinput) */
6156                         while (locinput <= ST.maxpos
6157                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6158                         {
6159                             locinput += UTF8SKIP(locinput);
6160                             n++;
6161                         }
6162                     }
6163                     else {
6164                         /* set n to utf8_distance(oldloc, locinput) */
6165                         while (locinput <= ST.maxpos
6166                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6167                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6168                         {
6169                             locinput += UTF8SKIP(locinput);
6170                             n++;
6171                         }
6172                     }
6173                 }
6174                 else {  /* Not utf8_target */
6175                     if (ST.c1 == ST.c2) {
6176                         while (locinput <= ST.maxpos &&
6177                                UCHARAT(locinput) != ST.c1)
6178                             locinput++;
6179                     }
6180                     else {
6181                         while (locinput <= ST.maxpos
6182                                && UCHARAT(locinput) != ST.c1
6183                                && UCHARAT(locinput) != ST.c2)
6184                             locinput++;
6185                     }
6186                     n = locinput - ST.oldloc;
6187                 }
6188                 if (locinput > ST.maxpos)
6189                     sayNO;
6190                 if (n) {
6191                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6192                      * at b; check that everything between oldloc and
6193                      * locinput matches */
6194                     char *li = ST.oldloc;
6195                     ST.count += n;
6196                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6197                         sayNO;
6198                     assert(n == REG_INFTY || locinput == li);
6199                 }
6200                 CURLY_SETPAREN(ST.paren, ST.count);
6201                 if (cur_eval && cur_eval->u.eval.close_paren &&
6202                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6203                     goto fake_end;
6204                 }
6205                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6206             }
6207             assert(0); /* NOTREACHED */
6208
6209
6210         case CURLY_B_min_fail:
6211             /* failed to find B in a non-greedy match where c1,c2 invalid */
6212
6213             REGCP_UNWIND(ST.cp);
6214             if (ST.paren) {
6215                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6216             }
6217             /* failed -- move forward one */
6218             {
6219                 char *li = locinput;
6220                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6221                     sayNO;
6222                 }
6223                 locinput = li;
6224             }
6225             {
6226                 ST.count++;
6227                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6228                         ST.count > 0)) /* count overflow ? */
6229                 {
6230                   curly_try_B_min:
6231                     CURLY_SETPAREN(ST.paren, ST.count);
6232                     if (cur_eval && cur_eval->u.eval.close_paren &&
6233                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6234                         goto fake_end;
6235                     }
6236                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6237                 }
6238             }
6239             sayNO;
6240             assert(0); /* NOTREACHED */
6241
6242
6243         curly_try_B_max:
6244             /* a successful greedy match: now try to match B */
6245             if (cur_eval && cur_eval->u.eval.close_paren &&
6246                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6247                 goto fake_end;
6248             }
6249             {
6250                 bool could_match = locinput < reginfo->strend;
6251
6252                 /* If it could work, try it. */
6253                 if (ST.c1 != CHRTEST_VOID && could_match) {
6254                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6255                     {
6256                         could_match = memEQ(locinput,
6257                                             ST.c1_utf8,
6258                                             UTF8SKIP(locinput))
6259                                     || memEQ(locinput,
6260                                              ST.c2_utf8,
6261                                              UTF8SKIP(locinput));
6262                     }
6263                     else {
6264                         could_match = UCHARAT(locinput) == ST.c1
6265                                       || UCHARAT(locinput) == ST.c2;
6266                     }
6267                 }
6268                 if (ST.c1 == CHRTEST_VOID || could_match) {
6269                     CURLY_SETPAREN(ST.paren, ST.count);
6270                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6271                     assert(0); /* NOTREACHED */
6272                 }
6273             }
6274             /* FALL THROUGH */
6275
6276         case CURLY_B_max_fail:
6277             /* failed to find B in a greedy match */
6278
6279             REGCP_UNWIND(ST.cp);
6280             if (ST.paren) {
6281                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6282             }
6283             /*  back up. */
6284             if (--ST.count < ST.min)
6285                 sayNO;
6286             locinput = HOPc(locinput, -1);
6287             goto curly_try_B_max;
6288
6289 #undef ST
6290
6291         case END: /*  last op of main pattern  */
6292             fake_end:
6293             if (cur_eval) {
6294                 /* we've just finished A in /(??{A})B/; now continue with B */
6295
6296                 st->u.eval.prev_rex = rex_sv;           /* inner */
6297
6298                 /* Save *all* the positions. */
6299                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6300                 rex_sv = cur_eval->u.eval.prev_rex;
6301                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6302                 SET_reg_curpm(rex_sv);
6303                 rex = ReANY(rex_sv);
6304                 rexi = RXi_GET(rex);
6305                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6306
6307                 REGCP_SET(st->u.eval.lastcp);
6308
6309                 /* Restore parens of the outer rex without popping the
6310                  * savestack */
6311                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6312                                         &maxopenparen);
6313
6314                 st->u.eval.prev_eval = cur_eval;
6315                 cur_eval = cur_eval->u.eval.prev_eval;
6316                 DEBUG_EXECUTE_r(
6317                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6318                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6319                 if ( nochange_depth )
6320                     nochange_depth--;
6321
6322                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6323                                     locinput); /* match B */
6324             }
6325
6326             if (locinput < reginfo->till) {
6327                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6328                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6329                                       PL_colors[4],
6330                                       (long)(locinput - startpos),
6331                                       (long)(reginfo->till - startpos),
6332                                       PL_colors[5]));
6333
6334                 sayNO_SILENT;           /* Cannot match: too short. */
6335             }
6336             sayYES;                     /* Success! */
6337
6338         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6339             DEBUG_EXECUTE_r(
6340             PerlIO_printf(Perl_debug_log,
6341                 "%*s  %ssubpattern success...%s\n",
6342                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6343             sayYES;                     /* Success! */
6344
6345 #undef  ST
6346 #define ST st->u.ifmatch
6347
6348         {
6349             char *newstart;
6350
6351         case SUSPEND:   /* (?>A) */
6352             ST.wanted = 1;
6353             newstart = locinput;
6354             goto do_ifmatch;
6355
6356         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6357             ST.wanted = 0;
6358             goto ifmatch_trivial_fail_test;
6359
6360         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6361             ST.wanted = 1;
6362           ifmatch_trivial_fail_test:
6363             if (scan->flags) {
6364                 char * const s = HOPBACKc(locinput, scan->flags);
6365                 if (!s) {
6366                     /* trivial fail */
6367                     if (logical) {
6368                         logical = 0;
6369                         sw = 1 - cBOOL(ST.wanted);
6370                     }
6371                     else if (ST.wanted)
6372                         sayNO;
6373                     next = scan + ARG(scan);
6374                     if (next == scan)
6375                         next = NULL;
6376                     break;
6377                 }
6378                 newstart = s;
6379             }
6380             else
6381                 newstart = locinput;
6382
6383           do_ifmatch:
6384             ST.me = scan;
6385             ST.logical = logical;
6386             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6387
6388             /* execute body of (?...A) */
6389             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6390             assert(0); /* NOTREACHED */
6391         }
6392
6393         case IFMATCH_A_fail: /* body of (?...A) failed */
6394             ST.wanted = !ST.wanted;
6395             /* FALL THROUGH */
6396
6397         case IFMATCH_A: /* body of (?...A) succeeded */
6398             if (ST.logical) {
6399                 sw = cBOOL(ST.wanted);
6400             }
6401             else if (!ST.wanted)
6402                 sayNO;
6403
6404             if (OP(ST.me) != SUSPEND) {
6405                 /* restore old position except for (?>...) */
6406                 locinput = st->locinput;
6407             }
6408             scan = ST.me + ARG(ST.me);
6409             if (scan == ST.me)
6410                 scan = NULL;
6411             continue; /* execute B */
6412
6413 #undef ST
6414
6415         case LONGJMP: /*  alternative with many branches compiles to
6416                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6417             next = scan + ARG(scan);
6418             if (next == scan)
6419                 next = NULL;
6420             break;
6421
6422         case COMMIT:  /*  (*COMMIT)  */
6423             reginfo->cutpoint = reginfo->strend;
6424             /* FALLTHROUGH */
6425
6426         case PRUNE:   /*  (*PRUNE)   */
6427             if (!scan->flags)
6428                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6429             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6430             assert(0); /* NOTREACHED */
6431
6432         case COMMIT_next_fail:
6433             no_final = 1;
6434             /* FALLTHROUGH */
6435
6436         case OPFAIL:   /* (*FAIL)  */
6437             sayNO;
6438             assert(0); /* NOTREACHED */
6439
6440 #define ST st->u.mark
6441         case MARKPOINT: /*  (*MARK:foo)  */
6442             ST.prev_mark = mark_state;
6443             ST.mark_name = sv_commit = sv_yes_mark
6444                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6445             mark_state = st;
6446             ST.mark_loc = locinput;
6447             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6448             assert(0); /* NOTREACHED */
6449
6450         case MARKPOINT_next:
6451             mark_state = ST.prev_mark;
6452             sayYES;
6453             assert(0); /* NOTREACHED */
6454
6455         case MARKPOINT_next_fail:
6456             if (popmark && sv_eq(ST.mark_name,popmark))
6457             {
6458                 if (ST.mark_loc > startpoint)
6459                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6460                 popmark = NULL; /* we found our mark */
6461                 sv_commit = ST.mark_name;
6462
6463                 DEBUG_EXECUTE_r({
6464                         PerlIO_printf(Perl_debug_log,
6465                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6466                             REPORT_CODE_OFF+depth*2, "",
6467                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6468                 });
6469             }
6470             mark_state = ST.prev_mark;
6471             sv_yes_mark = mark_state ?
6472                 mark_state->u.mark.mark_name : NULL;
6473             sayNO;
6474             assert(0); /* NOTREACHED */
6475
6476         case SKIP:  /*  (*SKIP)  */
6477             if (scan->flags) {
6478                 /* (*SKIP) : if we fail we cut here*/
6479                 ST.mark_name = NULL;
6480                 ST.mark_loc = locinput;
6481                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6482             } else {
6483                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6484                    otherwise do nothing.  Meaning we need to scan
6485                  */
6486                 regmatch_state *cur = mark_state;
6487                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6488
6489                 while (cur) {
6490                     if ( sv_eq( cur->u.mark.mark_name,
6491                                 find ) )
6492                     {
6493                         ST.mark_name = find;
6494                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6495                     }
6496                     cur = cur->u.mark.prev_mark;
6497                 }
6498             }
6499             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6500             break;
6501
6502         case SKIP_next_fail:
6503             if (ST.mark_name) {
6504                 /* (*CUT:NAME) - Set up to search for the name as we
6505                    collapse the stack*/
6506                 popmark = ST.mark_name;
6507             } else {
6508                 /* (*CUT) - No name, we cut here.*/
6509                 if (ST.mark_loc > startpoint)
6510                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6511                 /* but we set sv_commit to latest mark_name if there
6512                    is one so they can test to see how things lead to this
6513                    cut */
6514                 if (mark_state)
6515                     sv_commit=mark_state->u.mark.mark_name;
6516             }
6517             no_final = 1;
6518             sayNO;
6519             assert(0); /* NOTREACHED */
6520 #undef ST
6521
6522         case LNBREAK: /* \R */
6523             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6524                 locinput += n;
6525             } else
6526                 sayNO;
6527             break;
6528
6529         default:
6530             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6531                           PTR2UV(scan), OP(scan));
6532             Perl_croak(aTHX_ "regexp memory corruption");
6533
6534         /* this is a point to jump to in order to increment
6535          * locinput by one character */
6536         increment_locinput:
6537             assert(!NEXTCHR_IS_EOS);
6538             if (utf8_target) {
6539                 locinput += PL_utf8skip[nextchr];
6540                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6541                 if (locinput > reginfo->strend)
6542                     sayNO;
6543             }
6544             else
6545                 locinput++;
6546             break;
6547
6548         } /* end switch */
6549
6550         /* switch break jumps here */
6551         scan = next; /* prepare to execute the next op and ... */
6552         continue;    /* ... jump back to the top, reusing st */
6553         assert(0); /* NOTREACHED */
6554
6555       push_yes_state:
6556         /* push a state that backtracks on success */
6557         st->u.yes.prev_yes_state = yes_state;
6558         yes_state = st;
6559         /* FALL THROUGH */
6560       push_state:
6561         /* push a new regex state, then continue at scan  */
6562         {
6563             regmatch_state *newst;
6564
6565             DEBUG_STACK_r({
6566                 regmatch_state *cur = st;
6567                 regmatch_state *curyes = yes_state;
6568                 int curd = depth;
6569                 regmatch_slab *slab = PL_regmatch_slab;
6570                 for (;curd > -1;cur--,curd--) {
6571                     if (cur < SLAB_FIRST(slab)) {
6572                         slab = slab->prev;
6573                         cur = SLAB_LAST(slab);
6574                     }
6575                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6576                         REPORT_CODE_OFF + 2 + depth * 2,"",
6577                         curd, PL_reg_name[cur->resume_state],
6578                         (curyes == cur) ? "yes" : ""
6579                     );
6580                     if (curyes == cur)
6581                         curyes = cur->u.yes.prev_yes_state;
6582                 }
6583             } else
6584                 DEBUG_STATE_pp("push")
6585             );
6586             depth++;
6587             st->locinput = locinput;
6588             newst = st+1;
6589             if (newst >  SLAB_LAST(PL_regmatch_slab))
6590                 newst = S_push_slab(aTHX);
6591             PL_regmatch_state = newst;
6592
6593             locinput = pushinput;
6594             st = newst;
6595             continue;
6596             assert(0); /* NOTREACHED */
6597         }
6598     }
6599
6600     /*
6601     * We get here only if there's trouble -- normally "case END" is
6602     * the terminating point.
6603     */
6604     Perl_croak(aTHX_ "corrupted regexp pointers");
6605     /*NOTREACHED*/
6606     sayNO;
6607
6608 yes:
6609     if (yes_state) {
6610         /* we have successfully completed a subexpression, but we must now
6611          * pop to the state marked by yes_state and continue from there */
6612         assert(st != yes_state);
6613 #ifdef DEBUGGING
6614         while (st != yes_state) {
6615             st--;
6616             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6617                 PL_regmatch_slab = PL_regmatch_slab->prev;
6618                 st = SLAB_LAST(PL_regmatch_slab);
6619             }
6620             DEBUG_STATE_r({
6621                 if (no_final) {
6622                     DEBUG_STATE_pp("pop (no final)");
6623                 } else {
6624                     DEBUG_STATE_pp("pop (yes)");
6625                 }
6626             });
6627             depth--;
6628         }
6629 #else
6630         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6631             || yes_state > SLAB_LAST(PL_regmatch_slab))
6632         {
6633             /* not in this slab, pop slab */
6634             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6635             PL_regmatch_slab = PL_regmatch_slab->prev;
6636             st = SLAB_LAST(PL_regmatch_slab);
6637         }
6638         depth -= (st - yes_state);
6639 #endif
6640         st = yes_state;
6641         yes_state = st->u.yes.prev_yes_state;
6642         PL_regmatch_state = st;
6643
6644         if (no_final)
6645             locinput= st->locinput;
6646         state_num = st->resume_state + no_final;
6647         goto reenter_switch;
6648     }
6649
6650     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6651                           PL_colors[4], PL_colors[5]));
6652
6653     if (reginfo->info_aux_eval) {
6654         /* each successfully executed (?{...}) block does the equivalent of
6655          *   local $^R = do {...}
6656          * When popping the save stack, all these locals would be undone;
6657          * bypass this by setting the outermost saved $^R to the latest
6658          * value */
6659         if (oreplsv != GvSV(PL_replgv))
6660             sv_setsv(oreplsv, GvSV(PL_replgv));
6661     }
6662     result = 1;
6663     goto final_exit;
6664
6665 no:
6666     DEBUG_EXECUTE_r(
6667         PerlIO_printf(Perl_debug_log,
6668             "%*s  %sfailed...%s\n",
6669             REPORT_CODE_OFF+depth*2, "",
6670             PL_colors[4], PL_colors[5])
6671         );
6672
6673 no_silent:
6674     if (no_final) {
6675         if (yes_state) {
6676             goto yes;
6677         } else {
6678             goto final_exit;
6679         }
6680     }
6681     if (depth) {
6682         /* there's a previous state to backtrack to */
6683         st--;
6684         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6685             PL_regmatch_slab = PL_regmatch_slab->prev;
6686             st = SLAB_LAST(PL_regmatch_slab);
6687         }
6688         PL_regmatch_state = st;
6689         locinput= st->locinput;
6690
6691         DEBUG_STATE_pp("pop");
6692         depth--;
6693         if (yes_state == st)
6694             yes_state = st->u.yes.prev_yes_state;
6695
6696         state_num = st->resume_state + 1; /* failure = success + 1 */
6697         goto reenter_switch;
6698     }
6699     result = 0;
6700
6701   final_exit:
6702     if (rex->intflags & PREGf_VERBARG_SEEN) {
6703         SV *sv_err = get_sv("REGERROR", 1);
6704         SV *sv_mrk = get_sv("REGMARK", 1);
6705         if (result) {
6706             sv_commit = &PL_sv_no;
6707             if (!sv_yes_mark)
6708                 sv_yes_mark = &PL_sv_yes;
6709         } else {
6710             if (!sv_commit)
6711                 sv_commit = &PL_sv_yes;
6712             sv_yes_mark = &PL_sv_no;
6713         }
6714         sv_setsv(sv_err, sv_commit);
6715         sv_setsv(sv_mrk, sv_yes_mark);
6716     }
6717
6718
6719     if (last_pushed_cv) {
6720         dSP;
6721         POP_MULTICALL;
6722         PERL_UNUSED_VAR(SP);
6723     }
6724
6725     assert(!result ||  locinput - reginfo->strbeg >= 0);
6726     return result ?  locinput - reginfo->strbeg : -1;
6727 }
6728
6729 /*
6730  - regrepeat - repeatedly match something simple, report how many
6731  *
6732  * What 'simple' means is a node which can be the operand of a quantifier like
6733  * '+', or {1,3}
6734  *
6735  * startposp - pointer a pointer to the start position.  This is updated
6736  *             to point to the byte following the highest successful
6737  *             match.
6738  * p         - the regnode to be repeatedly matched against.
6739  * reginfo   - struct holding match state, such as strend
6740  * max       - maximum number of things to match.
6741  * depth     - (for debugging) backtracking depth.
6742  */
6743 STATIC I32
6744 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6745             regmatch_info *const reginfo, I32 max, int depth)
6746 {
6747     dVAR;
6748     char *scan;     /* Pointer to current position in target string */
6749     I32 c;
6750     char *loceol = reginfo->strend;   /* local version */
6751     I32 hardcount = 0;  /* How many matches so far */
6752     bool utf8_target = reginfo->is_utf8_target;
6753     int to_complement = 0;  /* Invert the result? */
6754     UV utf8_flags;
6755     _char_class_number classnum;
6756 #ifndef DEBUGGING
6757     PERL_UNUSED_ARG(depth);
6758 #endif
6759
6760     PERL_ARGS_ASSERT_REGREPEAT;
6761
6762     scan = *startposp;
6763     if (max == REG_INFTY)
6764         max = I32_MAX;
6765     else if (! utf8_target && loceol - scan > max)
6766         loceol = scan + max;
6767
6768     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6769      * to the maximum of how far we should go in it (leaving it set to the real
6770      * end, if the maximum permissible would take us beyond that).  This allows
6771      * us to make the loop exit condition that we haven't gone past <loceol> to
6772      * also mean that we haven't exceeded the max permissible count, saving a
6773      * test each time through the loop.  But it assumes that the OP matches a
6774      * single byte, which is true for most of the OPs below when applied to a
6775      * non-UTF-8 target.  Those relatively few OPs that don't have this
6776      * characteristic will have to compensate.
6777      *
6778      * There is no adjustment for UTF-8 targets, as the number of bytes per
6779      * character varies.  OPs will have to test both that the count is less
6780      * than the max permissible (using <hardcount> to keep track), and that we
6781      * are still within the bounds of the string (using <loceol>.  A few OPs
6782      * match a single byte no matter what the encoding.  They can omit the max
6783      * test if, for the UTF-8 case, they do the adjustment that was skipped
6784      * above.
6785      *
6786      * Thus, the code above sets things up for the common case; and exceptional
6787      * cases need extra work; the common case is to make sure <scan> doesn't
6788      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6789      * count doesn't exceed the maximum permissible */
6790
6791     switch (OP(p)) {
6792     case REG_ANY:
6793         if (utf8_target) {
6794             while (scan < loceol && hardcount < max && *scan != '\n') {
6795                 scan += UTF8SKIP(scan);
6796                 hardcount++;
6797             }
6798         } else {
6799             while (scan < loceol && *scan != '\n')
6800                 scan++;
6801         }
6802         break;
6803     case SANY:
6804         if (utf8_target) {
6805             while (scan < loceol && hardcount < max) {
6806                 scan += UTF8SKIP(scan);
6807                 hardcount++;
6808             }
6809         }
6810         else
6811             scan = loceol;
6812         break;
6813     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6814         if (utf8_target && loceol - scan > max) {
6815
6816             /* <loceol> hadn't been adjusted in the UTF-8 case */
6817             scan +=  max;
6818         }
6819         else {
6820             scan = loceol;
6821         }
6822         break;
6823     case EXACT:
6824         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6825
6826         c = (U8)*STRING(p);
6827
6828         /* Can use a simple loop if the pattern char to match on is invariant
6829          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6830          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6831          * true iff it doesn't matter if the argument is in UTF-8 or not */
6832         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6833             if (utf8_target && loceol - scan > max) {
6834                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6835                  * since here, to match at all, 1 char == 1 byte */
6836                 loceol = scan + max;
6837             }
6838             while (scan < loceol && UCHARAT(scan) == c) {
6839                 scan++;
6840             }
6841         }
6842         else if (reginfo->is_utf8_pat) {
6843             if (utf8_target) {
6844                 STRLEN scan_char_len;
6845
6846                 /* When both target and pattern are UTF-8, we have to do
6847                  * string EQ */
6848                 while (hardcount < max
6849                        && scan < loceol
6850                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6851                        && memEQ(scan, STRING(p), scan_char_len))
6852                 {
6853                     scan += scan_char_len;
6854                     hardcount++;
6855                 }
6856             }
6857             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6858
6859                 /* Target isn't utf8; convert the character in the UTF-8
6860                  * pattern to non-UTF8, and do a simple loop */
6861                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6862                 while (scan < loceol && UCHARAT(scan) == c) {
6863                     scan++;
6864                 }
6865             } /* else pattern char is above Latin1, can't possibly match the
6866                  non-UTF-8 target */
6867         }
6868         else {
6869
6870             /* Here, the string must be utf8; pattern isn't, and <c> is
6871              * different in utf8 than not, so can't compare them directly.
6872              * Outside the loop, find the two utf8 bytes that represent c, and
6873              * then look for those in sequence in the utf8 string */
6874             U8 high = UTF8_TWO_BYTE_HI(c);
6875             U8 low = UTF8_TWO_BYTE_LO(c);
6876
6877             while (hardcount < max
6878                     && scan + 1 < loceol
6879                     && UCHARAT(scan) == high
6880                     && UCHARAT(scan + 1) == low)
6881             {
6882                 scan += 2;
6883                 hardcount++;
6884             }
6885         }
6886         break;
6887
6888     case EXACTFA:
6889         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6890         goto do_exactf;
6891
6892     case EXACTFL:
6893         RXp_MATCH_TAINTED_on(prog);
6894         utf8_flags = FOLDEQ_UTF8_LOCALE;
6895         goto do_exactf;
6896
6897     case EXACTF:
6898             utf8_flags = 0;
6899             goto do_exactf;
6900
6901     case EXACTFU_SS:
6902     case EXACTFU_TRICKYFOLD:
6903     case EXACTFU:
6904         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6905
6906     do_exactf: {
6907         int c1, c2;
6908         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6909
6910         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6911
6912         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6913                                         reginfo))
6914         {
6915             if (c1 == CHRTEST_VOID) {
6916                 /* Use full Unicode fold matching */
6917                 char *tmpeol = reginfo->strend;
6918                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6919                 while (hardcount < max
6920                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6921                                              STRING(p), NULL, pat_len,
6922                                              reginfo->is_utf8_pat, utf8_flags))
6923                 {
6924                     scan = tmpeol;
6925                     tmpeol = reginfo->strend;
6926                     hardcount++;
6927                 }
6928             }
6929             else if (utf8_target) {
6930                 if (c1 == c2) {
6931                     while (scan < loceol
6932                            && hardcount < max
6933                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6934                     {
6935                         scan += UTF8SKIP(scan);
6936                         hardcount++;
6937                     }
6938                 }
6939                 else {
6940                     while (scan < loceol
6941                            && hardcount < max
6942                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6943                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6944                     {
6945                         scan += UTF8SKIP(scan);
6946                         hardcount++;
6947                     }
6948                 }
6949             }
6950             else if (c1 == c2) {
6951                 while (scan < loceol && UCHARAT(scan) == c1) {
6952                     scan++;
6953                 }
6954             }
6955             else {
6956                 while (scan < loceol &&
6957                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6958                 {
6959                     scan++;
6960                 }
6961             }
6962         }
6963         break;
6964     }
6965     case ANYOF:
6966     case ANYOF_WARN_SUPER:
6967         if (utf8_target) {
6968             while (hardcount < max
6969                    && scan < loceol
6970                    && reginclass(prog, p, (U8*)scan, utf8_target))
6971             {
6972                 scan += UTF8SKIP(scan);
6973                 hardcount++;
6974             }
6975         } else {
6976             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6977                 scan++;
6978         }
6979         break;
6980
6981     /* The argument (FLAGS) to all the POSIX node types is the class number */
6982
6983     case NPOSIXL:
6984         to_complement = 1;
6985         /* FALLTHROUGH */
6986
6987     case POSIXL:
6988         RXp_MATCH_TAINTED_on(prog);
6989         if (! utf8_target) {
6990             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6991                                                                    *scan)))
6992             {
6993                 scan++;
6994             }
6995         } else {
6996             while (hardcount < max && scan < loceol
6997                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6998                                                                   (U8 *) scan)))
6999             {
7000                 scan += UTF8SKIP(scan);
7001                 hardcount++;
7002             }
7003         }
7004         break;
7005
7006     case POSIXD:
7007         if (utf8_target) {
7008             goto utf8_posix;
7009         }
7010         /* FALLTHROUGH */
7011
7012     case POSIXA:
7013         if (utf8_target && loceol - scan > max) {
7014
7015             /* We didn't adjust <loceol> at the beginning of this routine
7016              * because is UTF-8, but it is actually ok to do so, since here, to
7017              * match, 1 char == 1 byte. */
7018             loceol = scan + max;
7019         }
7020         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7021             scan++;
7022         }
7023         break;
7024
7025     case NPOSIXD:
7026         if (utf8_target) {
7027             to_complement = 1;
7028             goto utf8_posix;
7029         }
7030         /* FALL THROUGH */
7031
7032     case NPOSIXA:
7033         if (! utf8_target) {
7034             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7035                 scan++;
7036             }
7037         }
7038         else {
7039
7040             /* The complement of something that matches only ASCII matches all
7041              * UTF-8 variant code points, plus everything in ASCII that isn't
7042              * in the class. */
7043             while (hardcount < max && scan < loceol
7044                    && (! UTF8_IS_INVARIANT(*scan)
7045                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7046             {
7047                 scan += UTF8SKIP(scan);
7048                 hardcount++;
7049             }
7050         }
7051         break;
7052
7053     case NPOSIXU:
7054         to_complement = 1;
7055         /* FALLTHROUGH */
7056
7057     case POSIXU:
7058         if (! utf8_target) {
7059             while (scan < loceol && to_complement
7060                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7061             {
7062                 scan++;
7063             }
7064         }
7065         else {
7066       utf8_posix:
7067             classnum = (_char_class_number) FLAGS(p);
7068             if (classnum < _FIRST_NON_SWASH_CC) {
7069
7070                 /* Here, a swash is needed for above-Latin1 code points.
7071                  * Process as many Latin1 code points using the built-in rules.
7072                  * Go to another loop to finish processing upon encountering
7073                  * the first Latin1 code point.  We could do that in this loop
7074                  * as well, but the other way saves having to test if the swash
7075                  * has been loaded every time through the loop: extra space to
7076                  * save a test. */
7077                 while (hardcount < max && scan < loceol) {
7078                     if (UTF8_IS_INVARIANT(*scan)) {
7079                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7080                                                                    classnum))))
7081                         {
7082                             break;
7083                         }
7084                         scan++;
7085                     }
7086                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7087                         if (! (to_complement
7088                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
7089                                                                    *(scan + 1)),
7090                                                     classnum))))
7091                         {
7092                             break;
7093                         }
7094                         scan += 2;
7095                     }
7096                     else {
7097                         goto found_above_latin1;
7098                     }
7099
7100                     hardcount++;
7101                 }
7102             }
7103             else {
7104                 /* For these character classes, the knowledge of how to handle
7105                  * every code point is compiled in to Perl via a macro.  This
7106                  * code is written for making the loops as tight as possible.
7107                  * It could be refactored to save space instead */
7108                 switch (classnum) {
7109                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7110                                                if we revert the change of \v
7111                                                matching this */
7112                         /* FALL THROUGH */
7113                     case _CC_ENUM_PSXSPC:
7114                         while (hardcount < max
7115                                && scan < loceol
7116                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7117                         {
7118                             scan += UTF8SKIP(scan);
7119                             hardcount++;
7120                         }
7121                         break;
7122                     case _CC_ENUM_BLANK:
7123                         while (hardcount < max
7124                                && scan < loceol
7125                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7126                         {
7127                             scan += UTF8SKIP(scan);
7128                             hardcount++;
7129                         }
7130                         break;
7131                     case _CC_ENUM_XDIGIT:
7132                         while (hardcount < max
7133                                && scan < loceol
7134                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7135                         {
7136                             scan += UTF8SKIP(scan);
7137                             hardcount++;
7138                         }
7139                         break;
7140                     case _CC_ENUM_VERTSPACE:
7141                         while (hardcount < max
7142                                && scan < loceol
7143                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7144                         {
7145                             scan += UTF8SKIP(scan);
7146                             hardcount++;
7147                         }
7148                         break;
7149                     case _CC_ENUM_CNTRL:
7150                         while (hardcount < max
7151                                && scan < loceol
7152                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7153                         {
7154                             scan += UTF8SKIP(scan);
7155                             hardcount++;
7156                         }
7157                         break;
7158                     default:
7159                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7160                 }
7161             }
7162         }
7163         break;
7164
7165       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7166
7167         /* Load the swash if not already present */
7168         if (! PL_utf8_swash_ptrs[classnum]) {
7169             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7170             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7171                                         "utf8", swash_property_names[classnum],
7172                                         &PL_sv_undef, 1, 0, NULL, &flags);
7173         }
7174
7175         while (hardcount < max && scan < loceol
7176                && to_complement ^ cBOOL(_generic_utf8(
7177                                        classnum,
7178                                        scan,
7179                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7180                                                    (U8 *) scan,
7181                                                    TRUE))))
7182         {
7183             scan += UTF8SKIP(scan);
7184             hardcount++;
7185         }
7186         break;
7187
7188     case LNBREAK:
7189         if (utf8_target) {
7190             while (hardcount < max && scan < loceol &&
7191                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7192                 scan += c;
7193                 hardcount++;
7194             }
7195         } else {
7196             /* LNBREAK can match one or two latin chars, which is ok, but we
7197              * have to use hardcount in this situation, and throw away the
7198              * adjustment to <loceol> done before the switch statement */
7199             loceol = reginfo->strend;
7200             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7201                 scan+=c;
7202                 hardcount++;
7203             }
7204         }
7205         break;
7206
7207     case BOUND:
7208     case BOUNDA:
7209     case BOUNDL:
7210     case BOUNDU:
7211     case EOS:
7212     case GPOS:
7213     case KEEPS:
7214     case NBOUND:
7215     case NBOUNDA:
7216     case NBOUNDL:
7217     case NBOUNDU:
7218     case OPFAIL:
7219     case SBOL:
7220     case SEOL:
7221         /* These are all 0 width, so match right here or not at all. */
7222         break;
7223
7224     default:
7225         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7226         assert(0); /* NOTREACHED */
7227
7228     }
7229
7230     if (hardcount)
7231         c = hardcount;
7232     else
7233         c = scan - *startposp;
7234     *startposp = scan;
7235
7236     DEBUG_r({
7237         GET_RE_DEBUG_FLAGS_DECL;
7238         DEBUG_EXECUTE_r({
7239             SV * const prop = sv_newmortal();
7240             regprop(prog, prop, p);
7241             PerlIO_printf(Perl_debug_log,
7242                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7243                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7244         });
7245     });
7246
7247     return(c);
7248 }
7249
7250
7251 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7252 /*
7253 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7254 create a copy so that changes the caller makes won't change the shared one.
7255 If <altsvp> is non-null, will return NULL in it, for back-compat.
7256  */
7257 SV *
7258 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7259 {
7260     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7261
7262     if (altsvp) {
7263         *altsvp = NULL;
7264     }
7265
7266     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7267 }
7268 #endif
7269
7270 STATIC SV *
7271 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7272 {
7273     /* Returns the swash for the input 'node' in the regex 'prog'.
7274      * If <doinit> is 'true', will attempt to create the swash if not already
7275      *    done.
7276      * If <listsvp> is non-null, will return the printable contents of the
7277      *    swash.  This can be used to get debugging information even before the
7278      *    swash exists, by calling this function with 'doinit' set to false, in
7279      *    which case the components that will be used to eventually create the
7280      *    swash are returned  (in a printable form).
7281      * Tied intimately to how regcomp.c sets up the data structure */
7282
7283     dVAR;
7284     SV *sw  = NULL;
7285     SV *si  = NULL;         /* Input swash initialization string */
7286     SV*  invlist = NULL;
7287
7288     RXi_GET_DECL(prog,progi);
7289     const struct reg_data * const data = prog ? progi->data : NULL;
7290
7291     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7292
7293     assert(ANYOF_NONBITMAP(node));
7294
7295     if (data && data->count) {
7296         const U32 n = ARG(node);
7297
7298         if (data->what[n] == 's') {
7299             SV * const rv = MUTABLE_SV(data->data[n]);
7300             AV * const av = MUTABLE_AV(SvRV(rv));
7301             SV **const ary = AvARRAY(av);
7302             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7303
7304             si = *ary;  /* ary[0] = the string to initialize the swash with */
7305
7306             /* Elements 2 and 3 are either both present or both absent. [2] is
7307              * any inversion list generated at compile time; [3] indicates if
7308              * that inversion list has any user-defined properties in it. */
7309             if (av_len(av) >= 2) {
7310                 invlist = ary[2];
7311                 if (SvUV(ary[3])) {
7312                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7313                 }
7314             }
7315             else {
7316                 invlist = NULL;
7317             }
7318
7319             /* Element [1] is reserved for the set-up swash.  If already there,
7320              * return it; if not, create it and store it there */
7321             if (SvROK(ary[1])) {
7322                 sw = ary[1];
7323             }
7324             else if (si && doinit) {
7325
7326                 sw = _core_swash_init("utf8", /* the utf8 package */
7327                                       "", /* nameless */
7328                                       si,
7329                                       1, /* binary */
7330                                       0, /* not from tr/// */
7331                                       invlist,
7332                                       &swash_init_flags);
7333                 (void)av_store(av, 1, sw);
7334             }
7335         }
7336     }
7337
7338     /* If requested, return a printable version of what this swash matches */
7339     if (listsvp) {
7340         SV* matches_string = newSVpvn("", 0);
7341
7342         /* The swash should be used, if possible, to get the data, as it
7343          * contains the resolved data.  But this function can be called at
7344          * compile-time, before everything gets resolved, in which case we
7345          * return the currently best available information, which is the string
7346          * that will eventually be used to do that resolving, 'si' */
7347         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7348             && (si && si != &PL_sv_undef))
7349         {
7350             sv_catsv(matches_string, si);
7351         }
7352
7353         /* Add the inversion list to whatever we have.  This may have come from
7354          * the swash, or from an input parameter */
7355         if (invlist) {
7356             sv_catsv(matches_string, _invlist_contents(invlist));
7357         }
7358         *listsvp = matches_string;
7359     }
7360
7361     return sw;
7362 }
7363
7364 /*
7365  - reginclass - determine if a character falls into a character class
7366
7367   n is the ANYOF regnode
7368   p is the target string
7369   utf8_target tells whether p is in UTF-8.
7370
7371   Returns true if matched; false otherwise.
7372
7373   Note that this can be a synthetic start class, a combination of various
7374   nodes, so things you think might be mutually exclusive, such as locale,
7375   aren't.  It can match both locale and non-locale
7376
7377  */
7378
7379 STATIC bool
7380 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7381 {
7382     dVAR;
7383     const char flags = ANYOF_FLAGS(n);
7384     bool match = FALSE;
7385     UV c = *p;
7386
7387     PERL_ARGS_ASSERT_REGINCLASS;
7388
7389     /* If c is not already the code point, get it.  Note that
7390      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7391     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7392         STRLEN c_len = 0;
7393         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7394                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7395                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7396                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7397                  * UTF8_ALLOW_FFFF */
7398         if (c_len == (STRLEN)-1)
7399             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7400     }
7401
7402     /* If this character is potentially in the bitmap, check it */
7403     if (c < 256) {
7404         if (ANYOF_BITMAP_TEST(n, c))
7405             match = TRUE;
7406         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7407                 && ! utf8_target
7408                 && ! isASCII(c))
7409         {
7410             match = TRUE;
7411         }
7412         else if (flags & ANYOF_LOCALE) {
7413             RXp_MATCH_TAINTED_on(prog);
7414
7415             if ((flags & ANYOF_LOC_FOLD)
7416                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7417             {
7418                 match = TRUE;
7419             }
7420             else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7421
7422                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7423                  * if the class includes the Posix character class given by
7424                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7425                  * complemented Posix class given by int(bit/2).  So we loop
7426                  * through the bits, each time changing whether we complement
7427                  * the result or not.  Suppose for the sake of illustration
7428                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7429                  * is set, it means there is a match for this ANYOF node if the
7430                  * character is in the class given by the expression (0 / 2 = 0
7431                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7432                  * and since 'to_complement' is 0, the result will stay TRUE,
7433                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7434                  * bit 1 is 1.  That means there is a match if the character
7435                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7436                  * but will on bit 1.  On the second iteration 'to_complement'
7437                  * will be 1, so the exclusive or will reverse things, so we
7438                  * are testing for \W.  On the third iteration, 'to_complement'
7439                  * will be 0, and we would be testing for \s; the fourth
7440                  * iteration would test for \S, etc.
7441                  *
7442                  * Note that this code assumes that all the classes are closed
7443                  * under folding.  For example, if a character matches \w, then
7444                  * its fold does too; and vice versa.  This should be true for
7445                  * any well-behaved locale for all the currently defined Posix
7446                  * classes, except for :lower: and :upper:, which are handled
7447                  * by the pseudo-class :cased: which matches if either of the
7448                  * other two does.  To get rid of this assumption, an outer
7449                  * loop could be used below to iterate over both the source
7450                  * character, and its fold (if different) */
7451
7452                 int count = 0;
7453                 int to_complement = 0;
7454                 while (count < ANYOF_MAX) {
7455                     if (ANYOF_CLASS_TEST(n, count)
7456                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7457                     {
7458                         match = TRUE;
7459                         break;
7460                     }
7461                     count++;
7462                     to_complement ^= 1;
7463                 }
7464             }
7465         }
7466     }
7467
7468     /* If the bitmap didn't (or couldn't) match, and something outside the
7469      * bitmap could match, try that.  Locale nodes specify completely the
7470      * behavior of code points in the bit map (otherwise, a utf8 target would
7471      * cause them to be treated as Unicode and not locale), except in
7472      * the very unlikely event when this node is a synthetic start class, which
7473      * could be a combination of locale and non-locale nodes.  So allow locale
7474      * to match for the synthetic start class, which will give a false
7475      * positive that will be resolved when the match is done again as not part
7476      * of the synthetic start class */
7477     if (!match) {
7478         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7479             match = TRUE;       /* Everything above 255 matches */
7480         }
7481         else if (ANYOF_NONBITMAP(n)
7482                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7483                      || (utf8_target
7484                          && (c >=256
7485                              || (! (flags & ANYOF_LOCALE))
7486                              || OP(n) == ANYOF_SYNTHETIC))))
7487         {
7488             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7489             if (sw) {
7490                 U8 * utf8_p;
7491                 if (utf8_target) {
7492                     utf8_p = (U8 *) p;
7493                 } else { /* Convert to utf8 */
7494                     STRLEN len = 1;
7495                     utf8_p = bytes_to_utf8(p, &len);
7496                 }
7497
7498                 if (swash_fetch(sw, utf8_p, TRUE)) {
7499                     match = TRUE;
7500                 }
7501
7502                 /* If we allocated a string above, free it */
7503                 if (! utf8_target) Safefree(utf8_p);
7504             }
7505         }
7506
7507         if (UNICODE_IS_SUPER(c)
7508             && OP(n) == ANYOF_WARN_SUPER
7509             && ckWARN_d(WARN_NON_UNICODE))
7510         {
7511             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7512                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7513         }
7514     }
7515
7516     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7517     return cBOOL(flags & ANYOF_INVERT) ^ match;
7518 }
7519
7520 STATIC U8 *
7521 S_reghop3(U8 *s, I32 off, const U8* lim)
7522 {
7523     /* return the position 'off' UTF-8 characters away from 's', forward if
7524      * 'off' >= 0, backwards if negative.  But don't go outside of position
7525      * 'lim', which better be < s  if off < 0 */
7526
7527     dVAR;
7528
7529     PERL_ARGS_ASSERT_REGHOP3;
7530
7531     if (off >= 0) {
7532         while (off-- && s < lim) {
7533             /* XXX could check well-formedness here */
7534             s += UTF8SKIP(s);
7535         }
7536     }
7537     else {
7538         while (off++ && s > lim) {
7539             s--;
7540             if (UTF8_IS_CONTINUED(*s)) {
7541                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7542                     s--;
7543             }
7544             /* XXX could check well-formedness here */
7545         }
7546     }
7547     return s;
7548 }
7549
7550 #ifdef XXX_dmq
7551 /* there are a bunch of places where we use two reghop3's that should
7552    be replaced with this routine. but since thats not done yet
7553    we ifdef it out - dmq
7554 */
7555 STATIC U8 *
7556 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7557 {
7558     dVAR;
7559
7560     PERL_ARGS_ASSERT_REGHOP4;
7561
7562     if (off >= 0) {
7563         while (off-- && s < rlim) {
7564             /* XXX could check well-formedness here */
7565             s += UTF8SKIP(s);
7566         }
7567     }
7568     else {
7569         while (off++ && s > llim) {
7570             s--;
7571             if (UTF8_IS_CONTINUED(*s)) {
7572                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7573                     s--;
7574             }
7575             /* XXX could check well-formedness here */
7576         }
7577     }
7578     return s;
7579 }
7580 #endif
7581
7582 STATIC U8 *
7583 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7584 {
7585     dVAR;
7586
7587     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7588
7589     if (off >= 0) {
7590         while (off-- && s < lim) {
7591             /* XXX could check well-formedness here */
7592             s += UTF8SKIP(s);
7593         }
7594         if (off >= 0)
7595             return NULL;
7596     }
7597     else {
7598         while (off++ && s > lim) {
7599             s--;
7600             if (UTF8_IS_CONTINUED(*s)) {
7601                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7602                     s--;
7603             }
7604             /* XXX could check well-formedness here */
7605         }
7606         if (off <= 0)
7607             return NULL;
7608     }
7609     return s;
7610 }
7611
7612
7613 /* when executing a regex that may have (?{}), extra stuff needs setting
7614    up that will be visible to the called code, even before the current
7615    match has finished. In particular:
7616
7617    * $_ is localised to the SV currently being matched;
7618    * pos($_) is created if necessary, ready to be updated on each call-out
7619      to code;
7620    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7621      isn't set until the current pattern is successfully finished), so that
7622      $1 etc of the match-so-far can be seen;
7623    * save the old values of subbeg etc of the current regex, and  set then
7624      to the current string (again, this is normally only done at the end
7625      of execution)
7626 */
7627
7628 static void
7629 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7630 {
7631     MAGIC *mg;
7632     regexp *const rex = ReANY(reginfo->prog);
7633     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7634
7635     eval_state->rex = rex;
7636
7637     if (reginfo->sv) {
7638         /* Make $_ available to executed code. */
7639         if (reginfo->sv != DEFSV) {
7640             SAVE_DEFSV;
7641             DEFSV_set(reginfo->sv);
7642         }
7643
7644         if (!(mg = mg_find_mglob(reginfo->sv))) {
7645             /* prepare for quick setting of pos */
7646             mg = sv_magicext_mglob(reginfo->sv);
7647             mg->mg_len = -1;
7648         }
7649         eval_state->pos_magic = mg;
7650         eval_state->pos       = mg->mg_len;
7651     }
7652     else
7653         eval_state->pos_magic = NULL;
7654
7655     if (!PL_reg_curpm) {
7656         /* PL_reg_curpm is a fake PMOP that we can attach the current
7657          * regex to and point PL_curpm at, so that $1 et al are visible
7658          * within a /(?{})/. It's just allocated once per interpreter the
7659          * first time its needed */
7660         Newxz(PL_reg_curpm, 1, PMOP);
7661 #ifdef USE_ITHREADS
7662         {
7663             SV* const repointer = &PL_sv_undef;
7664             /* this regexp is also owned by the new PL_reg_curpm, which
7665                will try to free it.  */
7666             av_push(PL_regex_padav, repointer);
7667             PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7668             PL_regex_pad = AvARRAY(PL_regex_padav);
7669         }
7670 #endif
7671     }
7672     SET_reg_curpm(reginfo->prog);
7673     eval_state->curpm = PL_curpm;
7674     PL_curpm = PL_reg_curpm;
7675     if (RXp_MATCH_COPIED(rex)) {
7676         /*  Here is a serious problem: we cannot rewrite subbeg,
7677             since it may be needed if this match fails.  Thus
7678             $` inside (?{}) could fail... */
7679         eval_state->subbeg     = rex->subbeg;
7680         eval_state->sublen     = rex->sublen;
7681         eval_state->suboffset  = rex->suboffset;
7682         eval_state->subcoffset = rex->subcoffset;
7683 #ifdef PERL_ANY_COW
7684         eval_state->saved_copy = rex->saved_copy;
7685 #endif
7686         RXp_MATCH_COPIED_off(rex);
7687     }
7688     else
7689         eval_state->subbeg = NULL;
7690     rex->subbeg = (char *)reginfo->strbeg;
7691     rex->suboffset = 0;
7692     rex->subcoffset = 0;
7693     rex->sublen = reginfo->strend - reginfo->strbeg;
7694 }
7695
7696
7697 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7698
7699 static void
7700 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7701 {
7702     dVAR;
7703     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7704     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7705     regmatch_slab *s;
7706
7707     Safefree(aux->poscache);
7708
7709     if (eval_state) {
7710
7711         /* undo the effects of S_setup_eval_state() */
7712
7713         if (eval_state->subbeg) {
7714             regexp * const rex = eval_state->rex;
7715             rex->subbeg     = eval_state->subbeg;
7716             rex->sublen     = eval_state->sublen;
7717             rex->suboffset  = eval_state->suboffset;
7718             rex->subcoffset = eval_state->subcoffset;
7719 #ifdef PERL_ANY_COW
7720             rex->saved_copy = eval_state->saved_copy;
7721 #endif
7722             RXp_MATCH_COPIED_on(rex);
7723         }
7724         if (eval_state->pos_magic)
7725             eval_state->pos_magic->mg_len = eval_state->pos;
7726
7727         PL_curpm = eval_state->curpm;
7728     }
7729
7730     PL_regmatch_state = aux->old_regmatch_state;
7731     PL_regmatch_slab  = aux->old_regmatch_slab;
7732
7733     /* free all slabs above current one - this must be the last action
7734      * of this function, as aux and eval_state are allocated within
7735      * slabs and may be freed here */
7736
7737     s = PL_regmatch_slab->next;
7738     if (s) {
7739         PL_regmatch_slab->next = NULL;
7740         while (s) {
7741             regmatch_slab * const osl = s;
7742             s = s->next;
7743             Safefree(osl);
7744         }
7745     }
7746 }
7747
7748
7749 STATIC void
7750 S_to_utf8_substr(pTHX_ regexp *prog)
7751 {
7752     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7753      * on the converted value */
7754
7755     int i = 1;
7756
7757     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7758
7759     do {
7760         if (prog->substrs->data[i].substr
7761             && !prog->substrs->data[i].utf8_substr) {
7762             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7763             prog->substrs->data[i].utf8_substr = sv;
7764             sv_utf8_upgrade(sv);
7765             if (SvVALID(prog->substrs->data[i].substr)) {
7766                 if (SvTAIL(prog->substrs->data[i].substr)) {
7767                     /* Trim the trailing \n that fbm_compile added last
7768                        time.  */
7769                     SvCUR_set(sv, SvCUR(sv) - 1);
7770                     /* Whilst this makes the SV technically "invalid" (as its
7771                        buffer is no longer followed by "\0") when fbm_compile()
7772                        adds the "\n" back, a "\0" is restored.  */
7773                     fbm_compile(sv, FBMcf_TAIL);
7774                 } else
7775                     fbm_compile(sv, 0);
7776             }
7777             if (prog->substrs->data[i].substr == prog->check_substr)
7778                 prog->check_utf8 = sv;
7779         }
7780     } while (i--);
7781 }
7782
7783 STATIC bool
7784 S_to_byte_substr(pTHX_ regexp *prog)
7785 {
7786     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7787      * on the converted value; returns FALSE if can't be converted. */
7788
7789     dVAR;
7790     int i = 1;
7791
7792     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7793
7794     do {
7795         if (prog->substrs->data[i].utf8_substr
7796             && !prog->substrs->data[i].substr) {
7797             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7798             if (! sv_utf8_downgrade(sv, TRUE)) {
7799                 return FALSE;
7800             }
7801             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7802                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7803                     /* Trim the trailing \n that fbm_compile added last
7804                         time.  */
7805                     SvCUR_set(sv, SvCUR(sv) - 1);
7806                     fbm_compile(sv, FBMcf_TAIL);
7807                 } else
7808                     fbm_compile(sv, 0);
7809             }
7810             prog->substrs->data[i].substr = sv;
7811             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7812                 prog->check_substr = sv;
7813         }
7814     } while (i--);
7815
7816     return TRUE;
7817 }
7818
7819 /*
7820  * Local variables:
7821  * c-indentation-style: bsd
7822  * c-basic-offset: 4
7823  * indent-tabs-mode: nil
7824  * End:
7825  *
7826  * ex: set ts=8 sts=4 sw=4 et:
7827  */