src/5019001/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86 #include "re_defs.h"
  87
  88 #ifdef PERL_IN_XSUB_RE
  89 #  include "re_comp.h"
  90 #else
  91 #  include "regcomp.h"
  92 #endif
  93
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98
  99 #ifndef STATIC
 100 #define STATIC  static
 101 #endif
 102
 103 /* Valid for non-utf8 strings: avoids the reginclass
 104  * call if there are no complications: i.e., if everything matchable is
 105  * straight forward in the bitmap */
 106 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 107                                               : ANYOF_BITMAP_TEST(p,*(c)))
 108
 109 /*
 110  * Forwards.
 111  */
 112
 113 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 114 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 115
 116 #define HOPc(pos,off) \
 117         (char *)(reginfo->is_utf8_target \
 118             ? reghop3((U8*)pos, off, \
 119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120             : (U8*)(pos + off))
 121 #define HOPBACKc(pos, off) \
 122         (char*)(reginfo->is_utf8_target \
 123             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 124             : (pos - off >= reginfo->strbeg)    \
 125                 ? (U8*)pos - off                \
 126                 : NULL)
 127
 128 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 129 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 130
 131
 132 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 133 #define NEXTCHR_IS_EOS (nextchr < 0)
 134
 135 #define SET_nextchr \
 136     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 137
 138 #define SET_locinput(p) \
 139     locinput = (p);  \
 140     SET_nextchr
 141
 142
 143 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 144         if (!swash_ptr) {                                                     \
 145             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 146             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 147                                          1, 0, NULL, &flags);                 \
 148             assert(swash_ptr);                                                \
 149         }                                                                     \
 150     } STMT_END
 151
 152 /* If in debug mode, we test that a known character properly matches */
 153 #ifdef DEBUGGING
 154 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 155                                           property_name,                      \
 156                                           utf8_char_in_property)              \
 157         LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 158         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 159 #else
 160 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 161                                           property_name,                      \
 162                                           utf8_char_in_property)              \
 163         LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 164 #endif
 165
 166 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 167                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 168                                         swash_property_names[_CC_WORDCHAR],   \
 169                                         GREEK_SMALL_LETTER_IOTA_UTF8)
 170
 171 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 172     STMT_START {                                                              \
 173         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 174                                        "_X_regular_begin",                    \
 175                                        GREEK_SMALL_LETTER_IOTA_UTF8);         \
 176         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 177                                        "_X_extend",                           \
 178                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 179     } STMT_END
 180
 181 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 182 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 183
 184 /* for use after a quantifier and before an EXACT-like node -- japhy */
 185 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 186  *
 187  * NOTE that *nothing* that affects backtracking should be in here, specifically
 188  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 189  * node that is in between two EXACT like nodes when ascertaining what the required
 190  * "follow" character is. This should probably be moved to regex compile time
 191  * although it may be done at run time beause of the REF possibility - more
 192  * investigation required. -- demerphq
 193 */
 194 #define JUMPABLE(rn) (      \
 195     OP(rn) == OPEN ||       \
 196     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 197     OP(rn) == EVAL ||   \
 198     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 199     OP(rn) == PLUS || OP(rn) == MINMOD || \
 200     OP(rn) == KEEPS || \
 201     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 202 )
 203 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 204
 205 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 206
 207 #if 0
 208 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 209    we don't need this definition. */
 210 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 211 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 212 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 213
 214 #else
 215 /* ... so we use this as its faster. */
 216 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 217 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 218 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 219 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 220
 221 #endif
 222
 223 /*
 224   Search for mandatory following text node; for lookahead, the text must
 225   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 226 */
 227 #define FIND_NEXT_IMPT(rn) STMT_START { \
 228     while (JUMPABLE(rn)) { \
 229         const OPCODE type = OP(rn); \
 230         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 231             rn = NEXTOPER(NEXTOPER(rn)); \
 232         else if (type == PLUS) \
 233             rn = NEXTOPER(rn); \
 234         else if (type == IFMATCH) \
 235             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 236         else rn += NEXT_OFF(rn); \
 237     } \
 238 } STMT_END
 239
 240 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 241  * These are for the pre-composed Hangul syllables, which are all in a
 242  * contiguous block and arranged there in such a way so as to facilitate
 243  * alorithmic determination of their characteristics.  As such, they don't need
 244  * a swash, but can be determined by simple arithmetic.  Almost all are
 245  * GCB=LVT, but every 28th one is a GCB=LV */
 246 #define SBASE 0xAC00    /* Start of block */
 247 #define SCount 11172    /* Length of block */
 248 #define TCount 28
 249
 250 #define SLAB_FIRST(s) (&(s)->states[0])
 251 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 252
 253 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 254 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 255 static regmatch_state * S_push_slab(pTHX);
 256
 257 #define REGCP_PAREN_ELEMS 3
 258 #define REGCP_OTHER_ELEMS 3
 259 #define REGCP_FRAME_ELEMS 1
 260 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 261  * are needed for the regexp context stack bookkeeping. */
 262
 263 STATIC CHECKPOINT
 264 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 265 {
 266     dVAR;
 267     const int retval = PL_savestack_ix;
 268     const int paren_elems_to_push =
 269                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 270     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 271     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 272     I32 p;
 273     GET_RE_DEBUG_FLAGS_DECL;
 274
 275     PERL_ARGS_ASSERT_REGCPPUSH;
 276
 277     if (paren_elems_to_push < 0)
 278         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 279                    paren_elems_to_push);
 280
 281     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 282         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 283                    " out of range (%lu-%ld)",
 284                    total_elems,
 285                    (unsigned long)maxopenparen,
 286                    (long)parenfloor);
 287
 288     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 289
 290     DEBUG_BUFFERS_r(
 291         if ((int)maxopenparen > (int)parenfloor)
 292             PerlIO_printf(Perl_debug_log,
 293                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 294                 PTR2UV(rex),
 295                 PTR2UV(rex->offs)
 296             );
 297     );
 298     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 299 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 300         SSPUSHINT(rex->offs[p].end);
 301         SSPUSHINT(rex->offs[p].start);
 302         SSPUSHINT(rex->offs[p].start_tmp);
 303         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 304             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 305             (UV)p,
 306             (IV)rex->offs[p].start,
 307             (IV)rex->offs[p].start_tmp,
 308             (IV)rex->offs[p].end
 309         ));
 310     }
 311 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 312     SSPUSHINT(maxopenparen);
 313     SSPUSHINT(rex->lastparen);
 314     SSPUSHINT(rex->lastcloseparen);
 315     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 316
 317     return retval;
 318 }
 319
 320 /* These are needed since we do not localize EVAL nodes: */
 321 #define REGCP_SET(cp)                                           \
 322     DEBUG_STATE_r(                                              \
 323             PerlIO_printf(Perl_debug_log,                       \
 324                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 325                 (IV)PL_savestack_ix));                          \
 326     cp = PL_savestack_ix
 327
 328 #define REGCP_UNWIND(cp)                                        \
 329     DEBUG_STATE_r(                                              \
 330         if (cp != PL_savestack_ix)                              \
 331             PerlIO_printf(Perl_debug_log,                       \
 332                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 333                 (IV)(cp), (IV)PL_savestack_ix));                \
 334     regcpblow(cp)
 335
 336 #define UNWIND_PAREN(lp, lcp)               \
 337     for (n = rex->lastparen; n > lp; n--)   \
 338         rex->offs[n].end = -1;              \
 339     rex->lastparen = n;                     \
 340     rex->lastcloseparen = lcp;
 341
 342
 343 STATIC void
 344 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 345 {
 346     dVAR;
 347     UV i;
 348     U32 paren;
 349     GET_RE_DEBUG_FLAGS_DECL;
 350
 351     PERL_ARGS_ASSERT_REGCPPOP;
 352
 353     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 354     i = SSPOPUV;
 355     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 356     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 357     rex->lastcloseparen = SSPOPINT;
 358     rex->lastparen = SSPOPINT;
 359     *maxopenparen_p = SSPOPINT;
 360
 361     i -= REGCP_OTHER_ELEMS;
 362     /* Now restore the parentheses context. */
 363     DEBUG_BUFFERS_r(
 364         if (i || rex->lastparen + 1 <= rex->nparens)
 365             PerlIO_printf(Perl_debug_log,
 366                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 367                 PTR2UV(rex),
 368                 PTR2UV(rex->offs)
 369             );
 370     );
 371     paren = *maxopenparen_p;
 372     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 373         I32 tmps;
 374         rex->offs[paren].start_tmp = SSPOPINT;
 375         rex->offs[paren].start = SSPOPINT;
 376         tmps = SSPOPINT;
 377         if (paren <= rex->lastparen)
 378             rex->offs[paren].end = tmps;
 379         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 380             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 381             (UV)paren,
 382             (IV)rex->offs[paren].start,
 383             (IV)rex->offs[paren].start_tmp,
 384             (IV)rex->offs[paren].end,
 385             (paren > rex->lastparen ? "(skipped)" : ""));
 386         );
 387         paren--;
 388     }
 389 #if 1
 390     /* It would seem that the similar code in regtry()
 391      * already takes care of this, and in fact it is in
 392      * a better location to since this code can #if 0-ed out
 393      * but the code in regtry() is needed or otherwise tests
 394      * requiring null fields (pat.t#187 and split.t#{13,14}
 395      * (as of patchlevel 7877)  will fail.  Then again,
 396      * this code seems to be necessary or otherwise
 397      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 398      * --jhi updated by dapm */
 399     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 400         if (i > *maxopenparen_p)
 401             rex->offs[i].start = -1;
 402         rex->offs[i].end = -1;
 403         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 404             "    \\%"UVuf": %s   ..-1 undeffing\n",
 405             (UV)i,
 406             (i > *maxopenparen_p) ? "-1" : "  "
 407         ));
 408     }
 409 #endif
 410 }
 411
 412 /* restore the parens and associated vars at savestack position ix,
 413  * but without popping the stack */
 414
 415 STATIC void
 416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 417 {
 418     I32 tmpix = PL_savestack_ix;
 419     PL_savestack_ix = ix;
 420     regcppop(rex, maxopenparen_p);
 421     PL_savestack_ix = tmpix;
 422 }
 423
 424 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 425
 426 STATIC bool
 427 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 428 {
 429     /* Returns a boolean as to whether or not 'character' is a member of the
 430      * Posix character class given by 'classnum' that should be equivalent to a
 431      * value in the typedef '_char_class_number'.
 432      *
 433      * Ideally this could be replaced by a just an array of function pointers
 434      * to the C library functions that implement the macros this calls.
 435      * However, to compile, the precise function signatures are required, and
 436      * these may vary from platform to to platform.  To avoid having to figure
 437      * out what those all are on each platform, I (khw) am using this method,
 438      * which adds an extra layer of function call overhead (unless the C
 439      * optimizer strips it away).  But we don't particularly care about
 440      * performance with locales anyway. */
 441
 442     switch ((_char_class_number) classnum) {
 443         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 444         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 445         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 446         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 447         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 448                                         || isUPPER_LC(character);
 449         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 450         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 451         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 452         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 453         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 454         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 455         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 456         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 457         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 458         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 459         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 460         default:    /* VERTSPACE should never occur in locales */
 461             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 462     }
 463
 464     assert(0); /* NOTREACHED */
 465     return FALSE;
 466 }
 467
 468 STATIC bool
 469 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 470 {
 471     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 472      * 'character' is a member of the Posix character class given by 'classnum'
 473      * that should be equivalent to a value in the typedef
 474      * '_char_class_number'.
 475      *
 476      * This just calls isFOO_lc on the code point for the character if it is in
 477      * the range 0-255.  Outside that range, all characters avoid Unicode
 478      * rules, ignoring any locale.  So use the Unicode function if this class
 479      * requires a swash, and use the Unicode macro otherwise. */
 480
 481     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 482
 483     if (UTF8_IS_INVARIANT(*character)) {
 484         return isFOO_lc(classnum, *character);
 485     }
 486     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 487         return isFOO_lc(classnum,
 488                         TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
 489     }
 490
 491     if (classnum < _FIRST_NON_SWASH_CC) {
 492
 493         /* Initialize the swash unless done already */
 494         if (! PL_utf8_swash_ptrs[classnum]) {
 495             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 496             PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 497                 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 498         }
 499
 500         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 501                                  character,
 502                                  TRUE /* is UTF */ ));
 503     }
 504
 505     switch ((_char_class_number) classnum) {
 506         case _CC_ENUM_SPACE:
 507         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 508
 509         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 510         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 511         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 512         default:                 return 0;  /* Things like CNTRL are always
 513                                                below 256 */
 514     }
 515
 516     assert(0); /* NOTREACHED */
 517     return FALSE;
 518 }
 519
 520 /*
 521  * pregexec and friends
 522  */
 523
 524 #ifndef PERL_IN_XSUB_RE
 525 /*
 526  - pregexec - match a regexp against a string
 527  */
 528 I32
 529 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 530          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 531 /* stringarg: the point in the string at which to begin matching */
 532 /* strend:    pointer to null at end of string */
 533 /* strbeg:    real beginning of string */
 534 /* minend:    end of match must be >= minend bytes after stringarg. */
 535 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 536  *            itself is accessed via the pointers above */
 537 /* nosave:    For optimizations. */
 538 {
 539     PERL_ARGS_ASSERT_PREGEXEC;
 540
 541     return
 542         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 543                       nosave ? 0 : REXEC_COPY_STR);
 544 }
 545 #endif
 546
 547 /*
 548  * Need to implement the following flags for reg_anch:
 549  *
 550  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 551  * USE_INTUIT_ML
 552  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 553  * INTUIT_AUTORITATIVE_ML
 554  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 555  * INTUIT_ONCE_ML
 556  *
 557  * Another flag for this function: SECOND_TIME (so that float substrs
 558  * with giant delta may be not rechecked).
 559  */
 560
 561 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 562
 563 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 564    Otherwise, only SvCUR(sv) is used to get strbeg. */
 565
 566 /* XXXX We assume that strpos is strbeg unless sv. */
 567
 568 /* XXXX Some places assume that there is a fixed substring.
 569         An update may be needed if optimizer marks as "INTUITable"
 570         RExen without fixed substrings.  Similarly, it is assumed that
 571         lengths of all the strings are no more than minlen, thus they
 572         cannot come from lookahead.
 573         (Or minlen should take into account lookahead.)
 574   NOTE: Some of this comment is not correct. minlen does now take account
 575   of lookahead/behind. Further research is required. -- demerphq
 576
 577 */
 578
 579 /* A failure to find a constant substring means that there is no need to make
 580    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 581    finding a substring too deep into the string means that fewer calls to
 582    regtry() should be needed.
 583
 584    REx compiler's optimizer found 4 possible hints:
 585         a) Anchored substring;
 586         b) Fixed substring;
 587         c) Whether we are anchored (beginning-of-line or \G);
 588         d) First node (of those at offset 0) which may distinguish positions;
 589    We use a)b)d) and multiline-part of c), and try to find a position in the
 590    string which does not contradict any of them.
 591  */
 592
 593 /* Most of decisions we do here should have been done at compile time.
 594    The nodes of the REx which we used for the search should have been
 595    deleted from the finite automaton. */
 596
 597 /* args:
 598  * rx:     the regex to match against
 599  * sv:     the SV being matched: only used for utf8 flag; the string
 600  *         itself is accessed via the pointers below. Note that on
 601  *         something like an overloaded SV, SvPOK(sv) may be false
 602  *         and the string pointers may point to something unrelated to
 603  *         the SV itself.
 604  * strbeg: real beginning of string
 605  * strpos: the point in the string at which to begin matching
 606  * strend: pointer to the byte following the last char of the string
 607  * flags   currently unused; set to 0
 608  * data:   currently unused; set to NULL
 609  */
 610
 611 char *
 612 Perl_re_intuit_start(pTHX_
 613                     REGEXP * const rx,
 614                     SV *sv,
 615                     const char * const strbeg,
 616                     char *strpos,
 617                     char *strend,
 618                     const U32 flags,
 619                     re_scream_pos_data *data)
 620 {
 621     dVAR;
 622     struct regexp *const prog = ReANY(rx);
 623     I32 start_shift = 0;
 624     /* Should be nonnegative! */
 625     I32 end_shift   = 0;
 626     char *s;
 627     SV *check;
 628     char *t;
 629     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 630     I32 ml_anch;
 631     char *other_last = NULL;    /* other substr checked before this */
 632     char *check_at = NULL;              /* check substr found at this pos */
 633     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 634     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 635     RXi_GET_DECL(prog,progi);
 636     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 637     regmatch_info *const reginfo = &reginfo_buf;
 638 #ifdef DEBUGGING
 639     const char * const i_strpos = strpos;
 640 #endif
 641     GET_RE_DEBUG_FLAGS_DECL;
 642
 643     PERL_ARGS_ASSERT_RE_INTUIT_START;
 644     PERL_UNUSED_ARG(flags);
 645     PERL_UNUSED_ARG(data);
 646
 647     /* CHR_DIST() would be more correct here but it makes things slow. */
 648     if (prog->minlen > strend - strpos) {
 649         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 650                               "String too short... [re_intuit_start]\n"));
 651         goto fail;
 652     }
 653
 654     reginfo->is_utf8_target = cBOOL(utf8_target);
 655     reginfo->info_aux = NULL;
 656     reginfo->strbeg = strbeg;
 657     reginfo->strend = strend;
 658     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 659     reginfo->intuit = 1;
 660     /* not actually used within intuit, but zero for safety anyway */
 661     reginfo->poscache_maxiter = 0;
 662
 663     if (utf8_target) {
 664         if (!prog->check_utf8 && prog->check_substr)
 665             to_utf8_substr(prog);
 666         check = prog->check_utf8;
 667     } else {
 668         if (!prog->check_substr && prog->check_utf8) {
 669             if (! to_byte_substr(prog)) {
 670                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 671             }
 672         }
 673         check = prog->check_substr;
 674     }
 675     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 676         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 677                      || ( (prog->extflags & RXf_ANCH_BOL)
 678                           && !multiline ) );    /* Check after \n? */
 679
 680         if (!ml_anch) {
 681           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 682                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 683                && (strpos != strbeg)) {
 684               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 685               goto fail;
 686           }
 687           if (prog->check_offset_min == prog->check_offset_max
 688               && !(prog->extflags & RXf_CANY_SEEN)
 689               && ! multiline)   /* /m can cause \n's to match that aren't
 690                                    accounted for in the string max length.
 691                                    See [perl #115242] */
 692           {
 693             /* Substring at constant offset from beg-of-str... */
 694             I32 slen;
 695
 696             s = HOP3c(strpos, prog->check_offset_min, strend);
 697
 698             if (SvTAIL(check)) {
 699                 slen = SvCUR(check);    /* >= 1 */
 700
 701                 if ( strend - s > slen || strend - s < slen - 1
 702                      || (strend - s == slen && strend[-1] != '\n')) {
 703                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 704                     goto fail_finish;
 705                 }
 706                 /* Now should match s[0..slen-2] */
 707                 slen--;
 708                 if (slen && (*SvPVX_const(check) != *s
 709                              || (slen > 1
 710                                  && memNE(SvPVX_const(check), s, slen)))) {
 711                   report_neq:
 712                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 713                     goto fail_finish;
 714                 }
 715             }
 716             else if (*SvPVX_const(check) != *s
 717                      || ((slen = SvCUR(check)) > 1
 718                          && memNE(SvPVX_const(check), s, slen)))
 719                 goto report_neq;
 720             check_at = s;
 721             goto success_at_start;
 722           }
 723         }
 724         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 725         s = strpos;
 726         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 727         end_shift = prog->check_end_shift;
 728
 729         if (!ml_anch) {
 730             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 731                                          - (SvTAIL(check) != 0);
 732             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 733
 734             if (end_shift < eshift)
 735                 end_shift = eshift;
 736         }
 737     }
 738     else {                              /* Can match at random position */
 739         ml_anch = 0;
 740         s = strpos;
 741         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 742         end_shift = prog->check_end_shift;
 743
 744         /* end shift should be non negative here */
 745     }
 746
 747 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 748     if (end_shift < 0)
 749         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 750                    (IV)end_shift, RX_PRECOMP(prog));
 751 #endif
 752
 753   restart:
 754     /* Find a possible match in the region s..strend by looking for
 755        the "check" substring in the region corrected by start/end_shift. */
 756
 757     {
 758         I32 srch_start_shift = start_shift;
 759         I32 srch_end_shift = end_shift;
 760         U8* start_point;
 761         U8* end_point;
 762         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 763             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 764             srch_start_shift = strbeg - s;
 765         }
 766     DEBUG_OPTIMISE_MORE_r({
 767         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 768             (IV)prog->check_offset_min,
 769             (IV)srch_start_shift,
 770             (IV)srch_end_shift,
 771             (IV)prog->check_end_shift);
 772     });
 773
 774         if (prog->extflags & RXf_CANY_SEEN) {
 775             start_point= (U8*)(s + srch_start_shift);
 776             end_point= (U8*)(strend - srch_end_shift);
 777         } else {
 778             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 779             end_point= HOP3(strend, -srch_end_shift, strbeg);
 780         }
 781         DEBUG_OPTIMISE_MORE_r({
 782             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 783                 (int)(end_point - start_point),
 784                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 785                 start_point);
 786         });
 787
 788         s = fbm_instr( start_point, end_point,
 789                       check, multiline ? FBMrf_MULTILINE : 0);
 790     }
 791     /* Update the count-of-usability, remove useless subpatterns,
 792         unshift s.  */
 793
 794     DEBUG_EXECUTE_r({
 795         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 796             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 797         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 798                           (s ? "Found" : "Did not find"),
 799             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 800                 ? "anchored" : "floating"),
 801             quoted,
 802             RE_SV_TAIL(check),
 803             (s ? " at offset " : "...\n") );
 804     });
 805
 806     if (!s)
 807         goto fail_finish;
 808     /* Finish the diagnostic message */
 809     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 810
 811     /* XXX dmq: first branch is for positive lookbehind...
 812        Our check string is offset from the beginning of the pattern.
 813        So we need to do any stclass tests offset forward from that
 814        point. I think. :-(
 815      */
 816
 817
 818
 819     check_at=s;
 820
 821
 822     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 823        Start with the other substr.
 824        XXXX no SCREAM optimization yet - and a very coarse implementation
 825        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 826                 *always* match.  Probably should be marked during compile...
 827        Probably it is right to do no SCREAM here...
 828      */
 829
 830     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 831                 : (prog->float_substr && prog->anchored_substr))
 832     {
 833         /* Take into account the "other" substring. */
 834         /* XXXX May be hopelessly wrong for UTF... */
 835         if (!other_last)
 836             other_last = strpos;
 837         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 838           do_other_anchored:
 839             {
 840                 char * const last = HOP3c(s, -start_shift, strbeg);
 841                 char *last1, *last2;
 842                 char * const saved_s = s;
 843                 SV* must;
 844
 845                 t = s - prog->check_offset_max;
 846                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 847                     && (!utf8_target
 848                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 849                             && t > strpos)))
 850                     NOOP;
 851                 else
 852                     t = strpos;
 853                 t = HOP3c(t, prog->anchored_offset, strend);
 854                 if (t < other_last)     /* These positions already checked */
 855                     t = other_last;
 856                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 857                 if (last < last1)
 858                     last1 = last;
 859                 /* XXXX It is not documented what units *_offsets are in.
 860                    We assume bytes, but this is clearly wrong.
 861                    Meaning this code needs to be carefully reviewed for errors.
 862                    dmq.
 863                   */
 864
 865                 /* On end-of-str: see comment below. */
 866                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 867                 if (must == &PL_sv_undef) {
 868                     s = (char*)NULL;
 869                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 870                 }
 871                 else
 872                     s = fbm_instr(
 873                         (unsigned char*)t,
 874                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 875                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 876                         must,
 877                         multiline ? FBMrf_MULTILINE : 0
 878                     );
 879                 DEBUG_EXECUTE_r({
 880                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 881                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 882                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 883                         (s ? "Found" : "Contradicts"),
 884                         quoted, RE_SV_TAIL(must));
 885                 });
 886
 887
 888                 if (!s) {
 889                     if (last1 >= last2) {
 890                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 891                                                 ", giving up...\n"));
 892                         goto fail_finish;
 893                     }
 894                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 895                         ", trying floating at offset %ld...\n",
 896                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 897                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 898                     s = HOP3c(last, 1, strend);
 899                     goto restart;
 900                 }
 901                 else {
 902                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 903                           (long)(s - i_strpos)));
 904                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 905                     other_last = HOP3c(s, 1, strend);
 906                     s = saved_s;
 907                     if (t == strpos)
 908                         goto try_at_start;
 909                     goto try_at_offset;
 910                 }
 911             }
 912         }
 913         else {          /* Take into account the floating substring. */
 914             char *last, *last1;
 915             char * const saved_s = s;
 916             SV* must;
 917
 918             t = HOP3c(s, -start_shift, strbeg);
 919             last1 = last =
 920                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 921             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 922                 last = HOP3c(t, prog->float_max_offset, strend);
 923             s = HOP3c(t, prog->float_min_offset, strend);
 924             if (s < other_last)
 925                 s = other_last;
 926  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 927             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 928             /* fbm_instr() takes into account exact value of end-of-str
 929                if the check is SvTAIL(ed).  Since false positives are OK,
 930                and end-of-str is not later than strend we are OK. */
 931             if (must == &PL_sv_undef) {
 932                 s = (char*)NULL;
 933                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 934             }
 935             else
 936                 s = fbm_instr((unsigned char*)s,
 937                               (unsigned char*)last + SvCUR(must)
 938                                   - (SvTAIL(must)!=0),
 939                               must, multiline ? FBMrf_MULTILINE : 0);
 940             DEBUG_EXECUTE_r({
 941                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 942                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 943                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 944                     (s ? "Found" : "Contradicts"),
 945                     quoted, RE_SV_TAIL(must));
 946             });
 947             if (!s) {
 948                 if (last1 == last) {
 949                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 950                                             ", giving up...\n"));
 951                     goto fail_finish;
 952                 }
 953                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 954                     ", trying anchored starting at offset %ld...\n",
 955                     (long)(saved_s + 1 - i_strpos)));
 956                 other_last = last;
 957                 s = HOP3c(t, 1, strend);
 958                 goto restart;
 959             }
 960             else {
 961                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 962                       (long)(s - i_strpos)));
 963                 other_last = s; /* Fix this later. --Hugo */
 964                 s = saved_s;
 965                 if (t == strpos)
 966                     goto try_at_start;
 967                 goto try_at_offset;
 968             }
 969         }
 970     }
 971
 972
 973     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 974
 975     DEBUG_OPTIMISE_MORE_r(
 976         PerlIO_printf(Perl_debug_log,
 977             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 978             (IV)prog->check_offset_min,
 979             (IV)prog->check_offset_max,
 980             (IV)(s-strpos),
 981             (IV)(t-strpos),
 982             (IV)(t-s),
 983             (IV)(strend-strpos)
 984         )
 985     );
 986
 987     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 988         && (!utf8_target
 989             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 990                  && t > strpos)))
 991     {
 992         /* Fixed substring is found far enough so that the match
 993            cannot start at strpos. */
 994       try_at_offset:
 995         if (ml_anch && t[-1] != '\n') {
 996             /* Eventually fbm_*() should handle this, but often
 997                anchored_offset is not 0, so this check will not be wasted. */
 998             /* XXXX In the code below we prefer to look for "^" even in
 999                presence of anchored substrings.  And we search even
1000                beyond the found float position.  These pessimizations
1001                are historical artefacts only.  */
1002           find_anchor:
1003             while (t < strend - prog->minlen) {
1004                 if (*t == '\n') {
1005                     if (t < check_at - prog->check_offset_min) {
1006                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1007                             /* Since we moved from the found position,
1008                                we definitely contradict the found anchored
1009                                substr.  Due to the above check we do not
1010                                contradict "check" substr.
1011                                Thus we can arrive here only if check substr
1012                                is float.  Redo checking for "other"=="fixed".
1013                              */
1014                             strpos = t + 1;
1015                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1016                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1017                             goto do_other_anchored;
1018                         }
1019                         /* We don't contradict the found floating substring. */
1020                         /* XXXX Why not check for STCLASS? */
1021                         s = t + 1;
1022                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1023                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1024                         goto set_useful;
1025                     }
1026                     /* Position contradicts check-string */
1027                     /* XXXX probably better to look for check-string
1028                        than for "\n", so one should lower the limit for t? */
1029                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1030                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1031                     other_last = strpos = s = t + 1;
1032                     goto restart;
1033                 }
1034                 t++;
1035             }
1036             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1037                         PL_colors[0], PL_colors[1]));
1038             goto fail_finish;
1039         }
1040         else {
1041             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1042                         PL_colors[0], PL_colors[1]));
1043         }
1044         s = t;
1045       set_useful:
1046         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1047     }
1048     else {
1049         /* The found string does not prohibit matching at strpos,
1050            - no optimization of calling REx engine can be performed,
1051            unless it was an MBOL and we are not after MBOL,
1052            or a future STCLASS check will fail this. */
1053       try_at_start:
1054         /* Even in this situation we may use MBOL flag if strpos is offset
1055            wrt the start of the string. */
1056         if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1057             /* May be due to an implicit anchor of m{.*foo}  */
1058             && !(prog->intflags & PREGf_IMPLICIT))
1059         {
1060             t = strpos;
1061             goto find_anchor;
1062         }
1063         DEBUG_EXECUTE_r( if (ml_anch)
1064             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1065                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1066         );
1067       success_at_start:
1068         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1069             && (utf8_target ? (
1070                 prog->check_utf8                /* Could be deleted already */
1071                 && --BmUSEFUL(prog->check_utf8) < 0
1072                 && (prog->check_utf8 == prog->float_utf8)
1073             ) : (
1074                 prog->check_substr              /* Could be deleted already */
1075                 && --BmUSEFUL(prog->check_substr) < 0
1076                 && (prog->check_substr == prog->float_substr)
1077             )))
1078         {
1079             /* If flags & SOMETHING - do not do it many times on the same match */
1080             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1081             /* XXX Does the destruction order has to change with utf8_target? */
1082             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1083             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1084             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1085             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1086             check = NULL;                       /* abort */
1087             s = strpos;
1088             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1089                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1090             if (prog->intflags & PREGf_IMPLICIT)
1091                 prog->extflags &= ~RXf_ANCH_MBOL;
1092             /* XXXX This is a remnant of the old implementation.  It
1093                     looks wasteful, since now INTUIT can use many
1094                     other heuristics. */
1095             prog->extflags &= ~RXf_USE_INTUIT;
1096             /* XXXX What other flags might need to be cleared in this branch? */
1097         }
1098         else
1099             s = strpos;
1100     }
1101
1102     /* Last resort... */
1103     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1104     /* trie stclasses are too expensive to use here, we are better off to
1105        leave it to regmatch itself */
1106     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1107         /* minlen == 0 is possible if regstclass is \b or \B,
1108            and the fixed substr is ''$.
1109            Since minlen is already taken into account, s+1 is before strend;
1110            accidentally, minlen >= 1 guaranties no false positives at s + 1
1111            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1112            regstclass does not come from lookahead...  */
1113         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1114            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1115         const U8* const str = (U8*)STRING(progi->regstclass);
1116         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1117                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1118                     : 1);
1119         char * endpos;
1120         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1121             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1122         else if (prog->float_substr || prog->float_utf8)
1123             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1124         else
1125             endpos= strend;
1126
1127         if (checked_upto < s)
1128            checked_upto = s;
1129         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1130                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1131
1132         t = s;
1133         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1134                             reginfo);
1135         if (s) {
1136             checked_upto = s;
1137         } else {
1138 #ifdef DEBUGGING
1139             const char *what = NULL;
1140 #endif
1141             if (endpos == strend) {
1142                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143                                 "Could not match STCLASS...\n") );
1144                 goto fail;
1145             }
1146             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147                                    "This position contradicts STCLASS...\n") );
1148             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1149                 goto fail;
1150             checked_upto = HOPBACKc(endpos, start_shift);
1151             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1152                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1153             /* Contradict one of substrings */
1154             if (prog->anchored_substr || prog->anchored_utf8) {
1155                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1156                     DEBUG_EXECUTE_r( what = "anchored" );
1157                   hop_and_restart:
1158                     s = HOP3c(t, 1, strend);
1159                     if (s + start_shift + end_shift > strend) {
1160                         /* XXXX Should be taken into account earlier? */
1161                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1162                                                "Could not match STCLASS...\n") );
1163                         goto fail;
1164                     }
1165                     if (!check)
1166                         goto giveup;
1167                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1168                                 "Looking for %s substr starting at offset %ld...\n",
1169                                  what, (long)(s + start_shift - i_strpos)) );
1170                     goto restart;
1171                 }
1172                 /* Have both, check_string is floating */
1173                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1174                     goto retry_floating_check;
1175                 /* Recheck anchored substring, but not floating... */
1176                 s = check_at;
1177                 if (!check)
1178                     goto giveup;
1179                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1180                           "Looking for anchored substr starting at offset %ld...\n",
1181                           (long)(other_last - i_strpos)) );
1182                 goto do_other_anchored;
1183             }
1184             /* Another way we could have checked stclass at the
1185                current position only: */
1186             if (ml_anch) {
1187                 s = t = t + 1;
1188                 if (!check)
1189                     goto giveup;
1190                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1191                           "Looking for /%s^%s/m starting at offset %ld...\n",
1192                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1193                 goto try_at_offset;
1194             }
1195             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1196                 goto fail;
1197             /* Check is floating substring. */
1198           retry_floating_check:
1199             t = check_at - start_shift;
1200             DEBUG_EXECUTE_r( what = "floating" );
1201             goto hop_and_restart;
1202         }
1203         if (t != s) {
1204             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1205                         "By STCLASS: moving %ld --> %ld\n",
1206                                   (long)(t - i_strpos), (long)(s - i_strpos))
1207                    );
1208         }
1209         else {
1210             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1211                                   "Does not contradict STCLASS...\n");
1212                    );
1213         }
1214     }
1215   giveup:
1216     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1217                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1218                           PL_colors[5], (long)(s - i_strpos)) );
1219     return s;
1220
1221   fail_finish:                          /* Substring not found */
1222     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1223         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1224   fail:
1225     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1226                           PL_colors[4], PL_colors[5]));
1227     return NULL;
1228 }
1229
1230 #define DECL_TRIE_TYPE(scan) \
1231     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1232                     trie_type = ((scan->flags == EXACT) \
1233                               ? (utf8_target ? trie_utf8 : trie_plain) \
1234                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1235
1236 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1237 STMT_START {                               \
1238     STRLEN skiplen;                                                                 \
1239     switch (trie_type) {                                                            \
1240     case trie_utf8_fold:                                                            \
1241         if ( foldlen>0 ) {                                                          \
1242             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1243             foldlen -= len;                                                         \
1244             uscan += len;                                                           \
1245             len=0;                                                                  \
1246         } else {                                                                    \
1247             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1248             len = UTF8SKIP(uc);                                                     \
1249             skiplen = UNISKIP( uvc );                                               \
1250             foldlen -= skiplen;                                                     \
1251             uscan = foldbuf + skiplen;                                              \
1252         }                                                                           \
1253         break;                                                                      \
1254     case trie_latin_utf8_fold:                                                      \
1255         if ( foldlen>0 ) {                                                          \
1256             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1257             foldlen -= len;                                                         \
1258             uscan += len;                                                           \
1259             len=0;                                                                  \
1260         } else {                                                                    \
1261             len = 1;                                                                \
1262             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);   \
1263             skiplen = UNISKIP( uvc );                                               \
1264             foldlen -= skiplen;                                                     \
1265             uscan = foldbuf + skiplen;                                              \
1266         }                                                                           \
1267         break;                                                                      \
1268     case trie_utf8:                                                                 \
1269         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1270         break;                                                                      \
1271     case trie_plain:                                                                \
1272         uvc = (UV)*uc;                                                              \
1273         len = 1;                                                                    \
1274     }                                                                               \
1275     if (uvc < 256) {                                                                \
1276         charid = trie->charmap[ uvc ];                                              \
1277     }                                                                               \
1278     else {                                                                          \
1279         charid = 0;                                                                 \
1280         if (widecharmap) {                                                          \
1281             SV** const svpp = hv_fetch(widecharmap,                                 \
1282                         (char*)&uvc, sizeof(UV), 0);                                \
1283             if (svpp)                                                               \
1284                 charid = (U16)SvIV(*svpp);                                          \
1285         }                                                                           \
1286     }                                                                               \
1287 } STMT_END
1288
1289 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1290 STMT_START {                                              \
1291     while (s <= e) {                                      \
1292         if ( (CoNd)                                       \
1293              && (ln == 1 || folder(s, pat_string, ln))    \
1294              && (reginfo->intuit || regtry(reginfo, &s)) )\
1295             goto got_it;                                  \
1296         s++;                                              \
1297     }                                                     \
1298 } STMT_END
1299
1300 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1301 STMT_START {                                          \
1302     while (s < strend) {                              \
1303         CoDe                                          \
1304         s += UTF8SKIP(s);                             \
1305     }                                                 \
1306 } STMT_END
1307
1308 #define REXEC_FBC_SCAN(CoDe)                          \
1309 STMT_START {                                          \
1310     while (s < strend) {                              \
1311         CoDe                                          \
1312         s++;                                          \
1313     }                                                 \
1314 } STMT_END
1315
1316 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1317 REXEC_FBC_UTF8_SCAN(                                  \
1318     if (CoNd) {                                       \
1319         if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1320             goto got_it;                              \
1321         else                                          \
1322             tmp = doevery;                            \
1323     }                                                 \
1324     else                                              \
1325         tmp = 1;                                      \
1326 )
1327
1328 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1329 REXEC_FBC_SCAN(                                       \
1330     if (CoNd) {                                       \
1331         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1332             goto got_it;                              \
1333         else                                          \
1334             tmp = doevery;                            \
1335     }                                                 \
1336     else                                              \
1337         tmp = 1;                                      \
1338 )
1339
1340 #define REXEC_FBC_TRYIT               \
1341 if ((reginfo->intuit || regtry(reginfo, &s))) \
1342     goto got_it
1343
1344 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1345     if (utf8_target) {                                             \
1346         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1347     }                                                          \
1348     else {                                                     \
1349         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1350     }
1351
1352 #define DUMP_EXEC_POS(li,s,doutf8) \
1353     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1354                 startpos, doutf8)
1355
1356
1357 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1358         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1359         tmp = TEST_NON_UTF8(tmp);                                              \
1360         REXEC_FBC_UTF8_SCAN(                                                   \
1361             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1362                 tmp = !tmp;                                                    \
1363                 IF_SUCCESS;                                                    \
1364             }                                                                  \
1365             else {                                                             \
1366                 IF_FAIL;                                                       \
1367             }                                                                  \
1368         );                                                                     \
1369
1370 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1371         if (s == reginfo->strbeg) {                                            \
1372             tmp = '\n';                                                        \
1373         }                                                                      \
1374         else {                                                                 \
1375             U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1376             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1377         }                                                                      \
1378         tmp = TeSt1_UtF8;                                                      \
1379         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1380         REXEC_FBC_UTF8_SCAN(                                                   \
1381             if (tmp == ! (TeSt2_UtF8)) { \
1382                 tmp = !tmp;                                                    \
1383                 IF_SUCCESS;                                                    \
1384             }                                                                  \
1385             else {                                                             \
1386                 IF_FAIL;                                                       \
1387             }                                                                  \
1388         );                                                                     \
1389
1390 /* The only difference between the BOUND and NBOUND cases is that
1391  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1392  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1393  * with the other one being empty */
1394 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1395     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1396
1397 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1398     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1399
1400 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1401     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1402
1403 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1405
1406
1407 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1408  * be passed in completely with the variable name being tested, which isn't
1409  * such a clean interface, but this is easier to read than it was before.  We
1410  * are looking for the boundary (or non-boundary between a word and non-word
1411  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1412  * must be different.  Find the "wordness" of the character just prior to this
1413  * one, and compare it with the wordness of this one.  If they differ, we have
1414  * a boundary.  At the beginning of the string, pretend that the previous
1415  * character was a new-line */
1416 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1417     if (utf8_target) {                                                         \
1418                 UTF8_CODE \
1419     }                                                                          \
1420     else {  /* Not utf8 */                                                     \
1421         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1422         tmp = TEST_NON_UTF8(tmp);                                              \
1423         REXEC_FBC_SCAN(                                                        \
1424             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1425                 tmp = !tmp;                                                    \
1426                 IF_SUCCESS;                                                    \
1427             }                                                                  \
1428             else {                                                             \
1429                 IF_FAIL;                                                       \
1430             }                                                                  \
1431         );                                                                     \
1432     }                                                                          \
1433     if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))           \
1434         goto got_it;
1435
1436 /* We know what class REx starts with.  Try to find this position... */
1437 /* if reginfo->intuit, its a dryrun */
1438 /* annoyingly all the vars in this routine have different names from their counterparts
1439    in regmatch. /grrr */
1440
1441 STATIC char *
1442 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1443     const char *strend, regmatch_info *reginfo)
1444 {
1445     dVAR;
1446     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1447     char *pat_string;   /* The pattern's exactish string */
1448     char *pat_end;          /* ptr to end char of pat_string */
1449     re_fold_t folder;   /* Function for computing non-utf8 folds */
1450     const U8 *fold_array;   /* array for folding ords < 256 */
1451     STRLEN ln;
1452     STRLEN lnc;
1453     U8 c1;
1454     U8 c2;
1455     char *e;
1456     I32 tmp = 1;        /* Scratch variable? */
1457     const bool utf8_target = reginfo->is_utf8_target;
1458     UV utf8_fold_flags = 0;
1459     const bool is_utf8_pat = reginfo->is_utf8_pat;
1460     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1461                                    with a result inverts that result, as 0^1 =
1462                                    1 and 1^1 = 0 */
1463     _char_class_number classnum;
1464
1465     RXi_GET_DECL(prog,progi);
1466
1467     PERL_ARGS_ASSERT_FIND_BYCLASS;
1468
1469     /* We know what class it must start with. */
1470     switch (OP(c)) {
1471     case ANYOF:
1472     case ANYOF_SYNTHETIC:
1473     case ANYOF_WARN_SUPER:
1474         if (utf8_target) {
1475             REXEC_FBC_UTF8_CLASS_SCAN(
1476                       reginclass(prog, c, (U8*)s, utf8_target));
1477         }
1478         else {
1479             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1480         }
1481         break;
1482     case CANY:
1483         REXEC_FBC_SCAN(
1484             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1485                 goto got_it;
1486             else
1487                 tmp = doevery;
1488         );
1489         break;
1490
1491     case EXACTFA:
1492         if (is_utf8_pat || utf8_target) {
1493             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1494             goto do_exactf_utf8;
1495         }
1496         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1497         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1498         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1499
1500     case EXACTF:
1501         if (utf8_target) {
1502
1503             /* regcomp.c already folded this if pattern is in UTF-8 */
1504             utf8_fold_flags = 0;
1505             goto do_exactf_utf8;
1506         }
1507         fold_array = PL_fold;
1508         folder = foldEQ;
1509         goto do_exactf_non_utf8;
1510
1511     case EXACTFL:
1512         if (is_utf8_pat || utf8_target) {
1513             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1514             goto do_exactf_utf8;
1515         }
1516         fold_array = PL_fold_locale;
1517         folder = foldEQ_locale;
1518         goto do_exactf_non_utf8;
1519
1520     case EXACTFU_SS:
1521         if (is_utf8_pat) {
1522             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1523         }
1524         goto do_exactf_utf8;
1525
1526     case EXACTFU_TRICKYFOLD:
1527     case EXACTFU:
1528         if (is_utf8_pat || utf8_target) {
1529             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1530             goto do_exactf_utf8;
1531         }
1532
1533         /* Any 'ss' in the pattern should have been replaced by regcomp,
1534          * so we don't have to worry here about this single special case
1535          * in the Latin1 range */
1536         fold_array = PL_fold_latin1;
1537         folder = foldEQ_latin1;
1538
1539         /* FALL THROUGH */
1540
1541     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1542                            are no glitches with fold-length differences
1543                            between the target string and pattern */
1544
1545         /* The idea in the non-utf8 EXACTF* cases is to first find the
1546          * first character of the EXACTF* node and then, if necessary,
1547          * case-insensitively compare the full text of the node.  c1 is the
1548          * first character.  c2 is its fold.  This logic will not work for
1549          * Unicode semantics and the german sharp ss, which hence should
1550          * not be compiled into a node that gets here. */
1551         pat_string = STRING(c);
1552         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1553
1554         /* We know that we have to match at least 'ln' bytes (which is the
1555          * same as characters, since not utf8).  If we have to match 3
1556          * characters, and there are only 2 availabe, we know without
1557          * trying that it will fail; so don't start a match past the
1558          * required minimum number from the far end */
1559         e = HOP3c(strend, -((I32)ln), s);
1560
1561         if (reginfo->intuit && e < s) {
1562             e = s;                      /* Due to minlen logic of intuit() */
1563         }
1564
1565         c1 = *pat_string;
1566         c2 = fold_array[c1];
1567         if (c1 == c2) { /* If char and fold are the same */
1568             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1569         }
1570         else {
1571             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1572         }
1573         break;
1574
1575     do_exactf_utf8:
1576     {
1577         unsigned expansion;
1578
1579         /* If one of the operands is in utf8, we can't use the simpler folding
1580          * above, due to the fact that many different characters can have the
1581          * same fold, or portion of a fold, or different- length fold */
1582         pat_string = STRING(c);
1583         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1584         pat_end = pat_string + ln;
1585         lnc = is_utf8_pat       /* length to match in characters */
1586                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1587                 : ln;
1588
1589         /* We have 'lnc' characters to match in the pattern, but because of
1590          * multi-character folding, each character in the target can match
1591          * up to 3 characters (Unicode guarantees it will never exceed
1592          * this) if it is utf8-encoded; and up to 2 if not (based on the
1593          * fact that the Latin 1 folds are already determined, and the
1594          * only multi-char fold in that range is the sharp-s folding to
1595          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1596          * string character.  Adjust lnc accordingly, rounding up, so that
1597          * if we need to match at least 4+1/3 chars, that really is 5. */
1598         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1599         lnc = (lnc + expansion - 1) / expansion;
1600
1601         /* As in the non-UTF8 case, if we have to match 3 characters, and
1602          * only 2 are left, it's guaranteed to fail, so don't start a
1603          * match that would require us to go beyond the end of the string
1604          */
1605         e = HOP3c(strend, -((I32)lnc), s);
1606
1607         if (reginfo->intuit && e < s) {
1608             e = s;                      /* Due to minlen logic of intuit() */
1609         }
1610
1611         /* XXX Note that we could recalculate e to stop the loop earlier,
1612          * as the worst case expansion above will rarely be met, and as we
1613          * go along we would usually find that e moves further to the left.
1614          * This would happen only after we reached the point in the loop
1615          * where if there were no expansion we should fail.  Unclear if
1616          * worth the expense */
1617
1618         while (s <= e) {
1619             char *my_strend= (char *)strend;
1620             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1621                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1622                 && (reginfo->intuit || regtry(reginfo, &s)) )
1623             {
1624                 goto got_it;
1625             }
1626             s += (utf8_target) ? UTF8SKIP(s) : 1;
1627         }
1628         break;
1629     }
1630     case BOUNDL:
1631         RXp_MATCH_TAINTED_on(prog);
1632         FBC_BOUND(isWORDCHAR_LC,
1633                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1634                   isWORDCHAR_LC_utf8((U8*)s));
1635         break;
1636     case NBOUNDL:
1637         RXp_MATCH_TAINTED_on(prog);
1638         FBC_NBOUND(isWORDCHAR_LC,
1639                    isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1640                    isWORDCHAR_LC_utf8((U8*)s));
1641         break;
1642     case BOUND:
1643         FBC_BOUND(isWORDCHAR,
1644                   isWORDCHAR_uni(tmp),
1645                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1646         break;
1647     case BOUNDA:
1648         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1649                          isWORDCHAR_A(tmp),
1650                          isWORDCHAR_A((U8*)s));
1651         break;
1652     case NBOUND:
1653         FBC_NBOUND(isWORDCHAR,
1654                    isWORDCHAR_uni(tmp),
1655                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656         break;
1657     case NBOUNDA:
1658         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1659                           isWORDCHAR_A(tmp),
1660                           isWORDCHAR_A((U8*)s));
1661         break;
1662     case BOUNDU:
1663         FBC_BOUND(isWORDCHAR_L1,
1664                   isWORDCHAR_uni(tmp),
1665                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1666         break;
1667     case NBOUNDU:
1668         FBC_NBOUND(isWORDCHAR_L1,
1669                    isWORDCHAR_uni(tmp),
1670                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1671         break;
1672     case LNBREAK:
1673         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1674                         is_LNBREAK_latin1_safe(s, strend)
1675         );
1676         break;
1677
1678     /* The argument to all the POSIX node types is the class number to pass to
1679      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1680
1681     case NPOSIXL:
1682         to_complement = 1;
1683         /* FALLTHROUGH */
1684
1685     case POSIXL:
1686         RXp_MATCH_TAINTED_on(prog);
1687         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1688                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1689         break;
1690
1691     case NPOSIXD:
1692         to_complement = 1;
1693         /* FALLTHROUGH */
1694
1695     case POSIXD:
1696         if (utf8_target) {
1697             goto posix_utf8;
1698         }
1699         goto posixa;
1700
1701     case NPOSIXA:
1702         if (utf8_target) {
1703             /* The complement of something that matches only ASCII matches all
1704              * UTF-8 variant code points, plus everything in ASCII that isn't
1705              * in the class */
1706             REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1707                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1708             break;
1709         }
1710
1711         to_complement = 1;
1712         /* FALLTHROUGH */
1713
1714     case POSIXA:
1715       posixa:
1716         /* Don't need to worry about utf8, as it can match only a single
1717          * byte invariant character. */
1718         REXEC_FBC_CLASS_SCAN(
1719                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1720         break;
1721
1722     case NPOSIXU:
1723         to_complement = 1;
1724         /* FALLTHROUGH */
1725
1726     case POSIXU:
1727         if (! utf8_target) {
1728             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1729                                                                     FLAGS(c))));
1730         }
1731         else {
1732
1733       posix_utf8:
1734             classnum = (_char_class_number) FLAGS(c);
1735             if (classnum < _FIRST_NON_SWASH_CC) {
1736                 while (s < strend) {
1737
1738                     /* We avoid loading in the swash as long as possible, but
1739                      * should we have to, we jump to a separate loop.  This
1740                      * extra 'if' statement is what keeps this code from being
1741                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1742                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
1743                         goto found_above_latin1;
1744                     }
1745                     if ((UTF8_IS_INVARIANT(*s)
1746                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1747                                                                 classnum)))
1748                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1749                             && to_complement ^ cBOOL(
1750                                 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1751                                               classnum))))
1752                     {
1753                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1754                             goto got_it;
1755                         else {
1756                             tmp = doevery;
1757                         }
1758                     }
1759                     else {
1760                         tmp = 1;
1761                     }
1762                     s += UTF8SKIP(s);
1763                 }
1764             }
1765             else switch (classnum) {    /* These classes are implemented as
1766                                            macros */
1767                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1768                                         revert the change of \v matching this */
1769                     /* FALL THROUGH */
1770
1771                 case _CC_ENUM_PSXSPC:
1772                     REXEC_FBC_UTF8_CLASS_SCAN(
1773                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
1774                     break;
1775
1776                 case _CC_ENUM_BLANK:
1777                     REXEC_FBC_UTF8_CLASS_SCAN(
1778                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
1779                     break;
1780
1781                 case _CC_ENUM_XDIGIT:
1782                     REXEC_FBC_UTF8_CLASS_SCAN(
1783                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1784                     break;
1785
1786                 case _CC_ENUM_VERTSPACE:
1787                     REXEC_FBC_UTF8_CLASS_SCAN(
1788                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
1789                     break;
1790
1791                 case _CC_ENUM_CNTRL:
1792                     REXEC_FBC_UTF8_CLASS_SCAN(
1793                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
1794                     break;
1795
1796                 default:
1797                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1798                     assert(0); /* NOTREACHED */
1799             }
1800         }
1801         break;
1802
1803       found_above_latin1:   /* Here we have to load a swash to get the result
1804                                for the current code point */
1805         if (! PL_utf8_swash_ptrs[classnum]) {
1806             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1807             PL_utf8_swash_ptrs[classnum] =
1808                     _core_swash_init("utf8", swash_property_names[classnum],
1809                                      &PL_sv_undef, 1, 0, NULL, &flags);
1810         }
1811
1812         /* This is a copy of the loop above for swash classes, though using the
1813          * FBC macro instead of being expanded out.  Since we've loaded the
1814          * swash, we don't have to check for that each time through the loop */
1815         REXEC_FBC_UTF8_CLASS_SCAN(
1816                 to_complement ^ cBOOL(_generic_utf8(
1817                                       classnum,
1818                                       s,
1819                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
1820                                                   (U8 *) s, TRUE))));
1821         break;
1822
1823     case AHOCORASICKC:
1824     case AHOCORASICK:
1825         {
1826             DECL_TRIE_TYPE(c);
1827             /* what trie are we using right now */
1828             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1829             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1830             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1831
1832             const char *last_start = strend - trie->minlen;
1833 #ifdef DEBUGGING
1834             const char *real_start = s;
1835 #endif
1836             STRLEN maxlen = trie->maxlen;
1837             SV *sv_points;
1838             U8 **points; /* map of where we were in the input string
1839                             when reading a given char. For ASCII this
1840                             is unnecessary overhead as the relationship
1841                             is always 1:1, but for Unicode, especially
1842                             case folded Unicode this is not true. */
1843             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1844             U8 *bitmap=NULL;
1845
1846
1847             GET_RE_DEBUG_FLAGS_DECL;
1848
1849             /* We can't just allocate points here. We need to wrap it in
1850              * an SV so it gets freed properly if there is a croak while
1851              * running the match */
1852             ENTER;
1853             SAVETMPS;
1854             sv_points=newSV(maxlen * sizeof(U8 *));
1855             SvCUR_set(sv_points,
1856                 maxlen * sizeof(U8 *));
1857             SvPOK_on(sv_points);
1858             sv_2mortal(sv_points);
1859             points=(U8**)SvPV_nolen(sv_points );
1860             if ( trie_type != trie_utf8_fold
1861                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1862             {
1863                 if (trie->bitmap)
1864                     bitmap=(U8*)trie->bitmap;
1865                 else
1866                     bitmap=(U8*)ANYOF_BITMAP(c);
1867             }
1868             /* this is the Aho-Corasick algorithm modified a touch
1869                to include special handling for long "unknown char" sequences.
1870                The basic idea being that we use AC as long as we are dealing
1871                with a possible matching char, when we encounter an unknown char
1872                (and we have not encountered an accepting state) we scan forward
1873                until we find a legal starting char.
1874                AC matching is basically that of trie matching, except that when
1875                we encounter a failing transition, we fall back to the current
1876                states "fail state", and try the current char again, a process
1877                we repeat until we reach the root state, state 1, or a legal
1878                transition. If we fail on the root state then we can either
1879                terminate if we have reached an accepting state previously, or
1880                restart the entire process from the beginning if we have not.
1881
1882              */
1883             while (s <= last_start) {
1884                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1885                 U8 *uc = (U8*)s;
1886                 U16 charid = 0;
1887                 U32 base = 1;
1888                 U32 state = 1;
1889                 UV uvc = 0;
1890                 STRLEN len = 0;
1891                 STRLEN foldlen = 0;
1892                 U8 *uscan = (U8*)NULL;
1893                 U8 *leftmost = NULL;
1894 #ifdef DEBUGGING
1895                 U32 accepted_word= 0;
1896 #endif
1897                 U32 pointpos = 0;
1898
1899                 while ( state && uc <= (U8*)strend ) {
1900                     int failed=0;
1901                     U32 word = aho->states[ state ].wordnum;
1902
1903                     if( state==1 ) {
1904                         if ( bitmap ) {
1905                             DEBUG_TRIE_EXECUTE_r(
1906                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1908                                         (char *)uc, utf8_target );
1909                                     PerlIO_printf( Perl_debug_log,
1910                                         " Scanning for legal start char...\n");
1911                                 }
1912                             );
1913                             if (utf8_target) {
1914                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1915                                     uc += UTF8SKIP(uc);
1916                                 }
1917                             } else {
1918                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1919                                     uc++;
1920                                 }
1921                             }
1922                             s= (char *)uc;
1923                         }
1924                         if (uc >(U8*)last_start) break;
1925                     }
1926
1927                     if ( word ) {
1928                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1929                         if (!leftmost || lpos < leftmost) {
1930                             DEBUG_r(accepted_word=word);
1931                             leftmost= lpos;
1932                         }
1933                         if (base==0) break;
1934
1935                     }
1936                     points[pointpos++ % maxlen]= uc;
1937                     if (foldlen || uc < (U8*)strend) {
1938                         REXEC_TRIE_READ_CHAR(trie_type, trie,
1939                                          widecharmap, uc,
1940                                          uscan, len, uvc, charid, foldlen,
1941                                          foldbuf, uniflags);
1942                         DEBUG_TRIE_EXECUTE_r({
1943                             dump_exec_pos( (char *)uc, c, strend,
1944                                         real_start, s, utf8_target);
1945                             PerlIO_printf(Perl_debug_log,
1946                                 " Charid:%3u CP:%4"UVxf" ",
1947                                  charid, uvc);
1948                         });
1949                     }
1950                     else {
1951                         len = 0;
1952                         charid = 0;
1953                     }
1954
1955
1956                     do {
1957 #ifdef DEBUGGING
1958                         word = aho->states[ state ].wordnum;
1959 #endif
1960                         base = aho->states[ state ].trans.base;
1961
1962                         DEBUG_TRIE_EXECUTE_r({
1963                             if (failed)
1964                                 dump_exec_pos( (char *)uc, c, strend, real_start,
1965                                     s,   utf8_target );
1966                             PerlIO_printf( Perl_debug_log,
1967                                 "%sState: %4"UVxf", word=%"UVxf,
1968                                 failed ? " Fail transition to " : "",
1969                                 (UV)state, (UV)word);
1970                         });
1971                         if ( base ) {
1972                             U32 tmp;
1973                             I32 offset;
1974                             if (charid &&
1975                                  ( ((offset = base + charid
1976                                     - 1 - trie->uniquecharcount)) >= 0)
1977                                  && ((U32)offset < trie->lasttrans)
1978                                  && trie->trans[offset].check == state
1979                                  && (tmp=trie->trans[offset].next))
1980                             {
1981                                 DEBUG_TRIE_EXECUTE_r(
1982                                     PerlIO_printf( Perl_debug_log," - legal\n"));
1983                                 state = tmp;
1984                                 break;
1985                             }
1986                             else {
1987                                 DEBUG_TRIE_EXECUTE_r(
1988                                     PerlIO_printf( Perl_debug_log," - fail\n"));
1989                                 failed = 1;
1990                                 state = aho->fail[state];
1991                             }
1992                         }
1993                         else {
1994                             /* we must be accepting here */
1995                             DEBUG_TRIE_EXECUTE_r(
1996                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
1997                             failed = 1;
1998                             break;
1999                         }
2000                     } while(state);
2001                     uc += len;
2002                     if (failed) {
2003                         if (leftmost)
2004                             break;
2005                         if (!state) state = 1;
2006                     }
2007                 }
2008                 if ( aho->states[ state ].wordnum ) {
2009                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2010                     if (!leftmost || lpos < leftmost) {
2011                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2012                         leftmost = lpos;
2013                     }
2014                 }
2015                 if (leftmost) {
2016                     s = (char*)leftmost;
2017                     DEBUG_TRIE_EXECUTE_r({
2018                         PerlIO_printf(
2019                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2020                             (UV)accepted_word, (IV)(s - real_start)
2021                         );
2022                     });
2023                     if (reginfo->intuit || regtry(reginfo, &s)) {
2024                         FREETMPS;
2025                         LEAVE;
2026                         goto got_it;
2027                     }
2028                     s = HOPc(s,1);
2029                     DEBUG_TRIE_EXECUTE_r({
2030                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2031                     });
2032                 } else {
2033                     DEBUG_TRIE_EXECUTE_r(
2034                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2035                     break;
2036                 }
2037             }
2038             FREETMPS;
2039             LEAVE;
2040         }
2041         break;
2042     default:
2043         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2044         break;
2045     }
2046     return 0;
2047   got_it:
2048     return s;
2049 }
2050
2051
2052 /*
2053  - regexec_flags - match a regexp against a string
2054  */
2055 I32
2056 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2057               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2058 /* stringarg: the point in the string at which to begin matching */
2059 /* strend:    pointer to null at end of string */
2060 /* strbeg:    real beginning of string */
2061 /* minend:    end of match must be >= minend bytes after stringarg. */
2062 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2063  *            itself is accessed via the pointers above */
2064 /* data:      May be used for some additional optimizations.
2065               Currently its only used, with a U32 cast, for transmitting
2066               the ganch offset when doing a /g match. This will change */
2067 /* nosave:    For optimizations. */
2068
2069 {
2070     dVAR;
2071     struct regexp *const prog = ReANY(rx);
2072     char *s;
2073     regnode *c;
2074     char *startpos = stringarg;
2075     I32 minlen;         /* must match at least this many chars */
2076     I32 dontbother = 0; /* how many characters not to try at end */
2077     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2078     I32 scream_pos = -1;                /* Internal iterator of scream. */
2079     char *scream_olds = NULL;
2080     const bool utf8_target = cBOOL(DO_UTF8(sv));
2081     I32 multiline;
2082     RXi_GET_DECL(prog,progi);
2083     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2084     regmatch_info *const reginfo = &reginfo_buf;
2085     regexp_paren_pair *swap = NULL;
2086     I32 oldsave;
2087     GET_RE_DEBUG_FLAGS_DECL;
2088
2089     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2090     PERL_UNUSED_ARG(data);
2091
2092     /* Be paranoid... */
2093     if (prog == NULL || startpos == NULL) {
2094         Perl_croak(aTHX_ "NULL regexp parameter");
2095         return 0;
2096     }
2097
2098     DEBUG_EXECUTE_r(
2099         debug_start_match(rx, utf8_target, startpos, strend,
2100         "Matching");
2101     );
2102
2103
2104     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2105      * which will call destuctors to reset PL_regmatch_state, free higher
2106      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2107      * regmatch_info_aux_eval */
2108
2109     oldsave = PL_savestack_ix;
2110
2111     multiline = prog->extflags & RXf_PMf_MULTILINE;
2112     minlen = prog->minlen;
2113
2114     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2115         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2116                               "String too short [regexec_flags]...\n"));
2117         goto phooey;
2118     }
2119
2120     /* Check validity of program. */
2121     if (UCHARAT(progi->program) != REG_MAGIC) {
2122         Perl_croak(aTHX_ "corrupted regexp program");
2123     }
2124
2125     RX_MATCH_TAINTED_off(rx);
2126
2127     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2128     reginfo->intuit = 0;
2129     reginfo->is_utf8_target = cBOOL(utf8_target);
2130     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2131     reginfo->warned = FALSE;
2132     reginfo->strbeg  = strbeg;
2133     reginfo->sv = sv;
2134     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2135     reginfo->strend = strend;
2136     /* see how far we have to get to not match where we matched before */
2137     reginfo->till = startpos+minend;
2138
2139     /* reserve next 2 or 3 slots in PL_regmatch_state:
2140      * slot N+0: may currently be in use: skip it
2141      * slot N+1: use for regmatch_info_aux struct
2142      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2143      * slot N+3: ready for use by regmatch()
2144      */
2145
2146     {
2147         regmatch_state *old_regmatch_state;
2148         regmatch_slab  *old_regmatch_slab;
2149         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2150
2151         /* on first ever match, allocate first slab */
2152         if (!PL_regmatch_slab) {
2153             Newx(PL_regmatch_slab, 1, regmatch_slab);
2154             PL_regmatch_slab->prev = NULL;
2155             PL_regmatch_slab->next = NULL;
2156             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2157         }
2158
2159         old_regmatch_state = PL_regmatch_state;
2160         old_regmatch_slab  = PL_regmatch_slab;
2161
2162         for (i=0; i <= max; i++) {
2163             if (i == 1)
2164                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2165             else if (i ==2)
2166                 reginfo->info_aux_eval =
2167                 reginfo->info_aux->info_aux_eval =
2168                             &(PL_regmatch_state->u.info_aux_eval);
2169
2170             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2171                 PL_regmatch_state = S_push_slab(aTHX);
2172         }
2173
2174         /* note initial PL_regmatch_state position; at end of match we'll
2175          * pop back to there and free any higher slabs */
2176
2177         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2178         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2179         reginfo->info_aux->poscache = NULL;
2180
2181         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2182
2183         if ((prog->extflags & RXf_EVAL_SEEN))
2184             S_setup_eval_state(aTHX_ reginfo);
2185         else
2186             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2187     }
2188
2189     /* If there is a "must appear" string, look for it. */
2190     s = startpos;
2191
2192     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2193         MAGIC *mg;
2194         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2195             reginfo->ganch = startpos + prog->gofs;
2196             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2197               "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2198         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2199                   && SvMAGIC(sv)
2200                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2201                   && mg->mg_len >= 0) {
2202             reginfo->ganch = strbeg + mg->mg_len;       /* Defined pos() */
2203             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2204                 "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2205
2206             if (prog->extflags & RXf_ANCH_GPOS) {
2207                 if (s > reginfo->ganch)
2208                     goto phooey;
2209                 s = reginfo->ganch - prog->gofs;
2210                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2211                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2212                 if (s < strbeg)
2213                     goto phooey;
2214             }
2215         }
2216         else if (data) {
2217             reginfo->ganch = strbeg + PTR2UV(data);
2218             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2219                  "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2220
2221         } else {                                /* pos() not defined */
2222             reginfo->ganch = strbeg;
2223             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2224                  "GPOS: reginfo->ganch = strbeg\n"));
2225         }
2226     }
2227     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2228         /* We have to be careful. If the previous successful match
2229            was from this regex we don't want a subsequent partially
2230            successful match to clobber the old results.
2231            So when we detect this possibility we add a swap buffer
2232            to the re, and switch the buffer each match. If we fail,
2233            we switch it back; otherwise we leave it swapped.
2234         */
2235         swap = prog->offs;
2236         /* do we need a save destructor here for eval dies? */
2237         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2238         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2239             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2240             PTR2UV(prog),
2241             PTR2UV(swap),
2242             PTR2UV(prog->offs)
2243         ));
2244     }
2245     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2246         re_scream_pos_data d;
2247
2248         d.scream_olds = &scream_olds;
2249         d.scream_pos = &scream_pos;
2250         s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2251         if (!s) {
2252             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2253             goto phooey;        /* not present */
2254         }
2255     }
2256
2257
2258
2259     /* Simplest case:  anchored match need be tried only once. */
2260     /*  [unless only anchor is BOL and multiline is set] */
2261     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2262         if (s == startpos && regtry(reginfo, &startpos))
2263             goto got_it;
2264         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2265                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2266         {
2267             char *end;
2268
2269             if (minlen)
2270                 dontbother = minlen - 1;
2271             end = HOP3c(strend, -dontbother, strbeg) - 1;
2272             /* for multiline we only have to try after newlines */
2273             if (prog->check_substr || prog->check_utf8) {
2274                 /* because of the goto we can not easily reuse the macros for bifurcating the
2275                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2276                 if (utf8_target) {
2277                     if (s == startpos)
2278                         goto after_try_utf8;
2279                     while (1) {
2280                         if (regtry(reginfo, &s)) {
2281                             goto got_it;
2282                         }
2283                       after_try_utf8:
2284                         if (s > end) {
2285                             goto phooey;
2286                         }
2287                         if (prog->extflags & RXf_USE_INTUIT) {
2288                             s = re_intuit_start(rx, sv, strbeg,
2289                                     s + UTF8SKIP(s), strend, flags, NULL);
2290                             if (!s) {
2291                                 goto phooey;
2292                             }
2293                         }
2294                         else {
2295                             s += UTF8SKIP(s);
2296                         }
2297                     }
2298                 } /* end search for check string in unicode */
2299                 else {
2300                     if (s == startpos) {
2301                         goto after_try_latin;
2302                     }
2303                     while (1) {
2304                         if (regtry(reginfo, &s)) {
2305                             goto got_it;
2306                         }
2307                       after_try_latin:
2308                         if (s > end) {
2309                             goto phooey;
2310                         }
2311                         if (prog->extflags & RXf_USE_INTUIT) {
2312                             s = re_intuit_start(rx, sv, strbeg,
2313                                         s + 1, strend, flags, NULL);
2314                             if (!s) {
2315                                 goto phooey;
2316                             }
2317                         }
2318                         else {
2319                             s++;
2320                         }
2321                     }
2322                 } /* end search for check string in latin*/
2323             } /* end search for check string */
2324             else { /* search for newline */
2325                 if (s > startpos) {
2326                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2327                     s--;
2328                 }
2329                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2330                 while (s <= end) { /* note it could be possible to match at the end of the string */
2331                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2332                         if (regtry(reginfo, &s))
2333                             goto got_it;
2334                     }
2335                 }
2336             } /* end search for newline */
2337         } /* end anchored/multiline check string search */
2338         goto phooey;
2339     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2340     {
2341         /* the warning about reginfo->ganch being used without initialization
2342            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2343            and we only enter this block when the same bit is set. */
2344         char *tmp_s = reginfo->ganch - prog->gofs;
2345
2346         if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2347             goto got_it;
2348         goto phooey;
2349     }
2350
2351     /* Messy cases:  unanchored match. */
2352     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2353         /* we have /x+whatever/ */
2354         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2355         char ch;
2356 #ifdef DEBUGGING
2357         int did_match = 0;
2358 #endif
2359         if (utf8_target) {
2360             if (! prog->anchored_utf8) {
2361                 to_utf8_substr(prog);
2362             }
2363             ch = SvPVX_const(prog->anchored_utf8)[0];
2364             REXEC_FBC_SCAN(
2365                 if (*s == ch) {
2366                     DEBUG_EXECUTE_r( did_match = 1 );
2367                     if (regtry(reginfo, &s)) goto got_it;
2368                     s += UTF8SKIP(s);
2369                     while (s < strend && *s == ch)
2370                         s += UTF8SKIP(s);
2371                 }
2372             );
2373
2374         }
2375         else {
2376             if (! prog->anchored_substr) {
2377                 if (! to_byte_substr(prog)) {
2378                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2379                 }
2380             }
2381             ch = SvPVX_const(prog->anchored_substr)[0];
2382             REXEC_FBC_SCAN(
2383                 if (*s == ch) {
2384                     DEBUG_EXECUTE_r( did_match = 1 );
2385                     if (regtry(reginfo, &s)) goto got_it;
2386                     s++;
2387                     while (s < strend && *s == ch)
2388                         s++;
2389                 }
2390             );
2391         }
2392         DEBUG_EXECUTE_r(if (!did_match)
2393                 PerlIO_printf(Perl_debug_log,
2394                                   "Did not find anchored character...\n")
2395                );
2396     }
2397     else if (prog->anchored_substr != NULL
2398               || prog->anchored_utf8 != NULL
2399               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2400                   && prog->float_max_offset < strend - s)) {
2401         SV *must;
2402         I32 back_max;
2403         I32 back_min;
2404         char *last;
2405         char *last1;            /* Last position checked before */
2406 #ifdef DEBUGGING
2407         int did_match = 0;
2408 #endif
2409         if (prog->anchored_substr || prog->anchored_utf8) {
2410             if (utf8_target) {
2411                 if (! prog->anchored_utf8) {
2412                     to_utf8_substr(prog);
2413                 }
2414                 must = prog->anchored_utf8;
2415             }
2416             else {
2417                 if (! prog->anchored_substr) {
2418                     if (! to_byte_substr(prog)) {
2419                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2420                     }
2421                 }
2422                 must = prog->anchored_substr;
2423             }
2424             back_max = back_min = prog->anchored_offset;
2425         } else {
2426             if (utf8_target) {
2427                 if (! prog->float_utf8) {
2428                     to_utf8_substr(prog);
2429                 }
2430                 must = prog->float_utf8;
2431             }
2432             else {
2433                 if (! prog->float_substr) {
2434                     if (! to_byte_substr(prog)) {
2435                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2436                     }
2437                 }
2438                 must = prog->float_substr;
2439             }
2440             back_max = prog->float_max_offset;
2441             back_min = prog->float_min_offset;
2442         }
2443
2444         if (back_min<0) {
2445             last = strend;
2446         } else {
2447             last = HOP3c(strend,        /* Cannot start after this */
2448                   -(I32)(CHR_SVLEN(must)
2449                          - (SvTAIL(must) != 0) + back_min), strbeg);
2450         }
2451         if (s > reginfo->strbeg)
2452             last1 = HOPc(s, -1);
2453         else
2454             last1 = s - 1;      /* bogus */
2455
2456         /* XXXX check_substr already used to find "s", can optimize if
2457            check_substr==must. */
2458         scream_pos = -1;
2459         dontbother = end_shift;
2460         strend = HOPc(strend, -dontbother);
2461         while ( (s <= last) &&
2462                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2463                                   (unsigned char*)strend, must,
2464                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2465             DEBUG_EXECUTE_r( did_match = 1 );
2466             if (HOPc(s, -back_max) > last1) {
2467                 last1 = HOPc(s, -back_min);
2468                 s = HOPc(s, -back_max);
2469             }
2470             else {
2471                 char * const t = (last1 >= reginfo->strbeg)
2472                                     ? HOPc(last1, 1) : last1 + 1;
2473
2474                 last1 = HOPc(s, -back_min);
2475                 s = t;
2476             }
2477             if (utf8_target) {
2478                 while (s <= last1) {
2479                     if (regtry(reginfo, &s))
2480                         goto got_it;
2481                     if (s >= last1) {
2482                         s++; /* to break out of outer loop */
2483                         break;
2484                     }
2485                     s += UTF8SKIP(s);
2486                 }
2487             }
2488             else {
2489                 while (s <= last1) {
2490                     if (regtry(reginfo, &s))
2491                         goto got_it;
2492                     s++;
2493                 }
2494             }
2495         }
2496         DEBUG_EXECUTE_r(if (!did_match) {
2497             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2498                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2499             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2500                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2501                                ? "anchored" : "floating"),
2502                 quoted, RE_SV_TAIL(must));
2503         });
2504         goto phooey;
2505     }
2506     else if ( (c = progi->regstclass) ) {
2507         if (minlen) {
2508             const OPCODE op = OP(progi->regstclass);
2509             /* don't bother with what can't match */
2510             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2511                 strend = HOPc(strend, -(minlen - 1));
2512         }
2513         DEBUG_EXECUTE_r({
2514             SV * const prop = sv_newmortal();
2515             regprop(prog, prop, c);
2516             {
2517                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2518                     s,strend-s,60);
2519                 PerlIO_printf(Perl_debug_log,
2520                     "Matching stclass %.*s against %s (%d bytes)\n",
2521                     (int)SvCUR(prop), SvPVX_const(prop),
2522                      quoted, (int)(strend - s));
2523             }
2524         });
2525         if (find_byclass(prog, c, s, strend, reginfo))
2526             goto got_it;
2527         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2528     }
2529     else {
2530         dontbother = 0;
2531         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2532             /* Trim the end. */
2533             char *last= NULL;
2534             SV* float_real;
2535             STRLEN len;
2536             const char *little;
2537
2538             if (utf8_target) {
2539                 if (! prog->float_utf8) {
2540                     to_utf8_substr(prog);
2541                 }
2542                 float_real = prog->float_utf8;
2543             }
2544             else {
2545                 if (! prog->float_substr) {
2546                     if (! to_byte_substr(prog)) {
2547                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2548                     }
2549                 }
2550                 float_real = prog->float_substr;
2551             }
2552
2553             little = SvPV_const(float_real, len);
2554             if (SvTAIL(float_real)) {
2555                     /* This means that float_real contains an artificial \n on
2556                      * the end due to the presence of something like this:
2557                      * /foo$/ where we can match both "foo" and "foo\n" at the
2558                      * end of the string.  So we have to compare the end of the
2559                      * string first against the float_real without the \n and
2560                      * then against the full float_real with the string.  We
2561                      * have to watch out for cases where the string might be
2562                      * smaller than the float_real or the float_real without
2563                      * the \n. */
2564                     char *checkpos= strend - len;
2565                     DEBUG_OPTIMISE_r(
2566                         PerlIO_printf(Perl_debug_log,
2567                             "%sChecking for float_real.%s\n",
2568                             PL_colors[4], PL_colors[5]));
2569                     if (checkpos + 1 < strbeg) {
2570                         /* can't match, even if we remove the trailing \n
2571                          * string is too short to match */
2572                         DEBUG_EXECUTE_r(
2573                             PerlIO_printf(Perl_debug_log,
2574                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2575                                 PL_colors[4], PL_colors[5]));
2576                         goto phooey;
2577                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2578                         /* can match, the end of the string matches without the
2579                          * "\n" */
2580                         last = checkpos + 1;
2581                     } else if (checkpos < strbeg) {
2582                         /* cant match, string is too short when the "\n" is
2583                          * included */
2584                         DEBUG_EXECUTE_r(
2585                             PerlIO_printf(Perl_debug_log,
2586                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2587                                 PL_colors[4], PL_colors[5]));
2588                         goto phooey;
2589                     } else if (!multiline) {
2590                         /* non multiline match, so compare with the "\n" at the
2591                          * end of the string */
2592                         if (memEQ(checkpos, little, len)) {
2593                             last= checkpos;
2594                         } else {
2595                             DEBUG_EXECUTE_r(
2596                                 PerlIO_printf(Perl_debug_log,
2597                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2598                                     PL_colors[4], PL_colors[5]));
2599                             goto phooey;
2600                         }
2601                     } else {
2602                         /* multiline match, so we have to search for a place
2603                          * where the full string is located */
2604                         goto find_last;
2605                     }
2606             } else {
2607                   find_last:
2608                     if (len)
2609                         last = rninstr(s, strend, little, little + len);
2610                     else
2611                         last = strend;  /* matching "$" */
2612             }
2613             if (!last) {
2614                 /* at one point this block contained a comment which was
2615                  * probably incorrect, which said that this was a "should not
2616                  * happen" case.  Even if it was true when it was written I am
2617                  * pretty sure it is not anymore, so I have removed the comment
2618                  * and replaced it with this one. Yves */
2619                 DEBUG_EXECUTE_r(
2620                     PerlIO_printf(Perl_debug_log,
2621                         "String does not contain required substring, cannot match.\n"
2622                     ));
2623                 goto phooey;
2624             }
2625             dontbother = strend - last + prog->float_min_offset;
2626         }
2627         if (minlen && (dontbother < minlen))
2628             dontbother = minlen - 1;
2629         strend -= dontbother;              /* this one's always in bytes! */
2630         /* We don't know much -- general case. */
2631         if (utf8_target) {
2632             for (;;) {
2633                 if (regtry(reginfo, &s))
2634                     goto got_it;
2635                 if (s >= strend)
2636                     break;
2637                 s += UTF8SKIP(s);
2638             };
2639         }
2640         else {
2641             do {
2642                 if (regtry(reginfo, &s))
2643                     goto got_it;
2644             } while (s++ < strend);
2645         }
2646     }
2647
2648     /* Failure. */
2649     goto phooey;
2650
2651 got_it:
2652     DEBUG_BUFFERS_r(
2653         if (swap)
2654             PerlIO_printf(Perl_debug_log,
2655                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2656                 PTR2UV(prog),
2657                 PTR2UV(swap)
2658             );
2659     );
2660     Safefree(swap);
2661
2662     /* clean up; this will trigger destructors that will free all slabs
2663      * above the current one, and cleanup the regmatch_info_aux
2664      * and regmatch_info_aux_eval sructs */
2665
2666     LEAVE_SCOPE(oldsave);
2667
2668     if (RXp_PAREN_NAMES(prog))
2669         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2670
2671     RX_MATCH_UTF8_set(rx, utf8_target);
2672
2673     /* make sure $`, $&, $', and $digit will work later */
2674     if ( !(flags & REXEC_NOT_FIRST) ) {
2675         if (flags & REXEC_COPY_STR) {
2676 #ifdef PERL_ANY_COW
2677             if (SvCANCOW(sv)) {
2678                 if (DEBUG_C_TEST) {
2679                     PerlIO_printf(Perl_debug_log,
2680                                   "Copy on write: regexp capture, type %d\n",
2681                                   (int) SvTYPE(sv));
2682                 }
2683                 RX_MATCH_COPY_FREE(rx);
2684                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2685                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2686                 assert (SvPOKp(prog->saved_copy));
2687                 prog->sublen  = reginfo->strend - strbeg;
2688                 prog->suboffset = 0;
2689                 prog->subcoffset = 0;
2690             } else
2691 #endif
2692             {
2693                 I32 min = 0;
2694                 I32 max = reginfo->strend - strbeg;
2695                 I32 sublen;
2696
2697                 if (    (flags & REXEC_COPY_SKIP_POST)
2698                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2699                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2700                 ) { /* don't copy $' part of string */
2701                     U32 n = 0;
2702                     max = -1;
2703                     /* calculate the right-most part of the string covered
2704                      * by a capture. Due to look-ahead, this may be to
2705                      * the right of $&, so we have to scan all captures */
2706                     while (n <= prog->lastparen) {
2707                         if (prog->offs[n].end > max)
2708                             max = prog->offs[n].end;
2709                         n++;
2710                     }
2711                     if (max == -1)
2712                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2713                                 ? prog->offs[0].start
2714                                 : 0;
2715                     assert(max >= 0 && max <= reginfo->strend - strbeg);
2716                 }
2717
2718                 if (    (flags & REXEC_COPY_SKIP_PRE)
2719                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2720                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2721                 ) { /* don't copy $` part of string */
2722                     U32 n = 0;
2723                     min = max;
2724                     /* calculate the left-most part of the string covered
2725                      * by a capture. Due to look-behind, this may be to
2726                      * the left of $&, so we have to scan all captures */
2727                     while (min && n <= prog->lastparen) {
2728                         if (   prog->offs[n].start != -1
2729                             && prog->offs[n].start < min)
2730                         {
2731                             min = prog->offs[n].start;
2732                         }
2733                         n++;
2734                     }
2735                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2736                         && min >  prog->offs[0].end
2737                     )
2738                         min = prog->offs[0].end;
2739
2740                 }
2741
2742                 assert(min >= 0 && min <= max
2743                     && min <= reginfo->strend - strbeg);
2744                 sublen = max - min;
2745
2746                 if (RX_MATCH_COPIED(rx)) {
2747                     if (sublen > prog->sublen)
2748                         prog->subbeg =
2749                                 (char*)saferealloc(prog->subbeg, sublen+1);
2750                 }
2751                 else
2752                     prog->subbeg = (char*)safemalloc(sublen+1);
2753                 Copy(strbeg + min, prog->subbeg, sublen, char);
2754                 prog->subbeg[sublen] = '\0';
2755                 prog->suboffset = min;
2756                 prog->sublen = sublen;
2757                 RX_MATCH_COPIED_on(rx);
2758             }
2759             prog->subcoffset = prog->suboffset;
2760             if (prog->suboffset && utf8_target) {
2761                 /* Convert byte offset to chars.
2762                  * XXX ideally should only compute this if @-/@+
2763                  * has been seen, a la PL_sawampersand ??? */
2764
2765                 /* If there's a direct correspondence between the
2766                  * string which we're matching and the original SV,
2767                  * then we can use the utf8 len cache associated with
2768                  * the SV. In particular, it means that under //g,
2769                  * sv_pos_b2u() will use the previously cached
2770                  * position to speed up working out the new length of
2771                  * subcoffset, rather than counting from the start of
2772                  * the string each time. This stops
2773                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2774                  * from going quadratic */
2775                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2776                     sv_pos_b2u(sv, &(prog->subcoffset));
2777                 else
2778                     prog->subcoffset = utf8_length((U8*)strbeg,
2779                                         (U8*)(strbeg+prog->suboffset));
2780             }
2781         }
2782         else {
2783             RX_MATCH_COPY_FREE(rx);
2784             prog->subbeg = strbeg;
2785             prog->suboffset = 0;
2786             prog->subcoffset = 0;
2787             /* use reginfo->strend, as strend may have been modified */
2788             prog->sublen = reginfo->strend - strbeg;
2789         }
2790     }
2791
2792     return 1;
2793
2794 phooey:
2795     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2796                           PL_colors[4], PL_colors[5]));
2797
2798     /* clean up; this will trigger destructors that will free all slabs
2799      * above the current one, and cleanup the regmatch_info_aux
2800      * and regmatch_info_aux_eval sructs */
2801
2802     LEAVE_SCOPE(oldsave);
2803
2804     if (swap) {
2805         /* we failed :-( roll it back */
2806         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2807             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2808             PTR2UV(prog),
2809             PTR2UV(prog->offs),
2810             PTR2UV(swap)
2811         ));
2812         Safefree(prog->offs);
2813         prog->offs = swap;
2814     }
2815     return 0;
2816 }
2817
2818
2819 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2820  * Do inc before dec, in case old and new rex are the same */
2821 #define SET_reg_curpm(Re2) \
2822     if (reginfo->info_aux_eval) {                   \
2823         (void)ReREFCNT_inc(Re2);                    \
2824         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2825         PM_SETRE((PL_reg_curpm), (Re2));            \
2826     }
2827
2828
2829 /*
2830  - regtry - try match at specific point
2831  */
2832 STATIC I32                      /* 0 failure, 1 success */
2833 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2834 {
2835     dVAR;
2836     CHECKPOINT lastcp;
2837     REGEXP *const rx = reginfo->prog;
2838     regexp *const prog = ReANY(rx);
2839     I32 result;
2840     RXi_GET_DECL(prog,progi);
2841     GET_RE_DEBUG_FLAGS_DECL;
2842
2843     PERL_ARGS_ASSERT_REGTRY;
2844
2845     reginfo->cutpoint=NULL;
2846
2847     prog->offs[0].start = *startposp - reginfo->strbeg;
2848     prog->lastparen = 0;
2849     prog->lastcloseparen = 0;
2850
2851     /* XXXX What this code is doing here?!!!  There should be no need
2852        to do this again and again, prog->lastparen should take care of
2853        this!  --ilya*/
2854
2855     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2856      * Actually, the code in regcppop() (which Ilya may be meaning by
2857      * prog->lastparen), is not needed at all by the test suite
2858      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2859      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2860      * Meanwhile, this code *is* needed for the
2861      * above-mentioned test suite tests to succeed.  The common theme
2862      * on those tests seems to be returning null fields from matches.
2863      * --jhi updated by dapm */
2864 #if 1
2865     if (prog->nparens) {
2866         regexp_paren_pair *pp = prog->offs;
2867         I32 i;
2868         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2869             ++pp;
2870             pp->start = -1;
2871             pp->end = -1;
2872         }
2873     }
2874 #endif
2875     REGCP_SET(lastcp);
2876     result = regmatch(reginfo, *startposp, progi->program + 1);
2877     if (result != -1) {
2878         prog->offs[0].end = result;
2879         return 1;
2880     }
2881     if (reginfo->cutpoint)
2882         *startposp= reginfo->cutpoint;
2883     REGCP_UNWIND(lastcp);
2884     return 0;
2885 }
2886
2887
2888 #define sayYES goto yes
2889 #define sayNO goto no
2890 #define sayNO_SILENT goto no_silent
2891
2892 /* we dont use STMT_START/END here because it leads to
2893    "unreachable code" warnings, which are bogus, but distracting. */
2894 #define CACHEsayNO \
2895     if (ST.cache_mask) \
2896        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
2897     sayNO
2898
2899 /* this is used to determine how far from the left messages like
2900    'failed...' are printed. It should be set such that messages
2901    are inline with the regop output that created them.
2902 */
2903 #define REPORT_CODE_OFF 32
2904
2905
2906 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2907 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2908 #define CHRTEST_NOT_A_CP_1 -999
2909 #define CHRTEST_NOT_A_CP_2 -998
2910
2911 /* grab a new slab and return the first slot in it */
2912
2913 STATIC regmatch_state *
2914 S_push_slab(pTHX)
2915 {
2916 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2917     dMY_CXT;
2918 #endif
2919     regmatch_slab *s = PL_regmatch_slab->next;
2920     if (!s) {
2921         Newx(s, 1, regmatch_slab);
2922         s->prev = PL_regmatch_slab;
2923         s->next = NULL;
2924         PL_regmatch_slab->next = s;
2925     }
2926     PL_regmatch_slab = s;
2927     return SLAB_FIRST(s);
2928 }
2929
2930
2931 /* push a new state then goto it */
2932
2933 #define PUSH_STATE_GOTO(state, node, input) \
2934     pushinput = input; \
2935     scan = node; \
2936     st->resume_state = state; \
2937     goto push_state;
2938
2939 /* push a new state with success backtracking, then goto it */
2940
2941 #define PUSH_YES_STATE_GOTO(state, node, input) \
2942     pushinput = input; \
2943     scan = node; \
2944     st->resume_state = state; \
2945     goto push_yes_state;
2946
2947
2948
2949
2950 /*
2951
2952 regmatch() - main matching routine
2953
2954 This is basically one big switch statement in a loop. We execute an op,
2955 set 'next' to point the next op, and continue. If we come to a point which
2956 we may need to backtrack to on failure such as (A|B|C), we push a
2957 backtrack state onto the backtrack stack. On failure, we pop the top
2958 state, and re-enter the loop at the state indicated. If there are no more
2959 states to pop, we return failure.
2960
2961 Sometimes we also need to backtrack on success; for example /A+/, where
2962 after successfully matching one A, we need to go back and try to
2963 match another one; similarly for lookahead assertions: if the assertion
2964 completes successfully, we backtrack to the state just before the assertion
2965 and then carry on.  In these cases, the pushed state is marked as
2966 'backtrack on success too'. This marking is in fact done by a chain of
2967 pointers, each pointing to the previous 'yes' state. On success, we pop to
2968 the nearest yes state, discarding any intermediate failure-only states.
2969 Sometimes a yes state is pushed just to force some cleanup code to be
2970 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2971 it to free the inner regex.
2972
2973 Note that failure backtracking rewinds the cursor position, while
2974 success backtracking leaves it alone.
2975
2976 A pattern is complete when the END op is executed, while a subpattern
2977 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2978 ops trigger the "pop to last yes state if any, otherwise return true"
2979 behaviour.
2980
2981 A common convention in this function is to use A and B to refer to the two
2982 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2983 the subpattern to be matched possibly multiple times, while B is the entire
2984 rest of the pattern. Variable and state names reflect this convention.
2985
2986 The states in the main switch are the union of ops and failure/success of
2987 substates associated with with that op.  For example, IFMATCH is the op
2988 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2989 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2990 successfully matched A and IFMATCH_A_fail is a state saying that we have
2991 just failed to match A. Resume states always come in pairs. The backtrack
2992 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2993 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2994 on success or failure.
2995
2996 The struct that holds a backtracking state is actually a big union, with
2997 one variant for each major type of op. The variable st points to the
2998 top-most backtrack struct. To make the code clearer, within each
2999 block of code we #define ST to alias the relevant union.
3000
3001 Here's a concrete example of a (vastly oversimplified) IFMATCH
3002 implementation:
3003
3004     switch (state) {
3005     ....
3006
3007 #define ST st->u.ifmatch
3008
3009     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3010         ST.foo = ...; // some state we wish to save
3011         ...
3012         // push a yes backtrack state with a resume value of
3013         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3014         // first node of A:
3015         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3016         // NOTREACHED
3017
3018     case IFMATCH_A: // we have successfully executed A; now continue with B
3019         next = B;
3020         bar = ST.foo; // do something with the preserved value
3021         break;
3022
3023     case IFMATCH_A_fail: // A failed, so the assertion failed
3024         ...;   // do some housekeeping, then ...
3025         sayNO; // propagate the failure
3026
3027 #undef ST
3028
3029     ...
3030     }
3031
3032 For any old-timers reading this who are familiar with the old recursive
3033 approach, the code above is equivalent to:
3034
3035     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3036     {
3037         int foo = ...
3038         ...
3039         if (regmatch(A)) {
3040             next = B;
3041             bar = foo;
3042             break;
3043         }
3044         ...;   // do some housekeeping, then ...
3045         sayNO; // propagate the failure
3046     }
3047
3048 The topmost backtrack state, pointed to by st, is usually free. If you
3049 want to claim it, populate any ST.foo fields in it with values you wish to
3050 save, then do one of
3051
3052         PUSH_STATE_GOTO(resume_state, node, newinput);
3053         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3054
3055 which sets that backtrack state's resume value to 'resume_state', pushes a
3056 new free entry to the top of the backtrack stack, then goes to 'node'.
3057 On backtracking, the free slot is popped, and the saved state becomes the
3058 new free state. An ST.foo field in this new top state can be temporarily
3059 accessed to retrieve values, but once the main loop is re-entered, it
3060 becomes available for reuse.
3061
3062 Note that the depth of the backtrack stack constantly increases during the
3063 left-to-right execution of the pattern, rather than going up and down with
3064 the pattern nesting. For example the stack is at its maximum at Z at the
3065 end of the pattern, rather than at X in the following:
3066
3067     /(((X)+)+)+....(Y)+....Z/
3068
3069 The only exceptions to this are lookahead/behind assertions and the cut,
3070 (?>A), which pop all the backtrack states associated with A before
3071 continuing.
3072
3073 Backtrack state structs are allocated in slabs of about 4K in size.
3074 PL_regmatch_state and st always point to the currently active state,
3075 and PL_regmatch_slab points to the slab currently containing
3076 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3077 allocated, and is never freed until interpreter destruction. When the slab
3078 is full, a new one is allocated and chained to the end. At exit from
3079 regmatch(), slabs allocated since entry are freed.
3080
3081 */
3082
3083
3084 #define DEBUG_STATE_pp(pp)                                  \
3085     DEBUG_STATE_r({                                         \
3086         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3087         PerlIO_printf(Perl_debug_log,                       \
3088             "    %*s"pp" %s%s%s%s%s\n",                     \
3089             depth*2, "",                                    \
3090             PL_reg_name[st->resume_state],                     \
3091             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3092             ((st==yes_state) ? "Y" : ""),                   \
3093             ((st==mark_state) ? "M" : ""),                  \
3094             ((st==yes_state||st==mark_state) ? "]" : "")    \
3095         );                                                  \
3096     });
3097
3098
3099 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3100
3101 #ifdef DEBUGGING
3102
3103 STATIC void
3104 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3105     const char *start, const char *end, const char *blurb)
3106 {
3107     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3108
3109     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3110
3111     if (!PL_colorset)
3112             reginitcolors();
3113     {
3114         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3115             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3116
3117         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3118             start, end - start, 60);
3119
3120         PerlIO_printf(Perl_debug_log,
3121             "%s%s REx%s %s against %s\n",
3122                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3123
3124         if (utf8_target||utf8_pat)
3125             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3126                 utf8_pat ? "pattern" : "",
3127                 utf8_pat && utf8_target ? " and " : "",
3128                 utf8_target ? "string" : ""
3129             );
3130     }
3131 }
3132
3133 STATIC void
3134 S_dump_exec_pos(pTHX_ const char *locinput,
3135                       const regnode *scan,
3136                       const char *loc_regeol,
3137                       const char *loc_bostr,
3138                       const char *loc_reg_starttry,
3139                       const bool utf8_target)
3140 {
3141     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3142     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3143     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3144     /* The part of the string before starttry has one color
3145        (pref0_len chars), between starttry and current
3146        position another one (pref_len - pref0_len chars),
3147        after the current position the third one.
3148        We assume that pref0_len <= pref_len, otherwise we
3149        decrease pref0_len.  */
3150     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3151         ? (5 + taill) - l : locinput - loc_bostr;
3152     int pref0_len;
3153
3154     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3155
3156     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3157         pref_len++;
3158     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3159     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3160         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3161               ? (5 + taill) - pref_len : loc_regeol - locinput);
3162     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3163         l--;
3164     if (pref0_len < 0)
3165         pref0_len = 0;
3166     if (pref0_len > pref_len)
3167         pref0_len = pref_len;
3168     {
3169         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3170
3171         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3172             (locinput - pref_len),pref0_len, 60, 4, 5);
3173
3174         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3175                     (locinput - pref_len + pref0_len),
3176                     pref_len - pref0_len, 60, 2, 3);
3177
3178         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3179                     locinput, loc_regeol - locinput, 10, 0, 1);
3180
3181         const STRLEN tlen=len0+len1+len2;
3182         PerlIO_printf(Perl_debug_log,
3183                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3184                     (IV)(locinput - loc_bostr),
3185                     len0, s0,
3186                     len1, s1,
3187                     (docolor ? "" : "> <"),
3188                     len2, s2,
3189                     (int)(tlen > 19 ? 0 :  19 - tlen),
3190                     "");
3191     }
3192 }
3193
3194 #endif
3195
3196 /* reg_check_named_buff_matched()
3197  * Checks to see if a named buffer has matched. The data array of
3198  * buffer numbers corresponding to the buffer is expected to reside
3199  * in the regexp->data->data array in the slot stored in the ARG() of
3200  * node involved. Note that this routine doesn't actually care about the
3201  * name, that information is not preserved from compilation to execution.
3202  * Returns the index of the leftmost defined buffer with the given name
3203  * or 0 if non of the buffers matched.
3204  */
3205 STATIC I32
3206 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3207 {
3208     I32 n;
3209     RXi_GET_DECL(rex,rexi);
3210     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3211     I32 *nums=(I32*)SvPVX(sv_dat);
3212
3213     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3214
3215     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3216         if ((I32)rex->lastparen >= nums[n] &&
3217             rex->offs[nums[n]].end != -1)
3218         {
3219             return nums[n];
3220         }
3221     }
3222     return 0;
3223 }
3224
3225
3226 static bool
3227 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3228         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3229 {
3230     /* This function determines if there are one or two characters that match
3231      * the first character of the passed-in EXACTish node <text_node>, and if
3232      * so, returns them in the passed-in pointers.
3233      *
3234      * If it determines that no possible character in the target string can
3235      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3236      * the first character in <text_node> requires UTF-8 to represent, and the
3237      * target string isn't in UTF-8.)
3238      *
3239      * If there are more than two characters that could match the beginning of
3240      * <text_node>, or if more context is required to determine a match or not,
3241      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3242      *
3243      * The motiviation behind this function is to allow the caller to set up
3244      * tight loops for matching.  If <text_node> is of type EXACT, there is
3245      * only one possible character that can match its first character, and so
3246      * the situation is quite simple.  But things get much more complicated if
3247      * folding is involved.  It may be that the first character of an EXACTFish
3248      * node doesn't participate in any possible fold, e.g., punctuation, so it
3249      * can be matched only by itself.  The vast majority of characters that are
3250      * in folds match just two things, their lower and upper-case equivalents.
3251      * But not all are like that; some have multiple possible matches, or match
3252      * sequences of more than one character.  This function sorts all that out.
3253      *
3254      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3255      * loop of trying to match A*, we know we can't exit where the thing
3256      * following it isn't a B.  And something can't be a B unless it is the
3257      * beginning of B.  By putting a quick test for that beginning in a tight
3258      * loop, we can rule out things that can't possibly be B without having to
3259      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3260      * character, we can make a tight loop matching A*, using the outputs of
3261      * this function.
3262      *
3263      * If the target string to match isn't in UTF-8, and there aren't
3264      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3265      * the one or two possible octets (which are characters in this situation)
3266      * that can match.  In all cases, if there is only one character that can
3267      * match, *<c1p> and *<c2p> will be identical.
3268      *
3269      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3270      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3271      * can match the beginning of <text_node>.  They should be declared with at
3272      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3273      * undefined what these contain.)  If one or both of the buffers are
3274      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3275      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3276      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3277      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3278      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3279
3280     const bool utf8_target = reginfo->is_utf8_target;
3281
3282     UV c1 = CHRTEST_NOT_A_CP_1;
3283     UV c2 = CHRTEST_NOT_A_CP_2;
3284     bool use_chrtest_void = FALSE;
3285     const bool is_utf8_pat = reginfo->is_utf8_pat;
3286
3287     /* Used when we have both utf8 input and utf8 output, to avoid converting
3288      * to/from code points */
3289     bool utf8_has_been_setup = FALSE;
3290
3291     dVAR;
3292
3293     U8 *pat = (U8*)STRING(text_node);
3294
3295     if (OP(text_node) == EXACT) {
3296
3297         /* In an exact node, only one thing can be matched, that first
3298          * character.  If both the pat and the target are UTF-8, we can just
3299          * copy the input to the output, avoiding finding the code point of
3300          * that character */
3301         if (!is_utf8_pat) {
3302             c2 = c1 = *pat;
3303         }
3304         else if (utf8_target) {
3305             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3306             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3307             utf8_has_been_setup = TRUE;
3308         }
3309         else {
3310             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3311         }
3312     }
3313     else /* an EXACTFish node */
3314          if ((is_utf8_pat
3315                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3316                                                     pat + STR_LEN(text_node)))
3317              || (!is_utf8_pat
3318                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3319                                                     pat + STR_LEN(text_node))))
3320     {
3321         /* Multi-character folds require more context to sort out.  Also
3322          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3323          * handled outside this routine */
3324         use_chrtest_void = TRUE;
3325     }
3326     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3327         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3328         if (c1 > 256) {
3329             /* Load the folds hash, if not already done */
3330             SV** listp;
3331             if (! PL_utf8_foldclosures) {
3332                 if (! PL_utf8_tofold) {
3333                     U8 dummy[UTF8_MAXBYTES+1];
3334
3335                     /* Force loading this by folding an above-Latin1 char */
3336                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3337                     assert(PL_utf8_tofold); /* Verify that worked */
3338                 }
3339                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3340             }
3341
3342             /* The fold closures data structure is a hash with the keys being
3343              * the UTF-8 of every character that is folded to, like 'k', and
3344              * the values each an array of all code points that fold to its
3345              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3346              * not included */
3347             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3348                                      (char *) pat,
3349                                      UTF8SKIP(pat),
3350                                      FALSE))))
3351             {
3352                 /* Not found in the hash, therefore there are no folds
3353                  * containing it, so there is only a single character that
3354                  * could match */
3355                 c2 = c1;
3356             }
3357             else {  /* Does participate in folds */
3358                 AV* list = (AV*) *listp;
3359                 if (av_len(list) != 1) {
3360
3361                     /* If there aren't exactly two folds to this, it is outside
3362                      * the scope of this function */
3363                     use_chrtest_void = TRUE;
3364                 }
3365                 else {  /* There are two.  Get them */
3366                     SV** c_p = av_fetch(list, 0, FALSE);
3367                     if (c_p == NULL) {
3368                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3369                     }
3370                     c1 = SvUV(*c_p);
3371
3372                     c_p = av_fetch(list, 1, FALSE);
3373                     if (c_p == NULL) {
3374                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3375                     }
3376                     c2 = SvUV(*c_p);
3377
3378                     /* Folds that cross the 255/256 boundary are forbidden if
3379                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3380                      * pattern character is above 256, and its only other match
3381                      * is below 256, the only legal match will be to itself.
3382                      * We have thrown away the original, so have to compute
3383                      * which is the one above 255 */
3384                     if ((c1 < 256) != (c2 < 256)) {
3385                         if (OP(text_node) == EXACTFL
3386                             || (OP(text_node) == EXACTFA
3387                                 && (isASCII(c1) || isASCII(c2))))
3388                         {
3389                             if (c1 < 256) {
3390                                 c1 = c2;
3391                             }
3392                             else {
3393                                 c2 = c1;
3394                             }
3395                         }
3396                     }
3397                 }
3398             }
3399         }
3400         else /* Here, c1 is < 255 */
3401              if (utf8_target
3402                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3403                  && OP(text_node) != EXACTFL
3404                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3405         {
3406             /* Here, there could be something above Latin1 in the target which
3407              * folds to this character in the pattern.  All such cases except
3408              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3409              * involved in their folds, so are outside the scope of this
3410              * function */
3411             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3412                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3413             }
3414             else {
3415                 use_chrtest_void = TRUE;
3416             }
3417         }
3418         else { /* Here nothing above Latin1 can fold to the pattern character */
3419             switch (OP(text_node)) {
3420
3421                 case EXACTFL:   /* /l rules */
3422                     c2 = PL_fold_locale[c1];
3423                     break;
3424
3425                 case EXACTF:
3426                     if (! utf8_target) {    /* /d rules */
3427                         c2 = PL_fold[c1];
3428                         break;
3429                     }
3430                     /* FALLTHROUGH */
3431                     /* /u rules for all these.  This happens to work for
3432                      * EXACTFA as nothing in Latin1 folds to ASCII */
3433                 case EXACTFA:
3434                 case EXACTFU_TRICKYFOLD:
3435                 case EXACTFU_SS:
3436                 case EXACTFU:
3437                     c2 = PL_fold_latin1[c1];
3438                     break;
3439
3440                 default:
3441                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3442                     assert(0); /* NOTREACHED */
3443             }
3444         }
3445     }
3446
3447     /* Here have figured things out.  Set up the returns */
3448     if (use_chrtest_void) {
3449         *c2p = *c1p = CHRTEST_VOID;
3450     }
3451     else if (utf8_target) {
3452         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3453             uvchr_to_utf8(c1_utf8, c1);
3454             uvchr_to_utf8(c2_utf8, c2);
3455         }
3456
3457         /* Invariants are stored in both the utf8 and byte outputs; Use
3458          * negative numbers otherwise for the byte ones.  Make sure that the
3459          * byte ones are the same iff the utf8 ones are the same */
3460         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3461         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3462                 ? *c2_utf8
3463                 : (c1 == c2)
3464                   ? CHRTEST_NOT_A_CP_1
3465                   : CHRTEST_NOT_A_CP_2;
3466     }
3467     else if (c1 > 255) {
3468        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3469                            can represent */
3470            return FALSE;
3471        }
3472
3473        *c1p = *c2p = c2;    /* c2 is the only representable value */
3474     }
3475     else {  /* c1 is representable; see about c2 */
3476        *c1p = c1;
3477        *c2p = (c2 < 256) ? c2 : c1;
3478     }
3479
3480     return TRUE;
3481 }
3482
3483 /* returns -1 on failure, $+[0] on success */
3484 STATIC I32
3485 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3486 {
3487 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3488     dMY_CXT;
3489 #endif
3490     dVAR;
3491     const bool utf8_target = reginfo->is_utf8_target;
3492     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3493     REGEXP *rex_sv = reginfo->prog;
3494     regexp *rex = ReANY(rex_sv);
3495     RXi_GET_DECL(rex,rexi);
3496     /* the current state. This is a cached copy of PL_regmatch_state */
3497     regmatch_state *st;
3498     /* cache heavy used fields of st in registers */
3499     regnode *scan;
3500     regnode *next;
3501     U32 n = 0;  /* general value; init to avoid compiler warning */
3502     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3503     char *locinput = startpos;
3504     char *pushinput; /* where to continue after a PUSH */
3505     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3506
3507     bool result = 0;        /* return value of S_regmatch */
3508     int depth = 0;          /* depth of backtrack stack */
3509     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3510     const U32 max_nochange_depth =
3511         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3512         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3513     regmatch_state *yes_state = NULL; /* state to pop to on success of
3514                                                             subpattern */
3515     /* mark_state piggy backs on the yes_state logic so that when we unwind
3516        the stack on success we can update the mark_state as we go */
3517     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3518     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3519     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3520     U32 state_num;
3521     bool no_final = 0;      /* prevent failure from backtracking? */
3522     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3523     char *startpoint = locinput;
3524     SV *popmark = NULL;     /* are we looking for a mark? */
3525     SV *sv_commit = NULL;   /* last mark name seen in failure */
3526     SV *sv_yes_mark = NULL; /* last mark name we have seen
3527                                during a successful match */
3528     U32 lastopen = 0;       /* last open we saw */
3529     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3530     SV* const oreplsv = GvSV(PL_replgv);
3531     /* these three flags are set by various ops to signal information to
3532      * the very next op. They have a useful lifetime of exactly one loop
3533      * iteration, and are not preserved or restored by state pushes/pops
3534      */
3535     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3536     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3537     int logical = 0;        /* the following EVAL is:
3538                                 0: (?{...})
3539                                 1: (?(?{...})X|Y)
3540                                 2: (??{...})
3541                                or the following IFMATCH/UNLESSM is:
3542                                 false: plain (?=foo)
3543                                 true:  used as a condition: (?(?=foo))
3544                             */
3545     PAD* last_pad = NULL;
3546     dMULTICALL;
3547     I32 gimme = G_SCALAR;
3548     CV *caller_cv = NULL;       /* who called us */
3549     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3550     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3551     U32 maxopenparen = 0;       /* max '(' index seen so far */
3552     int to_complement;  /* Invert the result? */
3553     _char_class_number classnum;
3554     bool is_utf8_pat = reginfo->is_utf8_pat;
3555
3556 #ifdef DEBUGGING
3557     GET_RE_DEBUG_FLAGS_DECL;
3558 #endif
3559
3560     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3561     multicall_oldcatch = 0;
3562     multicall_cv = NULL;
3563     cx = NULL;
3564     PERL_UNUSED_VAR(multicall_cop);
3565     PERL_UNUSED_VAR(newsp);
3566
3567
3568     PERL_ARGS_ASSERT_REGMATCH;
3569
3570     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3571             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3572     }));
3573
3574     st = PL_regmatch_state;
3575
3576     /* Note that nextchr is a byte even in UTF */
3577     SET_nextchr;
3578     scan = prog;
3579     while (scan != NULL) {
3580
3581         DEBUG_EXECUTE_r( {
3582             SV * const prop = sv_newmortal();
3583             regnode *rnext=regnext(scan);
3584             DUMP_EXEC_POS( locinput, scan, utf8_target );
3585             regprop(rex, prop, scan);
3586
3587             PerlIO_printf(Perl_debug_log,
3588                     "%3"IVdf":%*s%s(%"IVdf")\n",
3589                     (IV)(scan - rexi->program), depth*2, "",
3590                     SvPVX_const(prop),
3591                     (PL_regkind[OP(scan)] == END || !rnext) ?
3592                         0 : (IV)(rnext - rexi->program));
3593         });
3594
3595         next = scan + NEXT_OFF(scan);
3596         if (next == scan)
3597             next = NULL;
3598         state_num = OP(scan);
3599
3600         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3601       reenter_switch:
3602         to_complement = 0;
3603
3604         SET_nextchr;
3605         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3606
3607         switch (state_num) {
3608         case BOL: /*  /^../  */
3609             if (locinput == reginfo->strbeg)
3610                 break;
3611             sayNO;
3612
3613         case MBOL: /*  /^../m  */
3614             if (locinput == reginfo->strbeg ||
3615                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3616             {
3617                 break;
3618             }
3619             sayNO;
3620
3621         case SBOL: /*  /^../s  */
3622             if (locinput == reginfo->strbeg)
3623                 break;
3624             sayNO;
3625
3626         case GPOS: /*  \G  */
3627             if (locinput == reginfo->ganch)
3628                 break;
3629             sayNO;
3630
3631         case KEEPS: /*   \K  */
3632             /* update the startpoint */
3633             st->u.keeper.val = rex->offs[0].start;
3634             rex->offs[0].start = locinput - reginfo->strbeg;
3635             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3636             assert(0); /*NOTREACHED*/
3637         case KEEPS_next_fail:
3638             /* rollback the start point change */
3639             rex->offs[0].start = st->u.keeper.val;
3640             sayNO_SILENT;
3641             assert(0); /*NOTREACHED*/
3642
3643         case EOL: /* /..$/  */
3644                 goto seol;
3645
3646         case MEOL: /* /..$/m  */
3647             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3648                 sayNO;
3649             break;
3650
3651         case SEOL: /* /..$/s  */
3652           seol:
3653             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3654                 sayNO;
3655             if (reginfo->strend - locinput > 1)
3656                 sayNO;
3657             break;
3658
3659         case EOS: /*  \z  */
3660             if (!NEXTCHR_IS_EOS)
3661                 sayNO;
3662             break;
3663
3664         case SANY: /*  /./s  */
3665             if (NEXTCHR_IS_EOS)
3666                 sayNO;
3667             goto increment_locinput;
3668
3669         case CANY: /*  \C  */
3670             if (NEXTCHR_IS_EOS)
3671                 sayNO;
3672             locinput++;
3673             break;
3674
3675         case REG_ANY: /*  /./  */
3676             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3677                 sayNO;
3678             goto increment_locinput;
3679
3680
3681 #undef  ST
3682 #define ST st->u.trie
3683         case TRIEC: /* (ab|cd) with known charclass */
3684             /* In this case the charclass data is available inline so
3685                we can fail fast without a lot of extra overhead.
3686              */
3687             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3688                 DEBUG_EXECUTE_r(
3689                     PerlIO_printf(Perl_debug_log,
3690                               "%*s  %sfailed to match trie start class...%s\n",
3691                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3692                 );
3693                 sayNO_SILENT;
3694                 assert(0); /* NOTREACHED */
3695             }
3696             /* FALL THROUGH */
3697         case TRIE:  /* (ab|cd)  */
3698             /* the basic plan of execution of the trie is:
3699              * At the beginning, run though all the states, and
3700              * find the longest-matching word. Also remember the position
3701              * of the shortest matching word. For example, this pattern:
3702              *    1  2 3 4    5
3703              *    ab|a|x|abcd|abc
3704              * when matched against the string "abcde", will generate
3705              * accept states for all words except 3, with the longest
3706              * matching word being 4, and the shortest being 2 (with
3707              * the position being after char 1 of the string).
3708              *
3709              * Then for each matching word, in word order (i.e. 1,2,4,5),
3710              * we run the remainder of the pattern; on each try setting
3711              * the current position to the character following the word,
3712              * returning to try the next word on failure.
3713              *
3714              * We avoid having to build a list of words at runtime by
3715              * using a compile-time structure, wordinfo[].prev, which
3716              * gives, for each word, the previous accepting word (if any).
3717              * In the case above it would contain the mappings 1->2, 2->0,
3718              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3719              * the longest word (4 above), a list of all words, by
3720              * following the list of prev pointers; this gives us the
3721              * unordered list 4,5,1,2. Then given the current word we have
3722              * just tried, we can go through the list and find the
3723              * next-biggest word to try (so if we just failed on word 2,
3724              * the next in the list is 4).
3725              *
3726              * Since at runtime we don't record the matching position in
3727              * the string for each word, we have to work that out for
3728              * each word we're about to process. The wordinfo table holds
3729              * the character length of each word; given that we recorded
3730              * at the start: the position of the shortest word and its
3731              * length in chars, we just need to move the pointer the
3732              * difference between the two char lengths. Depending on
3733              * Unicode status and folding, that's cheap or expensive.
3734              *
3735              * This algorithm is optimised for the case where are only a
3736              * small number of accept states, i.e. 0,1, or maybe 2.
3737              * With lots of accepts states, and having to try all of them,
3738              * it becomes quadratic on number of accept states to find all
3739              * the next words.
3740              */
3741
3742             {
3743                 /* what type of TRIE am I? (utf8 makes this contextual) */
3744                 DECL_TRIE_TYPE(scan);
3745
3746                 /* what trie are we using right now */
3747                 reg_trie_data * const trie
3748                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3749                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3750                 U32 state = trie->startstate;
3751
3752                 if (   trie->bitmap
3753                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3754                 {
3755                     if (trie->states[ state ].wordnum) {
3756                          DEBUG_EXECUTE_r(
3757                             PerlIO_printf(Perl_debug_log,
3758                                           "%*s  %smatched empty string...%s\n",
3759                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3760                         );
3761                         if (!trie->jump)
3762                             break;
3763                     } else {
3764                         DEBUG_EXECUTE_r(
3765                             PerlIO_printf(Perl_debug_log,
3766                                           "%*s  %sfailed to match trie start class...%s\n",
3767                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3768                         );
3769                         sayNO_SILENT;
3770                    }
3771                 }
3772
3773             {
3774                 U8 *uc = ( U8* )locinput;
3775
3776                 STRLEN len = 0;
3777                 STRLEN foldlen = 0;
3778                 U8 *uscan = (U8*)NULL;
3779                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3780                 U32 charcount = 0; /* how many input chars we have matched */
3781                 U32 accepted = 0; /* have we seen any accepting states? */
3782
3783                 ST.jump = trie->jump;
3784                 ST.me = scan;
3785                 ST.firstpos = NULL;
3786                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3787                 ST.nextword = 0;
3788
3789                 /* fully traverse the TRIE; note the position of the
3790                    shortest accept state and the wordnum of the longest
3791                    accept state */
3792
3793                 while ( state && uc <= (U8*)(reginfo->strend) ) {
3794                     U32 base = trie->states[ state ].trans.base;
3795                     UV uvc = 0;
3796                     U16 charid = 0;
3797                     U16 wordnum;
3798                     wordnum = trie->states[ state ].wordnum;
3799
3800                     if (wordnum) { /* it's an accept state */
3801                         if (!accepted) {
3802                             accepted = 1;
3803                             /* record first match position */
3804                             if (ST.longfold) {
3805                                 ST.firstpos = (U8*)locinput;
3806                                 ST.firstchars = 0;
3807                             }
3808                             else {
3809                                 ST.firstpos = uc;
3810                                 ST.firstchars = charcount;
3811                             }
3812                         }
3813                         if (!ST.nextword || wordnum < ST.nextword)
3814                             ST.nextword = wordnum;
3815                         ST.topword = wordnum;
3816                     }
3817
3818                     DEBUG_TRIE_EXECUTE_r({
3819                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3820                                 PerlIO_printf( Perl_debug_log,
3821                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3822                                     2+depth * 2, "", PL_colors[4],
3823                                     (UV)state, (accepted ? 'Y' : 'N'));
3824                     });
3825
3826                     /* read a char and goto next state */
3827                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3828                         I32 offset;
3829                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3830                                              uscan, len, uvc, charid, foldlen,
3831                                              foldbuf, uniflags);
3832                         charcount++;
3833                         if (foldlen>0)
3834                             ST.longfold = TRUE;
3835                         if (charid &&
3836                              ( ((offset =
3837                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3838
3839                              && ((U32)offset < trie->lasttrans)
3840                              && trie->trans[offset].check == state)
3841                         {
3842                             state = trie->trans[offset].next;
3843                         }
3844                         else {
3845                             state = 0;
3846                         }
3847                         uc += len;
3848
3849                     }
3850                     else {
3851                         state = 0;
3852                     }
3853                     DEBUG_TRIE_EXECUTE_r(
3854                         PerlIO_printf( Perl_debug_log,
3855                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3856                             charid, uvc, (UV)state, PL_colors[5] );
3857                     );
3858                 }
3859                 if (!accepted)
3860                    sayNO;
3861
3862                 /* calculate total number of accept states */
3863                 {
3864                     U16 w = ST.topword;
3865                     accepted = 0;
3866                     while (w) {
3867                         w = trie->wordinfo[w].prev;
3868                         accepted++;
3869                     }
3870                     ST.accepted = accepted;
3871                 }
3872
3873                 DEBUG_EXECUTE_r(
3874                     PerlIO_printf( Perl_debug_log,
3875                         "%*s  %sgot %"IVdf" possible matches%s\n",
3876                         REPORT_CODE_OFF + depth * 2, "",
3877                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3878                 );
3879                 goto trie_first_try; /* jump into the fail handler */
3880             }}
3881             assert(0); /* NOTREACHED */
3882
3883         case TRIE_next_fail: /* we failed - try next alternative */
3884         {
3885             U8 *uc;
3886             if ( ST.jump) {
3887                 REGCP_UNWIND(ST.cp);
3888                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3889             }
3890             if (!--ST.accepted) {
3891                 DEBUG_EXECUTE_r({
3892                     PerlIO_printf( Perl_debug_log,
3893                         "%*s  %sTRIE failed...%s\n",
3894                         REPORT_CODE_OFF+depth*2, "",
3895                         PL_colors[4],
3896                         PL_colors[5] );
3897                 });
3898                 sayNO_SILENT;
3899             }
3900             {
3901                 /* Find next-highest word to process.  Note that this code
3902                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3903                 U16 min = 0;
3904                 U16 word;
3905                 U16 const nextword = ST.nextword;
3906                 reg_trie_wordinfo * const wordinfo
3907                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3908                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3909                     if (word > nextword && (!min || word < min))
3910                         min = word;
3911                 }
3912                 ST.nextword = min;
3913             }
3914
3915           trie_first_try:
3916             if (do_cutgroup) {
3917                 do_cutgroup = 0;
3918                 no_final = 0;
3919             }
3920
3921             if ( ST.jump) {
3922                 ST.lastparen = rex->lastparen;
3923                 ST.lastcloseparen = rex->lastcloseparen;
3924                 REGCP_SET(ST.cp);
3925             }
3926
3927             /* find start char of end of current word */
3928             {
3929                 U32 chars; /* how many chars to skip */
3930                 reg_trie_data * const trie
3931                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3932
3933                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3934                             >=  ST.firstchars);
3935                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3936                             - ST.firstchars;
3937                 uc = ST.firstpos;
3938
3939                 if (ST.longfold) {
3940                     /* the hard option - fold each char in turn and find
3941                      * its folded length (which may be different */
3942                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3943                     STRLEN foldlen;
3944                     STRLEN len;
3945                     UV uvc;
3946                     U8 *uscan;
3947
3948                     while (chars) {
3949                         if (utf8_target) {
3950                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3951                                                     uniflags);
3952                             uc += len;
3953                         }
3954                         else {
3955                             uvc = *uc;
3956                             uc++;
3957                         }
3958                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3959                         uscan = foldbuf;
3960                         while (foldlen) {
3961                             if (!--chars)
3962                                 break;
3963                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3964                                             uniflags);
3965                             uscan += len;
3966                             foldlen -= len;
3967                         }
3968                     }
3969                 }
3970                 else {
3971                     if (utf8_target)
3972                         while (chars--)
3973                             uc += UTF8SKIP(uc);
3974                     else
3975                         uc += chars;
3976                 }
3977             }
3978
3979             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3980                             ? ST.jump[ST.nextword]
3981                             : NEXT_OFF(ST.me));
3982
3983             DEBUG_EXECUTE_r({
3984                 PerlIO_printf( Perl_debug_log,
3985                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3986                     REPORT_CODE_OFF+depth*2, "",
3987                     PL_colors[4],
3988                     ST.nextword,
3989                     PL_colors[5]
3990                     );
3991             });
3992
3993             if (ST.accepted > 1 || has_cutgroup) {
3994                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3995                 assert(0); /* NOTREACHED */
3996             }
3997             /* only one choice left - just continue */
3998             DEBUG_EXECUTE_r({
3999                 AV *const trie_words
4000                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4001                 SV ** const tmp = av_fetch( trie_words,
4002                     ST.nextword-1, 0 );
4003                 SV *sv= tmp ? sv_newmortal() : NULL;
4004
4005                 PerlIO_printf( Perl_debug_log,
4006                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4007                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4008                     ST.nextword,
4009                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4010                             PL_colors[0], PL_colors[1],
4011                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4012                         )
4013                     : "not compiled under -Dr",
4014                     PL_colors[5] );
4015             });
4016
4017             locinput = (char*)uc;
4018             continue; /* execute rest of RE */
4019             assert(0); /* NOTREACHED */
4020         }
4021 #undef  ST
4022
4023         case EXACT: {            /*  /abc/        */
4024             char *s = STRING(scan);
4025             ln = STR_LEN(scan);
4026             if (utf8_target != is_utf8_pat) {
4027                 /* The target and the pattern have differing utf8ness. */
4028                 char *l = locinput;
4029                 const char * const e = s + ln;
4030
4031                 if (utf8_target) {
4032                     /* The target is utf8, the pattern is not utf8.
4033                      * Above-Latin1 code points can't match the pattern;
4034                      * invariants match exactly, and the other Latin1 ones need
4035                      * to be downgraded to a single byte in order to do the
4036                      * comparison.  (If we could be confident that the target
4037                      * is not malformed, this could be refactored to have fewer
4038                      * tests by just assuming that if the first bytes match, it
4039                      * is an invariant, but there are tests in the test suite
4040                      * dealing with (??{...}) which violate this) */
4041                     while (s < e) {
4042                         if (l >= reginfo->strend
4043                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4044                         {
4045                             sayNO;
4046                         }
4047                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4048                             if (*l != *s) {
4049                                 sayNO;
4050                             }
4051                             l++;
4052                         }
4053                         else {
4054                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4055                                 sayNO;
4056                             }
4057                             l += 2;
4058                         }
4059                         s++;
4060                     }
4061                 }
4062                 else {
4063                     /* The target is not utf8, the pattern is utf8. */
4064                     while (s < e) {
4065                         if (l >= reginfo->strend
4066                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4067                         {
4068                             sayNO;
4069                         }
4070                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4071                             if (*s != *l) {
4072                                 sayNO;
4073                             }
4074                             s++;
4075                         }
4076                         else {
4077                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4078                                 sayNO;
4079                             }
4080                             s += 2;
4081                         }
4082                         l++;
4083                     }
4084                 }
4085                 locinput = l;
4086             }
4087             else {
4088                 /* The target and the pattern have the same utf8ness. */
4089                 /* Inline the first character, for speed. */
4090                 if (reginfo->strend - locinput < ln
4091                     || UCHARAT(s) != nextchr
4092                     || (ln > 1 && memNE(s, locinput, ln)))
4093                 {
4094                     sayNO;
4095                 }
4096                 locinput += ln;
4097             }
4098             break;
4099             }
4100
4101         case EXACTFL: {          /*  /abc/il      */
4102             re_fold_t folder;
4103             const U8 * fold_array;
4104             const char * s;
4105             U32 fold_utf8_flags;
4106
4107             RX_MATCH_TAINTED_on(reginfo->prog);
4108             folder = foldEQ_locale;
4109             fold_array = PL_fold_locale;
4110             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4111             goto do_exactf;
4112
4113         case EXACTFU_SS:         /*  /\x{df}/iu   */
4114         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4115         case EXACTFU:            /*  /abc/iu      */
4116             folder = foldEQ_latin1;
4117             fold_array = PL_fold_latin1;
4118             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4119             goto do_exactf;
4120
4121         case EXACTFA:            /*  /abc/iaa     */
4122             folder = foldEQ_latin1;
4123             fold_array = PL_fold_latin1;
4124             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4125             goto do_exactf;
4126
4127         case EXACTF:             /*  /abc/i       */
4128             folder = foldEQ;
4129             fold_array = PL_fold;
4130             fold_utf8_flags = 0;
4131
4132           do_exactf:
4133             s = STRING(scan);
4134             ln = STR_LEN(scan);
4135
4136             if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4137               /* Either target or the pattern are utf8, or has the issue where
4138                * the fold lengths may differ. */
4139                 const char * const l = locinput;
4140                 char *e = reginfo->strend;
4141
4142                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4143                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4144                 {
4145                     sayNO;
4146                 }
4147                 locinput = e;
4148                 break;
4149             }
4150
4151             /* Neither the target nor the pattern are utf8 */
4152             if (UCHARAT(s) != nextchr
4153                 && !NEXTCHR_IS_EOS
4154                 && UCHARAT(s) != fold_array[nextchr])
4155             {
4156                 sayNO;
4157             }
4158             if (reginfo->strend - locinput < ln)
4159                 sayNO;
4160             if (ln > 1 && ! folder(s, locinput, ln))
4161                 sayNO;
4162             locinput += ln;
4163             break;
4164         }
4165
4166         /* XXX Could improve efficiency by separating these all out using a
4167          * macro or in-line function.  At that point regcomp.c would no longer
4168          * have to set the FLAGS fields of these */
4169         case BOUNDL:  /*  /\b/l  */
4170         case NBOUNDL: /*  /\B/l  */
4171             RX_MATCH_TAINTED_on(reginfo->prog);
4172             /* FALL THROUGH */
4173         case BOUND:   /*  /\b/   */
4174         case BOUNDU:  /*  /\b/u  */
4175         case BOUNDA:  /*  /\b/a  */
4176         case NBOUND:  /*  /\B/   */
4177         case NBOUNDU: /*  /\B/u  */
4178         case NBOUNDA: /*  /\B/a  */
4179             /* was last char in word? */
4180             if (utf8_target
4181                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4182                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4183             {
4184                 if (locinput == reginfo->strbeg)
4185                     ln = '\n';
4186                 else {
4187                     const U8 * const r =
4188                             reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4189
4190                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4191                 }
4192                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4193                     ln = isWORDCHAR_uni(ln);
4194                     if (NEXTCHR_IS_EOS)
4195                         n = 0;
4196                     else {
4197                         LOAD_UTF8_CHARCLASS_ALNUM();
4198                         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4199                                                                 utf8_target);
4200                     }
4201                 }
4202                 else {
4203                     ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4204                     n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4205                 }
4206             }
4207             else {
4208
4209                 /* Here the string isn't utf8, or is utf8 and only ascii
4210                  * characters are to match \w.  In the latter case looking at
4211                  * the byte just prior to the current one may be just the final
4212                  * byte of a multi-byte character.  This is ok.  There are two
4213                  * cases:
4214                  * 1) it is a single byte character, and then the test is doing
4215                  *      just what it's supposed to.
4216                  * 2) it is a multi-byte character, in which case the final
4217                  *      byte is never mistakable for ASCII, and so the test
4218                  *      will say it is not a word character, which is the
4219                  *      correct answer. */
4220                 ln = (locinput != reginfo->strbeg) ?
4221                     UCHARAT(locinput - 1) : '\n';
4222                 switch (FLAGS(scan)) {
4223                     case REGEX_UNICODE_CHARSET:
4224                         ln = isWORDCHAR_L1(ln);
4225                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4226                         break;
4227                     case REGEX_LOCALE_CHARSET:
4228                         ln = isWORDCHAR_LC(ln);
4229                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4230                         break;
4231                     case REGEX_DEPENDS_CHARSET:
4232                         ln = isWORDCHAR(ln);
4233                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4234                         break;
4235                     case REGEX_ASCII_RESTRICTED_CHARSET:
4236                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4237                         ln = isWORDCHAR_A(ln);
4238                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4239                         break;
4240                     default:
4241                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4242                         break;
4243                 }
4244             }
4245             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4246              * regcomp.sym */
4247             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4248                     sayNO;
4249             break;
4250
4251         case ANYOF:  /*  /[abc]/       */
4252         case ANYOF_WARN_SUPER:
4253             if (NEXTCHR_IS_EOS)
4254                 sayNO;
4255             if (utf8_target) {
4256                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4257                     sayNO;
4258                 locinput += UTF8SKIP(locinput);
4259             }
4260             else {
4261                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4262                     sayNO;
4263                 locinput++;
4264             }
4265             break;
4266
4267         /* The argument (FLAGS) to all the POSIX node types is the class number
4268          * */
4269
4270         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4271             to_complement = 1;
4272             /* FALLTHROUGH */
4273
4274         case POSIXL:    /* \w or [:punct:] etc. under /l */
4275             if (NEXTCHR_IS_EOS)
4276                 sayNO;
4277
4278             /* The locale hasn't influenced the outcome before this, so defer
4279              * tainting until now */
4280             RX_MATCH_TAINTED_on(reginfo->prog);
4281
4282             /* Use isFOO_lc() for characters within Latin1.  (Note that
4283              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4284              * wouldn't be invariant) */
4285             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4286                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4287                     sayNO;
4288                 }
4289             }
4290             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4291                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4292                                         (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4293                                                             *(locinput + 1))))))
4294                 {
4295                     sayNO;
4296                 }
4297             }
4298             else { /* Here, must be an above Latin-1 code point */
4299                 goto utf8_posix_not_eos;
4300             }
4301
4302             /* Here, must be utf8 */
4303             locinput += UTF8SKIP(locinput);
4304             break;
4305
4306         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4307             to_complement = 1;
4308             /* FALLTHROUGH */
4309
4310         case POSIXD:    /* \w or [:punct:] etc. under /d */
4311             if (utf8_target) {
4312                 goto utf8_posix;
4313             }
4314             goto posixa;
4315
4316         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4317
4318             if (NEXTCHR_IS_EOS) {
4319                 sayNO;
4320             }
4321
4322             /* All UTF-8 variants match */
4323             if (! UTF8_IS_INVARIANT(nextchr)) {
4324                 goto increment_locinput;
4325             }
4326
4327             to_complement = 1;
4328             /* FALLTHROUGH */
4329
4330         case POSIXA:    /* \w or [:punct:] etc. under /a */
4331
4332           posixa:
4333             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4334              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4335              * character is a single byte */
4336
4337             if (NEXTCHR_IS_EOS
4338                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4339                                                             FLAGS(scan)))))
4340             {
4341                 sayNO;
4342             }
4343
4344             /* Here we are either not in utf8, or we matched a utf8-invariant,
4345              * so the next char is the next byte */
4346             locinput++;
4347             break;
4348
4349         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4350             to_complement = 1;
4351             /* FALLTHROUGH */
4352
4353         case POSIXU:    /* \w or [:punct:] etc. under /u */
4354           utf8_posix:
4355             if (NEXTCHR_IS_EOS) {
4356                 sayNO;
4357             }
4358           utf8_posix_not_eos:
4359
4360             /* Use _generic_isCC() for characters within Latin1.  (Note that
4361              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4362              * wouldn't be invariant) */
4363             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4364                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4365                                                            FLAGS(scan)))))
4366                 {
4367                     sayNO;
4368                 }
4369                 locinput++;
4370             }
4371             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4372                 if (! (to_complement
4373                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4374                                                                *(locinput + 1)),
4375                                               FLAGS(scan)))))
4376                 {
4377                     sayNO;
4378                 }
4379                 locinput += 2;
4380             }
4381             else {  /* Handle above Latin-1 code points */
4382                 classnum = (_char_class_number) FLAGS(scan);
4383                 if (classnum < _FIRST_NON_SWASH_CC) {
4384
4385                     /* Here, uses a swash to find such code points.  Load if if
4386                      * not done already */
4387                     if (! PL_utf8_swash_ptrs[classnum]) {
4388                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4389                         PL_utf8_swash_ptrs[classnum]
4390                                 = _core_swash_init("utf8",
4391                                         swash_property_names[classnum],
4392                                         &PL_sv_undef, 1, 0, NULL, &flags);
4393                     }
4394                     if (! (to_complement
4395                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4396                                                (U8 *) locinput, TRUE))))
4397                     {
4398                         sayNO;
4399                     }
4400                 }
4401                 else {  /* Here, uses macros to find above Latin-1 code points */
4402                     switch (classnum) {
4403                         case _CC_ENUM_SPACE:    /* XXX would require separate
4404                                                    code if we revert the change
4405                                                    of \v matching this */
4406                         case _CC_ENUM_PSXSPC:
4407                             if (! (to_complement
4408                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4409                             {
4410                                 sayNO;
4411                             }
4412                             break;
4413                         case _CC_ENUM_BLANK:
4414                             if (! (to_complement
4415                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4416                             {
4417                                 sayNO;
4418                             }
4419                             break;
4420                         case _CC_ENUM_XDIGIT:
4421                             if (! (to_complement
4422                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4423                             {
4424                                 sayNO;
4425                             }
4426                             break;
4427                         case _CC_ENUM_VERTSPACE:
4428                             if (! (to_complement
4429                                             ^ cBOOL(is_VERTWS_high(locinput))))
4430                             {
4431                                 sayNO;
4432                             }
4433                             break;
4434                         default:    /* The rest, e.g. [:cntrl:], can't match
4435                                        above Latin1 */
4436                             if (! to_complement) {
4437                                 sayNO;
4438                             }
4439                             break;
4440                     }
4441                 }
4442                 locinput += UTF8SKIP(locinput);
4443             }
4444             break;
4445
4446         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4447                        a Unicode extended Grapheme Cluster */
4448             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4449               extended Grapheme Cluster is:
4450
4451             CR LF
4452             | Prepend* Begin Extend*
4453             | .
4454
4455             Begin is:           ( Special_Begin | ! Control )
4456             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4457             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4458             Control is:         [ GCB_Control | CR | LF ]
4459             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4460
4461                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4462                we can rewrite
4463
4464                    Begin is ( Regular_Begin + Special Begin )
4465
4466                It turns out that 98.4% of all Unicode code points match
4467                Regular_Begin.  Doing it this way eliminates a table match in
4468                the previous implementation for almost all Unicode code points.
4469
4470                There is a subtlety with Prepend* which showed up in testing.
4471                Note that the Begin, and only the Begin is required in:
4472                 | Prepend* Begin Extend*
4473                Also, Begin contains '! Control'.  A Prepend must be a
4474                '!  Control', which means it must also be a Begin.  What it
4475                comes down to is that if we match Prepend* and then find no
4476                suitable Begin afterwards, that if we backtrack the last
4477                Prepend, that one will be a suitable Begin.
4478             */
4479
4480             if (NEXTCHR_IS_EOS)
4481                 sayNO;
4482             if  (! utf8_target) {
4483
4484                 /* Match either CR LF  or '.', as all the other possibilities
4485                  * require utf8 */
4486                 locinput++;         /* Match the . or CR */
4487                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4488                                        match the LF */
4489                     && locinput < reginfo->strend
4490                     && UCHARAT(locinput) == '\n')
4491                 {
4492                     locinput++;
4493                 }
4494             }
4495             else {
4496
4497                 /* Utf8: See if is ( CR LF ); already know that locinput <
4498                  * reginfo->strend, so locinput+1 is in bounds */
4499                 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4500                      && UCHARAT(locinput + 1) == '\n')
4501                 {
4502                     locinput += 2;
4503                 }
4504                 else {
4505                     STRLEN len;
4506
4507                     /* In case have to backtrack to beginning, then match '.' */
4508                     char *starting = locinput;
4509
4510                     /* In case have to backtrack the last prepend */
4511                     char *previous_prepend = NULL;
4512
4513                     LOAD_UTF8_CHARCLASS_GCB();
4514
4515                     /* Match (prepend)*   */
4516                     while (locinput < reginfo->strend
4517                            && (len = is_GCB_Prepend_utf8(locinput)))
4518                     {
4519                         previous_prepend = locinput;
4520                         locinput += len;
4521                     }
4522
4523                     /* As noted above, if we matched a prepend character, but
4524                      * the next thing won't match, back off the last prepend we
4525                      * matched, as it is guaranteed to match the begin */
4526                     if (previous_prepend
4527                         && (locinput >=  reginfo->strend
4528                             || (! swash_fetch(PL_utf8_X_regular_begin,
4529                                              (U8*)locinput, utf8_target)
4530                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4531                         )
4532                     {
4533                         locinput = previous_prepend;
4534                     }
4535
4536                     /* Note that here we know reginfo->strend > locinput, as we
4537                      * tested that upon input to this switch case, and if we
4538                      * moved locinput forward, we tested the result just above
4539                      * and it either passed, or we backed off so that it will
4540                      * now pass */
4541                     if (swash_fetch(PL_utf8_X_regular_begin,
4542                                     (U8*)locinput, utf8_target)) {
4543                         locinput += UTF8SKIP(locinput);
4544                     }
4545                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4546
4547                         /* Here did not match the required 'Begin' in the
4548                          * second term.  So just match the very first
4549                          * character, the '.' of the final term of the regex */
4550                         locinput = starting + UTF8SKIP(starting);
4551                         goto exit_utf8;
4552                     } else {
4553
4554                         /* Here is a special begin.  It can be composed of
4555                          * several individual characters.  One possibility is
4556                          * RI+ */
4557                         if ((len = is_GCB_RI_utf8(locinput))) {
4558                             locinput += len;
4559                             while (locinput < reginfo->strend
4560                                    && (len = is_GCB_RI_utf8(locinput)))
4561                             {
4562                                 locinput += len;
4563                             }
4564                         } else if ((len = is_GCB_T_utf8(locinput))) {
4565                             /* Another possibility is T+ */
4566                             locinput += len;
4567                             while (locinput < reginfo->strend
4568                                 && (len = is_GCB_T_utf8(locinput)))
4569                             {
4570                                 locinput += len;
4571                             }
4572                         } else {
4573
4574                             /* Here, neither RI+ nor T+; must be some other
4575                              * Hangul.  That means it is one of the others: L,
4576                              * LV, LVT or V, and matches:
4577                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4578
4579                             /* Match L*           */
4580                             while (locinput < reginfo->strend
4581                                    && (len = is_GCB_L_utf8(locinput)))
4582                             {
4583                                 locinput += len;
4584                             }
4585
4586                             /* Here, have exhausted L*.  If the next character
4587                              * is not an LV, LVT nor V, it means we had to have
4588                              * at least one L, so matches L+ in the original
4589                              * equation, we have a complete hangul syllable.
4590                              * Are done. */
4591
4592                             if (locinput < reginfo->strend
4593                                 && is_GCB_LV_LVT_V_utf8(locinput))
4594                             {
4595                                 /* Otherwise keep going.  Must be LV, LVT or V.
4596                                  * See if LVT, by first ruling out V, then LV */
4597                                 if (! is_GCB_V_utf8(locinput)
4598                                         /* All but every TCount one is LV */
4599                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4600                                                                          NULL)
4601                                                                         - SBASE)
4602                                         % TCount != 0)
4603                                 {
4604                                     locinput += UTF8SKIP(locinput);
4605                                 } else {
4606
4607                                     /* Must be  V or LV.  Take it, then match
4608                                      * V*     */
4609                                     locinput += UTF8SKIP(locinput);
4610                                     while (locinput < reginfo->strend
4611                                            && (len = is_GCB_V_utf8(locinput)))
4612                                     {
4613                                         locinput += len;
4614                                     }
4615                                 }
4616
4617                                 /* And any of LV, LVT, or V can be followed
4618                                  * by T*            */
4619                                 while (locinput < reginfo->strend
4620                                        && (len = is_GCB_T_utf8(locinput)))
4621                                 {
4622                                     locinput += len;
4623                                 }
4624                             }
4625                         }
4626                     }
4627
4628                     /* Match any extender */
4629                     while (locinput < reginfo->strend
4630                             && swash_fetch(PL_utf8_X_extend,
4631                                             (U8*)locinput, utf8_target))
4632                     {
4633                         locinput += UTF8SKIP(locinput);
4634                     }
4635                 }
4636             exit_utf8:
4637                 if (locinput > reginfo->strend) sayNO;
4638             }
4639             break;
4640
4641         case NREFFL:  /*  /\g{name}/il  */
4642         {   /* The capture buffer cases.  The ones beginning with N for the
4643                named buffers just convert to the equivalent numbered and
4644                pretend they were called as the corresponding numbered buffer
4645                op.  */
4646             /* don't initialize these in the declaration, it makes C++
4647                unhappy */
4648             const char *s;
4649             char type;
4650             re_fold_t folder;
4651             const U8 *fold_array;
4652             UV utf8_fold_flags;
4653
4654             RX_MATCH_TAINTED_on(reginfo->prog);
4655             folder = foldEQ_locale;
4656             fold_array = PL_fold_locale;
4657             type = REFFL;
4658             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4659             goto do_nref;
4660
4661         case NREFFA:  /*  /\g{name}/iaa  */
4662             folder = foldEQ_latin1;
4663             fold_array = PL_fold_latin1;
4664             type = REFFA;
4665             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4666             goto do_nref;
4667
4668         case NREFFU:  /*  /\g{name}/iu  */
4669             folder = foldEQ_latin1;
4670             fold_array = PL_fold_latin1;
4671             type = REFFU;
4672             utf8_fold_flags = 0;
4673             goto do_nref;
4674
4675         case NREFF:  /*  /\g{name}/i  */
4676             folder = foldEQ;
4677             fold_array = PL_fold;
4678             type = REFF;
4679             utf8_fold_flags = 0;
4680             goto do_nref;
4681
4682         case NREF:  /*  /\g{name}/   */
4683             type = REF;
4684             folder = NULL;
4685             fold_array = NULL;
4686             utf8_fold_flags = 0;
4687           do_nref:
4688
4689             /* For the named back references, find the corresponding buffer
4690              * number */
4691             n = reg_check_named_buff_matched(rex,scan);
4692
4693             if ( ! n ) {
4694                 sayNO;
4695             }
4696             goto do_nref_ref_common;
4697
4698         case REFFL:  /*  /\1/il  */
4699             RX_MATCH_TAINTED_on(reginfo->prog);
4700             folder = foldEQ_locale;
4701             fold_array = PL_fold_locale;
4702             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4703             goto do_ref;
4704
4705         case REFFA:  /*  /\1/iaa  */
4706             folder = foldEQ_latin1;
4707             fold_array = PL_fold_latin1;
4708             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4709             goto do_ref;
4710
4711         case REFFU:  /*  /\1/iu  */
4712             folder = foldEQ_latin1;
4713             fold_array = PL_fold_latin1;
4714             utf8_fold_flags = 0;
4715             goto do_ref;
4716
4717         case REFF:  /*  /\1/i  */
4718             folder = foldEQ;
4719             fold_array = PL_fold;
4720             utf8_fold_flags = 0;
4721             goto do_ref;
4722
4723         case REF:  /*  /\1/    */
4724             folder = NULL;
4725             fold_array = NULL;
4726             utf8_fold_flags = 0;
4727
4728           do_ref:
4729             type = OP(scan);
4730             n = ARG(scan);  /* which paren pair */
4731
4732           do_nref_ref_common:
4733             ln = rex->offs[n].start;
4734             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4735             if (rex->lastparen < n || ln == -1)
4736                 sayNO;                  /* Do not match unless seen CLOSEn. */
4737             if (ln == rex->offs[n].end)
4738                 break;
4739
4740             s = reginfo->strbeg + ln;
4741             if (type != REF     /* REF can do byte comparison */
4742                 && (utf8_target || type == REFFU))
4743             { /* XXX handle REFFL better */
4744                 char * limit = reginfo->strend;
4745
4746                 /* This call case insensitively compares the entire buffer
4747                     * at s, with the current input starting at locinput, but
4748                     * not going off the end given by reginfo->strend, and
4749                     * returns in <limit> upon success, how much of the
4750                     * current input was matched */
4751                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4752                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4753                 {
4754                     sayNO;
4755                 }
4756                 locinput = limit;
4757                 break;
4758             }
4759
4760             /* Not utf8:  Inline the first character, for speed. */
4761             if (!NEXTCHR_IS_EOS &&
4762                 UCHARAT(s) != nextchr &&
4763                 (type == REF ||
4764                  UCHARAT(s) != fold_array[nextchr]))
4765                 sayNO;
4766             ln = rex->offs[n].end - ln;
4767             if (locinput + ln > reginfo->strend)
4768                 sayNO;
4769             if (ln > 1 && (type == REF
4770                            ? memNE(s, locinput, ln)
4771                            : ! folder(s, locinput, ln)))
4772                 sayNO;
4773             locinput += ln;
4774             break;
4775         }
4776
4777         case NOTHING: /* null op; e.g. the 'nothing' following
4778                        * the '*' in m{(a+|b)*}' */
4779             break;
4780         case TAIL: /* placeholder while compiling (A|B|C) */
4781             break;
4782
4783         case BACK: /* ??? doesn't appear to be used ??? */
4784             break;
4785
4786 #undef  ST
4787 #define ST st->u.eval
4788         {
4789             SV *ret;
4790             REGEXP *re_sv;
4791             regexp *re;
4792             regexp_internal *rei;
4793             regnode *startpoint;
4794
4795         case GOSTART: /*  (?R)  */
4796         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4797             if (cur_eval && cur_eval->locinput==locinput) {
4798                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4799                     Perl_croak(aTHX_ "Infinite recursion in regex");
4800                 if ( ++nochange_depth > max_nochange_depth )
4801                     Perl_croak(aTHX_
4802                         "Pattern subroutine nesting without pos change"
4803                         " exceeded limit in regex");
4804             } else {
4805                 nochange_depth = 0;
4806             }
4807             re_sv = rex_sv;
4808             re = rex;
4809             rei = rexi;
4810             if (OP(scan)==GOSUB) {
4811                 startpoint = scan + ARG2L(scan);
4812                 ST.close_paren = ARG(scan);
4813             } else {
4814                 startpoint = rei->program+1;
4815                 ST.close_paren = 0;
4816             }
4817             goto eval_recurse_doit;
4818             assert(0); /* NOTREACHED */
4819
4820         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4821             if (cur_eval && cur_eval->locinput==locinput) {
4822                 if ( ++nochange_depth > max_nochange_depth )
4823                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4824             } else {
4825                 nochange_depth = 0;
4826             }
4827             {
4828                 /* execute the code in the {...} */
4829
4830                 dSP;
4831                 IV before;
4832                 OP * const oop = PL_op;
4833                 COP * const ocurcop = PL_curcop;
4834                 OP *nop;
4835                 CV *newcv;
4836
4837                 /* save *all* paren positions */
4838                 regcppush(rex, 0, maxopenparen);
4839                 REGCP_SET(runops_cp);
4840
4841                 if (!caller_cv)
4842                     caller_cv = find_runcv(NULL);
4843
4844                 n = ARG(scan);
4845
4846                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4847                     newcv = (ReANY(
4848                                                 (REGEXP*)(rexi->data->data[n])
4849                                             ))->qr_anoncv
4850                                         ;
4851                     nop = (OP*)rexi->data->data[n+1];
4852                 }
4853                 else if (rexi->data->what[n] == 'l') { /* literal code */
4854                     newcv = caller_cv;
4855                     nop = (OP*)rexi->data->data[n];
4856                     assert(CvDEPTH(newcv));
4857                 }
4858                 else {
4859                     /* literal with own CV */
4860                     assert(rexi->data->what[n] == 'L');
4861                     newcv = rex->qr_anoncv;
4862                     nop = (OP*)rexi->data->data[n];
4863                 }
4864
4865                 /* normally if we're about to execute code from the same
4866                  * CV that we used previously, we just use the existing
4867                  * CX stack entry. However, its possible that in the
4868                  * meantime we may have backtracked, popped from the save
4869                  * stack, and undone the SAVECOMPPAD(s) associated with
4870                  * PUSH_MULTICALL; in which case PL_comppad no longer
4871                  * points to newcv's pad. */
4872                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4873                 {
4874                     U8 flags = (CXp_SUB_RE |
4875                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4876                     if (last_pushed_cv) {
4877                         CHANGE_MULTICALL_FLAGS(newcv, flags);
4878                     }
4879                     else {
4880                         PUSH_MULTICALL_FLAGS(newcv, flags);
4881                     }
4882                     last_pushed_cv = newcv;
4883                 }
4884                 else {
4885                     /* these assignments are just to silence compiler
4886                      * warnings */
4887                     multicall_cop = NULL;
4888                     newsp = NULL;
4889                 }
4890                 last_pad = PL_comppad;
4891
4892                 /* the initial nextstate you would normally execute
4893                  * at the start of an eval (which would cause error
4894                  * messages to come from the eval), may be optimised
4895                  * away from the execution path in the regex code blocks;
4896                  * so manually set PL_curcop to it initially */
4897                 {
4898                     OP *o = cUNOPx(nop)->op_first;
4899                     assert(o->op_type == OP_NULL);
4900                     if (o->op_targ == OP_SCOPE) {
4901                         o = cUNOPo->op_first;
4902                     }
4903                     else {
4904                         assert(o->op_targ == OP_LEAVE);
4905                         o = cUNOPo->op_first;
4906                         assert(o->op_type == OP_ENTER);
4907                         o = o->op_sibling;
4908                     }
4909
4910                     if (o->op_type != OP_STUB) {
4911                         assert(    o->op_type == OP_NEXTSTATE
4912                                 || o->op_type == OP_DBSTATE
4913                                 || (o->op_type == OP_NULL
4914                                     &&  (  o->op_targ == OP_NEXTSTATE
4915                                         || o->op_targ == OP_DBSTATE
4916                                         )
4917                                     )
4918                         );
4919                         PL_curcop = (COP*)o;
4920                     }
4921                 }
4922                 nop = nop->op_next;
4923
4924                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4925                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4926
4927                 rex->offs[0].end = locinput - reginfo->strbeg;
4928                 if (reginfo->info_aux_eval->pos_magic)
4929                         reginfo->info_aux_eval->pos_magic->mg_len
4930                                         = locinput - reginfo->strbeg;
4931
4932                 if (sv_yes_mark) {
4933                     SV *sv_mrk = get_sv("REGMARK", 1);
4934                     sv_setsv(sv_mrk, sv_yes_mark);
4935                 }
4936
4937                 /* we don't use MULTICALL here as we want to call the
4938                  * first op of the block of interest, rather than the
4939                  * first op of the sub */
4940                 before = (IV)(SP-PL_stack_base);
4941                 PL_op = nop;
4942                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4943                 SPAGAIN;
4944                 if ((IV)(SP-PL_stack_base) == before)
4945                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4946                 else {
4947                     ret = POPs;
4948                     PUTBACK;
4949                 }
4950
4951                 /* before restoring everything, evaluate the returned
4952                  * value, so that 'uninit' warnings don't use the wrong
4953                  * PL_op or pad. Also need to process any magic vars
4954                  * (e.g. $1) *before* parentheses are restored */
4955
4956                 PL_op = NULL;
4957
4958                 re_sv = NULL;
4959                 if (logical == 0)        /*   (?{})/   */
4960                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4961                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4962                     sw = cBOOL(SvTRUE(ret));
4963                     logical = 0;
4964                 }
4965                 else {                   /*  /(??{})  */
4966                     /*  if its overloaded, let the regex compiler handle
4967                      *  it; otherwise extract regex, or stringify  */
4968                     if (!SvAMAGIC(ret)) {
4969                         SV *sv = ret;
4970                         if (SvROK(sv))
4971                             sv = SvRV(sv);
4972                         if (SvTYPE(sv) == SVt_REGEXP)
4973                             re_sv = (REGEXP*) sv;
4974                         else if (SvSMAGICAL(sv)) {
4975                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4976                             if (mg)
4977                                 re_sv = (REGEXP *) mg->mg_obj;
4978                         }
4979
4980                         /* force any magic, undef warnings here */
4981                         if (!re_sv) {
4982                             ret = sv_mortalcopy(ret);
4983                             (void) SvPV_force_nolen(ret);
4984                         }
4985                     }
4986
4987                 }
4988
4989                 /* *** Note that at this point we don't restore
4990                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4991                  * be used again soon. This is safe as long as nothing
4992                  * in the regexp code uses the pad ! */
4993                 PL_op = oop;
4994                 PL_curcop = ocurcop;
4995                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
4996
4997                 if (logical != 2)
4998                     break;
4999             }
5000
5001                 /* only /(??{})/  from now on */
5002                 logical = 0;
5003                 {
5004                     /* extract RE object from returned value; compiling if
5005                      * necessary */
5006
5007                     if (re_sv) {
5008                         re_sv = reg_temp_copy(NULL, re_sv);
5009                     }
5010                     else {
5011                         U32 pm_flags = 0;
5012
5013                         if (SvUTF8(ret) && IN_BYTES) {
5014                             /* In use 'bytes': make a copy of the octet
5015                              * sequence, but without the flag on */
5016                             STRLEN len;
5017                             const char *const p = SvPV(ret, len);
5018                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5019                         }
5020                         if (rex->intflags & PREGf_USE_RE_EVAL)
5021                             pm_flags |= PMf_USE_RE_EVAL;
5022
5023                         /* if we got here, it should be an engine which
5024                          * supports compiling code blocks and stuff */
5025                         assert(rex->engine && rex->engine->op_comp);
5026                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5027                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5028                                     rex->engine, NULL, NULL,
5029                                     /* copy /msix etc to inner pattern */
5030                                     scan->flags,
5031                                     pm_flags);
5032
5033                         if (!(SvFLAGS(ret)
5034                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5035                                  | SVs_GMG))) {
5036                             /* This isn't a first class regexp. Instead, it's
5037                                caching a regexp onto an existing, Perl visible
5038                                scalar.  */
5039                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5040                         }
5041                         /* safe to do now that any $1 etc has been
5042                          * interpolated into the new pattern string and
5043                          * compiled */
5044                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5045                     }
5046                     SAVEFREESV(re_sv);
5047                     re = ReANY(re_sv);
5048                 }
5049                 RXp_MATCH_COPIED_off(re);
5050                 re->subbeg = rex->subbeg;
5051                 re->sublen = rex->sublen;
5052                 re->suboffset = rex->suboffset;
5053                 re->subcoffset = rex->subcoffset;
5054                 rei = RXi_GET(re);
5055                 DEBUG_EXECUTE_r(
5056                     debug_start_match(re_sv, utf8_target, locinput,
5057                                     reginfo->strend, "Matching embedded");
5058                 );
5059                 startpoint = rei->program + 1;
5060                 ST.close_paren = 0; /* only used for GOSUB */
5061
5062         eval_recurse_doit: /* Share code with GOSUB below this line */
5063                 /* run the pattern returned from (??{...}) */
5064
5065                 /* Save *all* the positions. */
5066                 ST.cp = regcppush(rex, 0, maxopenparen);
5067                 REGCP_SET(ST.lastcp);
5068
5069                 re->lastparen = 0;
5070                 re->lastcloseparen = 0;
5071
5072                 maxopenparen = 0;
5073
5074                 /* invalidate the S-L poscache. We're now executing a
5075                  * different set of WHILEM ops (and their associated
5076                  * indexes) against the same string, so the bits in the
5077                  * cache are meaningless. Setting maxiter to zero forces
5078                  * the cache to be invalidated and zeroed before reuse.
5079                  * XXX This is too dramatic a measure. Ideally we should
5080                  * save the old cache and restore when running the outer
5081                  * pattern again */
5082                 reginfo->poscache_maxiter = 0;
5083
5084                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5085
5086                 ST.prev_rex = rex_sv;
5087                 ST.prev_curlyx = cur_curlyx;
5088                 rex_sv = re_sv;
5089                 SET_reg_curpm(rex_sv);
5090                 rex = re;
5091                 rexi = rei;
5092                 cur_curlyx = NULL;
5093                 ST.B = next;
5094                 ST.prev_eval = cur_eval;
5095                 cur_eval = st;
5096                 /* now continue from first node in postoned RE */
5097                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5098                 assert(0); /* NOTREACHED */
5099         }
5100
5101         case EVAL_AB: /* cleanup after a successful (??{A})B */
5102             /* note: this is called twice; first after popping B, then A */
5103             rex_sv = ST.prev_rex;
5104             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5105             SET_reg_curpm(rex_sv);
5106             rex = ReANY(rex_sv);
5107             rexi = RXi_GET(rex);
5108             regcpblow(ST.cp);
5109             cur_eval = ST.prev_eval;
5110             cur_curlyx = ST.prev_curlyx;
5111
5112             /* Invalidate cache. See "invalidate" comment above. */
5113             reginfo->poscache_maxiter = 0;
5114             if ( nochange_depth )
5115                 nochange_depth--;
5116             sayYES;
5117
5118
5119         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5120             /* note: this is called twice; first after popping B, then A */
5121             rex_sv = ST.prev_rex;
5122             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5123             SET_reg_curpm(rex_sv);
5124             rex = ReANY(rex_sv);
5125             rexi = RXi_GET(rex);
5126
5127             REGCP_UNWIND(ST.lastcp);
5128             regcppop(rex, &maxopenparen);
5129             cur_eval = ST.prev_eval;
5130             cur_curlyx = ST.prev_curlyx;
5131             /* Invalidate cache. See "invalidate" comment above. */
5132             reginfo->poscache_maxiter = 0;
5133             if ( nochange_depth )
5134                 nochange_depth--;
5135             sayNO_SILENT;
5136 #undef ST
5137
5138         case OPEN: /*  (  */
5139             n = ARG(scan);  /* which paren pair */
5140             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5141             if (n > maxopenparen)
5142                 maxopenparen = n;
5143             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5144                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5145                 PTR2UV(rex),
5146                 PTR2UV(rex->offs),
5147                 (UV)n,
5148                 (IV)rex->offs[n].start_tmp,
5149                 (UV)maxopenparen
5150             ));
5151             lastopen = n;
5152             break;
5153
5154 /* XXX really need to log other places start/end are set too */
5155 #define CLOSE_CAPTURE \
5156     rex->offs[n].start = rex->offs[n].start_tmp; \
5157     rex->offs[n].end = locinput - reginfo->strbeg; \
5158     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5159         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5160         PTR2UV(rex), \
5161         PTR2UV(rex->offs), \
5162         (UV)n, \
5163         (IV)rex->offs[n].start, \
5164         (IV)rex->offs[n].end \
5165     ))
5166
5167         case CLOSE:  /*  )  */
5168             n = ARG(scan);  /* which paren pair */
5169             CLOSE_CAPTURE;
5170             if (n > rex->lastparen)
5171                 rex->lastparen = n;
5172             rex->lastcloseparen = n;
5173             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5174                 goto fake_end;
5175             }
5176             break;
5177
5178         case ACCEPT:  /*  (*ACCEPT)  */
5179             if (ARG(scan)){
5180                 regnode *cursor;
5181                 for (cursor=scan;
5182                      cursor && OP(cursor)!=END;
5183                      cursor=regnext(cursor))
5184                 {
5185                     if ( OP(cursor)==CLOSE ){
5186                         n = ARG(cursor);
5187                         if ( n <= lastopen ) {
5188                             CLOSE_CAPTURE;
5189                             if (n > rex->lastparen)
5190                                 rex->lastparen = n;
5191                             rex->lastcloseparen = n;
5192                             if ( n == ARG(scan) || (cur_eval &&
5193                                 cur_eval->u.eval.close_paren == n))
5194                                 break;
5195                         }
5196                     }
5197                 }
5198             }
5199             goto fake_end;
5200             /*NOTREACHED*/
5201
5202         case GROUPP:  /*  (?(1))  */
5203             n = ARG(scan);  /* which paren pair */
5204             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5205             break;
5206
5207         case NGROUPP:  /*  (?(<name>))  */
5208             /* reg_check_named_buff_matched returns 0 for no match */
5209             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5210             break;
5211
5212         case INSUBP:   /*  (?(R))  */
5213             n = ARG(scan);
5214             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5215             break;
5216
5217         case DEFINEP:  /*  (?(DEFINE))  */
5218             sw = 0;
5219             break;
5220
5221         case IFTHEN:   /*  (?(cond)A|B)  */
5222             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5223             if (sw)
5224                 next = NEXTOPER(NEXTOPER(scan));
5225             else {
5226                 next = scan + ARG(scan);
5227                 if (OP(next) == IFTHEN) /* Fake one. */
5228                     next = NEXTOPER(NEXTOPER(next));
5229             }
5230             break;
5231
5232         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5233             logical = scan->flags;
5234             break;
5235
5236 /*******************************************************************
5237
5238 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5239 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5240 STAR/PLUS/CURLY/CURLYN are used instead.)
5241
5242 A*B is compiled as <CURLYX><A><WHILEM><B>
5243
5244 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5245 state, which contains the current count, initialised to -1. It also sets
5246 cur_curlyx to point to this state, with any previous value saved in the
5247 state block.
5248
5249 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5250 since the pattern may possibly match zero times (i.e. it's a while {} loop
5251 rather than a do {} while loop).
5252
5253 Each entry to WHILEM represents a successful match of A. The count in the
5254 CURLYX block is incremented, another WHILEM state is pushed, and execution
5255 passes to A or B depending on greediness and the current count.
5256
5257 For example, if matching against the string a1a2a3b (where the aN are
5258 substrings that match /A/), then the match progresses as follows: (the
5259 pushed states are interspersed with the bits of strings matched so far):
5260
5261     <CURLYX cnt=-1>
5262     <CURLYX cnt=0><WHILEM>
5263     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5264     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5265     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5266     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5267
5268 (Contrast this with something like CURLYM, which maintains only a single
5269 backtrack state:
5270
5271     <CURLYM cnt=0> a1
5272     a1 <CURLYM cnt=1> a2
5273     a1 a2 <CURLYM cnt=2> a3
5274     a1 a2 a3 <CURLYM cnt=3> b
5275 )
5276
5277 Each WHILEM state block marks a point to backtrack to upon partial failure
5278 of A or B, and also contains some minor state data related to that
5279 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5280 overall state, such as the count, and pointers to the A and B ops.
5281
5282 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5283 must always point to the *current* CURLYX block, the rules are:
5284
5285 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5286 and set cur_curlyx to point the new block.
5287
5288 When popping the CURLYX block after a successful or unsuccessful match,
5289 restore the previous cur_curlyx.
5290
5291 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5292 to the outer one saved in the CURLYX block.
5293
5294 When popping the WHILEM block after a successful or unsuccessful B match,
5295 restore the previous cur_curlyx.
5296
5297 Here's an example for the pattern (AI* BI)*BO
5298 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5299
5300 cur_
5301 curlyx backtrack stack
5302 ------ ---------------
5303 NULL
5304 CO     <CO prev=NULL> <WO>
5305 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5306 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5307 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5308
5309 At this point the pattern succeeds, and we work back down the stack to
5310 clean up, restoring as we go:
5311
5312 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5313 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5314 CO     <CO prev=NULL> <WO>
5315 NULL
5316
5317 *******************************************************************/
5318
5319 #define ST st->u.curlyx
5320
5321         case CURLYX:    /* start of /A*B/  (for complex A) */
5322         {
5323             /* No need to save/restore up to this paren */
5324             I32 parenfloor = scan->flags;
5325
5326             assert(next); /* keep Coverity happy */
5327             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5328                 next += ARG(next);
5329
5330             /* XXXX Probably it is better to teach regpush to support
5331                parenfloor > maxopenparen ... */
5332             if (parenfloor > (I32)rex->lastparen)
5333                 parenfloor = rex->lastparen; /* Pessimization... */
5334
5335             ST.prev_curlyx= cur_curlyx;
5336             cur_curlyx = st;
5337             ST.cp = PL_savestack_ix;
5338
5339             /* these fields contain the state of the current curly.
5340              * they are accessed by subsequent WHILEMs */
5341             ST.parenfloor = parenfloor;
5342             ST.me = scan;
5343             ST.B = next;
5344             ST.minmod = minmod;
5345             minmod = 0;
5346             ST.count = -1;      /* this will be updated by WHILEM */
5347             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5348
5349             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5350             assert(0); /* NOTREACHED */
5351         }
5352
5353         case CURLYX_end: /* just finished matching all of A*B */
5354             cur_curlyx = ST.prev_curlyx;
5355             sayYES;
5356             assert(0); /* NOTREACHED */
5357
5358         case CURLYX_end_fail: /* just failed to match all of A*B */
5359             regcpblow(ST.cp);
5360             cur_curlyx = ST.prev_curlyx;
5361             sayNO;
5362             assert(0); /* NOTREACHED */
5363
5364
5365 #undef ST
5366 #define ST st->u.whilem
5367
5368         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5369         {
5370             /* see the discussion above about CURLYX/WHILEM */
5371             I32 n;
5372             int min = ARG1(cur_curlyx->u.curlyx.me);
5373             int max = ARG2(cur_curlyx->u.curlyx.me);
5374             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5375
5376             assert(cur_curlyx); /* keep Coverity happy */
5377             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5378             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5379             ST.cache_offset = 0;
5380             ST.cache_mask = 0;
5381
5382
5383             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5384                   "%*s  whilem: matched %ld out of %d..%d\n",
5385                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5386             );
5387
5388             /* First just match a string of min A's. */
5389
5390             if (n < min) {
5391                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5392                                     maxopenparen);
5393                 cur_curlyx->u.curlyx.lastloc = locinput;
5394                 REGCP_SET(ST.lastcp);
5395
5396                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5397                 assert(0); /* NOTREACHED */
5398             }
5399
5400             /* If degenerate A matches "", assume A done. */
5401
5402             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5403                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5404                    "%*s  whilem: empty match detected, trying continuation...\n",
5405                    REPORT_CODE_OFF+depth*2, "")
5406                 );
5407                 goto do_whilem_B_max;
5408             }
5409
5410             /* super-linear cache processing.
5411              *
5412              * The idea here is that for certain types of CURLYX/WHILEM -
5413              * principally those whose upper bound is infinity (and
5414              * excluding regexes that have things like \1 and other very
5415              * non-regular expresssiony things), then if a pattern like
5416              * /....A*.../ fails and we backtrack to the WHILEM, then we
5417              * make a note that this particular WHILEM op was at string
5418              * position 47 (say) when the rest of pattern failed. Then, if
5419              * we ever find ourselves back at that WHILEM, and at string
5420              * position 47 again, we can just fail immediately rather than
5421              * running the rest of the pattern again.
5422              *
5423              * This is very handy when patterns start to go
5424              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5425              * with a combinatorial explosion of backtracking.
5426              *
5427              * The cache is implemented as a bit array, with one bit per
5428              * string byte position per WHILEM op (up to 16) - so its
5429              * between 0.25 and 2x the string size.
5430              *
5431              * To avoid allocating a poscache buffer every time, we do an
5432              * initially countdown; only after we have  executed a WHILEM
5433              * op (string-length x #WHILEMs) times do we allocate the
5434              * cache.
5435              *
5436              * The top 4 bits of scan->flags byte say how many different
5437              * relevant CURLLYX/WHILEM op pairs there are, while the
5438              * bottom 4-bits is the identifying index number of this
5439              * WHILEM.
5440              */
5441
5442             if (scan->flags) {
5443
5444                 if (!reginfo->poscache_maxiter) {
5445                     /* start the countdown: Postpone detection until we
5446                      * know the match is not *that* much linear. */
5447                     reginfo->poscache_maxiter
5448                         =    (reginfo->strend - reginfo->strbeg + 1)
5449                            * (scan->flags>>4);
5450                     /* possible overflow for long strings and many CURLYX's */
5451                     if (reginfo->poscache_maxiter < 0)
5452                         reginfo->poscache_maxiter = I32_MAX;
5453                     reginfo->poscache_iter = reginfo->poscache_maxiter;
5454                 }
5455
5456                 if (reginfo->poscache_iter-- == 0) {
5457                     /* initialise cache */
5458                     const I32 size = (reginfo->poscache_maxiter + 7)/8;
5459                     regmatch_info_aux *const aux = reginfo->info_aux;
5460                     if (aux->poscache) {
5461                         if ((I32)reginfo->poscache_size < size) {
5462                             Renew(aux->poscache, size, char);
5463                             reginfo->poscache_size = size;
5464                         }
5465                         Zero(aux->poscache, size, char);
5466                     }
5467                     else {
5468                         reginfo->poscache_size = size;
5469                         Newxz(aux->poscache, size, char);
5470                     }
5471                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5472       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5473                               PL_colors[4], PL_colors[5])
5474                     );
5475                 }
5476
5477                 if (reginfo->poscache_iter < 0) {
5478                     /* have we already failed at this position? */
5479                     I32 offset, mask;
5480
5481                     reginfo->poscache_iter = -1; /* stop eventual underflow */
5482                     offset  = (scan->flags & 0xf) - 1
5483                                 +   (locinput - reginfo->strbeg)
5484                                   * (scan->flags>>4);
5485                     mask    = 1 << (offset % 8);
5486                     offset /= 8;
5487                     if (reginfo->info_aux->poscache[offset] & mask) {
5488                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5489                             "%*s  whilem: (cache) already tried at this position...\n",
5490                             REPORT_CODE_OFF+depth*2, "")
5491                         );
5492                         sayNO; /* cache records failure */
5493                     }
5494                     ST.cache_offset = offset;
5495                     ST.cache_mask   = mask;
5496                 }
5497             }
5498
5499             /* Prefer B over A for minimal matching. */
5500
5501             if (cur_curlyx->u.curlyx.minmod) {
5502                 ST.save_curlyx = cur_curlyx;
5503                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5504                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5505                             maxopenparen);
5506                 REGCP_SET(ST.lastcp);
5507                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5508                                     locinput);
5509                 assert(0); /* NOTREACHED */
5510             }
5511
5512             /* Prefer A over B for maximal matching. */
5513
5514             if (n < max) { /* More greed allowed? */
5515                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5516                             maxopenparen);
5517                 cur_curlyx->u.curlyx.lastloc = locinput;
5518                 REGCP_SET(ST.lastcp);
5519                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5520                 assert(0); /* NOTREACHED */
5521             }
5522             goto do_whilem_B_max;
5523         }
5524         assert(0); /* NOTREACHED */
5525
5526         case WHILEM_B_min: /* just matched B in a minimal match */
5527         case WHILEM_B_max: /* just matched B in a maximal match */
5528             cur_curlyx = ST.save_curlyx;
5529             sayYES;
5530             assert(0); /* NOTREACHED */
5531
5532         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5533             cur_curlyx = ST.save_curlyx;
5534             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5535             cur_curlyx->u.curlyx.count--;
5536             CACHEsayNO;
5537             assert(0); /* NOTREACHED */
5538
5539         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5540             /* FALL THROUGH */
5541         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5542             REGCP_UNWIND(ST.lastcp);
5543             regcppop(rex, &maxopenparen);
5544             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5545             cur_curlyx->u.curlyx.count--;
5546             CACHEsayNO;
5547             assert(0); /* NOTREACHED */
5548
5549         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5550             REGCP_UNWIND(ST.lastcp);
5551             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5552             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5553                 "%*s  whilem: failed, trying continuation...\n",
5554                 REPORT_CODE_OFF+depth*2, "")
5555             );
5556           do_whilem_B_max:
5557             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5558                 && ckWARN(WARN_REGEXP)
5559                 && !reginfo->warned)
5560             {
5561                 reginfo->warned = TRUE;
5562                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5563                      "Complex regular subexpression recursion limit (%d) "
5564                      "exceeded",
5565                      REG_INFTY - 1);
5566             }
5567
5568             /* now try B */
5569             ST.save_curlyx = cur_curlyx;
5570             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5571             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5572                                 locinput);
5573             assert(0); /* NOTREACHED */
5574
5575         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5576             cur_curlyx = ST.save_curlyx;
5577             REGCP_UNWIND(ST.lastcp);
5578             regcppop(rex, &maxopenparen);
5579
5580             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5581                 /* Maximum greed exceeded */
5582                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5583                     && ckWARN(WARN_REGEXP)
5584                     && !reginfo->warned)
5585                 {
5586                     reginfo->warned     = TRUE;
5587                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5588                         "Complex regular subexpression recursion "
5589                         "limit (%d) exceeded",
5590                         REG_INFTY - 1);
5591                 }
5592                 cur_curlyx->u.curlyx.count--;
5593                 CACHEsayNO;
5594             }
5595
5596             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5597                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5598             );
5599             /* Try grabbing another A and see if it helps. */
5600             cur_curlyx->u.curlyx.lastloc = locinput;
5601             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5602                             maxopenparen);
5603             REGCP_SET(ST.lastcp);
5604             PUSH_STATE_GOTO(WHILEM_A_min,
5605                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5606                 locinput);
5607             assert(0); /* NOTREACHED */
5608
5609 #undef  ST
5610 #define ST st->u.branch
5611
5612         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5613             next = scan + ARG(scan);
5614             if (next == scan)
5615                 next = NULL;
5616             scan = NEXTOPER(scan);
5617             /* FALL THROUGH */
5618
5619         case BRANCH:        /*  /(...|A|...)/ */
5620             scan = NEXTOPER(scan); /* scan now points to inner node */
5621             ST.lastparen = rex->lastparen;
5622             ST.lastcloseparen = rex->lastcloseparen;
5623             ST.next_branch = next;
5624             REGCP_SET(ST.cp);
5625
5626             /* Now go into the branch */
5627             if (has_cutgroup) {
5628                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5629             } else {
5630                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5631             }
5632             assert(0); /* NOTREACHED */
5633
5634         case CUTGROUP:  /*  /(*THEN)/  */
5635             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5636                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5637             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5638             assert(0); /* NOTREACHED */
5639
5640         case CUTGROUP_next_fail:
5641             do_cutgroup = 1;
5642             no_final = 1;
5643             if (st->u.mark.mark_name)
5644                 sv_commit = st->u.mark.mark_name;
5645             sayNO;
5646             assert(0); /* NOTREACHED */
5647
5648         case BRANCH_next:
5649             sayYES;
5650             assert(0); /* NOTREACHED */
5651
5652         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5653             if (do_cutgroup) {
5654                 do_cutgroup = 0;
5655                 no_final = 0;
5656             }
5657             REGCP_UNWIND(ST.cp);
5658             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5659             scan = ST.next_branch;
5660             /* no more branches? */
5661             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5662                 DEBUG_EXECUTE_r({
5663                     PerlIO_printf( Perl_debug_log,
5664                         "%*s  %sBRANCH failed...%s\n",
5665                         REPORT_CODE_OFF+depth*2, "",
5666                         PL_colors[4],
5667                         PL_colors[5] );
5668                 });
5669                 sayNO_SILENT;
5670             }
5671             continue; /* execute next BRANCH[J] op */
5672             assert(0); /* NOTREACHED */
5673
5674         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5675             minmod = 1;
5676             break;
5677
5678 #undef  ST
5679 #define ST st->u.curlym
5680
5681         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5682
5683             /* This is an optimisation of CURLYX that enables us to push
5684              * only a single backtracking state, no matter how many matches
5685              * there are in {m,n}. It relies on the pattern being constant
5686              * length, with no parens to influence future backrefs
5687              */
5688
5689             ST.me = scan;
5690             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5691
5692             ST.lastparen      = rex->lastparen;
5693             ST.lastcloseparen = rex->lastcloseparen;
5694
5695             /* if paren positive, emulate an OPEN/CLOSE around A */
5696             if (ST.me->flags) {
5697                 U32 paren = ST.me->flags;
5698                 if (paren > maxopenparen)
5699                     maxopenparen = paren;
5700                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5701             }
5702             ST.A = scan;
5703             ST.B = next;
5704             ST.alen = 0;
5705             ST.count = 0;
5706             ST.minmod = minmod;
5707             minmod = 0;
5708             ST.c1 = CHRTEST_UNINIT;
5709             REGCP_SET(ST.cp);
5710
5711             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5712                 goto curlym_do_B;
5713
5714           curlym_do_A: /* execute the A in /A{m,n}B/  */
5715             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5716             assert(0); /* NOTREACHED */
5717
5718         case CURLYM_A: /* we've just matched an A */
5719             ST.count++;
5720             /* after first match, determine A's length: u.curlym.alen */
5721             if (ST.count == 1) {
5722                 if (reginfo->is_utf8_target) {
5723                     char *s = st->locinput;
5724                     while (s < locinput) {
5725                         ST.alen++;
5726                         s += UTF8SKIP(s);
5727                     }
5728                 }
5729                 else {
5730                     ST.alen = locinput - st->locinput;
5731                 }
5732                 if (ST.alen == 0)
5733                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5734             }
5735             DEBUG_EXECUTE_r(
5736                 PerlIO_printf(Perl_debug_log,
5737                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5738                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5739                           (IV) ST.count, (IV)ST.alen)
5740             );
5741
5742             if (cur_eval && cur_eval->u.eval.close_paren &&
5743                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5744                 goto fake_end;
5745
5746             {
5747                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5748                 if ( max == REG_INFTY || ST.count < max )
5749                     goto curlym_do_A; /* try to match another A */
5750             }
5751             goto curlym_do_B; /* try to match B */
5752
5753         case CURLYM_A_fail: /* just failed to match an A */
5754             REGCP_UNWIND(ST.cp);
5755
5756             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5757                 || (cur_eval && cur_eval->u.eval.close_paren &&
5758                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5759                 sayNO;
5760
5761           curlym_do_B: /* execute the B in /A{m,n}B/  */
5762             if (ST.c1 == CHRTEST_UNINIT) {
5763                 /* calculate c1 and c2 for possible match of 1st char
5764                  * following curly */
5765                 ST.c1 = ST.c2 = CHRTEST_VOID;
5766                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5767                     regnode *text_node = ST.B;
5768                     if (! HAS_TEXT(text_node))
5769                         FIND_NEXT_IMPT(text_node);
5770                     /* this used to be
5771
5772                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5773
5774                         But the former is redundant in light of the latter.
5775
5776                         if this changes back then the macro for
5777                         IS_TEXT and friends need to change.
5778                      */
5779                     if (PL_regkind[OP(text_node)] == EXACT) {
5780                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5781                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5782                            reginfo))
5783                         {
5784                             sayNO;
5785                         }
5786                     }
5787                 }
5788             }
5789
5790             DEBUG_EXECUTE_r(
5791                 PerlIO_printf(Perl_debug_log,
5792                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5793                     (int)(REPORT_CODE_OFF+(depth*2)),
5794                     "", (IV)ST.count)
5795                 );
5796             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5797                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5798                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5799                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5800                     {
5801                         /* simulate B failing */
5802                         DEBUG_OPTIMISE_r(
5803                             PerlIO_printf(Perl_debug_log,
5804                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5805                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5806                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5807                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5808                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5809                         );
5810                         state_num = CURLYM_B_fail;
5811                         goto reenter_switch;
5812                     }
5813                 }
5814                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5815                     /* simulate B failing */
5816                     DEBUG_OPTIMISE_r(
5817                         PerlIO_printf(Perl_debug_log,
5818                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5819                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5820                             (int) nextchr, ST.c1, ST.c2)
5821                     );
5822                     state_num = CURLYM_B_fail;
5823                     goto reenter_switch;
5824                 }
5825             }
5826
5827             if (ST.me->flags) {
5828                 /* emulate CLOSE: mark current A as captured */
5829                 I32 paren = ST.me->flags;
5830                 if (ST.count) {
5831                     rex->offs[paren].start
5832                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5833                     rex->offs[paren].end = locinput - reginfo->strbeg;
5834                     if ((U32)paren > rex->lastparen)
5835                         rex->lastparen = paren;
5836                     rex->lastcloseparen = paren;
5837                 }
5838                 else
5839                     rex->offs[paren].end = -1;
5840                 if (cur_eval && cur_eval->u.eval.close_paren &&
5841                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5842                 {
5843                     if (ST.count)
5844                         goto fake_end;
5845                     else
5846                         sayNO;
5847                 }
5848             }
5849
5850             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5851             assert(0); /* NOTREACHED */
5852
5853         case CURLYM_B_fail: /* just failed to match a B */
5854             REGCP_UNWIND(ST.cp);
5855             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5856             if (ST.minmod) {
5857                 I32 max = ARG2(ST.me);
5858                 if (max != REG_INFTY && ST.count == max)
5859                     sayNO;
5860                 goto curlym_do_A; /* try to match a further A */
5861             }
5862             /* backtrack one A */
5863             if (ST.count == ARG1(ST.me) /* min */)
5864                 sayNO;
5865             ST.count--;
5866             SET_locinput(HOPc(locinput, -ST.alen));
5867             goto curlym_do_B; /* try to match B */
5868
5869 #undef ST
5870 #define ST st->u.curly
5871
5872 #define CURLY_SETPAREN(paren, success) \
5873     if (paren) { \
5874         if (success) { \
5875             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5876             rex->offs[paren].end = locinput - reginfo->strbeg; \
5877             if (paren > rex->lastparen) \
5878                 rex->lastparen = paren; \
5879             rex->lastcloseparen = paren; \
5880         } \
5881         else { \
5882             rex->offs[paren].end = -1; \
5883             rex->lastparen      = ST.lastparen; \
5884             rex->lastcloseparen = ST.lastcloseparen; \
5885         } \
5886     }
5887
5888         case STAR:              /*  /A*B/ where A is width 1 char */
5889             ST.paren = 0;
5890             ST.min = 0;
5891             ST.max = REG_INFTY;
5892             scan = NEXTOPER(scan);
5893             goto repeat;
5894
5895         case PLUS:              /*  /A+B/ where A is width 1 char */
5896             ST.paren = 0;
5897             ST.min = 1;
5898             ST.max = REG_INFTY;
5899             scan = NEXTOPER(scan);
5900             goto repeat;
5901
5902         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5903             ST.paren = scan->flags;     /* Which paren to set */
5904             ST.lastparen      = rex->lastparen;
5905             ST.lastcloseparen = rex->lastcloseparen;
5906             if (ST.paren > maxopenparen)
5907                 maxopenparen = ST.paren;
5908             ST.min = ARG1(scan);  /* min to match */
5909             ST.max = ARG2(scan);  /* max to match */
5910             if (cur_eval && cur_eval->u.eval.close_paren &&
5911                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5912                 ST.min=1;
5913                 ST.max=1;
5914             }
5915             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5916             goto repeat;
5917
5918         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5919             ST.paren = 0;
5920             ST.min = ARG1(scan);  /* min to match */
5921             ST.max = ARG2(scan);  /* max to match */
5922             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5923           repeat:
5924             /*
5925             * Lookahead to avoid useless match attempts
5926             * when we know what character comes next.
5927             *
5928             * Used to only do .*x and .*?x, but now it allows
5929             * for )'s, ('s and (?{ ... })'s to be in the way
5930             * of the quantifier and the EXACT-like node.  -- japhy
5931             */
5932
5933             assert(ST.min <= ST.max);
5934             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5935                 ST.c1 = ST.c2 = CHRTEST_VOID;
5936             }
5937             else {
5938                 regnode *text_node = next;
5939
5940                 if (! HAS_TEXT(text_node))
5941                     FIND_NEXT_IMPT(text_node);
5942
5943                 if (! HAS_TEXT(text_node))
5944                     ST.c1 = ST.c2 = CHRTEST_VOID;
5945                 else {
5946                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5947                         ST.c1 = ST.c2 = CHRTEST_VOID;
5948                     }
5949                     else {
5950
5951                     /*  Currently we only get here when
5952
5953                         PL_rekind[OP(text_node)] == EXACT
5954
5955                         if this changes back then the macro for IS_TEXT and
5956                         friends need to change. */
5957                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5958                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5959                            reginfo))
5960                         {
5961                             sayNO;
5962                         }
5963                     }
5964                 }
5965             }
5966
5967             ST.A = scan;
5968             ST.B = next;
5969             if (minmod) {
5970                 char *li = locinput;
5971                 minmod = 0;
5972                 if (ST.min &&
5973                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
5974                             < ST.min)
5975                     sayNO;
5976                 SET_locinput(li);
5977                 ST.count = ST.min;
5978                 REGCP_SET(ST.cp);
5979                 if (ST.c1 == CHRTEST_VOID)
5980                     goto curly_try_B_min;
5981
5982                 ST.oldloc = locinput;
5983
5984                 /* set ST.maxpos to the furthest point along the
5985                  * string that could possibly match */
5986                 if  (ST.max == REG_INFTY) {
5987                     ST.maxpos = reginfo->strend - 1;
5988                     if (utf8_target)
5989                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5990                             ST.maxpos--;
5991                 }
5992                 else if (utf8_target) {
5993                     int m = ST.max - ST.min;
5994                     for (ST.maxpos = locinput;
5995                          m >0 && ST.maxpos < reginfo->strend; m--)
5996                         ST.maxpos += UTF8SKIP(ST.maxpos);
5997                 }
5998                 else {
5999                     ST.maxpos = locinput + ST.max - ST.min;
6000                     if (ST.maxpos >= reginfo->strend)
6001                         ST.maxpos = reginfo->strend - 1;
6002                 }
6003                 goto curly_try_B_min_known;
6004
6005             }
6006             else {
6007                 /* avoid taking address of locinput, so it can remain
6008                  * a register var */
6009                 char *li = locinput;
6010                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6011                 if (ST.count < ST.min)
6012                     sayNO;
6013                 SET_locinput(li);
6014                 if ((ST.count > ST.min)
6015                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6016                 {
6017                     /* A{m,n} must come at the end of the string, there's
6018                      * no point in backing off ... */
6019                     ST.min = ST.count;
6020                     /* ...except that $ and \Z can match before *and* after
6021                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6022                        We may back off by one in this case. */
6023                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6024                         ST.min--;
6025                 }
6026                 REGCP_SET(ST.cp);
6027                 goto curly_try_B_max;
6028             }
6029             assert(0); /* NOTREACHED */
6030
6031
6032         case CURLY_B_min_known_fail:
6033             /* failed to find B in a non-greedy match where c1,c2 valid */
6034
6035             REGCP_UNWIND(ST.cp);
6036             if (ST.paren) {
6037                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6038             }
6039             /* Couldn't or didn't -- move forward. */
6040             ST.oldloc = locinput;
6041             if (utf8_target)
6042                 locinput += UTF8SKIP(locinput);
6043             else
6044                 locinput++;
6045             ST.count++;
6046           curly_try_B_min_known:
6047              /* find the next place where 'B' could work, then call B */
6048             {
6049                 int n;
6050                 if (utf8_target) {
6051                     n = (ST.oldloc == locinput) ? 0 : 1;
6052                     if (ST.c1 == ST.c2) {
6053                         /* set n to utf8_distance(oldloc, locinput) */
6054                         while (locinput <= ST.maxpos
6055                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6056                         {
6057                             locinput += UTF8SKIP(locinput);
6058                             n++;
6059                         }
6060                     }
6061                     else {
6062                         /* set n to utf8_distance(oldloc, locinput) */
6063                         while (locinput <= ST.maxpos
6064                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6065                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6066                         {
6067                             locinput += UTF8SKIP(locinput);
6068                             n++;
6069                         }
6070                     }
6071                 }
6072                 else {  /* Not utf8_target */
6073                     if (ST.c1 == ST.c2) {
6074                         while (locinput <= ST.maxpos &&
6075                                UCHARAT(locinput) != ST.c1)
6076                             locinput++;
6077                     }
6078                     else {
6079                         while (locinput <= ST.maxpos
6080                                && UCHARAT(locinput) != ST.c1
6081                                && UCHARAT(locinput) != ST.c2)
6082                             locinput++;
6083                     }
6084                     n = locinput - ST.oldloc;
6085                 }
6086                 if (locinput > ST.maxpos)
6087                     sayNO;
6088                 if (n) {
6089                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6090                      * at b; check that everything between oldloc and
6091                      * locinput matches */
6092                     char *li = ST.oldloc;
6093                     ST.count += n;
6094                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6095                         sayNO;
6096                     assert(n == REG_INFTY || locinput == li);
6097                 }
6098                 CURLY_SETPAREN(ST.paren, ST.count);
6099                 if (cur_eval && cur_eval->u.eval.close_paren &&
6100                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6101                     goto fake_end;
6102                 }
6103                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6104             }
6105             assert(0); /* NOTREACHED */
6106
6107
6108         case CURLY_B_min_fail:
6109             /* failed to find B in a non-greedy match where c1,c2 invalid */
6110
6111             REGCP_UNWIND(ST.cp);
6112             if (ST.paren) {
6113                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6114             }
6115             /* failed -- move forward one */
6116             {
6117                 char *li = locinput;
6118                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6119                     sayNO;
6120                 }
6121                 locinput = li;
6122             }
6123             {
6124                 ST.count++;
6125                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6126                         ST.count > 0)) /* count overflow ? */
6127                 {
6128                   curly_try_B_min:
6129                     CURLY_SETPAREN(ST.paren, ST.count);
6130                     if (cur_eval && cur_eval->u.eval.close_paren &&
6131                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6132                         goto fake_end;
6133                     }
6134                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6135                 }
6136             }
6137             sayNO;
6138             assert(0); /* NOTREACHED */
6139
6140
6141         curly_try_B_max:
6142             /* a successful greedy match: now try to match B */
6143             if (cur_eval && cur_eval->u.eval.close_paren &&
6144                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6145                 goto fake_end;
6146             }
6147             {
6148                 bool could_match = locinput < reginfo->strend;
6149
6150                 /* If it could work, try it. */
6151                 if (ST.c1 != CHRTEST_VOID && could_match) {
6152                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6153                     {
6154                         could_match = memEQ(locinput,
6155                                             ST.c1_utf8,
6156                                             UTF8SKIP(locinput))
6157                                     || memEQ(locinput,
6158                                              ST.c2_utf8,
6159                                              UTF8SKIP(locinput));
6160                     }
6161                     else {
6162                         could_match = UCHARAT(locinput) == ST.c1
6163                                       || UCHARAT(locinput) == ST.c2;
6164                     }
6165                 }
6166                 if (ST.c1 == CHRTEST_VOID || could_match) {
6167                     CURLY_SETPAREN(ST.paren, ST.count);
6168                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6169                     assert(0); /* NOTREACHED */
6170                 }
6171             }
6172             /* FALL THROUGH */
6173
6174         case CURLY_B_max_fail:
6175             /* failed to find B in a greedy match */
6176
6177             REGCP_UNWIND(ST.cp);
6178             if (ST.paren) {
6179                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6180             }
6181             /*  back up. */
6182             if (--ST.count < ST.min)
6183                 sayNO;
6184             locinput = HOPc(locinput, -1);
6185             goto curly_try_B_max;
6186
6187 #undef ST
6188
6189         case END: /*  last op of main pattern  */
6190             fake_end:
6191             if (cur_eval) {
6192                 /* we've just finished A in /(??{A})B/; now continue with B */
6193
6194                 st->u.eval.prev_rex = rex_sv;           /* inner */
6195
6196                 /* Save *all* the positions. */
6197                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6198                 rex_sv = cur_eval->u.eval.prev_rex;
6199                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6200                 SET_reg_curpm(rex_sv);
6201                 rex = ReANY(rex_sv);
6202                 rexi = RXi_GET(rex);
6203                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6204
6205                 REGCP_SET(st->u.eval.lastcp);
6206
6207                 /* Restore parens of the outer rex without popping the
6208                  * savestack */
6209                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6210                                         &maxopenparen);
6211
6212                 st->u.eval.prev_eval = cur_eval;
6213                 cur_eval = cur_eval->u.eval.prev_eval;
6214                 DEBUG_EXECUTE_r(
6215                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6216                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6217                 if ( nochange_depth )
6218                     nochange_depth--;
6219
6220                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6221                                     locinput); /* match B */
6222             }
6223
6224             if (locinput < reginfo->till) {
6225                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6226                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6227                                       PL_colors[4],
6228                                       (long)(locinput - startpos),
6229                                       (long)(reginfo->till - startpos),
6230                                       PL_colors[5]));
6231
6232                 sayNO_SILENT;           /* Cannot match: too short. */
6233             }
6234             sayYES;                     /* Success! */
6235
6236         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6237             DEBUG_EXECUTE_r(
6238             PerlIO_printf(Perl_debug_log,
6239                 "%*s  %ssubpattern success...%s\n",
6240                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6241             sayYES;                     /* Success! */
6242
6243 #undef  ST
6244 #define ST st->u.ifmatch
6245
6246         {
6247             char *newstart;
6248
6249         case SUSPEND:   /* (?>A) */
6250             ST.wanted = 1;
6251             newstart = locinput;
6252             goto do_ifmatch;
6253
6254         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6255             ST.wanted = 0;
6256             goto ifmatch_trivial_fail_test;
6257
6258         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6259             ST.wanted = 1;
6260           ifmatch_trivial_fail_test:
6261             if (scan->flags) {
6262                 char * const s = HOPBACKc(locinput, scan->flags);
6263                 if (!s) {
6264                     /* trivial fail */
6265                     if (logical) {
6266                         logical = 0;
6267                         sw = 1 - cBOOL(ST.wanted);
6268                     }
6269                     else if (ST.wanted)
6270                         sayNO;
6271                     next = scan + ARG(scan);
6272                     if (next == scan)
6273                         next = NULL;
6274                     break;
6275                 }
6276                 newstart = s;
6277             }
6278             else
6279                 newstart = locinput;
6280
6281           do_ifmatch:
6282             ST.me = scan;
6283             ST.logical = logical;
6284             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6285
6286             /* execute body of (?...A) */
6287             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6288             assert(0); /* NOTREACHED */
6289         }
6290
6291         case IFMATCH_A_fail: /* body of (?...A) failed */
6292             ST.wanted = !ST.wanted;
6293             /* FALL THROUGH */
6294
6295         case IFMATCH_A: /* body of (?...A) succeeded */
6296             if (ST.logical) {
6297                 sw = cBOOL(ST.wanted);
6298             }
6299             else if (!ST.wanted)
6300                 sayNO;
6301
6302             if (OP(ST.me) != SUSPEND) {
6303                 /* restore old position except for (?>...) */
6304                 locinput = st->locinput;
6305             }
6306             scan = ST.me + ARG(ST.me);
6307             if (scan == ST.me)
6308                 scan = NULL;
6309             continue; /* execute B */
6310
6311 #undef ST
6312
6313         case LONGJMP: /*  alternative with many branches compiles to
6314                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6315             next = scan + ARG(scan);
6316             if (next == scan)
6317                 next = NULL;
6318             break;
6319
6320         case COMMIT:  /*  (*COMMIT)  */
6321             reginfo->cutpoint = reginfo->strend;
6322             /* FALLTHROUGH */
6323
6324         case PRUNE:   /*  (*PRUNE)   */
6325             if (!scan->flags)
6326                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6327             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6328             assert(0); /* NOTREACHED */
6329
6330         case COMMIT_next_fail:
6331             no_final = 1;
6332             /* FALLTHROUGH */
6333
6334         case OPFAIL:   /* (*FAIL)  */
6335             sayNO;
6336             assert(0); /* NOTREACHED */
6337
6338 #define ST st->u.mark
6339         case MARKPOINT: /*  (*MARK:foo)  */
6340             ST.prev_mark = mark_state;
6341             ST.mark_name = sv_commit = sv_yes_mark
6342                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6343             mark_state = st;
6344             ST.mark_loc = locinput;
6345             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6346             assert(0); /* NOTREACHED */
6347
6348         case MARKPOINT_next:
6349             mark_state = ST.prev_mark;
6350             sayYES;
6351             assert(0); /* NOTREACHED */
6352
6353         case MARKPOINT_next_fail:
6354             if (popmark && sv_eq(ST.mark_name,popmark))
6355             {
6356                 if (ST.mark_loc > startpoint)
6357                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6358                 popmark = NULL; /* we found our mark */
6359                 sv_commit = ST.mark_name;
6360
6361                 DEBUG_EXECUTE_r({
6362                         PerlIO_printf(Perl_debug_log,
6363                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6364                             REPORT_CODE_OFF+depth*2, "",
6365                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6366                 });
6367             }
6368             mark_state = ST.prev_mark;
6369             sv_yes_mark = mark_state ?
6370                 mark_state->u.mark.mark_name : NULL;
6371             sayNO;
6372             assert(0); /* NOTREACHED */
6373
6374         case SKIP:  /*  (*SKIP)  */
6375             if (scan->flags) {
6376                 /* (*SKIP) : if we fail we cut here*/
6377                 ST.mark_name = NULL;
6378                 ST.mark_loc = locinput;
6379                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6380             } else {
6381                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6382                    otherwise do nothing.  Meaning we need to scan
6383                  */
6384                 regmatch_state *cur = mark_state;
6385                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6386
6387                 while (cur) {
6388                     if ( sv_eq( cur->u.mark.mark_name,
6389                                 find ) )
6390                     {
6391                         ST.mark_name = find;
6392                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6393                     }
6394                     cur = cur->u.mark.prev_mark;
6395                 }
6396             }
6397             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6398             break;
6399
6400         case SKIP_next_fail:
6401             if (ST.mark_name) {
6402                 /* (*CUT:NAME) - Set up to search for the name as we
6403                    collapse the stack*/
6404                 popmark = ST.mark_name;
6405             } else {
6406                 /* (*CUT) - No name, we cut here.*/
6407                 if (ST.mark_loc > startpoint)
6408                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6409                 /* but we set sv_commit to latest mark_name if there
6410                    is one so they can test to see how things lead to this
6411                    cut */
6412                 if (mark_state)
6413                     sv_commit=mark_state->u.mark.mark_name;
6414             }
6415             no_final = 1;
6416             sayNO;
6417             assert(0); /* NOTREACHED */
6418 #undef ST
6419
6420         case LNBREAK: /* \R */
6421             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6422                 locinput += n;
6423             } else
6424                 sayNO;
6425             break;
6426
6427         default:
6428             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6429                           PTR2UV(scan), OP(scan));
6430             Perl_croak(aTHX_ "regexp memory corruption");
6431
6432         /* this is a point to jump to in order to increment
6433          * locinput by one character */
6434         increment_locinput:
6435             assert(!NEXTCHR_IS_EOS);
6436             if (utf8_target) {
6437                 locinput += PL_utf8skip[nextchr];
6438                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6439                 if (locinput > reginfo->strend)
6440                     sayNO;
6441             }
6442             else
6443                 locinput++;
6444             break;
6445
6446         } /* end switch */
6447
6448         /* switch break jumps here */
6449         scan = next; /* prepare to execute the next op and ... */
6450         continue;    /* ... jump back to the top, reusing st */
6451         assert(0); /* NOTREACHED */
6452
6453       push_yes_state:
6454         /* push a state that backtracks on success */
6455         st->u.yes.prev_yes_state = yes_state;
6456         yes_state = st;
6457         /* FALL THROUGH */
6458       push_state:
6459         /* push a new regex state, then continue at scan  */
6460         {
6461             regmatch_state *newst;
6462
6463             DEBUG_STACK_r({
6464                 regmatch_state *cur = st;
6465                 regmatch_state *curyes = yes_state;
6466                 int curd = depth;
6467                 regmatch_slab *slab = PL_regmatch_slab;
6468                 for (;curd > -1;cur--,curd--) {
6469                     if (cur < SLAB_FIRST(slab)) {
6470                         slab = slab->prev;
6471                         cur = SLAB_LAST(slab);
6472                     }
6473                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6474                         REPORT_CODE_OFF + 2 + depth * 2,"",
6475                         curd, PL_reg_name[cur->resume_state],
6476                         (curyes == cur) ? "yes" : ""
6477                     );
6478                     if (curyes == cur)
6479                         curyes = cur->u.yes.prev_yes_state;
6480                 }
6481             } else
6482                 DEBUG_STATE_pp("push")
6483             );
6484             depth++;
6485             st->locinput = locinput;
6486             newst = st+1;
6487             if (newst >  SLAB_LAST(PL_regmatch_slab))
6488                 newst = S_push_slab(aTHX);
6489             PL_regmatch_state = newst;
6490
6491             locinput = pushinput;
6492             st = newst;
6493             continue;
6494             assert(0); /* NOTREACHED */
6495         }
6496     }
6497
6498     /*
6499     * We get here only if there's trouble -- normally "case END" is
6500     * the terminating point.
6501     */
6502     Perl_croak(aTHX_ "corrupted regexp pointers");
6503     /*NOTREACHED*/
6504     sayNO;
6505
6506 yes:
6507     if (yes_state) {
6508         /* we have successfully completed a subexpression, but we must now
6509          * pop to the state marked by yes_state and continue from there */
6510         assert(st != yes_state);
6511 #ifdef DEBUGGING
6512         while (st != yes_state) {
6513             st--;
6514             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6515                 PL_regmatch_slab = PL_regmatch_slab->prev;
6516                 st = SLAB_LAST(PL_regmatch_slab);
6517             }
6518             DEBUG_STATE_r({
6519                 if (no_final) {
6520                     DEBUG_STATE_pp("pop (no final)");
6521                 } else {
6522                     DEBUG_STATE_pp("pop (yes)");
6523                 }
6524             });
6525             depth--;
6526         }
6527 #else
6528         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6529             || yes_state > SLAB_LAST(PL_regmatch_slab))
6530         {
6531             /* not in this slab, pop slab */
6532             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6533             PL_regmatch_slab = PL_regmatch_slab->prev;
6534             st = SLAB_LAST(PL_regmatch_slab);
6535         }
6536         depth -= (st - yes_state);
6537 #endif
6538         st = yes_state;
6539         yes_state = st->u.yes.prev_yes_state;
6540         PL_regmatch_state = st;
6541
6542         if (no_final)
6543             locinput= st->locinput;
6544         state_num = st->resume_state + no_final;
6545         goto reenter_switch;
6546     }
6547
6548     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6549                           PL_colors[4], PL_colors[5]));
6550
6551     if (reginfo->info_aux_eval) {
6552         /* each successfully executed (?{...}) block does the equivalent of
6553          *   local $^R = do {...}
6554          * When popping the save stack, all these locals would be undone;
6555          * bypass this by setting the outermost saved $^R to the latest
6556          * value */
6557         if (oreplsv != GvSV(PL_replgv))
6558             sv_setsv(oreplsv, GvSV(PL_replgv));
6559     }
6560     result = 1;
6561     goto final_exit;
6562
6563 no:
6564     DEBUG_EXECUTE_r(
6565         PerlIO_printf(Perl_debug_log,
6566             "%*s  %sfailed...%s\n",
6567             REPORT_CODE_OFF+depth*2, "",
6568             PL_colors[4], PL_colors[5])
6569         );
6570
6571 no_silent:
6572     if (no_final) {
6573         if (yes_state) {
6574             goto yes;
6575         } else {
6576             goto final_exit;
6577         }
6578     }
6579     if (depth) {
6580         /* there's a previous state to backtrack to */
6581         st--;
6582         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6583             PL_regmatch_slab = PL_regmatch_slab->prev;
6584             st = SLAB_LAST(PL_regmatch_slab);
6585         }
6586         PL_regmatch_state = st;
6587         locinput= st->locinput;
6588
6589         DEBUG_STATE_pp("pop");
6590         depth--;
6591         if (yes_state == st)
6592             yes_state = st->u.yes.prev_yes_state;
6593
6594         state_num = st->resume_state + 1; /* failure = success + 1 */
6595         goto reenter_switch;
6596     }
6597     result = 0;
6598
6599   final_exit:
6600     if (rex->intflags & PREGf_VERBARG_SEEN) {
6601         SV *sv_err = get_sv("REGERROR", 1);
6602         SV *sv_mrk = get_sv("REGMARK", 1);
6603         if (result) {
6604             sv_commit = &PL_sv_no;
6605             if (!sv_yes_mark)
6606                 sv_yes_mark = &PL_sv_yes;
6607         } else {
6608             if (!sv_commit)
6609                 sv_commit = &PL_sv_yes;
6610             sv_yes_mark = &PL_sv_no;
6611         }
6612         sv_setsv(sv_err, sv_commit);
6613         sv_setsv(sv_mrk, sv_yes_mark);
6614     }
6615
6616
6617     if (last_pushed_cv) {
6618         dSP;
6619         POP_MULTICALL;
6620         PERL_UNUSED_VAR(SP);
6621     }
6622
6623     assert(!result ||  locinput - reginfo->strbeg >= 0);
6624     return result ?  locinput - reginfo->strbeg : -1;
6625 }
6626
6627 /*
6628  - regrepeat - repeatedly match something simple, report how many
6629  *
6630  * What 'simple' means is a node which can be the operand of a quantifier like
6631  * '+', or {1,3}
6632  *
6633  * startposp - pointer a pointer to the start position.  This is updated
6634  *             to point to the byte following the highest successful
6635  *             match.
6636  * p         - the regnode to be repeatedly matched against.
6637  * reginfo   - struct holding match state, such as strend
6638  * max       - maximum number of things to match.
6639  * depth     - (for debugging) backtracking depth.
6640  */
6641 STATIC I32
6642 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6643             regmatch_info *const reginfo, I32 max, int depth)
6644 {
6645     dVAR;
6646     char *scan;     /* Pointer to current position in target string */
6647     I32 c;
6648     char *loceol = reginfo->strend;   /* local version */
6649     I32 hardcount = 0;  /* How many matches so far */
6650     bool utf8_target = reginfo->is_utf8_target;
6651     int to_complement = 0;  /* Invert the result? */
6652     UV utf8_flags;
6653     _char_class_number classnum;
6654 #ifndef DEBUGGING
6655     PERL_UNUSED_ARG(depth);
6656 #endif
6657
6658     PERL_ARGS_ASSERT_REGREPEAT;
6659
6660     scan = *startposp;
6661     if (max == REG_INFTY)
6662         max = I32_MAX;
6663     else if (! utf8_target && loceol - scan > max)
6664         loceol = scan + max;
6665
6666     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6667      * to the maximum of how far we should go in it (leaving it set to the real
6668      * end, if the maximum permissible would take us beyond that).  This allows
6669      * us to make the loop exit condition that we haven't gone past <loceol> to
6670      * also mean that we haven't exceeded the max permissible count, saving a
6671      * test each time through the loop.  But it assumes that the OP matches a
6672      * single byte, which is true for most of the OPs below when applied to a
6673      * non-UTF-8 target.  Those relatively few OPs that don't have this
6674      * characteristic will have to compensate.
6675      *
6676      * There is no adjustment for UTF-8 targets, as the number of bytes per
6677      * character varies.  OPs will have to test both that the count is less
6678      * than the max permissible (using <hardcount> to keep track), and that we
6679      * are still within the bounds of the string (using <loceol>.  A few OPs
6680      * match a single byte no matter what the encoding.  They can omit the max
6681      * test if, for the UTF-8 case, they do the adjustment that was skipped
6682      * above.
6683      *
6684      * Thus, the code above sets things up for the common case; and exceptional
6685      * cases need extra work; the common case is to make sure <scan> doesn't
6686      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6687      * count doesn't exceed the maximum permissible */
6688
6689     switch (OP(p)) {
6690     case REG_ANY:
6691         if (utf8_target) {
6692             while (scan < loceol && hardcount < max && *scan != '\n') {
6693                 scan += UTF8SKIP(scan);
6694                 hardcount++;
6695             }
6696         } else {
6697             while (scan < loceol && *scan != '\n')
6698                 scan++;
6699         }
6700         break;
6701     case SANY:
6702         if (utf8_target) {
6703             while (scan < loceol && hardcount < max) {
6704                 scan += UTF8SKIP(scan);
6705                 hardcount++;
6706             }
6707         }
6708         else
6709             scan = loceol;
6710         break;
6711     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6712         if (utf8_target && loceol - scan > max) {
6713
6714             /* <loceol> hadn't been adjusted in the UTF-8 case */
6715             scan +=  max;
6716         }
6717         else {
6718             scan = loceol;
6719         }
6720         break;
6721     case EXACT:
6722         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6723
6724         c = (U8)*STRING(p);
6725
6726         /* Can use a simple loop if the pattern char to match on is invariant
6727          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6728          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6729          * true iff it doesn't matter if the argument is in UTF-8 or not */
6730         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6731             if (utf8_target && loceol - scan > max) {
6732                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6733                  * since here, to match at all, 1 char == 1 byte */
6734                 loceol = scan + max;
6735             }
6736             while (scan < loceol && UCHARAT(scan) == c) {
6737                 scan++;
6738             }
6739         }
6740         else if (reginfo->is_utf8_pat) {
6741             if (utf8_target) {
6742                 STRLEN scan_char_len;
6743
6744                 /* When both target and pattern are UTF-8, we have to do
6745                  * string EQ */
6746                 while (hardcount < max
6747                        && scan < loceol
6748                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6749                        && memEQ(scan, STRING(p), scan_char_len))
6750                 {
6751                     scan += scan_char_len;
6752                     hardcount++;
6753                 }
6754             }
6755             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6756
6757                 /* Target isn't utf8; convert the character in the UTF-8
6758                  * pattern to non-UTF8, and do a simple loop */
6759                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6760                 while (scan < loceol && UCHARAT(scan) == c) {
6761                     scan++;
6762                 }
6763             } /* else pattern char is above Latin1, can't possibly match the
6764                  non-UTF-8 target */
6765         }
6766         else {
6767
6768             /* Here, the string must be utf8; pattern isn't, and <c> is
6769              * different in utf8 than not, so can't compare them directly.
6770              * Outside the loop, find the two utf8 bytes that represent c, and
6771              * then look for those in sequence in the utf8 string */
6772             U8 high = UTF8_TWO_BYTE_HI(c);
6773             U8 low = UTF8_TWO_BYTE_LO(c);
6774
6775             while (hardcount < max
6776                     && scan + 1 < loceol
6777                     && UCHARAT(scan) == high
6778                     && UCHARAT(scan + 1) == low)
6779             {
6780                 scan += 2;
6781                 hardcount++;
6782             }
6783         }
6784         break;
6785
6786     case EXACTFA:
6787         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6788         goto do_exactf;
6789
6790     case EXACTFL:
6791         RXp_MATCH_TAINTED_on(prog);
6792         utf8_flags = FOLDEQ_UTF8_LOCALE;
6793         goto do_exactf;
6794
6795     case EXACTF:
6796             utf8_flags = 0;
6797             goto do_exactf;
6798
6799     case EXACTFU_SS:
6800     case EXACTFU_TRICKYFOLD:
6801     case EXACTFU:
6802         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6803
6804     do_exactf: {
6805         int c1, c2;
6806         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6807
6808         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6809
6810         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6811                                         reginfo))
6812         {
6813             if (c1 == CHRTEST_VOID) {
6814                 /* Use full Unicode fold matching */
6815                 char *tmpeol = reginfo->strend;
6816                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6817                 while (hardcount < max
6818                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6819                                              STRING(p), NULL, pat_len,
6820                                              reginfo->is_utf8_pat, utf8_flags))
6821                 {
6822                     scan = tmpeol;
6823                     tmpeol = reginfo->strend;
6824                     hardcount++;
6825                 }
6826             }
6827             else if (utf8_target) {
6828                 if (c1 == c2) {
6829                     while (scan < loceol
6830                            && hardcount < max
6831                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6832                     {
6833                         scan += UTF8SKIP(scan);
6834                         hardcount++;
6835                     }
6836                 }
6837                 else {
6838                     while (scan < loceol
6839                            && hardcount < max
6840                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6841                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6842                     {
6843                         scan += UTF8SKIP(scan);
6844                         hardcount++;
6845                     }
6846                 }
6847             }
6848             else if (c1 == c2) {
6849                 while (scan < loceol && UCHARAT(scan) == c1) {
6850                     scan++;
6851                 }
6852             }
6853             else {
6854                 while (scan < loceol &&
6855                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6856                 {
6857                     scan++;
6858                 }
6859             }
6860         }
6861         break;
6862     }
6863     case ANYOF:
6864     case ANYOF_WARN_SUPER:
6865         if (utf8_target) {
6866             while (hardcount < max
6867                    && scan < loceol
6868                    && reginclass(prog, p, (U8*)scan, utf8_target))
6869             {
6870                 scan += UTF8SKIP(scan);
6871                 hardcount++;
6872             }
6873         } else {
6874             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6875                 scan++;
6876         }
6877         break;
6878
6879     /* The argument (FLAGS) to all the POSIX node types is the class number */
6880
6881     case NPOSIXL:
6882         to_complement = 1;
6883         /* FALLTHROUGH */
6884
6885     case POSIXL:
6886         RXp_MATCH_TAINTED_on(prog);
6887         if (! utf8_target) {
6888             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6889                                                                    *scan)))
6890             {
6891                 scan++;
6892             }
6893         } else {
6894             while (hardcount < max && scan < loceol
6895                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6896                                                                   (U8 *) scan)))
6897             {
6898                 scan += UTF8SKIP(scan);
6899                 hardcount++;
6900             }
6901         }
6902         break;
6903
6904     case POSIXD:
6905         if (utf8_target) {
6906             goto utf8_posix;
6907         }
6908         /* FALLTHROUGH */
6909
6910     case POSIXA:
6911         if (utf8_target && loceol - scan > max) {
6912
6913             /* We didn't adjust <loceol> at the beginning of this routine
6914              * because is UTF-8, but it is actually ok to do so, since here, to
6915              * match, 1 char == 1 byte. */
6916             loceol = scan + max;
6917         }
6918         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6919             scan++;
6920         }
6921         break;
6922
6923     case NPOSIXD:
6924         if (utf8_target) {
6925             to_complement = 1;
6926             goto utf8_posix;
6927         }
6928         /* FALL THROUGH */
6929
6930     case NPOSIXA:
6931         if (! utf8_target) {
6932             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6933                 scan++;
6934             }
6935         }
6936         else {
6937
6938             /* The complement of something that matches only ASCII matches all
6939              * UTF-8 variant code points, plus everything in ASCII that isn't
6940              * in the class. */
6941             while (hardcount < max && scan < loceol
6942                    && (! UTF8_IS_INVARIANT(*scan)
6943                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
6944             {
6945                 scan += UTF8SKIP(scan);
6946                 hardcount++;
6947             }
6948         }
6949         break;
6950
6951     case NPOSIXU:
6952         to_complement = 1;
6953         /* FALLTHROUGH */
6954
6955     case POSIXU:
6956         if (! utf8_target) {
6957             while (scan < loceol && to_complement
6958                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
6959             {
6960                 scan++;
6961             }
6962         }
6963         else {
6964       utf8_posix:
6965             classnum = (_char_class_number) FLAGS(p);
6966             if (classnum < _FIRST_NON_SWASH_CC) {
6967
6968                 /* Here, a swash is needed for above-Latin1 code points.
6969                  * Process as many Latin1 code points using the built-in rules.
6970                  * Go to another loop to finish processing upon encountering
6971                  * the first Latin1 code point.  We could do that in this loop
6972                  * as well, but the other way saves having to test if the swash
6973                  * has been loaded every time through the loop: extra space to
6974                  * save a test. */
6975                 while (hardcount < max && scan < loceol) {
6976                     if (UTF8_IS_INVARIANT(*scan)) {
6977                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6978                                                                    classnum))))
6979                         {
6980                             break;
6981                         }
6982                         scan++;
6983                     }
6984                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6985                         if (! (to_complement
6986                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6987                                                                    *(scan + 1)),
6988                                                     classnum))))
6989                         {
6990                             break;
6991                         }
6992                         scan += 2;
6993                     }
6994                     else {
6995                         goto found_above_latin1;
6996                     }
6997
6998                     hardcount++;
6999                 }
7000             }
7001             else {
7002                 /* For these character classes, the knowledge of how to handle
7003                  * every code point is compiled in to Perl via a macro.  This
7004                  * code is written for making the loops as tight as possible.
7005                  * It could be refactored to save space instead */
7006                 switch (classnum) {
7007                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7008                                                if we revert the change of \v
7009                                                matching this */
7010                         /* FALL THROUGH */
7011                     case _CC_ENUM_PSXSPC:
7012                         while (hardcount < max
7013                                && scan < loceol
7014                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7015                         {
7016                             scan += UTF8SKIP(scan);
7017                             hardcount++;
7018                         }
7019                         break;
7020                     case _CC_ENUM_BLANK:
7021                         while (hardcount < max
7022                                && scan < loceol
7023                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7024                         {
7025                             scan += UTF8SKIP(scan);
7026                             hardcount++;
7027                         }
7028                         break;
7029                     case _CC_ENUM_XDIGIT:
7030                         while (hardcount < max
7031                                && scan < loceol
7032                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7033                         {
7034                             scan += UTF8SKIP(scan);
7035                             hardcount++;
7036                         }
7037                         break;
7038                     case _CC_ENUM_VERTSPACE:
7039                         while (hardcount < max
7040                                && scan < loceol
7041                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7042                         {
7043                             scan += UTF8SKIP(scan);
7044                             hardcount++;
7045                         }
7046                         break;
7047                     case _CC_ENUM_CNTRL:
7048                         while (hardcount < max
7049                                && scan < loceol
7050                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7051                         {
7052                             scan += UTF8SKIP(scan);
7053                             hardcount++;
7054                         }
7055                         break;
7056                     default:
7057                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7058                 }
7059             }
7060         }
7061         break;
7062
7063       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7064
7065         /* Load the swash if not already present */
7066         if (! PL_utf8_swash_ptrs[classnum]) {
7067             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7068             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7069                                         "utf8", swash_property_names[classnum],
7070                                         &PL_sv_undef, 1, 0, NULL, &flags);
7071         }
7072
7073         while (hardcount < max && scan < loceol
7074                && to_complement ^ cBOOL(_generic_utf8(
7075                                        classnum,
7076                                        scan,
7077                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7078                                                    (U8 *) scan,
7079                                                    TRUE))))
7080         {
7081             scan += UTF8SKIP(scan);
7082             hardcount++;
7083         }
7084         break;
7085
7086     case LNBREAK:
7087         if (utf8_target) {
7088             while (hardcount < max && scan < loceol &&
7089                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7090                 scan += c;
7091                 hardcount++;
7092             }
7093         } else {
7094             /* LNBREAK can match one or two latin chars, which is ok, but we
7095              * have to use hardcount in this situation, and throw away the
7096              * adjustment to <loceol> done before the switch statement */
7097             loceol = reginfo->strend;
7098             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7099                 scan+=c;
7100                 hardcount++;
7101             }
7102         }
7103         break;
7104
7105     case BOUND:
7106     case BOUNDA:
7107     case BOUNDL:
7108     case BOUNDU:
7109     case EOS:
7110     case GPOS:
7111     case KEEPS:
7112     case NBOUND:
7113     case NBOUNDA:
7114     case NBOUNDL:
7115     case NBOUNDU:
7116     case OPFAIL:
7117     case SBOL:
7118     case SEOL:
7119         /* These are all 0 width, so match right here or not at all. */
7120         break;
7121
7122     default:
7123         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7124         assert(0); /* NOTREACHED */
7125
7126     }
7127
7128     if (hardcount)
7129         c = hardcount;
7130     else
7131         c = scan - *startposp;
7132     *startposp = scan;
7133
7134     DEBUG_r({
7135         GET_RE_DEBUG_FLAGS_DECL;
7136         DEBUG_EXECUTE_r({
7137             SV * const prop = sv_newmortal();
7138             regprop(prog, prop, p);
7139             PerlIO_printf(Perl_debug_log,
7140                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7141                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7142         });
7143     });
7144
7145     return(c);
7146 }
7147
7148
7149 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7150 /*
7151 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7152 create a copy so that changes the caller makes won't change the shared one.
7153 If <altsvp> is non-null, will return NULL in it, for back-compat.
7154  */
7155 SV *
7156 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7157 {
7158     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7159
7160     if (altsvp) {
7161         *altsvp = NULL;
7162     }
7163
7164     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7165 }
7166 #endif
7167
7168 STATIC SV *
7169 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7170 {
7171     /* Returns the swash for the input 'node' in the regex 'prog'.
7172      * If <doinit> is true, will attempt to create the swash if not already
7173      *    done.
7174      * If <listsvp> is non-null, will return the swash initialization string in
7175      *    it.
7176      * Tied intimately to how regcomp.c sets up the data structure */
7177
7178     dVAR;
7179     SV *sw  = NULL;
7180     SV *si  = NULL;
7181     SV*  invlist = NULL;
7182
7183     RXi_GET_DECL(prog,progi);
7184     const struct reg_data * const data = prog ? progi->data : NULL;
7185
7186     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7187
7188     assert(ANYOF_NONBITMAP(node));
7189
7190     if (data && data->count) {
7191         const U32 n = ARG(node);
7192
7193         if (data->what[n] == 's') {
7194             SV * const rv = MUTABLE_SV(data->data[n]);
7195             AV * const av = MUTABLE_AV(SvRV(rv));
7196             SV **const ary = AvARRAY(av);
7197             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7198
7199             si = *ary;  /* ary[0] = the string to initialize the swash with */
7200
7201             /* Elements 2 and 3 are either both present or both absent. [2] is
7202              * any inversion list generated at compile time; [3] indicates if
7203              * that inversion list has any user-defined properties in it. */
7204             if (av_len(av) >= 2) {
7205                 invlist = ary[2];
7206                 if (SvUV(ary[3])) {
7207                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7208                 }
7209             }
7210             else {
7211                 invlist = NULL;
7212             }
7213
7214             /* Element [1] is reserved for the set-up swash.  If already there,
7215              * return it; if not, create it and store it there */
7216             if (SvROK(ary[1])) {
7217                 sw = ary[1];
7218             }
7219             else if (si && doinit) {
7220
7221                 sw = _core_swash_init("utf8", /* the utf8 package */
7222                                       "", /* nameless */
7223                                       si,
7224                                       1, /* binary */
7225                                       0, /* not from tr/// */
7226                                       invlist,
7227                                       &swash_init_flags);
7228                 (void)av_store(av, 1, sw);
7229             }
7230         }
7231     }
7232
7233     if (listsvp) {
7234         SV* matches_string = newSVpvn("", 0);
7235
7236         /* Use the swash, if any, which has to have incorporated into it all
7237          * possibilities */
7238         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7239             && (si && si != &PL_sv_undef))
7240         {
7241
7242             /* If no swash, use the input initialization string, if available */
7243             sv_catsv(matches_string, si);
7244         }
7245
7246         /* Add the inversion list to whatever we have.  This may have come from
7247          * the swash, or from an input parameter */
7248         if (invlist) {
7249             sv_catsv(matches_string, _invlist_contents(invlist));
7250         }
7251         *listsvp = matches_string;
7252     }
7253
7254     return sw;
7255 }
7256
7257 /*
7258  - reginclass - determine if a character falls into a character class
7259
7260   n is the ANYOF regnode
7261   p is the target string
7262   utf8_target tells whether p is in UTF-8.
7263
7264   Returns true if matched; false otherwise.
7265
7266   Note that this can be a synthetic start class, a combination of various
7267   nodes, so things you think might be mutually exclusive, such as locale,
7268   aren't.  It can match both locale and non-locale
7269
7270  */
7271
7272 STATIC bool
7273 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7274 {
7275     dVAR;
7276     const char flags = ANYOF_FLAGS(n);
7277     bool match = FALSE;
7278     UV c = *p;
7279
7280     PERL_ARGS_ASSERT_REGINCLASS;
7281
7282     /* If c is not already the code point, get it.  Note that
7283      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7284     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7285         STRLEN c_len = 0;
7286         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7287                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7288                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7289                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7290                  * UTF8_ALLOW_FFFF */
7291         if (c_len == (STRLEN)-1)
7292             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7293     }
7294
7295     /* If this character is potentially in the bitmap, check it */
7296     if (c < 256) {
7297         if (ANYOF_BITMAP_TEST(n, c))
7298             match = TRUE;
7299         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7300                 && ! utf8_target
7301                 && ! isASCII(c))
7302         {
7303             match = TRUE;
7304         }
7305         else if (flags & ANYOF_LOCALE) {
7306             RXp_MATCH_TAINTED_on(prog);
7307
7308             if ((flags & ANYOF_LOC_FOLD)
7309                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7310             {
7311                 match = TRUE;
7312             }
7313             else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7314
7315                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7316                  * if the class includes the Posix character class given by
7317                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7318                  * complemented Posix class given by int(bit/2).  So we loop
7319                  * through the bits, each time changing whether we complement
7320                  * the result or not.  Suppose for the sake of illustration
7321                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7322                  * is set, it means there is a match for this ANYOF node if the
7323                  * character is in the class given by the expression (0 / 2 = 0
7324                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7325                  * and since 'to_complement' is 0, the result will stay TRUE,
7326                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7327                  * bit 1 is 1.  That means there is a match if the character
7328                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7329                  * but will on bit 1.  On the second iteration 'to_complement'
7330                  * will be 1, so the exclusive or will reverse things, so we
7331                  * are testing for \W.  On the third iteration, 'to_complement'
7332                  * will be 0, and we would be testing for \s; the fourth
7333                  * iteration would test for \S, etc.
7334                  *
7335                  * Note that this code assumes that all the classes are closed
7336                  * under folding.  For example, if a character matches \w, then
7337                  * its fold does too; and vice versa.  This should be true for
7338                  * any well-behaved locale for all the currently defined Posix
7339                  * classes, except for :lower: and :upper:, which are handled
7340                  * by the pseudo-class :cased: which matches if either of the
7341                  * other two does.  To get rid of this assumption, an outer
7342                  * loop could be used below to iterate over both the source
7343                  * character, and its fold (if different) */
7344
7345                 int count = 0;
7346                 int to_complement = 0;
7347                 while (count < ANYOF_MAX) {
7348                     if (ANYOF_CLASS_TEST(n, count)
7349                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7350                     {
7351                         match = TRUE;
7352                         break;
7353                     }
7354                     count++;
7355                     to_complement ^= 1;
7356                 }
7357             }
7358         }
7359     }
7360
7361     /* If the bitmap didn't (or couldn't) match, and something outside the
7362      * bitmap could match, try that.  Locale nodes specify completely the
7363      * behavior of code points in the bit map (otherwise, a utf8 target would
7364      * cause them to be treated as Unicode and not locale), except in
7365      * the very unlikely event when this node is a synthetic start class, which
7366      * could be a combination of locale and non-locale nodes.  So allow locale
7367      * to match for the synthetic start class, which will give a false
7368      * positive that will be resolved when the match is done again as not part
7369      * of the synthetic start class */
7370     if (!match) {
7371         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7372             match = TRUE;       /* Everything above 255 matches */
7373         }
7374         else if (ANYOF_NONBITMAP(n)
7375                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7376                      || (utf8_target
7377                          && (c >=256
7378                              || (! (flags & ANYOF_LOCALE))
7379                              || OP(n) == ANYOF_SYNTHETIC))))
7380         {
7381             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7382             if (sw) {
7383                 U8 * utf8_p;
7384                 if (utf8_target) {
7385                     utf8_p = (U8 *) p;
7386                 } else { /* Convert to utf8 */
7387                     STRLEN len = 1;
7388                     utf8_p = bytes_to_utf8(p, &len);
7389                 }
7390
7391                 if (swash_fetch(sw, utf8_p, TRUE)) {
7392                     match = TRUE;
7393                 }
7394
7395                 /* If we allocated a string above, free it */
7396                 if (! utf8_target) Safefree(utf8_p);
7397             }
7398         }
7399
7400         if (UNICODE_IS_SUPER(c)
7401             && OP(n) == ANYOF_WARN_SUPER
7402             && ckWARN_d(WARN_NON_UNICODE))
7403         {
7404             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7405                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7406         }
7407     }
7408
7409     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7410     return cBOOL(flags & ANYOF_INVERT) ^ match;
7411 }
7412
7413 STATIC U8 *
7414 S_reghop3(U8 *s, I32 off, const U8* lim)
7415 {
7416     /* return the position 'off' UTF-8 characters away from 's', forward if
7417      * 'off' >= 0, backwards if negative.  But don't go outside of position
7418      * 'lim', which better be < s  if off < 0 */
7419
7420     dVAR;
7421
7422     PERL_ARGS_ASSERT_REGHOP3;
7423
7424     if (off >= 0) {
7425         while (off-- && s < lim) {
7426             /* XXX could check well-formedness here */
7427             s += UTF8SKIP(s);
7428         }
7429     }
7430     else {
7431         while (off++ && s > lim) {
7432             s--;
7433             if (UTF8_IS_CONTINUED(*s)) {
7434                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7435                     s--;
7436             }
7437             /* XXX could check well-formedness here */
7438         }
7439     }
7440     return s;
7441 }
7442
7443 #ifdef XXX_dmq
7444 /* there are a bunch of places where we use two reghop3's that should
7445    be replaced with this routine. but since thats not done yet
7446    we ifdef it out - dmq
7447 */
7448 STATIC U8 *
7449 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7450 {
7451     dVAR;
7452
7453     PERL_ARGS_ASSERT_REGHOP4;
7454
7455     if (off >= 0) {
7456         while (off-- && s < rlim) {
7457             /* XXX could check well-formedness here */
7458             s += UTF8SKIP(s);
7459         }
7460     }
7461     else {
7462         while (off++ && s > llim) {
7463             s--;
7464             if (UTF8_IS_CONTINUED(*s)) {
7465                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7466                     s--;
7467             }
7468             /* XXX could check well-formedness here */
7469         }
7470     }
7471     return s;
7472 }
7473 #endif
7474
7475 STATIC U8 *
7476 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7477 {
7478     dVAR;
7479
7480     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7481
7482     if (off >= 0) {
7483         while (off-- && s < lim) {
7484             /* XXX could check well-formedness here */
7485             s += UTF8SKIP(s);
7486         }
7487         if (off >= 0)
7488             return NULL;
7489     }
7490     else {
7491         while (off++ && s > lim) {
7492             s--;
7493             if (UTF8_IS_CONTINUED(*s)) {
7494                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7495                     s--;
7496             }
7497             /* XXX could check well-formedness here */
7498         }
7499         if (off <= 0)
7500             return NULL;
7501     }
7502     return s;
7503 }
7504
7505
7506 /* when executing a regex that may have (?{}), extra stuff needs setting
7507    up that will be visible to the called code, even before the current
7508    match has finished. In particular:
7509
7510    * $_ is localised to the SV currently being matched;
7511    * pos($_) is created if necessary, ready to be updated on each call-out
7512      to code;
7513    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7514      isn't set until the current pattern is successfully finished), so that
7515      $1 etc of the match-so-far can be seen;
7516    * save the old values of subbeg etc of the current regex, and  set then
7517      to the current string (again, this is normally only done at the end
7518      of execution)
7519 */
7520
7521 static void
7522 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7523 {
7524     MAGIC *mg;
7525     regexp *const rex = ReANY(reginfo->prog);
7526     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7527
7528     eval_state->rex = rex;
7529
7530     if (reginfo->sv) {
7531         /* Make $_ available to executed code. */
7532         if (reginfo->sv != DEFSV) {
7533             SAVE_DEFSV;
7534             DEFSV_set(reginfo->sv);
7535         }
7536
7537         if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
7538               && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
7539             /* prepare for quick setting of pos */
7540 #ifdef PERL_OLD_COPY_ON_WRITE
7541             if (SvIsCOW(reginfo->sv))
7542                 sv_force_normal_flags(reginfo->sv, 0);
7543 #endif
7544             mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
7545                              &PL_vtbl_mglob, NULL, 0);
7546             mg->mg_len = -1;
7547         }
7548         eval_state->pos_magic = mg;
7549         eval_state->pos       = mg->mg_len;
7550     }
7551     else
7552         eval_state->pos_magic = NULL;
7553
7554     if (!PL_reg_curpm) {
7555         /* PL_reg_curpm is a fake PMOP that we can attach the current
7556          * regex to and point PL_curpm at, so that $1 et al are visible
7557          * within a /(?{})/. It's just allocated once per interpreter the
7558          * first time its needed */
7559         Newxz(PL_reg_curpm, 1, PMOP);
7560 #ifdef USE_ITHREADS
7561         {
7562             SV* const repointer = &PL_sv_undef;
7563             /* this regexp is also owned by the new PL_reg_curpm, which
7564                will try to free it.  */
7565             av_push(PL_regex_padav, repointer);
7566             PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7567             PL_regex_pad = AvARRAY(PL_regex_padav);
7568         }
7569 #endif
7570     }
7571     SET_reg_curpm(reginfo->prog);
7572     eval_state->curpm = PL_curpm;
7573     PL_curpm = PL_reg_curpm;
7574     if (RXp_MATCH_COPIED(rex)) {
7575         /*  Here is a serious problem: we cannot rewrite subbeg,
7576             since it may be needed if this match fails.  Thus
7577             $` inside (?{}) could fail... */
7578         eval_state->subbeg     = rex->subbeg;
7579         eval_state->sublen     = rex->sublen;
7580         eval_state->suboffset  = rex->suboffset;
7581         eval_state->subcoffset = rex->subcoffset;
7582 #ifdef PERL_ANY_COW
7583         eval_state->saved_copy = rex->saved_copy;
7584 #endif
7585         RXp_MATCH_COPIED_off(rex);
7586     }
7587     else
7588         eval_state->subbeg = NULL;
7589     rex->subbeg = (char *)reginfo->strbeg;
7590     rex->suboffset = 0;
7591     rex->subcoffset = 0;
7592     rex->sublen = reginfo->strend - reginfo->strbeg;
7593 }
7594
7595
7596 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7597
7598 static void
7599 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7600 {
7601     dVAR;
7602     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7603     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7604     regmatch_slab *s;
7605
7606     Safefree(aux->poscache);
7607
7608     if (eval_state) {
7609
7610         /* undo the effects of S_setup_eval_state() */
7611
7612         if (eval_state->subbeg) {
7613             regexp * const rex = eval_state->rex;
7614             rex->subbeg     = eval_state->subbeg;
7615             rex->sublen     = eval_state->sublen;
7616             rex->suboffset  = eval_state->suboffset;
7617             rex->subcoffset = eval_state->subcoffset;
7618 #ifdef PERL_ANY_COW
7619             rex->saved_copy = eval_state->saved_copy;
7620 #endif
7621             RXp_MATCH_COPIED_on(rex);
7622         }
7623         if (eval_state->pos_magic)
7624             eval_state->pos_magic->mg_len = eval_state->pos;
7625
7626         PL_curpm = eval_state->curpm;
7627     }
7628
7629     PL_regmatch_state = aux->old_regmatch_state;
7630     PL_regmatch_slab  = aux->old_regmatch_slab;
7631
7632     /* free all slabs above current one - this must be the last action
7633      * of this function, as aux and eval_state are allocated within
7634      * slabs and may be freed here */
7635
7636     s = PL_regmatch_slab->next;
7637     if (s) {
7638         PL_regmatch_slab->next = NULL;
7639         while (s) {
7640             regmatch_slab * const osl = s;
7641             s = s->next;
7642             Safefree(osl);
7643         }
7644     }
7645 }
7646
7647
7648 STATIC void
7649 S_to_utf8_substr(pTHX_ regexp *prog)
7650 {
7651     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7652      * on the converted value */
7653
7654     int i = 1;
7655
7656     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7657
7658     do {
7659         if (prog->substrs->data[i].substr
7660             && !prog->substrs->data[i].utf8_substr) {
7661             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7662             prog->substrs->data[i].utf8_substr = sv;
7663             sv_utf8_upgrade(sv);
7664             if (SvVALID(prog->substrs->data[i].substr)) {
7665                 if (SvTAIL(prog->substrs->data[i].substr)) {
7666                     /* Trim the trailing \n that fbm_compile added last
7667                        time.  */
7668                     SvCUR_set(sv, SvCUR(sv) - 1);
7669                     /* Whilst this makes the SV technically "invalid" (as its
7670                        buffer is no longer followed by "\0") when fbm_compile()
7671                        adds the "\n" back, a "\0" is restored.  */
7672                     fbm_compile(sv, FBMcf_TAIL);
7673                 } else
7674                     fbm_compile(sv, 0);
7675             }
7676             if (prog->substrs->data[i].substr == prog->check_substr)
7677                 prog->check_utf8 = sv;
7678         }
7679     } while (i--);
7680 }
7681
7682 STATIC bool
7683 S_to_byte_substr(pTHX_ regexp *prog)
7684 {
7685     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7686      * on the converted value; returns FALSE if can't be converted. */
7687
7688     dVAR;
7689     int i = 1;
7690
7691     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7692
7693     do {
7694         if (prog->substrs->data[i].utf8_substr
7695             && !prog->substrs->data[i].substr) {
7696             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7697             if (! sv_utf8_downgrade(sv, TRUE)) {
7698                 return FALSE;
7699             }
7700             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7701                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7702                     /* Trim the trailing \n that fbm_compile added last
7703                         time.  */
7704                     SvCUR_set(sv, SvCUR(sv) - 1);
7705                     fbm_compile(sv, FBMcf_TAIL);
7706                 } else
7707                     fbm_compile(sv, 0);
7708             }
7709             prog->substrs->data[i].substr = sv;
7710             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7711                 prog->check_substr = sv;
7712         }
7713     } while (i--);
7714
7715     return TRUE;
7716 }
7717
7718 /*
7719  * Local variables:
7720  * c-indentation-style: bsd
7721  * c-basic-offset: 4
7722  * indent-tabs-mode: nil
7723  * End:
7724  *
7725  * ex: set ts=8 sts=4 sw=4 et:
7726  */