src/5021002/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "inline_invlist.c"
  84 #include "unicode_constants.h"
  85
  86 #ifdef DEBUGGING
  87 /* At least one required character in the target string is expressible only in
  88  * UTF-8. */
  89 static const char* const non_utf8_target_but_utf8_required
  90                 = "Can't match, because target string needs to be in UTF-8\n";
  91 #endif
  92
  93 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  94     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  95     goto target; \
  96 } STMT_END
  97
  98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* Valid only for non-utf8 strings: avoids the reginclass
 105  * call if there are no complications: i.e., if everything matchable is
 106  * straight forward in the bitmap */
 107 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 108                                               : ANYOF_BITMAP_TEST(p,*(c)))
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 116
 117 #define HOPc(pos,off) \
 118         (char *)(reginfo->is_utf8_target \
 119             ? reghop3((U8*)pos, off, \
 120                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 121             : (U8*)(pos + off))
 122
 123 #define HOPBACKc(pos, off) \
 124         (char*)(reginfo->is_utf8_target \
 125             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 126             : (pos - off >= reginfo->strbeg)    \
 127                 ? (U8*)pos - off                \
 128                 : NULL)
 129
 130 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 131 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 132
 133 /* lim must be +ve. Returns NULL on overshoot */
 134 #define HOPMAYBE3(pos,off,lim) \
 135         (reginfo->is_utf8_target                        \
 136             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 137             : ((U8*)pos + off <= lim)                   \
 138                 ? (U8*)pos + off                        \
 139                 : NULL)
 140
 141 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 142  * off must be >=0; args should be vars rather than expressions */
 143 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 144     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 145     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 146
 147 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 148     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 149     : (U8*)(pos + off))
 150 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 151
 152 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 153 #define NEXTCHR_IS_EOS (nextchr < 0)
 154
 155 #define SET_nextchr \
 156     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 157
 158 #define SET_locinput(p) \
 159     locinput = (p);  \
 160     SET_nextchr
 161
 162
 163 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 164         if (!swash_ptr) {                                                     \
 165             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 166             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 167                                          1, 0, invlist, &flags);              \
 168             assert(swash_ptr);                                                \
 169         }                                                                     \
 170     } STMT_END
 171
 172 /* If in debug mode, we test that a known character properly matches */
 173 #ifdef DEBUGGING
 174 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 175                                           property_name,                      \
 176                                           invlist,                            \
 177                                           utf8_char_in_property)              \
 178         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 179         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 180 #else
 181 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 182                                           property_name,                      \
 183                                           invlist,                            \
 184                                           utf8_char_in_property)              \
 185         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 186 #endif
 187
 188 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 189                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 190                                         "",                                   \
 191                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 192                                         LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 193
 194 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 195     STMT_START {                                                              \
 196         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 197                                        "_X_regular_begin",                    \
 198                                        NULL,                                  \
 199                                        LATIN_CAPITAL_LETTER_SHARP_S_UTF8);    \
 200         LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 201                                        "_X_extend",                           \
 202                                        NULL,                                  \
 203                                        COMBINING_GRAVE_ACCENT_UTF8);          \
 204     } STMT_END
 205
 206 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 207 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 208
 209 /* for use after a quantifier and before an EXACT-like node -- japhy */
 210 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 211  *
 212  * NOTE that *nothing* that affects backtracking should be in here, specifically
 213  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 214  * node that is in between two EXACT like nodes when ascertaining what the required
 215  * "follow" character is. This should probably be moved to regex compile time
 216  * although it may be done at run time beause of the REF possibility - more
 217  * investigation required. -- demerphq
 218 */
 219 #define JUMPABLE(rn) (                                                             \
 220     OP(rn) == OPEN ||                                                              \
 221     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 222     OP(rn) == EVAL ||                                                              \
 223     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 224     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 225     OP(rn) == KEEPS ||                                                             \
 226     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 227 )
 228 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 229
 230 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 231
 232 #if 0
 233 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 234    we don't need this definition. */
 235 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 236 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 237 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 238
 239 #else
 240 /* ... so we use this as its faster. */
 241 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 242 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 243 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 244 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 245
 246 #endif
 247
 248 /*
 249   Search for mandatory following text node; for lookahead, the text must
 250   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 251 */
 252 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 253     while (JUMPABLE(rn)) { \
 254         const OPCODE type = OP(rn); \
 255         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 256             rn = NEXTOPER(NEXTOPER(rn)); \
 257         else if (type == PLUS) \
 258             rn = NEXTOPER(rn); \
 259         else if (type == IFMATCH) \
 260             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 261         else rn += NEXT_OFF(rn); \
 262     } \
 263 } STMT_END
 264
 265 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 266  * These are for the pre-composed Hangul syllables, which are all in a
 267  * contiguous block and arranged there in such a way so as to facilitate
 268  * alorithmic determination of their characteristics.  As such, they don't need
 269  * a swash, but can be determined by simple arithmetic.  Almost all are
 270  * GCB=LVT, but every 28th one is a GCB=LV */
 271 #define SBASE 0xAC00    /* Start of block */
 272 #define SCount 11172    /* Length of block */
 273 #define TCount 28
 274
 275 #define SLAB_FIRST(s) (&(s)->states[0])
 276 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 277
 278 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 279 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 280 static regmatch_state * S_push_slab(pTHX);
 281
 282 #define REGCP_PAREN_ELEMS 3
 283 #define REGCP_OTHER_ELEMS 3
 284 #define REGCP_FRAME_ELEMS 1
 285 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 286  * are needed for the regexp context stack bookkeeping. */
 287
 288 STATIC CHECKPOINT
 289 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 290 {
 291     const int retval = PL_savestack_ix;
 292     const int paren_elems_to_push =
 293                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 294     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 295     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 296     I32 p;
 297     GET_RE_DEBUG_FLAGS_DECL;
 298
 299     PERL_ARGS_ASSERT_REGCPPUSH;
 300
 301     if (paren_elems_to_push < 0)
 302         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 303                    (int)paren_elems_to_push, (int)maxopenparen,
 304                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 305
 306     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 307         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 308                    " out of range (%lu-%ld)",
 309                    total_elems,
 310                    (unsigned long)maxopenparen,
 311                    (long)parenfloor);
 312
 313     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 314
 315     DEBUG_BUFFERS_r(
 316         if ((int)maxopenparen > (int)parenfloor)
 317             PerlIO_printf(Perl_debug_log,
 318                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 319                 PTR2UV(rex),
 320                 PTR2UV(rex->offs)
 321             );
 322     );
 323     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 324 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 325         SSPUSHIV(rex->offs[p].end);
 326         SSPUSHIV(rex->offs[p].start);
 327         SSPUSHINT(rex->offs[p].start_tmp);
 328         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 329             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 330             (UV)p,
 331             (IV)rex->offs[p].start,
 332             (IV)rex->offs[p].start_tmp,
 333             (IV)rex->offs[p].end
 334         ));
 335     }
 336 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 337     SSPUSHINT(maxopenparen);
 338     SSPUSHINT(rex->lastparen);
 339     SSPUSHINT(rex->lastcloseparen);
 340     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 341
 342     return retval;
 343 }
 344
 345 /* These are needed since we do not localize EVAL nodes: */
 346 #define REGCP_SET(cp)                                           \
 347     DEBUG_STATE_r(                                              \
 348             PerlIO_printf(Perl_debug_log,                       \
 349                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 350                 (IV)PL_savestack_ix));                          \
 351     cp = PL_savestack_ix
 352
 353 #define REGCP_UNWIND(cp)                                        \
 354     DEBUG_STATE_r(                                              \
 355         if (cp != PL_savestack_ix)                              \
 356             PerlIO_printf(Perl_debug_log,                       \
 357                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 358                 (IV)(cp), (IV)PL_savestack_ix));                \
 359     regcpblow(cp)
 360
 361 #define UNWIND_PAREN(lp, lcp)               \
 362     for (n = rex->lastparen; n > lp; n--)   \
 363         rex->offs[n].end = -1;              \
 364     rex->lastparen = n;                     \
 365     rex->lastcloseparen = lcp;
 366
 367
 368 STATIC void
 369 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 370 {
 371     UV i;
 372     U32 paren;
 373     GET_RE_DEBUG_FLAGS_DECL;
 374
 375     PERL_ARGS_ASSERT_REGCPPOP;
 376
 377     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 378     i = SSPOPUV;
 379     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 380     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 381     rex->lastcloseparen = SSPOPINT;
 382     rex->lastparen = SSPOPINT;
 383     *maxopenparen_p = SSPOPINT;
 384
 385     i -= REGCP_OTHER_ELEMS;
 386     /* Now restore the parentheses context. */
 387     DEBUG_BUFFERS_r(
 388         if (i || rex->lastparen + 1 <= rex->nparens)
 389             PerlIO_printf(Perl_debug_log,
 390                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 391                 PTR2UV(rex),
 392                 PTR2UV(rex->offs)
 393             );
 394     );
 395     paren = *maxopenparen_p;
 396     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 397         SSize_t tmps;
 398         rex->offs[paren].start_tmp = SSPOPINT;
 399         rex->offs[paren].start = SSPOPIV;
 400         tmps = SSPOPIV;
 401         if (paren <= rex->lastparen)
 402             rex->offs[paren].end = tmps;
 403         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 404             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 405             (UV)paren,
 406             (IV)rex->offs[paren].start,
 407             (IV)rex->offs[paren].start_tmp,
 408             (IV)rex->offs[paren].end,
 409             (paren > rex->lastparen ? "(skipped)" : ""));
 410         );
 411         paren--;
 412     }
 413 #if 1
 414     /* It would seem that the similar code in regtry()
 415      * already takes care of this, and in fact it is in
 416      * a better location to since this code can #if 0-ed out
 417      * but the code in regtry() is needed or otherwise tests
 418      * requiring null fields (pat.t#187 and split.t#{13,14}
 419      * (as of patchlevel 7877)  will fail.  Then again,
 420      * this code seems to be necessary or otherwise
 421      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 422      * --jhi updated by dapm */
 423     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 424         if (i > *maxopenparen_p)
 425             rex->offs[i].start = -1;
 426         rex->offs[i].end = -1;
 427         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 428             "    \\%"UVuf": %s   ..-1 undeffing\n",
 429             (UV)i,
 430             (i > *maxopenparen_p) ? "-1" : "  "
 431         ));
 432     }
 433 #endif
 434 }
 435
 436 /* restore the parens and associated vars at savestack position ix,
 437  * but without popping the stack */
 438
 439 STATIC void
 440 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 441 {
 442     I32 tmpix = PL_savestack_ix;
 443     PL_savestack_ix = ix;
 444     regcppop(rex, maxopenparen_p);
 445     PL_savestack_ix = tmpix;
 446 }
 447
 448 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 449
 450 STATIC bool
 451 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 452 {
 453     /* Returns a boolean as to whether or not 'character' is a member of the
 454      * Posix character class given by 'classnum' that should be equivalent to a
 455      * value in the typedef '_char_class_number'.
 456      *
 457      * Ideally this could be replaced by a just an array of function pointers
 458      * to the C library functions that implement the macros this calls.
 459      * However, to compile, the precise function signatures are required, and
 460      * these may vary from platform to to platform.  To avoid having to figure
 461      * out what those all are on each platform, I (khw) am using this method,
 462      * which adds an extra layer of function call overhead (unless the C
 463      * optimizer strips it away).  But we don't particularly care about
 464      * performance with locales anyway. */
 465
 466     switch ((_char_class_number) classnum) {
 467         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 468         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 469         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 470         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 471         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 472                                         || isUPPER_LC(character);
 473         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 474         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 475         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 476         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 477         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 478         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 479         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 480         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 481         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 482         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 483         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 484         default:    /* VERTSPACE should never occur in locales */
 485             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 486     }
 487
 488     assert(0); /* NOTREACHED */
 489     return FALSE;
 490 }
 491
 492 STATIC bool
 493 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 494 {
 495     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 496      * 'character' is a member of the Posix character class given by 'classnum'
 497      * that should be equivalent to a value in the typedef
 498      * '_char_class_number'.
 499      *
 500      * This just calls isFOO_lc on the code point for the character if it is in
 501      * the range 0-255.  Outside that range, all characters avoid Unicode
 502      * rules, ignoring any locale.  So use the Unicode function if this class
 503      * requires a swash, and use the Unicode macro otherwise. */
 504
 505     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 506
 507     if (UTF8_IS_INVARIANT(*character)) {
 508         return isFOO_lc(classnum, *character);
 509     }
 510     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 511         return isFOO_lc(classnum,
 512                         TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 513     }
 514
 515     if (classnum < _FIRST_NON_SWASH_CC) {
 516
 517         /* Initialize the swash unless done already */
 518         if (! PL_utf8_swash_ptrs[classnum]) {
 519             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 520             PL_utf8_swash_ptrs[classnum] =
 521                     _core_swash_init("utf8",
 522                                      "",
 523                                      &PL_sv_undef, 1, 0,
 524                                      PL_XPosix_ptrs[classnum], &flags);
 525         }
 526
 527         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 528                                  character,
 529                                  TRUE /* is UTF */ ));
 530     }
 531
 532     switch ((_char_class_number) classnum) {
 533         case _CC_ENUM_SPACE:
 534         case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 535
 536         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 537         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 538         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 539         default:                 return 0;  /* Things like CNTRL are always
 540                                                below 256 */
 541     }
 542
 543     assert(0); /* NOTREACHED */
 544     return FALSE;
 545 }
 546
 547 /*
 548  * pregexec and friends
 549  */
 550
 551 #ifndef PERL_IN_XSUB_RE
 552 /*
 553  - pregexec - match a regexp against a string
 554  */
 555 I32
 556 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 557          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 558 /* stringarg: the point in the string at which to begin matching */
 559 /* strend:    pointer to null at end of string */
 560 /* strbeg:    real beginning of string */
 561 /* minend:    end of match must be >= minend bytes after stringarg. */
 562 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 563  *            itself is accessed via the pointers above */
 564 /* nosave:    For optimizations. */
 565 {
 566     PERL_ARGS_ASSERT_PREGEXEC;
 567
 568     return
 569         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 570                       nosave ? 0 : REXEC_COPY_STR);
 571 }
 572 #endif
 573
 574
 575
 576 /* re_intuit_start():
 577  *
 578  * Based on some optimiser hints, try to find the earliest position in the
 579  * string where the regex could match.
 580  *
 581  *   rx:     the regex to match against
 582  *   sv:     the SV being matched: only used for utf8 flag; the string
 583  *           itself is accessed via the pointers below. Note that on
 584  *           something like an overloaded SV, SvPOK(sv) may be false
 585  *           and the string pointers may point to something unrelated to
 586  *           the SV itself.
 587  *   strbeg: real beginning of string
 588  *   strpos: the point in the string at which to begin matching
 589  *   strend: pointer to the byte following the last char of the string
 590  *   flags   currently unused; set to 0
 591  *   data:   currently unused; set to NULL
 592  *
 593  * The basic idea of re_intuit_start() is to use some known information
 594  * about the pattern, namely:
 595  *
 596  *   a) the longest known anchored substring (i.e. one that's at a
 597  *      constant offset from the beginning of the pattern; but not
 598  *      necessarily at a fixed offset from the beginning of the
 599  *      string);
 600  *   b) the longest floating substring (i.e. one that's not at a constant
 601  *      offset from the beginning of the pattern);
 602  *   c) Whether the pattern is anchored to the string; either
 603  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 604  *      or anchored to pos(): /\G/;
 605  *   d) A start class: a real or synthetic character class which
 606  *      represents which characters are legal at the start of the pattern;
 607  *
 608  * to either quickly reject the match, or to find the earliest position
 609  * within the string at which the pattern might match, thus avoiding
 610  * running the full NFA engine at those earlier locations, only to
 611  * eventually fail and retry further along.
 612  *
 613  * Returns NULL if the pattern can't match, or returns the address within
 614  * the string which is the earliest place the match could occur.
 615  *
 616  * The longest of the anchored and floating substrings is called 'check'
 617  * and is checked first. The other is called 'other' and is checked
 618  * second. The 'other' substring may not be present.  For example,
 619  *
 620  *    /(abc|xyz)ABC\d{0,3}DEFG/
 621  *
 622  * will have
 623  *
 624  *   check substr (float)    = "DEFG", offset 6..9 chars
 625  *   other substr (anchored) = "ABC",  offset 3..3 chars
 626  *   stclass = [ax]
 627  *
 628  * Be aware that during the course of this function, sometimes 'anchored'
 629  * refers to a substring being anchored relative to the start of the
 630  * pattern, and sometimes to the pattern itself being anchored relative to
 631  * the string. For example:
 632  *
 633  *   /\dabc/:   "abc" is anchored to the pattern;
 634  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 635  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 636  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 637  *                    but the pattern is anchored to the string.
 638  */
 639
 640 char *
 641 Perl_re_intuit_start(pTHX_
 642                     REGEXP * const rx,
 643                     SV *sv,
 644                     const char * const strbeg,
 645                     char *strpos,
 646                     char *strend,
 647                     const U32 flags,
 648                     re_scream_pos_data *data)
 649 {
 650     struct regexp *const prog = ReANY(rx);
 651     SSize_t start_shift = prog->check_offset_min;
 652     /* Should be nonnegative! */
 653     SSize_t end_shift   = 0;
 654     /* current lowest pos in string where the regex can start matching */
 655     char *rx_origin = strpos;
 656     SV *check;
 657     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 658     U8   other_ix = 1 - prog->substrs->check_ix;
 659     bool ml_anch = 0;
 660     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 661     char *check_at = NULL;              /* check substr found at this pos */
 662     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 663     RXi_GET_DECL(prog,progi);
 664     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 665     regmatch_info *const reginfo = &reginfo_buf;
 666     GET_RE_DEBUG_FLAGS_DECL;
 667
 668     PERL_ARGS_ASSERT_RE_INTUIT_START;
 669     PERL_UNUSED_ARG(flags);
 670     PERL_UNUSED_ARG(data);
 671
 672     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 673                 "Intuit: trying to determine minimum start position...\n"));
 674
 675     /* for now, assume that all substr offsets are positive. If at some point
 676      * in the future someone wants to do clever things with look-behind and
 677      * -ve offsets, they'll need to fix up any code in this function
 678      * which uses these offsets. See the thread beginning
 679      * <20140113145929.GF27210@iabyn.com>
 680      */
 681     assert(prog->substrs->data[0].min_offset >= 0);
 682     assert(prog->substrs->data[0].max_offset >= 0);
 683     assert(prog->substrs->data[1].min_offset >= 0);
 684     assert(prog->substrs->data[1].max_offset >= 0);
 685     assert(prog->substrs->data[2].min_offset >= 0);
 686     assert(prog->substrs->data[2].max_offset >= 0);
 687
 688     /* for now, assume that if both present, that the floating substring
 689      * doesn't start before the anchored substring.
 690      * If you break this assumption (e.g. doing better optimisations
 691      * with lookahead/behind), then you'll need to audit the code in this
 692      * function carefully first
 693      */
 694     assert(
 695             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 696               && (prog->float_utf8    || prog->float_substr))
 697            || (prog->float_min_offset >= prog->anchored_offset));
 698
 699     /* byte rather than char calculation for efficiency. It fails
 700      * to quickly reject some cases that can't match, but will reject
 701      * them later after doing full char arithmetic */
 702     if (prog->minlen > strend - strpos) {
 703         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 704                               "  String too short...\n"));
 705         goto fail;
 706     }
 707
 708     reginfo->is_utf8_target = cBOOL(utf8_target);
 709     reginfo->info_aux = NULL;
 710     reginfo->strbeg = strbeg;
 711     reginfo->strend = strend;
 712     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 713     reginfo->intuit = 1;
 714     /* not actually used within intuit, but zero for safety anyway */
 715     reginfo->poscache_maxiter = 0;
 716
 717     if (utf8_target) {
 718         if (!prog->check_utf8 && prog->check_substr)
 719             to_utf8_substr(prog);
 720         check = prog->check_utf8;
 721     } else {
 722         if (!prog->check_substr && prog->check_utf8) {
 723             if (! to_byte_substr(prog)) {
 724                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 725             }
 726         }
 727         check = prog->check_substr;
 728     }
 729
 730     /* dump the various substring data */
 731     DEBUG_OPTIMISE_MORE_r({
 732         int i;
 733         for (i=0; i<=2; i++) {
 734             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 735                                   : prog->substrs->data[i].substr);
 736             if (!sv)
 737                 continue;
 738
 739             PerlIO_printf(Perl_debug_log,
 740                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 741                 " useful=%"IVdf" utf8=%d [%s]\n",
 742                 i,
 743                 (IV)prog->substrs->data[i].min_offset,
 744                 (IV)prog->substrs->data[i].max_offset,
 745                 (IV)prog->substrs->data[i].end_shift,
 746                 BmUSEFUL(sv),
 747                 utf8_target ? 1 : 0,
 748                 SvPEEK(sv));
 749         }
 750     });
 751
 752     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 753
 754         /* ml_anch: check after \n?
 755          *
 756          * A note about IMPLICIT: on an un-anchored pattern beginning
 757          * with /.*.../, these flags will have been added by the
 758          * compiler:
 759          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 760          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 761          */
 762         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 763                    && !(prog->intflags & PREGf_IMPLICIT);
 764
 765         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 766             /* we are only allowed to match at BOS or \G */
 767
 768             /* trivially reject if there's a BOS anchor and we're not at BOS.
 769              *
 770              * Note that we don't try to do a similar quick reject for
 771              * \G, since generally the caller will have calculated strpos
 772              * based on pos() and gofs, so the string is already correctly
 773              * anchored by definition; and handling the exceptions would
 774              * be too fiddly (e.g. REXEC_IGNOREPOS).
 775              */
 776             if (   strpos != strbeg
 777                 && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL)))
 778             {
 779                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 780                                 "  Not at start...\n"));
 781                 goto fail;
 782             }
 783
 784             /* in the presence of an anchor, the anchored (relative to the
 785              * start of the regex) substr must also be anchored relative
 786              * to strpos. So quickly reject if substr isn't found there.
 787              * This works for \G too, because the caller will already have
 788              * subtracted gofs from pos, and gofs is the offset from the
 789              * \G to the start of the regex. For example, in /.abc\Gdef/,
 790              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 791              * caller will have set strpos=pos()-4; we look for the substr
 792              * at position pos()-4+1, which lines up with the "a" */
 793
 794             if (prog->check_offset_min == prog->check_offset_max
 795                 && !(prog->intflags & PREGf_CANY_SEEN))
 796             {
 797                 /* Substring at constant offset from beg-of-str... */
 798                 SSize_t slen = SvCUR(check);
 799                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 800
 801                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 802                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 803                     (IV)prog->check_offset_min));
 804
 805                 if (SvTAIL(check)) {
 806                     /* In this case, the regex is anchored at the end too.
 807                      * Unless it's a multiline match, the lengths must match
 808                      * exactly, give or take a \n.  NB: slen >= 1 since
 809                      * the last char of check is \n */
 810                     if (!multiline
 811                         && (   strend - s > slen
 812                             || strend - s < slen - 1
 813                             || (strend - s == slen && strend[-1] != '\n')))
 814                     {
 815                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 816                                             "  String too long...\n"));
 817                         goto fail_finish;
 818                     }
 819                     /* Now should match s[0..slen-2] */
 820                     slen--;
 821                 }
 822                 if (slen && (*SvPVX_const(check) != *s
 823                     || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 824                 {
 825                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 826                                     "  String not equal...\n"));
 827                     goto fail_finish;
 828                 }
 829
 830                 check_at = s;
 831                 goto success_at_start;
 832             }
 833         }
 834     }
 835
 836     end_shift = prog->check_end_shift;
 837
 838 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 839     if (end_shift < 0)
 840         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 841                    (IV)end_shift, RX_PRECOMP(prog));
 842 #endif
 843
 844   restart:
 845
 846     /* This is the (re)entry point of the main loop in this function.
 847      * The goal of this loop is to:
 848      * 1) find the "check" substring in the region rx_origin..strend
 849      *    (adjusted by start_shift / end_shift). If not found, reject
 850      *    immediately.
 851      * 2) If it exists, look for the "other" substr too if defined; for
 852      *    example, if the check substr maps to the anchored substr, then
 853      *    check the floating substr, and vice-versa. If not found, go
 854      *    back to (1) with rx_origin suitably incremented.
 855      * 3) If we find an rx_origin position that doesn't contradict
 856      *    either of the substrings, then check the possible additional
 857      *    constraints on rx_origin of /^.../m or a known start class.
 858      *    If these fail, then depending on which constraints fail, jump
 859      *    back to here, or to various other re-entry points further along
 860      *    that skip some of the first steps.
 861      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 862      *    substring. If the start position was determined to be at the
 863      *    beginning of the string  - so, not rejected, but not optimised,
 864      *    since we have to run regmatch from position 0 - decrement the
 865      *    BmUSEFUL() count. Otherwise increment it.
 866      */
 867
 868
 869     /* first, look for the 'check' substring */
 870
 871     {
 872         U8* start_point;
 873         U8* end_point;
 874
 875         DEBUG_OPTIMISE_MORE_r({
 876             PerlIO_printf(Perl_debug_log,
 877                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 878                 " Start shift: %"IVdf" End shift %"IVdf
 879                 " Real end Shift: %"IVdf"\n",
 880                 (IV)(rx_origin - strpos),
 881                 (IV)prog->check_offset_min,
 882                 (IV)start_shift,
 883                 (IV)end_shift,
 884                 (IV)prog->check_end_shift);
 885         });
 886
 887         if (prog->intflags & PREGf_CANY_SEEN) {
 888             start_point= (U8*)(rx_origin + start_shift);
 889             end_point= (U8*)(strend - end_shift);
 890             if (start_point > end_point)
 891                 goto fail_finish;
 892         } else {
 893             end_point = HOP3(strend, -end_shift, strbeg);
 894             start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 895             if (!start_point)
 896                 goto fail_finish;
 897         }
 898
 899
 900         /* If the regex is absolutely anchored to either the start of the
 901          * string (BOL,SBOL) or to pos() (ANCH_GPOS), then
 902          * check_offset_max represents an upper bound on the string where
 903          * the substr could start. For the ANCH_GPOS case, we assume that
 904          * the caller of intuit will have already set strpos to
 905          * pos()-gofs, so in this case strpos + offset_max will still be
 906          * an upper bound on the substr.
 907          */
 908         if (!ml_anch
 909             && prog->intflags & PREGf_ANCH
 910             && prog->check_offset_max != SSize_t_MAX)
 911         {
 912             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 913             const char * const anchor =
 914                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 915
 916             /* do a bytes rather than chars comparison. It's conservative;
 917              * so it skips doing the HOP if the result can't possibly end
 918              * up earlier than the old value of end_point.
 919              */
 920             if ((char*)end_point - anchor > prog->check_offset_max) {
 921                 end_point = HOP3lim((U8*)anchor,
 922                                 prog->check_offset_max,
 923                                 end_point -len)
 924                             + len;
 925             }
 926         }
 927
 928         DEBUG_OPTIMISE_MORE_r({
 929             PerlIO_printf(Perl_debug_log, "  fbm_instr len=%d str=<%.*s>\n",
 930                 (int)(end_point - start_point),
 931                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 932                 start_point);
 933         });
 934
 935         check_at = fbm_instr( start_point, end_point,
 936                       check, multiline ? FBMrf_MULTILINE : 0);
 937
 938         /* Update the count-of-usability, remove useless subpatterns,
 939             unshift s.  */
 940
 941         DEBUG_EXECUTE_r({
 942             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 943                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 944             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 945                               (check_at ? "Found" : "Did not find"),
 946                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 947                     ? "anchored" : "floating"),
 948                 quoted,
 949                 RE_SV_TAIL(check),
 950                 (check_at ? " at offset " : "...\n") );
 951         });
 952
 953         if (!check_at)
 954             goto fail_finish;
 955         /* Finish the diagnostic message */
 956         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
 957
 958         /* set rx_origin to the minimum position where the regex could start
 959          * matching, given the constraint of the just-matched check substring.
 960          * But don't set it lower than previously.
 961          */
 962
 963         if (check_at - rx_origin > prog->check_offset_max)
 964             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 965     }
 966
 967
 968     /* now look for the 'other' substring if defined */
 969
 970     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 971                     : prog->substrs->data[other_ix].substr)
 972     {
 973         /* Take into account the "other" substring. */
 974         char *last, *last1;
 975         char *s;
 976         SV* must;
 977         struct reg_substr_datum *other;
 978
 979       do_other_substr:
 980         other = &prog->substrs->data[other_ix];
 981
 982         /* if "other" is anchored:
 983          * we've previously found a floating substr starting at check_at.
 984          * This means that the regex origin must lie somewhere
 985          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 986          * and max:                 HOP3(check_at, -check_offset_min)
 987          * (except that min will be >= strpos)
 988          * So the fixed  substr must lie somewhere between
 989          *  HOP3(min, anchored_offset)
 990          *  HOP3(max, anchored_offset) + SvCUR(substr)
 991          */
 992
 993         /* if "other" is floating
 994          * Calculate last1, the absolute latest point where the
 995          * floating substr could start in the string, ignoring any
 996          * constraints from the earlier fixed match. It is calculated
 997          * as follows:
 998          *
 999          * strend - prog->minlen (in chars) is the absolute latest
1000          * position within the string where the origin of the regex
1001          * could appear. The latest start point for the floating
1002          * substr is float_min_offset(*) on from the start of the
1003          * regex.  last1 simply combines thee two offsets.
1004          *
1005          * (*) You might think the latest start point should be
1006          * float_max_offset from the regex origin, and technically
1007          * you'd be correct. However, consider
1008          *    /a\d{2,4}bcd\w/
1009          * Here, float min, max are 3,5 and minlen is 7.
1010          * This can match either
1011          *    /a\d\dbcd\w/
1012          *    /a\d\d\dbcd\w/
1013          *    /a\d\d\d\dbcd\w/
1014          * In the first case, the regex matches minlen chars; in the
1015          * second, minlen+1, in the third, minlen+2.
1016          * In the first case, the floating offset is 3 (which equals
1017          * float_min), in the second, 4, and in the third, 5 (which
1018          * equals float_max). In all cases, the floating string bcd
1019          * can never start more than 4 chars from the end of the
1020          * string, which equals minlen - float_min. As the substring
1021          * starts to match more than float_min from the start of the
1022          * regex, it makes the regex match more than minlen chars,
1023          * and the two cancel each other out. So we can always use
1024          * float_min - minlen, rather than float_max - minlen for the
1025          * latest position in the string.
1026          *
1027          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1028          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1029          */
1030
1031         assert(prog->minlen >= other->min_offset);
1032         last1 = HOP3c(strend,
1033                         other->min_offset - prog->minlen, strbeg);
1034
1035         if (other_ix) {/* i.e. if (other-is-float) */
1036             /* last is the latest point where the floating substr could
1037              * start, *given* any constraints from the earlier fixed
1038              * match. This constraint is that the floating string starts
1039              * <= float_max_offset chars from the regex origin (rx_origin).
1040              * If this value is less than last1, use it instead.
1041              */
1042             assert(rx_origin <= last1);
1043             last =
1044                 /* this condition handles the offset==infinity case, and
1045                  * is a short-cut otherwise. Although it's comparing a
1046                  * byte offset to a char length, it does so in a safe way,
1047                  * since 1 char always occupies 1 or more bytes,
1048                  * so if a string range is  (last1 - rx_origin) bytes,
1049                  * it will be less than or equal to  (last1 - rx_origin)
1050                  * chars; meaning it errs towards doing the accurate HOP3
1051                  * rather than just using last1 as a short-cut */
1052                 (last1 - rx_origin) < other->max_offset
1053                     ? last1
1054                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1055         }
1056         else {
1057             assert(strpos + start_shift <= check_at);
1058             last = HOP4c(check_at, other->min_offset - start_shift,
1059                         strbeg, strend);
1060         }
1061
1062         s = HOP3c(rx_origin, other->min_offset, strend);
1063         if (s < other_last)     /* These positions already checked */
1064             s = other_last;
1065
1066         must = utf8_target ? other->utf8_substr : other->substr;
1067         assert(SvPOK(must));
1068         s = fbm_instr(
1069             (unsigned char*)s,
1070             (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1071             must,
1072             multiline ? FBMrf_MULTILINE : 0
1073         );
1074         DEBUG_EXECUTE_r({
1075             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1076                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1077             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1078                 s ? "Found" : "Contradicts",
1079                 other_ix ? "floating" : "anchored",
1080                 quoted, RE_SV_TAIL(must));
1081         });
1082
1083
1084         if (!s) {
1085             /* last1 is latest possible substr location. If we didn't
1086              * find it before there, we never will */
1087             if (last >= last1) {
1088                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1089                                         ", giving up...\n"));
1090                 goto fail_finish;
1091             }
1092
1093             /* try to find the check substr again at a later
1094              * position. Maybe next time we'll find the "other" substr
1095              * in range too */
1096             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1097                 ", trying %s at offset %ld...\n",
1098                 (other_ix ? "floating" : "anchored"),
1099                 (long)(HOP3c(check_at, 1, strend) - strpos)));
1100
1101             other_last = HOP3c(last, 1, strend) /* highest failure */;
1102             rx_origin =
1103                 other_ix /* i.e. if other-is-float */
1104                     ? HOP3c(rx_origin, 1, strend)
1105                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1106             goto restart;
1107         }
1108         else {
1109             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1110                   (long)(s - strpos)));
1111
1112             if (other_ix) { /* if (other-is-float) */
1113                 /* other_last is set to s, not s+1, since its possible for
1114                  * a floating substr to fail first time, then succeed
1115                  * second time at the same floating position; e.g.:
1116                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1117                  * The first time round, anchored and float match at
1118                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1119                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1120                  */
1121                 other_last = s;
1122             }
1123             else {
1124                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1125                 other_last = HOP3c(s, 1, strend);
1126             }
1127         }
1128     }
1129     else {
1130         DEBUG_OPTIMISE_MORE_r(
1131             PerlIO_printf(Perl_debug_log,
1132                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1133                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1134                 " strend-strpos:%"IVdf"\n",
1135                 (IV)prog->check_offset_min,
1136                 (IV)prog->check_offset_max,
1137                 (IV)(check_at-strpos),
1138                 (IV)(rx_origin-strpos),
1139                 (IV)(rx_origin-check_at),
1140                 (IV)(strend-strpos)
1141             )
1142         );
1143     }
1144
1145   postprocess_substr_matches:
1146
1147     /* handle the extra constraint of /^.../m if present */
1148
1149     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1150         char *s;
1151
1152         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1153                         "  looking for /^/m anchor"));
1154
1155         /* we have failed the constraint of a \n before rx_origin.
1156          * Find the next \n, if any, even if it's beyond the current
1157          * anchored and/or floating substrings. Whether we should be
1158          * scanning ahead for the next \n or the next substr is debatable.
1159          * On the one hand you'd expect rare substrings to appear less
1160          * often than \n's. On the other hand, searching for \n means
1161          * we're effectively flipping been check_substr and "\n" on each
1162          * iteration as the current "rarest" string candidate, which
1163          * means for example that we'll quickly reject the whole string if
1164          * hasn't got a \n, rather than trying every substr position
1165          * first
1166          */
1167
1168         s = HOP3c(strend, - prog->minlen, strpos);
1169         if (s <= rx_origin ||
1170             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1171         {
1172             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1173                             "  Did not find /%s^%s/m...\n",
1174                             PL_colors[0], PL_colors[1]));
1175             goto fail_finish;
1176         }
1177
1178         /* earliest possible origin is 1 char after the \n.
1179          * (since *rx_origin == '\n', it's safe to ++ here rather than
1180          * HOP(rx_origin, 1)) */
1181         rx_origin++;
1182
1183         if (prog->substrs->check_ix == 0  /* check is anchored */
1184             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1185         {
1186             /* Position contradicts check-string; either because
1187              * check was anchored (and thus has no wiggle room),
1188              * or check was float and rx_origin is above the float range */
1189             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1190                 "  Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1191                 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1192             goto restart;
1193         }
1194
1195         /* if we get here, the check substr must have been float,
1196          * is in range, and we may or may not have had an anchored
1197          * "other" substr which still contradicts */
1198         assert(prog->substrs->check_ix); /* check is float */
1199
1200         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1201             /* whoops, the anchored "other" substr exists, so we still
1202              * contradict. On the other hand, the float "check" substr
1203              * didn't contradict, so just retry the anchored "other"
1204              * substr */
1205             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1206                 "  Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1207                 PL_colors[0], PL_colors[1],
1208                 (long)(rx_origin - strpos),
1209                 (long)(rx_origin - strpos + prog->anchored_offset)));
1210             goto do_other_substr;
1211         }
1212
1213         /* success: we don't contradict the found floating substring
1214          * (and there's no anchored substr). */
1215         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1216             "  Found /%s^%s/m at offset %ld...\n",
1217             PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1218     }
1219     else {
1220         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1221             "  (multiline anchor test skipped)\n"));
1222     }
1223
1224   success_at_start:
1225
1226
1227     /* if we have a starting character class, then test that extra constraint.
1228      * (trie stclasses are too expensive to use here, we are better off to
1229      * leave it to regmatch itself) */
1230
1231     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1232         const U8* const str = (U8*)STRING(progi->regstclass);
1233
1234         /* XXX this value could be pre-computed */
1235         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1236                     ?  (reginfo->is_utf8_pat
1237                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1238                         : STR_LEN(progi->regstclass))
1239                     : 1);
1240         char * endpos;
1241         char *s;
1242         /* latest pos that a matching float substr constrains rx start to */
1243         char *rx_max_float = NULL;
1244
1245         /* if the current rx_origin is anchored, either by satisfying an
1246          * anchored substring constraint, or a /^.../m constraint, then we
1247          * can reject the current origin if the start class isn't found
1248          * at the current position. If we have a float-only match, then
1249          * rx_origin is constrained to a range; so look for the start class
1250          * in that range. if neither, then look for the start class in the
1251          * whole rest of the string */
1252
1253         /* XXX DAPM it's not clear what the minlen test is for, and why
1254          * it's not used in the floating case. Nothing in the test suite
1255          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1256          * Here are some old comments, which may or may not be correct:
1257          *
1258          *   minlen == 0 is possible if regstclass is \b or \B,
1259          *   and the fixed substr is ''$.
1260          *   Since minlen is already taken into account, rx_origin+1 is
1261          *   before strend; accidentally, minlen >= 1 guaranties no false
1262          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1263          *   0) below assumes that regstclass does not come from lookahead...
1264          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1265          *   This leaves EXACTF-ish only, which are dealt with in
1266          *   find_byclass().
1267          */
1268
1269         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1270             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1271         else if (prog->float_substr || prog->float_utf8) {
1272             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1273             endpos= HOP3c(rx_max_float, cl_l, strend);
1274         }
1275         else
1276             endpos= strend;
1277
1278         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1279             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1280             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1281               (IV)start_shift, (IV)(check_at - strbeg),
1282               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1283
1284         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1285                             reginfo);
1286         if (!s) {
1287             if (endpos == strend) {
1288                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1289                                 "  Could not match STCLASS...\n") );
1290                 goto fail;
1291             }
1292             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1293                                "  This position contradicts STCLASS...\n") );
1294             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1295                         && !(prog->intflags & PREGf_IMPLICIT))
1296                 goto fail;
1297
1298             /* Contradict one of substrings */
1299             if (prog->anchored_substr || prog->anchored_utf8) {
1300                 if (prog->substrs->check_ix == 1) { /* check is float */
1301                     /* Have both, check_string is floating */
1302                     assert(rx_origin + start_shift <= check_at);
1303                     if (rx_origin + start_shift != check_at) {
1304                         /* not at latest position float substr could match:
1305                          * Recheck anchored substring, but not floating.
1306                          * The condition above is in bytes rather than
1307                          * chars for efficiency. It's conservative, in
1308                          * that it errs on the side of doing 'goto
1309                          * do_other_substr', where a more accurate
1310                          * char-based calculation will be done */
1311                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1312                                   "  Looking for anchored substr starting at offset %ld...\n",
1313                                   (long)(other_last - strpos)) );
1314                         goto do_other_substr;
1315                     }
1316                 }
1317             }
1318             else {
1319                 /* float-only */
1320
1321                 if (ml_anch) {
1322                     /* In the presence of ml_anch, we might be able to
1323                      * find another \n without breaking the current float
1324                      * constraint. */
1325
1326                     /* strictly speaking this should be HOP3c(..., 1, ...),
1327                      * but since we goto a block of code that's going to
1328                      * search for the next \n if any, its safe here */
1329                     rx_origin++;
1330                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1331                               "  Looking for /%s^%s/m starting at offset %ld...\n",
1332                               PL_colors[0], PL_colors[1],
1333                               (long)(rx_origin - strpos)) );
1334                     goto postprocess_substr_matches;
1335                 }
1336
1337                 /* strictly speaking this can never be true; but might
1338                  * be if we ever allow intuit without substrings */
1339                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1340                     goto fail;
1341
1342                 rx_origin = rx_max_float;
1343             }
1344
1345             /* at this point, any matching substrings have been
1346              * contradicted. Start again... */
1347
1348             rx_origin = HOP3c(rx_origin, 1, strend);
1349
1350             /* uses bytes rather than char calculations for efficiency.
1351              * It's conservative: it errs on the side of doing 'goto restart',
1352              * where there is code that does a proper char-based test */
1353             if (rx_origin + start_shift + end_shift > strend) {
1354                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1355                                        "  Could not match STCLASS...\n") );
1356                 goto fail;
1357             }
1358             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1359                 "  Looking for %s substr starting at offset %ld...\n",
1360                 (prog->substrs->check_ix ? "floating" : "anchored"),
1361                 (long)(rx_origin + start_shift - strpos)) );
1362             goto restart;
1363         }
1364
1365         /* Success !!! */
1366
1367         if (rx_origin != s) {
1368             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1369                         "  By STCLASS: moving %ld --> %ld\n",
1370                                   (long)(rx_origin - strpos), (long)(s - strpos))
1371                    );
1372         }
1373         else {
1374             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1375                                   "  Does not contradict STCLASS...\n");
1376                    );
1377         }
1378     }
1379
1380     /* Decide whether using the substrings helped */
1381
1382     if (rx_origin != strpos) {
1383         /* Fixed substring is found far enough so that the match
1384            cannot start at strpos. */
1385
1386         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1387         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1388     }
1389     else {
1390         /* The found rx_origin position does not prohibit matching at
1391          * strpos, so calling intuit didn't gain us anything. Decrement
1392          * the BmUSEFUL() count on the check substring, and if we reach
1393          * zero, free it.  */
1394         if (!(prog->intflags & PREGf_NAUGHTY)
1395             && (utf8_target ? (
1396                 prog->check_utf8                /* Could be deleted already */
1397                 && --BmUSEFUL(prog->check_utf8) < 0
1398                 && (prog->check_utf8 == prog->float_utf8)
1399             ) : (
1400                 prog->check_substr              /* Could be deleted already */
1401                 && --BmUSEFUL(prog->check_substr) < 0
1402                 && (prog->check_substr == prog->float_substr)
1403             )))
1404         {
1405             /* If flags & SOMETHING - do not do it many times on the same match */
1406             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1407             /* XXX Does the destruction order has to change with utf8_target? */
1408             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1409             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1410             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1411             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1412             check = NULL;                       /* abort */
1413             /* XXXX This is a remnant of the old implementation.  It
1414                     looks wasteful, since now INTUIT can use many
1415                     other heuristics. */
1416             prog->extflags &= ~RXf_USE_INTUIT;
1417         }
1418     }
1419
1420     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1421             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1422              PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1423
1424     return rx_origin;
1425
1426   fail_finish:                          /* Substring not found */
1427     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1428         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1429   fail:
1430     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1431                           PL_colors[4], PL_colors[5]));
1432     return NULL;
1433 }
1434
1435
1436 #define DECL_TRIE_TYPE(scan) \
1437     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1438                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1439                     trie_type = ((scan->flags == EXACT) \
1440                               ? (utf8_target ? trie_utf8 : trie_plain) \
1441                               : (scan->flags == EXACTFA) \
1442                                 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1443                                 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1444
1445 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1446 STMT_START {                                                                        \
1447     STRLEN skiplen;                                                                 \
1448     U8 flags = FOLD_FLAGS_FULL;                                                     \
1449     switch (trie_type) {                                                            \
1450     case trie_utf8_exactfa_fold:                                                    \
1451         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1452         /* FALLTHROUGH */                                                          \
1453     case trie_utf8_fold:                                                            \
1454         if ( foldlen>0 ) {                                                          \
1455             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1456             foldlen -= len;                                                         \
1457             uscan += len;                                                           \
1458             len=0;                                                                  \
1459         } else {                                                                    \
1460             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1461             len = UTF8SKIP(uc);                                                     \
1462             skiplen = UNISKIP( uvc );                                               \
1463             foldlen -= skiplen;                                                     \
1464             uscan = foldbuf + skiplen;                                              \
1465         }                                                                           \
1466         break;                                                                      \
1467     case trie_latin_utf8_exactfa_fold:                                              \
1468         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1469         /* FALLTHROUGH */                                                          \
1470     case trie_latin_utf8_fold:                                                      \
1471         if ( foldlen>0 ) {                                                          \
1472             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1473             foldlen -= len;                                                         \
1474             uscan += len;                                                           \
1475             len=0;                                                                  \
1476         } else {                                                                    \
1477             len = 1;                                                                \
1478             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1479             skiplen = UNISKIP( uvc );                                               \
1480             foldlen -= skiplen;                                                     \
1481             uscan = foldbuf + skiplen;                                              \
1482         }                                                                           \
1483         break;                                                                      \
1484     case trie_utf8:                                                                 \
1485         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1486         break;                                                                      \
1487     case trie_plain:                                                                \
1488         uvc = (UV)*uc;                                                              \
1489         len = 1;                                                                    \
1490     }                                                                               \
1491     if (uvc < 256) {                                                                \
1492         charid = trie->charmap[ uvc ];                                              \
1493     }                                                                               \
1494     else {                                                                          \
1495         charid = 0;                                                                 \
1496         if (widecharmap) {                                                          \
1497             SV** const svpp = hv_fetch(widecharmap,                                 \
1498                         (char*)&uvc, sizeof(UV), 0);                                \
1499             if (svpp)                                                               \
1500                 charid = (U16)SvIV(*svpp);                                          \
1501         }                                                                           \
1502     }                                                                               \
1503 } STMT_END
1504
1505 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1506     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1507                 startpos, doutf8)
1508
1509 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1510 STMT_START {                                              \
1511     while (s <= e) {                                      \
1512         if ( (COND)                                       \
1513              && (ln == 1 || folder(s, pat_string, ln))    \
1514              && (reginfo->intuit || regtry(reginfo, &s)) )\
1515             goto got_it;                                  \
1516         s++;                                              \
1517     }                                                     \
1518 } STMT_END
1519
1520 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1521 STMT_START {                                          \
1522     while (s < strend) {                              \
1523         CODE                                          \
1524         s += UTF8SKIP(s);                             \
1525     }                                                 \
1526 } STMT_END
1527
1528 #define REXEC_FBC_SCAN(CODE)                          \
1529 STMT_START {                                          \
1530     while (s < strend) {                              \
1531         CODE                                          \
1532         s++;                                          \
1533     }                                                 \
1534 } STMT_END
1535
1536 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1537 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1538     if (COND) {                                                \
1539         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1540             goto got_it;                                       \
1541         else                                                   \
1542             tmp = doevery;                                     \
1543     }                                                          \
1544     else                                                       \
1545         tmp = 1;                                               \
1546 )
1547
1548 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1549 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1550     if (COND) {                                                \
1551         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1552             goto got_it;                                       \
1553         else                                                   \
1554             tmp = doevery;                                     \
1555     }                                                          \
1556     else                                                       \
1557         tmp = 1;                                               \
1558 )
1559
1560 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1561     if (utf8_target) {                                         \
1562         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1563     }                                                          \
1564     else {                                                     \
1565         REXEC_FBC_CLASS_SCAN(COND);                            \
1566     }
1567
1568 /* The three macros below are slightly different versions of the same logic.
1569  *
1570  * The first is for /a and /aa when the target string is UTF-8.  This can only
1571  * match ascii, but it must advance based on UTF-8.   The other two handle the
1572  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1573  * for the boundary (or non-boundary) between a word and non-word character.
1574  * The utf8 and non-utf8 cases have the same logic, but the details must be
1575  * different.  Find the "wordness" of the character just prior to this one, and
1576  * compare it with the wordness of this one.  If they differ, we have a
1577  * boundary.  At the beginning of the string, pretend that the previous
1578  * character was a new-line.
1579  *
1580  * All these macros uncleanly have side-effects with each other and outside
1581  * variables.  So far it's been too much trouble to clean-up
1582  *
1583  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1584  *               a word character or not.
1585  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1586  *               word/non-word
1587  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1588  *
1589  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1590  * are looking for a boundary or for a non-boundary.  If we are looking for a
1591  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1592  * see if this tentative match actually works, and if so, to quit the loop
1593  * here.  And vice-versa if we are looking for a non-boundary.
1594  *
1595  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1596  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1597  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1598  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1599  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1600  * complement.  But in that branch we complement tmp, meaning that at the
1601  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1602  * which means at the top of the loop in the next iteration, it is
1603  * TEST_NON_UTF8(s-1) */
1604 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1605     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1606     tmp = TEST_NON_UTF8(tmp);                                                  \
1607     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1608         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1609             tmp = !tmp;                                                        \
1610             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1611         }                                                                      \
1612         else {                                                                 \
1613             IF_FAIL;                                                           \
1614         }                                                                      \
1615     );                                                                         \
1616
1617 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1618  * TEST_UTF8 is a macro that for the same input code points returns identically
1619  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1620 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1621     if (s == reginfo->strbeg) {                                                \
1622         tmp = '\n';                                                            \
1623     }                                                                          \
1624     else { /* Back-up to the start of the previous character */                \
1625         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1626         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1627                                                        0, UTF8_ALLOW_DEFAULT); \
1628     }                                                                          \
1629     tmp = TEST_UV(tmp);                                                        \
1630     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1631     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1632         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1633             tmp = !tmp;                                                        \
1634             IF_SUCCESS;                                                        \
1635         }                                                                      \
1636         else {                                                                 \
1637             IF_FAIL;                                                           \
1638         }                                                                      \
1639     );
1640
1641 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1642  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1643  * macros below */
1644 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1645     if (utf8_target) {                                                         \
1646         UTF8_CODE                                                              \
1647     }                                                                          \
1648     else {  /* Not utf8 */                                                     \
1649         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1650         tmp = TEST_NON_UTF8(tmp);                                              \
1651         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1652             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1653                 IF_SUCCESS;                                                    \
1654                 tmp = !tmp;                                                    \
1655             }                                                                  \
1656             else {                                                             \
1657                 IF_FAIL;                                                       \
1658             }                                                                  \
1659         );                                                                     \
1660     }                                                                          \
1661     /* Here, things have been set up by the previous code so that tmp is the   \
1662      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1663      * utf8ness of the target).  We also have to check if this matches against \
1664      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1665      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1666      * string */                                                               \
1667     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1668         IF_SUCCESS;                                                            \
1669     }                                                                          \
1670     else {                                                                     \
1671         IF_FAIL;                                                               \
1672     }
1673
1674 /* This is the macro to use when we want to see if something that looks like it
1675  * could match, actually does, and if so exits the loop */
1676 #define REXEC_FBC_TRYIT                            \
1677     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1678         goto got_it
1679
1680 /* The only difference between the BOUND and NBOUND cases is that
1681  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1682  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1683  * with the other one being empty (PLACEHOLDER is defined as empty).
1684  *
1685  * The TEST_FOO parameters are for operating on different forms of input, but
1686  * all should be ones that return identically for the same underlying code
1687  * points */
1688 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1689     FBC_BOUND_COMMON(                                                          \
1690           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1691           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1692
1693 #define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                         \
1694     FBC_BOUND_COMMON(                                                          \
1695             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1696             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1697
1698 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1699     FBC_BOUND_COMMON(                                                          \
1700           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1701           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1702
1703 #define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                        \
1704     FBC_BOUND_COMMON(                                                          \
1705             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1706             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1707
1708
1709 /* We know what class REx starts with.  Try to find this position... */
1710 /* if reginfo->intuit, its a dryrun */
1711 /* annoyingly all the vars in this routine have different names from their counterparts
1712    in regmatch. /grrr */
1713 STATIC char *
1714 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1715     const char *strend, regmatch_info *reginfo)
1716 {
1717     dVAR;
1718     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1719     char *pat_string;   /* The pattern's exactish string */
1720     char *pat_end;          /* ptr to end char of pat_string */
1721     re_fold_t folder;   /* Function for computing non-utf8 folds */
1722     const U8 *fold_array;   /* array for folding ords < 256 */
1723     STRLEN ln;
1724     STRLEN lnc;
1725     U8 c1;
1726     U8 c2;
1727     char *e;
1728     I32 tmp = 1;        /* Scratch variable? */
1729     const bool utf8_target = reginfo->is_utf8_target;
1730     UV utf8_fold_flags = 0;
1731     const bool is_utf8_pat = reginfo->is_utf8_pat;
1732     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1733                                    with a result inverts that result, as 0^1 =
1734                                    1 and 1^1 = 0 */
1735     _char_class_number classnum;
1736
1737     RXi_GET_DECL(prog,progi);
1738
1739     PERL_ARGS_ASSERT_FIND_BYCLASS;
1740
1741     /* We know what class it must start with. */
1742     switch (OP(c)) {
1743     case ANYOF:
1744         if (utf8_target) {
1745             REXEC_FBC_UTF8_CLASS_SCAN(
1746                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1747         }
1748         else {
1749             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1750         }
1751         break;
1752     case CANY:
1753         REXEC_FBC_SCAN(
1754             if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1755                 goto got_it;
1756             else
1757                 tmp = doevery;
1758         );
1759         break;
1760
1761     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1762         assert(! is_utf8_pat);
1763         /* FALLTHROUGH */
1764     case EXACTFA:
1765         if (is_utf8_pat || utf8_target) {
1766             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1767             goto do_exactf_utf8;
1768         }
1769         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1770         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1771         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1772
1773     case EXACTF:   /* This node only generated for non-utf8 patterns */
1774         assert(! is_utf8_pat);
1775         if (utf8_target) {
1776             utf8_fold_flags = 0;
1777             goto do_exactf_utf8;
1778         }
1779         fold_array = PL_fold;
1780         folder = foldEQ;
1781         goto do_exactf_non_utf8;
1782
1783     case EXACTFL:
1784         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1785             utf8_fold_flags = FOLDEQ_LOCALE;
1786             goto do_exactf_utf8;
1787         }
1788         fold_array = PL_fold_locale;
1789         folder = foldEQ_locale;
1790         goto do_exactf_non_utf8;
1791
1792     case EXACTFU_SS:
1793         if (is_utf8_pat) {
1794             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1795         }
1796         goto do_exactf_utf8;
1797
1798     case EXACTFU:
1799         if (is_utf8_pat || utf8_target) {
1800             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1801             goto do_exactf_utf8;
1802         }
1803
1804         /* Any 'ss' in the pattern should have been replaced by regcomp,
1805          * so we don't have to worry here about this single special case
1806          * in the Latin1 range */
1807         fold_array = PL_fold_latin1;
1808         folder = foldEQ_latin1;
1809
1810         /* FALLTHROUGH */
1811
1812     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1813                            are no glitches with fold-length differences
1814                            between the target string and pattern */
1815
1816         /* The idea in the non-utf8 EXACTF* cases is to first find the
1817          * first character of the EXACTF* node and then, if necessary,
1818          * case-insensitively compare the full text of the node.  c1 is the
1819          * first character.  c2 is its fold.  This logic will not work for
1820          * Unicode semantics and the german sharp ss, which hence should
1821          * not be compiled into a node that gets here. */
1822         pat_string = STRING(c);
1823         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1824
1825         /* We know that we have to match at least 'ln' bytes (which is the
1826          * same as characters, since not utf8).  If we have to match 3
1827          * characters, and there are only 2 availabe, we know without
1828          * trying that it will fail; so don't start a match past the
1829          * required minimum number from the far end */
1830         e = HOP3c(strend, -((SSize_t)ln), s);
1831
1832         if (reginfo->intuit && e < s) {
1833             e = s;                      /* Due to minlen logic of intuit() */
1834         }
1835
1836         c1 = *pat_string;
1837         c2 = fold_array[c1];
1838         if (c1 == c2) { /* If char and fold are the same */
1839             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1840         }
1841         else {
1842             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1843         }
1844         break;
1845
1846     do_exactf_utf8:
1847     {
1848         unsigned expansion;
1849
1850         /* If one of the operands is in utf8, we can't use the simpler folding
1851          * above, due to the fact that many different characters can have the
1852          * same fold, or portion of a fold, or different- length fold */
1853         pat_string = STRING(c);
1854         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1855         pat_end = pat_string + ln;
1856         lnc = is_utf8_pat       /* length to match in characters */
1857                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1858                 : ln;
1859
1860         /* We have 'lnc' characters to match in the pattern, but because of
1861          * multi-character folding, each character in the target can match
1862          * up to 3 characters (Unicode guarantees it will never exceed
1863          * this) if it is utf8-encoded; and up to 2 if not (based on the
1864          * fact that the Latin 1 folds are already determined, and the
1865          * only multi-char fold in that range is the sharp-s folding to
1866          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1867          * string character.  Adjust lnc accordingly, rounding up, so that
1868          * if we need to match at least 4+1/3 chars, that really is 5. */
1869         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1870         lnc = (lnc + expansion - 1) / expansion;
1871
1872         /* As in the non-UTF8 case, if we have to match 3 characters, and
1873          * only 2 are left, it's guaranteed to fail, so don't start a
1874          * match that would require us to go beyond the end of the string
1875          */
1876         e = HOP3c(strend, -((SSize_t)lnc), s);
1877
1878         if (reginfo->intuit && e < s) {
1879             e = s;                      /* Due to minlen logic of intuit() */
1880         }
1881
1882         /* XXX Note that we could recalculate e to stop the loop earlier,
1883          * as the worst case expansion above will rarely be met, and as we
1884          * go along we would usually find that e moves further to the left.
1885          * This would happen only after we reached the point in the loop
1886          * where if there were no expansion we should fail.  Unclear if
1887          * worth the expense */
1888
1889         while (s <= e) {
1890             char *my_strend= (char *)strend;
1891             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1892                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1893                 && (reginfo->intuit || regtry(reginfo, &s)) )
1894             {
1895                 goto got_it;
1896             }
1897             s += (utf8_target) ? UTF8SKIP(s) : 1;
1898         }
1899         break;
1900     }
1901
1902     case BOUNDL:
1903         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1904         break;
1905     case NBOUNDL:
1906         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1907         break;
1908     case BOUND:
1909         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1910         break;
1911     case BOUNDA:
1912         FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1913         break;
1914     case NBOUND:
1915         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1916         break;
1917     case NBOUNDA:
1918         FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1919         break;
1920     case BOUNDU:
1921         FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1922         break;
1923     case NBOUNDU:
1924         FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1925         break;
1926     case LNBREAK:
1927         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1928                         is_LNBREAK_latin1_safe(s, strend)
1929         );
1930         break;
1931
1932     /* The argument to all the POSIX node types is the class number to pass to
1933      * _generic_isCC() to build a mask for searching in PL_charclass[] */
1934
1935     case NPOSIXL:
1936         to_complement = 1;
1937         /* FALLTHROUGH */
1938
1939     case POSIXL:
1940         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1941                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1942         break;
1943
1944     case NPOSIXD:
1945         to_complement = 1;
1946         /* FALLTHROUGH */
1947
1948     case POSIXD:
1949         if (utf8_target) {
1950             goto posix_utf8;
1951         }
1952         goto posixa;
1953
1954     case NPOSIXA:
1955         if (utf8_target) {
1956             /* The complement of something that matches only ASCII matches all
1957              * non-ASCII, plus everything in ASCII that isn't in the class. */
1958             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
1959                                       || ! _generic_isCC_A(*s, FLAGS(c)));
1960             break;
1961         }
1962
1963         to_complement = 1;
1964         /* FALLTHROUGH */
1965
1966     case POSIXA:
1967       posixa:
1968         /* Don't need to worry about utf8, as it can match only a single
1969          * byte invariant character. */
1970         REXEC_FBC_CLASS_SCAN(
1971                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1972         break;
1973
1974     case NPOSIXU:
1975         to_complement = 1;
1976         /* FALLTHROUGH */
1977
1978     case POSIXU:
1979         if (! utf8_target) {
1980             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1981                                                                     FLAGS(c))));
1982         }
1983         else {
1984
1985       posix_utf8:
1986             classnum = (_char_class_number) FLAGS(c);
1987             if (classnum < _FIRST_NON_SWASH_CC) {
1988                 while (s < strend) {
1989
1990                     /* We avoid loading in the swash as long as possible, but
1991                      * should we have to, we jump to a separate loop.  This
1992                      * extra 'if' statement is what keeps this code from being
1993                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1994                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
1995                         goto found_above_latin1;
1996                     }
1997                     if ((UTF8_IS_INVARIANT(*s)
1998                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1999                                                                 classnum)))
2000                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2001                             && to_complement ^ cBOOL(
2002                                 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2003                                                                       *(s + 1)),
2004                                               classnum))))
2005                     {
2006                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2007                             goto got_it;
2008                         else {
2009                             tmp = doevery;
2010                         }
2011                     }
2012                     else {
2013                         tmp = 1;
2014                     }
2015                     s += UTF8SKIP(s);
2016                 }
2017             }
2018             else switch (classnum) {    /* These classes are implemented as
2019                                            macros */
2020                 case _CC_ENUM_SPACE: /* XXX would require separate code if we
2021                                         revert the change of \v matching this */
2022                     /* FALLTHROUGH */
2023
2024                 case _CC_ENUM_PSXSPC:
2025                     REXEC_FBC_UTF8_CLASS_SCAN(
2026                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2027                     break;
2028
2029                 case _CC_ENUM_BLANK:
2030                     REXEC_FBC_UTF8_CLASS_SCAN(
2031                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2032                     break;
2033
2034                 case _CC_ENUM_XDIGIT:
2035                     REXEC_FBC_UTF8_CLASS_SCAN(
2036                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2037                     break;
2038
2039                 case _CC_ENUM_VERTSPACE:
2040                     REXEC_FBC_UTF8_CLASS_SCAN(
2041                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2042                     break;
2043
2044                 case _CC_ENUM_CNTRL:
2045                     REXEC_FBC_UTF8_CLASS_SCAN(
2046                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2047                     break;
2048
2049                 default:
2050                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2051                     assert(0); /* NOTREACHED */
2052             }
2053         }
2054         break;
2055
2056       found_above_latin1:   /* Here we have to load a swash to get the result
2057                                for the current code point */
2058         if (! PL_utf8_swash_ptrs[classnum]) {
2059             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2060             PL_utf8_swash_ptrs[classnum] =
2061                     _core_swash_init("utf8",
2062                                      "",
2063                                      &PL_sv_undef, 1, 0,
2064                                      PL_XPosix_ptrs[classnum], &flags);
2065         }
2066
2067         /* This is a copy of the loop above for swash classes, though using the
2068          * FBC macro instead of being expanded out.  Since we've loaded the
2069          * swash, we don't have to check for that each time through the loop */
2070         REXEC_FBC_UTF8_CLASS_SCAN(
2071                 to_complement ^ cBOOL(_generic_utf8(
2072                                       classnum,
2073                                       s,
2074                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2075                                                   (U8 *) s, TRUE))));
2076         break;
2077
2078     case AHOCORASICKC:
2079     case AHOCORASICK:
2080         {
2081             DECL_TRIE_TYPE(c);
2082             /* what trie are we using right now */
2083             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2084             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2085             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2086
2087             const char *last_start = strend - trie->minlen;
2088 #ifdef DEBUGGING
2089             const char *real_start = s;
2090 #endif
2091             STRLEN maxlen = trie->maxlen;
2092             SV *sv_points;
2093             U8 **points; /* map of where we were in the input string
2094                             when reading a given char. For ASCII this
2095                             is unnecessary overhead as the relationship
2096                             is always 1:1, but for Unicode, especially
2097                             case folded Unicode this is not true. */
2098             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2099             U8 *bitmap=NULL;
2100
2101
2102             GET_RE_DEBUG_FLAGS_DECL;
2103
2104             /* We can't just allocate points here. We need to wrap it in
2105              * an SV so it gets freed properly if there is a croak while
2106              * running the match */
2107             ENTER;
2108             SAVETMPS;
2109             sv_points=newSV(maxlen * sizeof(U8 *));
2110             SvCUR_set(sv_points,
2111                 maxlen * sizeof(U8 *));
2112             SvPOK_on(sv_points);
2113             sv_2mortal(sv_points);
2114             points=(U8**)SvPV_nolen(sv_points );
2115             if ( trie_type != trie_utf8_fold
2116                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2117             {
2118                 if (trie->bitmap)
2119                     bitmap=(U8*)trie->bitmap;
2120                 else
2121                     bitmap=(U8*)ANYOF_BITMAP(c);
2122             }
2123             /* this is the Aho-Corasick algorithm modified a touch
2124                to include special handling for long "unknown char" sequences.
2125                The basic idea being that we use AC as long as we are dealing
2126                with a possible matching char, when we encounter an unknown char
2127                (and we have not encountered an accepting state) we scan forward
2128                until we find a legal starting char.
2129                AC matching is basically that of trie matching, except that when
2130                we encounter a failing transition, we fall back to the current
2131                states "fail state", and try the current char again, a process
2132                we repeat until we reach the root state, state 1, or a legal
2133                transition. If we fail on the root state then we can either
2134                terminate if we have reached an accepting state previously, or
2135                restart the entire process from the beginning if we have not.
2136
2137              */
2138             while (s <= last_start) {
2139                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2140                 U8 *uc = (U8*)s;
2141                 U16 charid = 0;
2142                 U32 base = 1;
2143                 U32 state = 1;
2144                 UV uvc = 0;
2145                 STRLEN len = 0;
2146                 STRLEN foldlen = 0;
2147                 U8 *uscan = (U8*)NULL;
2148                 U8 *leftmost = NULL;
2149 #ifdef DEBUGGING
2150                 U32 accepted_word= 0;
2151 #endif
2152                 U32 pointpos = 0;
2153
2154                 while ( state && uc <= (U8*)strend ) {
2155                     int failed=0;
2156                     U32 word = aho->states[ state ].wordnum;
2157
2158                     if( state==1 ) {
2159                         if ( bitmap ) {
2160                             DEBUG_TRIE_EXECUTE_r(
2161                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2162                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2163                                         (char *)uc, utf8_target );
2164                                     PerlIO_printf( Perl_debug_log,
2165                                         " Scanning for legal start char...\n");
2166                                 }
2167                             );
2168                             if (utf8_target) {
2169                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2170                                     uc += UTF8SKIP(uc);
2171                                 }
2172                             } else {
2173                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2174                                     uc++;
2175                                 }
2176                             }
2177                             s= (char *)uc;
2178                         }
2179                         if (uc >(U8*)last_start) break;
2180                     }
2181
2182                     if ( word ) {
2183                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2184                         if (!leftmost || lpos < leftmost) {
2185                             DEBUG_r(accepted_word=word);
2186                             leftmost= lpos;
2187                         }
2188                         if (base==0) break;
2189
2190                     }
2191                     points[pointpos++ % maxlen]= uc;
2192                     if (foldlen || uc < (U8*)strend) {
2193                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2194                                          widecharmap, uc,
2195                                          uscan, len, uvc, charid, foldlen,
2196                                          foldbuf, uniflags);
2197                         DEBUG_TRIE_EXECUTE_r({
2198                             dump_exec_pos( (char *)uc, c, strend,
2199                                         real_start, s, utf8_target);
2200                             PerlIO_printf(Perl_debug_log,
2201                                 " Charid:%3u CP:%4"UVxf" ",
2202                                  charid, uvc);
2203                         });
2204                     }
2205                     else {
2206                         len = 0;
2207                         charid = 0;
2208                     }
2209
2210
2211                     do {
2212 #ifdef DEBUGGING
2213                         word = aho->states[ state ].wordnum;
2214 #endif
2215                         base = aho->states[ state ].trans.base;
2216
2217                         DEBUG_TRIE_EXECUTE_r({
2218                             if (failed)
2219                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2220                                     s,   utf8_target );
2221                             PerlIO_printf( Perl_debug_log,
2222                                 "%sState: %4"UVxf", word=%"UVxf,
2223                                 failed ? " Fail transition to " : "",
2224                                 (UV)state, (UV)word);
2225                         });
2226                         if ( base ) {
2227                             U32 tmp;
2228                             I32 offset;
2229                             if (charid &&
2230                                  ( ((offset = base + charid
2231                                     - 1 - trie->uniquecharcount)) >= 0)
2232                                  && ((U32)offset < trie->lasttrans)
2233                                  && trie->trans[offset].check == state
2234                                  && (tmp=trie->trans[offset].next))
2235                             {
2236                                 DEBUG_TRIE_EXECUTE_r(
2237                                     PerlIO_printf( Perl_debug_log," - legal\n"));
2238                                 state = tmp;
2239                                 break;
2240                             }
2241                             else {
2242                                 DEBUG_TRIE_EXECUTE_r(
2243                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2244                                 failed = 1;
2245                                 state = aho->fail[state];
2246                             }
2247                         }
2248                         else {
2249                             /* we must be accepting here */
2250                             DEBUG_TRIE_EXECUTE_r(
2251                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2252                             failed = 1;
2253                             break;
2254                         }
2255                     } while(state);
2256                     uc += len;
2257                     if (failed) {
2258                         if (leftmost)
2259                             break;
2260                         if (!state) state = 1;
2261                     }
2262                 }
2263                 if ( aho->states[ state ].wordnum ) {
2264                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2265                     if (!leftmost || lpos < leftmost) {
2266                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2267                         leftmost = lpos;
2268                     }
2269                 }
2270                 if (leftmost) {
2271                     s = (char*)leftmost;
2272                     DEBUG_TRIE_EXECUTE_r({
2273                         PerlIO_printf(
2274                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2275                             (UV)accepted_word, (IV)(s - real_start)
2276                         );
2277                     });
2278                     if (reginfo->intuit || regtry(reginfo, &s)) {
2279                         FREETMPS;
2280                         LEAVE;
2281                         goto got_it;
2282                     }
2283                     s = HOPc(s,1);
2284                     DEBUG_TRIE_EXECUTE_r({
2285                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2286                     });
2287                 } else {
2288                     DEBUG_TRIE_EXECUTE_r(
2289                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2290                     break;
2291                 }
2292             }
2293             FREETMPS;
2294             LEAVE;
2295         }
2296         break;
2297     default:
2298         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2299     }
2300     return 0;
2301   got_it:
2302     return s;
2303 }
2304
2305 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2306  * flags have same meanings as with regexec_flags() */
2307
2308 static void
2309 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2310                             char *strbeg,
2311                             char *strend,
2312                             SV *sv,
2313                             U32 flags,
2314                             bool utf8_target)
2315 {
2316     struct regexp *const prog = ReANY(rx);
2317
2318     if (flags & REXEC_COPY_STR) {
2319 #ifdef PERL_ANY_COW
2320         if (SvCANCOW(sv)) {
2321             if (DEBUG_C_TEST) {
2322                 PerlIO_printf(Perl_debug_log,
2323                               "Copy on write: regexp capture, type %d\n",
2324                               (int) SvTYPE(sv));
2325             }
2326             /* Create a new COW SV to share the match string and store
2327              * in saved_copy, unless the current COW SV in saved_copy
2328              * is valid and suitable for our purpose */
2329             if ((   prog->saved_copy
2330                  && SvIsCOW(prog->saved_copy)
2331                  && SvPOKp(prog->saved_copy)
2332                  && SvIsCOW(sv)
2333                  && SvPOKp(sv)
2334                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2335             {
2336                 /* just reuse saved_copy SV */
2337                 if (RXp_MATCH_COPIED(prog)) {
2338                     Safefree(prog->subbeg);
2339                     RXp_MATCH_COPIED_off(prog);
2340                 }
2341             }
2342             else {
2343                 /* create new COW SV to share string */
2344                 RX_MATCH_COPY_FREE(rx);
2345                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2346             }
2347             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2348             assert (SvPOKp(prog->saved_copy));
2349             prog->sublen  = strend - strbeg;
2350             prog->suboffset = 0;
2351             prog->subcoffset = 0;
2352         } else
2353 #endif
2354         {
2355             SSize_t min = 0;
2356             SSize_t max = strend - strbeg;
2357             SSize_t sublen;
2358
2359             if (    (flags & REXEC_COPY_SKIP_POST)
2360                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2361                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2362             ) { /* don't copy $' part of string */
2363                 U32 n = 0;
2364                 max = -1;
2365                 /* calculate the right-most part of the string covered
2366                  * by a capture. Due to look-ahead, this may be to
2367                  * the right of $&, so we have to scan all captures */
2368                 while (n <= prog->lastparen) {
2369                     if (prog->offs[n].end > max)
2370                         max = prog->offs[n].end;
2371                     n++;
2372                 }
2373                 if (max == -1)
2374                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2375                             ? prog->offs[0].start
2376                             : 0;
2377                 assert(max >= 0 && max <= strend - strbeg);
2378             }
2379
2380             if (    (flags & REXEC_COPY_SKIP_PRE)
2381                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2382                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2383             ) { /* don't copy $` part of string */
2384                 U32 n = 0;
2385                 min = max;
2386                 /* calculate the left-most part of the string covered
2387                  * by a capture. Due to look-behind, this may be to
2388                  * the left of $&, so we have to scan all captures */
2389                 while (min && n <= prog->lastparen) {
2390                     if (   prog->offs[n].start != -1
2391                         && prog->offs[n].start < min)
2392                     {
2393                         min = prog->offs[n].start;
2394                     }
2395                     n++;
2396                 }
2397                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2398                     && min >  prog->offs[0].end
2399                 )
2400                     min = prog->offs[0].end;
2401
2402             }
2403
2404             assert(min >= 0 && min <= max && min <= strend - strbeg);
2405             sublen = max - min;
2406
2407             if (RX_MATCH_COPIED(rx)) {
2408                 if (sublen > prog->sublen)
2409                     prog->subbeg =
2410                             (char*)saferealloc(prog->subbeg, sublen+1);
2411             }
2412             else
2413                 prog->subbeg = (char*)safemalloc(sublen+1);
2414             Copy(strbeg + min, prog->subbeg, sublen, char);
2415             prog->subbeg[sublen] = '\0';
2416             prog->suboffset = min;
2417             prog->sublen = sublen;
2418             RX_MATCH_COPIED_on(rx);
2419         }
2420         prog->subcoffset = prog->suboffset;
2421         if (prog->suboffset && utf8_target) {
2422             /* Convert byte offset to chars.
2423              * XXX ideally should only compute this if @-/@+
2424              * has been seen, a la PL_sawampersand ??? */
2425
2426             /* If there's a direct correspondence between the
2427              * string which we're matching and the original SV,
2428              * then we can use the utf8 len cache associated with
2429              * the SV. In particular, it means that under //g,
2430              * sv_pos_b2u() will use the previously cached
2431              * position to speed up working out the new length of
2432              * subcoffset, rather than counting from the start of
2433              * the string each time. This stops
2434              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2435              * from going quadratic */
2436             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2437                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2438                                                 SV_GMAGIC|SV_CONST_RETURN);
2439             else
2440                 prog->subcoffset = utf8_length((U8*)strbeg,
2441                                     (U8*)(strbeg+prog->suboffset));
2442         }
2443     }
2444     else {
2445         RX_MATCH_COPY_FREE(rx);
2446         prog->subbeg = strbeg;
2447         prog->suboffset = 0;
2448         prog->subcoffset = 0;
2449         prog->sublen = strend - strbeg;
2450     }
2451 }
2452
2453
2454
2455
2456 /*
2457  - regexec_flags - match a regexp against a string
2458  */
2459 I32
2460 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2461               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2462 /* stringarg: the point in the string at which to begin matching */
2463 /* strend:    pointer to null at end of string */
2464 /* strbeg:    real beginning of string */
2465 /* minend:    end of match must be >= minend bytes after stringarg. */
2466 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2467  *            itself is accessed via the pointers above */
2468 /* data:      May be used for some additional optimizations.
2469               Currently unused. */
2470 /* flags:     For optimizations. See REXEC_* in regexp.h */
2471
2472 {
2473     struct regexp *const prog = ReANY(rx);
2474     char *s;
2475     regnode *c;
2476     char *startpos;
2477     SSize_t minlen;             /* must match at least this many chars */
2478     SSize_t dontbother = 0;     /* how many characters not to try at end */
2479     const bool utf8_target = cBOOL(DO_UTF8(sv));
2480     I32 multiline;
2481     RXi_GET_DECL(prog,progi);
2482     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2483     regmatch_info *const reginfo = &reginfo_buf;
2484     regexp_paren_pair *swap = NULL;
2485     I32 oldsave;
2486     GET_RE_DEBUG_FLAGS_DECL;
2487
2488     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2489     PERL_UNUSED_ARG(data);
2490
2491     /* Be paranoid... */
2492     if (prog == NULL || stringarg == NULL) {
2493         Perl_croak(aTHX_ "NULL regexp parameter");
2494     }
2495
2496     DEBUG_EXECUTE_r(
2497         debug_start_match(rx, utf8_target, stringarg, strend,
2498         "Matching");
2499     );
2500
2501     startpos = stringarg;
2502
2503     if (prog->intflags & PREGf_GPOS_SEEN) {
2504         MAGIC *mg;
2505
2506         /* set reginfo->ganch, the position where \G can match */
2507
2508         reginfo->ganch =
2509             (flags & REXEC_IGNOREPOS)
2510             ? stringarg /* use start pos rather than pos() */
2511             : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2512               /* Defined pos(): */
2513             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2514             : strbeg; /* pos() not defined; use start of string */
2515
2516         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2517             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2518
2519         /* in the presence of \G, we may need to start looking earlier in
2520          * the string than the suggested start point of stringarg:
2521          * if prog->gofs is set, then that's a known, fixed minimum
2522          * offset, such as
2523          * /..\G/:   gofs = 2
2524          * /ab|c\G/: gofs = 1
2525          * or if the minimum offset isn't known, then we have to go back
2526          * to the start of the string, e.g. /w+\G/
2527          */
2528
2529         if (prog->intflags & PREGf_ANCH_GPOS) {
2530             startpos  = reginfo->ganch - prog->gofs;
2531             if (startpos <
2532                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2533             {
2534                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2535                         "fail: ganch-gofs before earliest possible start\n"));
2536                 return 0;
2537             }
2538         }
2539         else if (prog->gofs) {
2540             if (startpos - prog->gofs < strbeg)
2541                 startpos = strbeg;
2542             else
2543                 startpos -= prog->gofs;
2544         }
2545         else if (prog->intflags & PREGf_GPOS_FLOAT)
2546             startpos = strbeg;
2547     }
2548
2549     minlen = prog->minlen;
2550     if ((startpos + minlen) > strend || startpos < strbeg) {
2551         DEBUG_r(PerlIO_printf(Perl_debug_log,
2552                     "Regex match can't succeed, so not even tried\n"));
2553         return 0;
2554     }
2555
2556     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2557      * which will call destuctors to reset PL_regmatch_state, free higher
2558      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2559      * regmatch_info_aux_eval */
2560
2561     oldsave = PL_savestack_ix;
2562
2563     s = startpos;
2564
2565     if ((prog->extflags & RXf_USE_INTUIT)
2566         && !(flags & REXEC_CHECKED))
2567     {
2568         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2569                                     flags, NULL);
2570         if (!s)
2571             return 0;
2572
2573         if (prog->extflags & RXf_CHECK_ALL) {
2574             /* we can match based purely on the result of INTUIT.
2575              * Set up captures etc just for $& and $-[0]
2576              * (an intuit-only match wont have $1,$2,..) */
2577             assert(!prog->nparens);
2578
2579             /* s/// doesn't like it if $& is earlier than where we asked it to
2580              * start searching (which can happen on something like /.\G/) */
2581             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2582                     && (s < stringarg))
2583             {
2584                 /* this should only be possible under \G */
2585                 assert(prog->intflags & PREGf_GPOS_SEEN);
2586                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2587                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2588                 goto phooey;
2589             }
2590
2591             /* match via INTUIT shouldn't have any captures.
2592              * Let @-, @+, $^N know */
2593             prog->lastparen = prog->lastcloseparen = 0;
2594             RX_MATCH_UTF8_set(rx, utf8_target);
2595             prog->offs[0].start = s - strbeg;
2596             prog->offs[0].end = utf8_target
2597                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2598                 : s - strbeg + prog->minlenret;
2599             if ( !(flags & REXEC_NOT_FIRST) )
2600                 S_reg_set_capture_string(aTHX_ rx,
2601                                         strbeg, strend,
2602                                         sv, flags, utf8_target);
2603
2604             return 1;
2605         }
2606     }
2607
2608     multiline = prog->extflags & RXf_PMf_MULTILINE;
2609
2610     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2611         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2612                               "String too short [regexec_flags]...\n"));
2613         goto phooey;
2614     }
2615
2616     /* Check validity of program. */
2617     if (UCHARAT(progi->program) != REG_MAGIC) {
2618         Perl_croak(aTHX_ "corrupted regexp program");
2619     }
2620
2621     RX_MATCH_TAINTED_off(rx);
2622
2623     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2624     reginfo->intuit = 0;
2625     reginfo->is_utf8_target = cBOOL(utf8_target);
2626     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2627     reginfo->warned = FALSE;
2628     reginfo->strbeg  = strbeg;
2629     reginfo->sv = sv;
2630     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2631     reginfo->strend = strend;
2632     /* see how far we have to get to not match where we matched before */
2633     reginfo->till = stringarg + minend;
2634
2635     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2636         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2637            S_cleanup_regmatch_info_aux has executed (registered by
2638            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2639            magic belonging to this SV.
2640            Not newSVsv, either, as it does not COW.
2641         */
2642         assert(!IS_PADGV(sv));
2643         reginfo->sv = newSV(0);
2644         SvSetSV_nosteal(reginfo->sv, sv);
2645         SAVEFREESV(reginfo->sv);
2646     }
2647
2648     /* reserve next 2 or 3 slots in PL_regmatch_state:
2649      * slot N+0: may currently be in use: skip it
2650      * slot N+1: use for regmatch_info_aux struct
2651      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2652      * slot N+3: ready for use by regmatch()
2653      */
2654
2655     {
2656         regmatch_state *old_regmatch_state;
2657         regmatch_slab  *old_regmatch_slab;
2658         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2659
2660         /* on first ever match, allocate first slab */
2661         if (!PL_regmatch_slab) {
2662             Newx(PL_regmatch_slab, 1, regmatch_slab);
2663             PL_regmatch_slab->prev = NULL;
2664             PL_regmatch_slab->next = NULL;
2665             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2666         }
2667
2668         old_regmatch_state = PL_regmatch_state;
2669         old_regmatch_slab  = PL_regmatch_slab;
2670
2671         for (i=0; i <= max; i++) {
2672             if (i == 1)
2673                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2674             else if (i ==2)
2675                 reginfo->info_aux_eval =
2676                 reginfo->info_aux->info_aux_eval =
2677                             &(PL_regmatch_state->u.info_aux_eval);
2678
2679             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2680                 PL_regmatch_state = S_push_slab(aTHX);
2681         }
2682
2683         /* note initial PL_regmatch_state position; at end of match we'll
2684          * pop back to there and free any higher slabs */
2685
2686         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2687         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2688         reginfo->info_aux->poscache = NULL;
2689
2690         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2691
2692         if ((prog->extflags & RXf_EVAL_SEEN))
2693             S_setup_eval_state(aTHX_ reginfo);
2694         else
2695             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2696     }
2697
2698     /* If there is a "must appear" string, look for it. */
2699
2700     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2701         /* We have to be careful. If the previous successful match
2702            was from this regex we don't want a subsequent partially
2703            successful match to clobber the old results.
2704            So when we detect this possibility we add a swap buffer
2705            to the re, and switch the buffer each match. If we fail,
2706            we switch it back; otherwise we leave it swapped.
2707         */
2708         swap = prog->offs;
2709         /* do we need a save destructor here for eval dies? */
2710         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2711         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2712             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2713             PTR2UV(prog),
2714             PTR2UV(swap),
2715             PTR2UV(prog->offs)
2716         ));
2717     }
2718
2719     /* Simplest case:  anchored match need be tried only once. */
2720     /*  [unless only anchor is BOL and multiline is set] */
2721     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2722         if (s == startpos && regtry(reginfo, &s))
2723             goto got_it;
2724         else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2725         {
2726             char *end;
2727
2728             if (minlen)
2729                 dontbother = minlen - 1;
2730             end = HOP3c(strend, -dontbother, strbeg) - 1;
2731             /* for multiline we only have to try after newlines */
2732             if (prog->check_substr || prog->check_utf8) {
2733                 /* because of the goto we can not easily reuse the macros for bifurcating the
2734                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2735                 if (utf8_target) {
2736                     if (s == startpos)
2737                         goto after_try_utf8;
2738                     while (1) {
2739                         if (regtry(reginfo, &s)) {
2740                             goto got_it;
2741                         }
2742                       after_try_utf8:
2743                         if (s > end) {
2744                             goto phooey;
2745                         }
2746                         if (prog->extflags & RXf_USE_INTUIT) {
2747                             s = re_intuit_start(rx, sv, strbeg,
2748                                     s + UTF8SKIP(s), strend, flags, NULL);
2749                             if (!s) {
2750                                 goto phooey;
2751                             }
2752                         }
2753                         else {
2754                             s += UTF8SKIP(s);
2755                         }
2756                     }
2757                 } /* end search for check string in unicode */
2758                 else {
2759                     if (s == startpos) {
2760                         goto after_try_latin;
2761                     }
2762                     while (1) {
2763                         if (regtry(reginfo, &s)) {
2764                             goto got_it;
2765                         }
2766                       after_try_latin:
2767                         if (s > end) {
2768                             goto phooey;
2769                         }
2770                         if (prog->extflags & RXf_USE_INTUIT) {
2771                             s = re_intuit_start(rx, sv, strbeg,
2772                                         s + 1, strend, flags, NULL);
2773                             if (!s) {
2774                                 goto phooey;
2775                             }
2776                         }
2777                         else {
2778                             s++;
2779                         }
2780                     }
2781                 } /* end search for check string in latin*/
2782             } /* end search for check string */
2783             else { /* search for newline */
2784                 if (s > startpos) {
2785                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2786                     s--;
2787                 }
2788                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2789                 while (s <= end) { /* note it could be possible to match at the end of the string */
2790                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2791                         if (regtry(reginfo, &s))
2792                             goto got_it;
2793                     }
2794                 }
2795             } /* end search for newline */
2796         } /* end anchored/multiline check string search */
2797         goto phooey;
2798     } else if (prog->intflags & PREGf_ANCH_GPOS)
2799     {
2800         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2801         assert(prog->intflags & PREGf_GPOS_SEEN);
2802         /* For anchored \G, the only position it can match from is
2803          * (ganch-gofs); we already set startpos to this above; if intuit
2804          * moved us on from there, we can't possibly succeed */
2805         assert(startpos == reginfo->ganch - prog->gofs);
2806         if (s == startpos && regtry(reginfo, &s))
2807             goto got_it;
2808         goto phooey;
2809     }
2810
2811     /* Messy cases:  unanchored match. */
2812     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2813         /* we have /x+whatever/ */
2814         /* it must be a one character string (XXXX Except is_utf8_pat?) */
2815         char ch;
2816 #ifdef DEBUGGING
2817         int did_match = 0;
2818 #endif
2819         if (utf8_target) {
2820             if (! prog->anchored_utf8) {
2821                 to_utf8_substr(prog);
2822             }
2823             ch = SvPVX_const(prog->anchored_utf8)[0];
2824             REXEC_FBC_SCAN(
2825                 if (*s == ch) {
2826                     DEBUG_EXECUTE_r( did_match = 1 );
2827                     if (regtry(reginfo, &s)) goto got_it;
2828                     s += UTF8SKIP(s);
2829                     while (s < strend && *s == ch)
2830                         s += UTF8SKIP(s);
2831                 }
2832             );
2833
2834         }
2835         else {
2836             if (! prog->anchored_substr) {
2837                 if (! to_byte_substr(prog)) {
2838                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2839                 }
2840             }
2841             ch = SvPVX_const(prog->anchored_substr)[0];
2842             REXEC_FBC_SCAN(
2843                 if (*s == ch) {
2844                     DEBUG_EXECUTE_r( did_match = 1 );
2845                     if (regtry(reginfo, &s)) goto got_it;
2846                     s++;
2847                     while (s < strend && *s == ch)
2848                         s++;
2849                 }
2850             );
2851         }
2852         DEBUG_EXECUTE_r(if (!did_match)
2853                 PerlIO_printf(Perl_debug_log,
2854                                   "Did not find anchored character...\n")
2855                );
2856     }
2857     else if (prog->anchored_substr != NULL
2858               || prog->anchored_utf8 != NULL
2859               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2860                   && prog->float_max_offset < strend - s)) {
2861         SV *must;
2862         SSize_t back_max;
2863         SSize_t back_min;
2864         char *last;
2865         char *last1;            /* Last position checked before */
2866 #ifdef DEBUGGING
2867         int did_match = 0;
2868 #endif
2869         if (prog->anchored_substr || prog->anchored_utf8) {
2870             if (utf8_target) {
2871                 if (! prog->anchored_utf8) {
2872                     to_utf8_substr(prog);
2873                 }
2874                 must = prog->anchored_utf8;
2875             }
2876             else {
2877                 if (! prog->anchored_substr) {
2878                     if (! to_byte_substr(prog)) {
2879                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2880                     }
2881                 }
2882                 must = prog->anchored_substr;
2883             }
2884             back_max = back_min = prog->anchored_offset;
2885         } else {
2886             if (utf8_target) {
2887                 if (! prog->float_utf8) {
2888                     to_utf8_substr(prog);
2889                 }
2890                 must = prog->float_utf8;
2891             }
2892             else {
2893                 if (! prog->float_substr) {
2894                     if (! to_byte_substr(prog)) {
2895                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2896                     }
2897                 }
2898                 must = prog->float_substr;
2899             }
2900             back_max = prog->float_max_offset;
2901             back_min = prog->float_min_offset;
2902         }
2903
2904         if (back_min<0) {
2905             last = strend;
2906         } else {
2907             last = HOP3c(strend,        /* Cannot start after this */
2908                   -(SSize_t)(CHR_SVLEN(must)
2909                          - (SvTAIL(must) != 0) + back_min), strbeg);
2910         }
2911         if (s > reginfo->strbeg)
2912             last1 = HOPc(s, -1);
2913         else
2914             last1 = s - 1;      /* bogus */
2915
2916         /* XXXX check_substr already used to find "s", can optimize if
2917            check_substr==must. */
2918         dontbother = 0;
2919         strend = HOPc(strend, -dontbother);
2920         while ( (s <= last) &&
2921                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
2922                                   (unsigned char*)strend, must,
2923                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2924             DEBUG_EXECUTE_r( did_match = 1 );
2925             if (HOPc(s, -back_max) > last1) {
2926                 last1 = HOPc(s, -back_min);
2927                 s = HOPc(s, -back_max);
2928             }
2929             else {
2930                 char * const t = (last1 >= reginfo->strbeg)
2931                                     ? HOPc(last1, 1) : last1 + 1;
2932
2933                 last1 = HOPc(s, -back_min);
2934                 s = t;
2935             }
2936             if (utf8_target) {
2937                 while (s <= last1) {
2938                     if (regtry(reginfo, &s))
2939                         goto got_it;
2940                     if (s >= last1) {
2941                         s++; /* to break out of outer loop */
2942                         break;
2943                     }
2944                     s += UTF8SKIP(s);
2945                 }
2946             }
2947             else {
2948                 while (s <= last1) {
2949                     if (regtry(reginfo, &s))
2950                         goto got_it;
2951                     s++;
2952                 }
2953             }
2954         }
2955         DEBUG_EXECUTE_r(if (!did_match) {
2956             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2957                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2958             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2959                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2960                                ? "anchored" : "floating"),
2961                 quoted, RE_SV_TAIL(must));
2962         });
2963         goto phooey;
2964     }
2965     else if ( (c = progi->regstclass) ) {
2966         if (minlen) {
2967             const OPCODE op = OP(progi->regstclass);
2968             /* don't bother with what can't match */
2969             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2970                 strend = HOPc(strend, -(minlen - 1));
2971         }
2972         DEBUG_EXECUTE_r({
2973             SV * const prop = sv_newmortal();
2974             regprop(prog, prop, c, reginfo);
2975             {
2976                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2977                     s,strend-s,60);
2978                 PerlIO_printf(Perl_debug_log,
2979                     "Matching stclass %.*s against %s (%d bytes)\n",
2980                     (int)SvCUR(prop), SvPVX_const(prop),
2981                      quoted, (int)(strend - s));
2982             }
2983         });
2984         if (find_byclass(prog, c, s, strend, reginfo))
2985             goto got_it;
2986         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2987     }
2988     else {
2989         dontbother = 0;
2990         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2991             /* Trim the end. */
2992             char *last= NULL;
2993             SV* float_real;
2994             STRLEN len;
2995             const char *little;
2996
2997             if (utf8_target) {
2998                 if (! prog->float_utf8) {
2999                     to_utf8_substr(prog);
3000                 }
3001                 float_real = prog->float_utf8;
3002             }
3003             else {
3004                 if (! prog->float_substr) {
3005                     if (! to_byte_substr(prog)) {
3006                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3007                     }
3008                 }
3009                 float_real = prog->float_substr;
3010             }
3011
3012             little = SvPV_const(float_real, len);
3013             if (SvTAIL(float_real)) {
3014                     /* This means that float_real contains an artificial \n on
3015                      * the end due to the presence of something like this:
3016                      * /foo$/ where we can match both "foo" and "foo\n" at the
3017                      * end of the string.  So we have to compare the end of the
3018                      * string first against the float_real without the \n and
3019                      * then against the full float_real with the string.  We
3020                      * have to watch out for cases where the string might be
3021                      * smaller than the float_real or the float_real without
3022                      * the \n. */
3023                     char *checkpos= strend - len;
3024                     DEBUG_OPTIMISE_r(
3025                         PerlIO_printf(Perl_debug_log,
3026                             "%sChecking for float_real.%s\n",
3027                             PL_colors[4], PL_colors[5]));
3028                     if (checkpos + 1 < strbeg) {
3029                         /* can't match, even if we remove the trailing \n
3030                          * string is too short to match */
3031                         DEBUG_EXECUTE_r(
3032                             PerlIO_printf(Perl_debug_log,
3033                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3034                                 PL_colors[4], PL_colors[5]));
3035                         goto phooey;
3036                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3037                         /* can match, the end of the string matches without the
3038                          * "\n" */
3039                         last = checkpos + 1;
3040                     } else if (checkpos < strbeg) {
3041                         /* cant match, string is too short when the "\n" is
3042                          * included */
3043                         DEBUG_EXECUTE_r(
3044                             PerlIO_printf(Perl_debug_log,
3045                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3046                                 PL_colors[4], PL_colors[5]));
3047                         goto phooey;
3048                     } else if (!multiline) {
3049                         /* non multiline match, so compare with the "\n" at the
3050                          * end of the string */
3051                         if (memEQ(checkpos, little, len)) {
3052                             last= checkpos;
3053                         } else {
3054                             DEBUG_EXECUTE_r(
3055                                 PerlIO_printf(Perl_debug_log,
3056                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3057                                     PL_colors[4], PL_colors[5]));
3058                             goto phooey;
3059                         }
3060                     } else {
3061                         /* multiline match, so we have to search for a place
3062                          * where the full string is located */
3063                         goto find_last;
3064                     }
3065             } else {
3066                   find_last:
3067                     if (len)
3068                         last = rninstr(s, strend, little, little + len);
3069                     else
3070                         last = strend;  /* matching "$" */
3071             }
3072             if (!last) {
3073                 /* at one point this block contained a comment which was
3074                  * probably incorrect, which said that this was a "should not
3075                  * happen" case.  Even if it was true when it was written I am
3076                  * pretty sure it is not anymore, so I have removed the comment
3077                  * and replaced it with this one. Yves */
3078                 DEBUG_EXECUTE_r(
3079                     PerlIO_printf(Perl_debug_log,
3080                         "String does not contain required substring, cannot match.\n"
3081                     ));
3082                 goto phooey;
3083             }
3084             dontbother = strend - last + prog->float_min_offset;
3085         }
3086         if (minlen && (dontbother < minlen))
3087             dontbother = minlen - 1;
3088         strend -= dontbother;              /* this one's always in bytes! */
3089         /* We don't know much -- general case. */
3090         if (utf8_target) {
3091             for (;;) {
3092                 if (regtry(reginfo, &s))
3093                     goto got_it;
3094                 if (s >= strend)
3095                     break;
3096                 s += UTF8SKIP(s);
3097             };
3098         }
3099         else {
3100             do {
3101                 if (regtry(reginfo, &s))
3102                     goto got_it;
3103             } while (s++ < strend);
3104         }
3105     }
3106
3107     /* Failure. */
3108     goto phooey;
3109
3110 got_it:
3111     /* s/// doesn't like it if $& is earlier than where we asked it to
3112      * start searching (which can happen on something like /.\G/) */
3113     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3114             && (prog->offs[0].start < stringarg - strbeg))
3115     {
3116         /* this should only be possible under \G */
3117         assert(prog->intflags & PREGf_GPOS_SEEN);
3118         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3119             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3120         goto phooey;
3121     }
3122
3123     DEBUG_BUFFERS_r(
3124         if (swap)
3125             PerlIO_printf(Perl_debug_log,
3126                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3127                 PTR2UV(prog),
3128                 PTR2UV(swap)
3129             );
3130     );
3131     Safefree(swap);
3132
3133     /* clean up; this will trigger destructors that will free all slabs
3134      * above the current one, and cleanup the regmatch_info_aux
3135      * and regmatch_info_aux_eval sructs */
3136
3137     LEAVE_SCOPE(oldsave);
3138
3139     if (RXp_PAREN_NAMES(prog))
3140         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3141
3142     RX_MATCH_UTF8_set(rx, utf8_target);
3143
3144     /* make sure $`, $&, $', and $digit will work later */
3145     if ( !(flags & REXEC_NOT_FIRST) )
3146         S_reg_set_capture_string(aTHX_ rx,
3147                                     strbeg, reginfo->strend,
3148                                     sv, flags, utf8_target);
3149
3150     return 1;
3151
3152 phooey:
3153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3154                           PL_colors[4], PL_colors[5]));
3155
3156     /* clean up; this will trigger destructors that will free all slabs
3157      * above the current one, and cleanup the regmatch_info_aux
3158      * and regmatch_info_aux_eval sructs */
3159
3160     LEAVE_SCOPE(oldsave);
3161
3162     if (swap) {
3163         /* we failed :-( roll it back */
3164         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3165             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3166             PTR2UV(prog),
3167             PTR2UV(prog->offs),
3168             PTR2UV(swap)
3169         ));
3170         Safefree(prog->offs);
3171         prog->offs = swap;
3172     }
3173     return 0;
3174 }
3175
3176
3177 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3178  * Do inc before dec, in case old and new rex are the same */
3179 #define SET_reg_curpm(Re2)                          \
3180     if (reginfo->info_aux_eval) {                   \
3181         (void)ReREFCNT_inc(Re2);                    \
3182         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3183         PM_SETRE((PL_reg_curpm), (Re2));            \
3184     }
3185
3186
3187 /*
3188  - regtry - try match at specific point
3189  */
3190 STATIC I32                      /* 0 failure, 1 success */
3191 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3192 {
3193     CHECKPOINT lastcp;
3194     REGEXP *const rx = reginfo->prog;
3195     regexp *const prog = ReANY(rx);
3196     SSize_t result;
3197     RXi_GET_DECL(prog,progi);
3198     GET_RE_DEBUG_FLAGS_DECL;
3199
3200     PERL_ARGS_ASSERT_REGTRY;
3201
3202     reginfo->cutpoint=NULL;
3203
3204     prog->offs[0].start = *startposp - reginfo->strbeg;
3205     prog->lastparen = 0;
3206     prog->lastcloseparen = 0;
3207
3208     /* XXXX What this code is doing here?!!!  There should be no need
3209        to do this again and again, prog->lastparen should take care of
3210        this!  --ilya*/
3211
3212     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3213      * Actually, the code in regcppop() (which Ilya may be meaning by
3214      * prog->lastparen), is not needed at all by the test suite
3215      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3216      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3217      * Meanwhile, this code *is* needed for the
3218      * above-mentioned test suite tests to succeed.  The common theme
3219      * on those tests seems to be returning null fields from matches.
3220      * --jhi updated by dapm */
3221 #if 1
3222     if (prog->nparens) {
3223         regexp_paren_pair *pp = prog->offs;
3224         I32 i;
3225         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3226             ++pp;
3227             pp->start = -1;
3228             pp->end = -1;
3229         }
3230     }
3231 #endif
3232     REGCP_SET(lastcp);
3233     result = regmatch(reginfo, *startposp, progi->program + 1);
3234     if (result != -1) {
3235         prog->offs[0].end = result;
3236         return 1;
3237     }
3238     if (reginfo->cutpoint)
3239         *startposp= reginfo->cutpoint;
3240     REGCP_UNWIND(lastcp);
3241     return 0;
3242 }
3243
3244
3245 #define sayYES goto yes
3246 #define sayNO goto no
3247 #define sayNO_SILENT goto no_silent
3248
3249 /* we dont use STMT_START/END here because it leads to
3250    "unreachable code" warnings, which are bogus, but distracting. */
3251 #define CACHEsayNO \
3252     if (ST.cache_mask) \
3253        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3254     sayNO
3255
3256 /* this is used to determine how far from the left messages like
3257    'failed...' are printed. It should be set such that messages
3258    are inline with the regop output that created them.
3259 */
3260 #define REPORT_CODE_OFF 32
3261
3262
3263 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3264 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3265 #define CHRTEST_NOT_A_CP_1 -999
3266 #define CHRTEST_NOT_A_CP_2 -998
3267
3268 /* grab a new slab and return the first slot in it */
3269
3270 STATIC regmatch_state *
3271 S_push_slab(pTHX)
3272 {
3273 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3274     dMY_CXT;
3275 #endif
3276     regmatch_slab *s = PL_regmatch_slab->next;
3277     if (!s) {
3278         Newx(s, 1, regmatch_slab);
3279         s->prev = PL_regmatch_slab;
3280         s->next = NULL;
3281         PL_regmatch_slab->next = s;
3282     }
3283     PL_regmatch_slab = s;
3284     return SLAB_FIRST(s);
3285 }
3286
3287
3288 /* push a new state then goto it */
3289
3290 #define PUSH_STATE_GOTO(state, node, input) \
3291     pushinput = input; \
3292     scan = node; \
3293     st->resume_state = state; \
3294     goto push_state;
3295
3296 /* push a new state with success backtracking, then goto it */
3297
3298 #define PUSH_YES_STATE_GOTO(state, node, input) \
3299     pushinput = input; \
3300     scan = node; \
3301     st->resume_state = state; \
3302     goto push_yes_state;
3303
3304
3305
3306
3307 /*
3308
3309 regmatch() - main matching routine
3310
3311 This is basically one big switch statement in a loop. We execute an op,
3312 set 'next' to point the next op, and continue. If we come to a point which
3313 we may need to backtrack to on failure such as (A|B|C), we push a
3314 backtrack state onto the backtrack stack. On failure, we pop the top
3315 state, and re-enter the loop at the state indicated. If there are no more
3316 states to pop, we return failure.
3317
3318 Sometimes we also need to backtrack on success; for example /A+/, where
3319 after successfully matching one A, we need to go back and try to
3320 match another one; similarly for lookahead assertions: if the assertion
3321 completes successfully, we backtrack to the state just before the assertion
3322 and then carry on.  In these cases, the pushed state is marked as
3323 'backtrack on success too'. This marking is in fact done by a chain of
3324 pointers, each pointing to the previous 'yes' state. On success, we pop to
3325 the nearest yes state, discarding any intermediate failure-only states.
3326 Sometimes a yes state is pushed just to force some cleanup code to be
3327 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3328 it to free the inner regex.
3329
3330 Note that failure backtracking rewinds the cursor position, while
3331 success backtracking leaves it alone.
3332
3333 A pattern is complete when the END op is executed, while a subpattern
3334 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3335 ops trigger the "pop to last yes state if any, otherwise return true"
3336 behaviour.
3337
3338 A common convention in this function is to use A and B to refer to the two
3339 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3340 the subpattern to be matched possibly multiple times, while B is the entire
3341 rest of the pattern. Variable and state names reflect this convention.
3342
3343 The states in the main switch are the union of ops and failure/success of
3344 substates associated with with that op.  For example, IFMATCH is the op
3345 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3346 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3347 successfully matched A and IFMATCH_A_fail is a state saying that we have
3348 just failed to match A. Resume states always come in pairs. The backtrack
3349 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3350 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3351 on success or failure.
3352
3353 The struct that holds a backtracking state is actually a big union, with
3354 one variant for each major type of op. The variable st points to the
3355 top-most backtrack struct. To make the code clearer, within each
3356 block of code we #define ST to alias the relevant union.
3357
3358 Here's a concrete example of a (vastly oversimplified) IFMATCH
3359 implementation:
3360
3361     switch (state) {
3362     ....
3363
3364 #define ST st->u.ifmatch
3365
3366     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3367         ST.foo = ...; // some state we wish to save
3368         ...
3369         // push a yes backtrack state with a resume value of
3370         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3371         // first node of A:
3372         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3373         // NOTREACHED
3374
3375     case IFMATCH_A: // we have successfully executed A; now continue with B
3376         next = B;
3377         bar = ST.foo; // do something with the preserved value
3378         break;
3379
3380     case IFMATCH_A_fail: // A failed, so the assertion failed
3381         ...;   // do some housekeeping, then ...
3382         sayNO; // propagate the failure
3383
3384 #undef ST
3385
3386     ...
3387     }
3388
3389 For any old-timers reading this who are familiar with the old recursive
3390 approach, the code above is equivalent to:
3391
3392     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3393     {
3394         int foo = ...
3395         ...
3396         if (regmatch(A)) {
3397             next = B;
3398             bar = foo;
3399             break;
3400         }
3401         ...;   // do some housekeeping, then ...
3402         sayNO; // propagate the failure
3403     }
3404
3405 The topmost backtrack state, pointed to by st, is usually free. If you
3406 want to claim it, populate any ST.foo fields in it with values you wish to
3407 save, then do one of
3408
3409         PUSH_STATE_GOTO(resume_state, node, newinput);
3410         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3411
3412 which sets that backtrack state's resume value to 'resume_state', pushes a
3413 new free entry to the top of the backtrack stack, then goes to 'node'.
3414 On backtracking, the free slot is popped, and the saved state becomes the
3415 new free state. An ST.foo field in this new top state can be temporarily
3416 accessed to retrieve values, but once the main loop is re-entered, it
3417 becomes available for reuse.
3418
3419 Note that the depth of the backtrack stack constantly increases during the
3420 left-to-right execution of the pattern, rather than going up and down with
3421 the pattern nesting. For example the stack is at its maximum at Z at the
3422 end of the pattern, rather than at X in the following:
3423
3424     /(((X)+)+)+....(Y)+....Z/
3425
3426 The only exceptions to this are lookahead/behind assertions and the cut,
3427 (?>A), which pop all the backtrack states associated with A before
3428 continuing.
3429
3430 Backtrack state structs are allocated in slabs of about 4K in size.
3431 PL_regmatch_state and st always point to the currently active state,
3432 and PL_regmatch_slab points to the slab currently containing
3433 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3434 allocated, and is never freed until interpreter destruction. When the slab
3435 is full, a new one is allocated and chained to the end. At exit from
3436 regmatch(), slabs allocated since entry are freed.
3437
3438 */
3439
3440
3441 #define DEBUG_STATE_pp(pp)                                  \
3442     DEBUG_STATE_r({                                         \
3443         DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3444         PerlIO_printf(Perl_debug_log,                       \
3445             "    %*s"pp" %s%s%s%s%s\n",                     \
3446             depth*2, "",                                    \
3447             PL_reg_name[st->resume_state],                  \
3448             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3449             ((st==yes_state) ? "Y" : ""),                   \
3450             ((st==mark_state) ? "M" : ""),                  \
3451             ((st==yes_state||st==mark_state) ? "]" : "")    \
3452         );                                                  \
3453     });
3454
3455
3456 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3457
3458 #ifdef DEBUGGING
3459
3460 STATIC void
3461 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3462     const char *start, const char *end, const char *blurb)
3463 {
3464     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3465
3466     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3467
3468     if (!PL_colorset)
3469             reginitcolors();
3470     {
3471         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3472             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3473
3474         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3475             start, end - start, 60);
3476
3477         PerlIO_printf(Perl_debug_log,
3478             "%s%s REx%s %s against %s\n",
3479                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3480
3481         if (utf8_target||utf8_pat)
3482             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3483                 utf8_pat ? "pattern" : "",
3484                 utf8_pat && utf8_target ? " and " : "",
3485                 utf8_target ? "string" : ""
3486             );
3487     }
3488 }
3489
3490 STATIC void
3491 S_dump_exec_pos(pTHX_ const char *locinput,
3492                       const regnode *scan,
3493                       const char *loc_regeol,
3494                       const char *loc_bostr,
3495                       const char *loc_reg_starttry,
3496                       const bool utf8_target)
3497 {
3498     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3499     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3500     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3501     /* The part of the string before starttry has one color
3502        (pref0_len chars), between starttry and current
3503        position another one (pref_len - pref0_len chars),
3504        after the current position the third one.
3505        We assume that pref0_len <= pref_len, otherwise we
3506        decrease pref0_len.  */
3507     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3508         ? (5 + taill) - l : locinput - loc_bostr;
3509     int pref0_len;
3510
3511     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3512
3513     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3514         pref_len++;
3515     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3516     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3517         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3518               ? (5 + taill) - pref_len : loc_regeol - locinput);
3519     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3520         l--;
3521     if (pref0_len < 0)
3522         pref0_len = 0;
3523     if (pref0_len > pref_len)
3524         pref0_len = pref_len;
3525     {
3526         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3527
3528         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3529             (locinput - pref_len),pref0_len, 60, 4, 5);
3530
3531         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3532                     (locinput - pref_len + pref0_len),
3533                     pref_len - pref0_len, 60, 2, 3);
3534
3535         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3536                     locinput, loc_regeol - locinput, 10, 0, 1);
3537
3538         const STRLEN tlen=len0+len1+len2;
3539         PerlIO_printf(Perl_debug_log,
3540                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3541                     (IV)(locinput - loc_bostr),
3542                     len0, s0,
3543                     len1, s1,
3544                     (docolor ? "" : "> <"),
3545                     len2, s2,
3546                     (int)(tlen > 19 ? 0 :  19 - tlen),
3547                     "");
3548     }
3549 }
3550
3551 #endif
3552
3553 /* reg_check_named_buff_matched()
3554  * Checks to see if a named buffer has matched. The data array of
3555  * buffer numbers corresponding to the buffer is expected to reside
3556  * in the regexp->data->data array in the slot stored in the ARG() of
3557  * node involved. Note that this routine doesn't actually care about the
3558  * name, that information is not preserved from compilation to execution.
3559  * Returns the index of the leftmost defined buffer with the given name
3560  * or 0 if non of the buffers matched.
3561  */
3562 STATIC I32
3563 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3564 {
3565     I32 n;
3566     RXi_GET_DECL(rex,rexi);
3567     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3568     I32 *nums=(I32*)SvPVX(sv_dat);
3569
3570     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3571
3572     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3573         if ((I32)rex->lastparen >= nums[n] &&
3574             rex->offs[nums[n]].end != -1)
3575         {
3576             return nums[n];
3577         }
3578     }
3579     return 0;
3580 }
3581
3582
3583 static bool
3584 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3585         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3586 {
3587     /* This function determines if there are one or two characters that match
3588      * the first character of the passed-in EXACTish node <text_node>, and if
3589      * so, returns them in the passed-in pointers.
3590      *
3591      * If it determines that no possible character in the target string can
3592      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3593      * the first character in <text_node> requires UTF-8 to represent, and the
3594      * target string isn't in UTF-8.)
3595      *
3596      * If there are more than two characters that could match the beginning of
3597      * <text_node>, or if more context is required to determine a match or not,
3598      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3599      *
3600      * The motiviation behind this function is to allow the caller to set up
3601      * tight loops for matching.  If <text_node> is of type EXACT, there is
3602      * only one possible character that can match its first character, and so
3603      * the situation is quite simple.  But things get much more complicated if
3604      * folding is involved.  It may be that the first character of an EXACTFish
3605      * node doesn't participate in any possible fold, e.g., punctuation, so it
3606      * can be matched only by itself.  The vast majority of characters that are
3607      * in folds match just two things, their lower and upper-case equivalents.
3608      * But not all are like that; some have multiple possible matches, or match
3609      * sequences of more than one character.  This function sorts all that out.
3610      *
3611      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3612      * loop of trying to match A*, we know we can't exit where the thing
3613      * following it isn't a B.  And something can't be a B unless it is the
3614      * beginning of B.  By putting a quick test for that beginning in a tight
3615      * loop, we can rule out things that can't possibly be B without having to
3616      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3617      * character, we can make a tight loop matching A*, using the outputs of
3618      * this function.
3619      *
3620      * If the target string to match isn't in UTF-8, and there aren't
3621      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3622      * the one or two possible octets (which are characters in this situation)
3623      * that can match.  In all cases, if there is only one character that can
3624      * match, *<c1p> and *<c2p> will be identical.
3625      *
3626      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3627      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3628      * can match the beginning of <text_node>.  They should be declared with at
3629      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3630      * undefined what these contain.)  If one or both of the buffers are
3631      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3632      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3633      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3634      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3635      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3636
3637     const bool utf8_target = reginfo->is_utf8_target;
3638
3639     UV c1 = CHRTEST_NOT_A_CP_1;
3640     UV c2 = CHRTEST_NOT_A_CP_2;
3641     bool use_chrtest_void = FALSE;
3642     const bool is_utf8_pat = reginfo->is_utf8_pat;
3643
3644     /* Used when we have both utf8 input and utf8 output, to avoid converting
3645      * to/from code points */
3646     bool utf8_has_been_setup = FALSE;
3647
3648     dVAR;
3649
3650     U8 *pat = (U8*)STRING(text_node);
3651     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3652
3653     if (OP(text_node) == EXACT) {
3654
3655         /* In an exact node, only one thing can be matched, that first
3656          * character.  If both the pat and the target are UTF-8, we can just
3657          * copy the input to the output, avoiding finding the code point of
3658          * that character */
3659         if (!is_utf8_pat) {
3660             c2 = c1 = *pat;
3661         }
3662         else if (utf8_target) {
3663             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3664             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3665             utf8_has_been_setup = TRUE;
3666         }
3667         else {
3668             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3669         }
3670     }
3671     else { /* an EXACTFish node */
3672         U8 *pat_end = pat + STR_LEN(text_node);
3673
3674         /* An EXACTFL node has at least some characters unfolded, because what
3675          * they match is not known until now.  So, now is the time to fold
3676          * the first few of them, as many as are needed to determine 'c1' and
3677          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3678          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3679          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3680          * need to fold as many characters as a single character can fold to,
3681          * so that later we can check if the first ones are such a multi-char
3682          * fold.  But, in such a pattern only locale-problematic characters
3683          * aren't folded, so we can skip this completely if the first character
3684          * in the node isn't one of the tricky ones */
3685         if (OP(text_node) == EXACTFL) {
3686
3687             if (! is_utf8_pat) {
3688                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3689                 {
3690                     folded[0] = folded[1] = 's';
3691                     pat = folded;
3692                     pat_end = folded + 2;
3693                 }
3694             }
3695             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3696                 U8 *s = pat;
3697                 U8 *d = folded;
3698                 int i;
3699
3700                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3701                     if (isASCII(*s)) {
3702                         *(d++) = (U8) toFOLD_LC(*s);
3703                         s++;
3704                     }
3705                     else {
3706                         STRLEN len;
3707                         _to_utf8_fold_flags(s,
3708                                             d,
3709                                             &len,
3710                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3711                         d += len;
3712                         s += UTF8SKIP(s);
3713                     }
3714                 }
3715
3716                 pat = folded;
3717                 pat_end = d;
3718             }
3719         }
3720
3721         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3722              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3723         {
3724             /* Multi-character folds require more context to sort out.  Also
3725              * PL_utf8_foldclosures used below doesn't handle them, so have to
3726              * be handled outside this routine */
3727             use_chrtest_void = TRUE;
3728         }
3729         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3730             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3731             if (c1 > 255) {
3732                 /* Load the folds hash, if not already done */
3733                 SV** listp;
3734                 if (! PL_utf8_foldclosures) {
3735                     _load_PL_utf8_foldclosures();
3736                 }
3737
3738                 /* The fold closures data structure is a hash with the keys
3739                  * being the UTF-8 of every character that is folded to, like
3740                  * 'k', and the values each an array of all code points that
3741                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
3742                  * Multi-character folds are not included */
3743                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3744                                         (char *) pat,
3745                                         UTF8SKIP(pat),
3746                                         FALSE))))
3747                 {
3748                     /* Not found in the hash, therefore there are no folds
3749                     * containing it, so there is only a single character that
3750                     * could match */
3751                     c2 = c1;
3752                 }
3753                 else {  /* Does participate in folds */
3754                     AV* list = (AV*) *listp;
3755                     if (av_tindex(list) != 1) {
3756
3757                         /* If there aren't exactly two folds to this, it is
3758                          * outside the scope of this function */
3759                         use_chrtest_void = TRUE;
3760                     }
3761                     else {  /* There are two.  Get them */
3762                         SV** c_p = av_fetch(list, 0, FALSE);
3763                         if (c_p == NULL) {
3764                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3765                         }
3766                         c1 = SvUV(*c_p);
3767
3768                         c_p = av_fetch(list, 1, FALSE);
3769                         if (c_p == NULL) {
3770                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3771                         }
3772                         c2 = SvUV(*c_p);
3773
3774                         /* Folds that cross the 255/256 boundary are forbidden
3775                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3776                          * one is ASCIII.  Since the pattern character is above
3777                          * 255, and its only other match is below 256, the only
3778                          * legal match will be to itself.  We have thrown away
3779                          * the original, so have to compute which is the one
3780                          * above 255. */
3781                         if ((c1 < 256) != (c2 < 256)) {
3782                             if ((OP(text_node) == EXACTFL
3783                                  && ! IN_UTF8_CTYPE_LOCALE)
3784                                 || ((OP(text_node) == EXACTFA
3785                                     || OP(text_node) == EXACTFA_NO_TRIE)
3786                                     && (isASCII(c1) || isASCII(c2))))
3787                             {
3788                                 if (c1 < 256) {
3789                                     c1 = c2;
3790                                 }
3791                                 else {
3792                                     c2 = c1;
3793                                 }
3794                             }
3795                         }
3796                     }
3797                 }
3798             }
3799             else /* Here, c1 is <= 255 */
3800                 if (utf8_target
3801                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3802                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3803                     && ((OP(text_node) != EXACTFA
3804                         && OP(text_node) != EXACTFA_NO_TRIE)
3805                         || ! isASCII(c1)))
3806             {
3807                 /* Here, there could be something above Latin1 in the target
3808                  * which folds to this character in the pattern.  All such
3809                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3810                  * than two characters involved in their folds, so are outside
3811                  * the scope of this function */
3812                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3813                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3814                 }
3815                 else {
3816                     use_chrtest_void = TRUE;
3817                 }
3818             }
3819             else { /* Here nothing above Latin1 can fold to the pattern
3820                       character */
3821                 switch (OP(text_node)) {
3822
3823                     case EXACTFL:   /* /l rules */
3824                         c2 = PL_fold_locale[c1];
3825                         break;
3826
3827                     case EXACTF:   /* This node only generated for non-utf8
3828                                     patterns */
3829                         assert(! is_utf8_pat);
3830                         if (! utf8_target) {    /* /d rules */
3831                             c2 = PL_fold[c1];
3832                             break;
3833                         }
3834                         /* FALLTHROUGH */
3835                         /* /u rules for all these.  This happens to work for
3836                         * EXACTFA as nothing in Latin1 folds to ASCII */
3837                     case EXACTFA_NO_TRIE:   /* This node only generated for
3838                                             non-utf8 patterns */
3839                         assert(! is_utf8_pat);
3840                         /* FALLTHROUGH */
3841                     case EXACTFA:
3842                     case EXACTFU_SS:
3843                     case EXACTFU:
3844                         c2 = PL_fold_latin1[c1];
3845                         break;
3846
3847                     default:
3848                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3849                         assert(0); /* NOTREACHED */
3850                 }
3851             }
3852         }
3853     }
3854
3855     /* Here have figured things out.  Set up the returns */
3856     if (use_chrtest_void) {
3857         *c2p = *c1p = CHRTEST_VOID;
3858     }
3859     else if (utf8_target) {
3860         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3861             uvchr_to_utf8(c1_utf8, c1);
3862             uvchr_to_utf8(c2_utf8, c2);
3863         }
3864
3865         /* Invariants are stored in both the utf8 and byte outputs; Use
3866          * negative numbers otherwise for the byte ones.  Make sure that the
3867          * byte ones are the same iff the utf8 ones are the same */
3868         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3869         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3870                 ? *c2_utf8
3871                 : (c1 == c2)
3872                   ? CHRTEST_NOT_A_CP_1
3873                   : CHRTEST_NOT_A_CP_2;
3874     }
3875     else if (c1 > 255) {
3876        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3877                            can represent */
3878            return FALSE;
3879        }
3880
3881        *c1p = *c2p = c2;    /* c2 is the only representable value */
3882     }
3883     else {  /* c1 is representable; see about c2 */
3884        *c1p = c1;
3885        *c2p = (c2 < 256) ? c2 : c1;
3886     }
3887
3888     return TRUE;
3889 }
3890
3891 /* returns -1 on failure, $+[0] on success */
3892 STATIC SSize_t
3893 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3894 {
3895 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3896     dMY_CXT;
3897 #endif
3898     dVAR;
3899     const bool utf8_target = reginfo->is_utf8_target;
3900     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3901     REGEXP *rex_sv = reginfo->prog;
3902     regexp *rex = ReANY(rex_sv);
3903     RXi_GET_DECL(rex,rexi);
3904     /* the current state. This is a cached copy of PL_regmatch_state */
3905     regmatch_state *st;
3906     /* cache heavy used fields of st in registers */
3907     regnode *scan;
3908     regnode *next;
3909     U32 n = 0;  /* general value; init to avoid compiler warning */
3910     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
3911     char *locinput = startpos;
3912     char *pushinput; /* where to continue after a PUSH */
3913     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3914
3915     bool result = 0;        /* return value of S_regmatch */
3916     int depth = 0;          /* depth of backtrack stack */
3917     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3918     const U32 max_nochange_depth =
3919         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3920         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3921     regmatch_state *yes_state = NULL; /* state to pop to on success of
3922                                                             subpattern */
3923     /* mark_state piggy backs on the yes_state logic so that when we unwind
3924        the stack on success we can update the mark_state as we go */
3925     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3926     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3927     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3928     U32 state_num;
3929     bool no_final = 0;      /* prevent failure from backtracking? */
3930     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3931     char *startpoint = locinput;
3932     SV *popmark = NULL;     /* are we looking for a mark? */
3933     SV *sv_commit = NULL;   /* last mark name seen in failure */
3934     SV *sv_yes_mark = NULL; /* last mark name we have seen
3935                                during a successful match */
3936     U32 lastopen = 0;       /* last open we saw */
3937     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3938     SV* const oreplsv = GvSVn(PL_replgv);
3939     /* these three flags are set by various ops to signal information to
3940      * the very next op. They have a useful lifetime of exactly one loop
3941      * iteration, and are not preserved or restored by state pushes/pops
3942      */
3943     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3944     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3945     int logical = 0;        /* the following EVAL is:
3946                                 0: (?{...})
3947                                 1: (?(?{...})X|Y)
3948                                 2: (??{...})
3949                                or the following IFMATCH/UNLESSM is:
3950                                 false: plain (?=foo)
3951                                 true:  used as a condition: (?(?=foo))
3952                             */
3953     PAD* last_pad = NULL;
3954     dMULTICALL;
3955     I32 gimme = G_SCALAR;
3956     CV *caller_cv = NULL;       /* who called us */
3957     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3958     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3959     U32 maxopenparen = 0;       /* max '(' index seen so far */
3960     int to_complement;  /* Invert the result? */
3961     _char_class_number classnum;
3962     bool is_utf8_pat = reginfo->is_utf8_pat;
3963
3964 #ifdef DEBUGGING
3965     GET_RE_DEBUG_FLAGS_DECL;
3966 #endif
3967
3968     /* protect against undef(*^R) */
3969     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3970
3971     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3972     multicall_oldcatch = 0;
3973     multicall_cv = NULL;
3974     cx = NULL;
3975     PERL_UNUSED_VAR(multicall_cop);
3976     PERL_UNUSED_VAR(newsp);
3977
3978
3979     PERL_ARGS_ASSERT_REGMATCH;
3980
3981     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3982             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3983     }));
3984
3985     st = PL_regmatch_state;
3986
3987     /* Note that nextchr is a byte even in UTF */
3988     SET_nextchr;
3989     scan = prog;
3990     while (scan != NULL) {
3991
3992         DEBUG_EXECUTE_r( {
3993             SV * const prop = sv_newmortal();
3994             regnode *rnext=regnext(scan);
3995             DUMP_EXEC_POS( locinput, scan, utf8_target );
3996             regprop(rex, prop, scan, reginfo);
3997
3998             PerlIO_printf(Perl_debug_log,
3999                     "%3"IVdf":%*s%s(%"IVdf")\n",
4000                     (IV)(scan - rexi->program), depth*2, "",
4001                     SvPVX_const(prop),
4002                     (PL_regkind[OP(scan)] == END || !rnext) ?
4003                         0 : (IV)(rnext - rexi->program));
4004         });
4005
4006         next = scan + NEXT_OFF(scan);
4007         if (next == scan)
4008             next = NULL;
4009         state_num = OP(scan);
4010
4011       reenter_switch:
4012         to_complement = 0;
4013
4014         SET_nextchr;
4015         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4016
4017         switch (state_num) {
4018         case BOL:  /*  /^../   */
4019         case SBOL: /*  /^../s  */
4020             if (locinput == reginfo->strbeg)
4021                 break;
4022             sayNO;
4023
4024         case MBOL: /*  /^../m  */
4025             if (locinput == reginfo->strbeg ||
4026                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4027             {
4028                 break;
4029             }
4030             sayNO;
4031
4032         case GPOS: /*  \G  */
4033             if (locinput == reginfo->ganch)
4034                 break;
4035             sayNO;
4036
4037         case KEEPS: /*   \K  */
4038             /* update the startpoint */
4039             st->u.keeper.val = rex->offs[0].start;
4040             rex->offs[0].start = locinput - reginfo->strbeg;
4041             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4042             assert(0); /*NOTREACHED*/
4043         case KEEPS_next_fail:
4044             /* rollback the start point change */
4045             rex->offs[0].start = st->u.keeper.val;
4046             sayNO_SILENT;
4047             assert(0); /*NOTREACHED*/
4048
4049         case MEOL: /* /..$/m  */
4050             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4051                 sayNO;
4052             break;
4053
4054         case EOL: /* /..$/  */
4055             /* FALLTHROUGH */
4056         case SEOL: /* /..$/s  */
4057             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4058                 sayNO;
4059             if (reginfo->strend - locinput > 1)
4060                 sayNO;
4061             break;
4062
4063         case EOS: /*  \z  */
4064             if (!NEXTCHR_IS_EOS)
4065                 sayNO;
4066             break;
4067
4068         case SANY: /*  /./s  */
4069             if (NEXTCHR_IS_EOS)
4070                 sayNO;
4071             goto increment_locinput;
4072
4073         case CANY: /*  \C  */
4074             if (NEXTCHR_IS_EOS)
4075                 sayNO;
4076             locinput++;
4077             break;
4078
4079         case REG_ANY: /*  /./  */
4080             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4081                 sayNO;
4082             goto increment_locinput;
4083
4084
4085 #undef  ST
4086 #define ST st->u.trie
4087         case TRIEC: /* (ab|cd) with known charclass */
4088             /* In this case the charclass data is available inline so
4089                we can fail fast without a lot of extra overhead.
4090              */
4091             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4092                 DEBUG_EXECUTE_r(
4093                     PerlIO_printf(Perl_debug_log,
4094                               "%*s  %sfailed to match trie start class...%s\n",
4095                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4096                 );
4097                 sayNO_SILENT;
4098                 assert(0); /* NOTREACHED */
4099             }
4100             /* FALLTHROUGH */
4101         case TRIE:  /* (ab|cd)  */
4102             /* the basic plan of execution of the trie is:
4103              * At the beginning, run though all the states, and
4104              * find the longest-matching word. Also remember the position
4105              * of the shortest matching word. For example, this pattern:
4106              *    1  2 3 4    5
4107              *    ab|a|x|abcd|abc
4108              * when matched against the string "abcde", will generate
4109              * accept states for all words except 3, with the longest
4110              * matching word being 4, and the shortest being 2 (with
4111              * the position being after char 1 of the string).
4112              *
4113              * Then for each matching word, in word order (i.e. 1,2,4,5),
4114              * we run the remainder of the pattern; on each try setting
4115              * the current position to the character following the word,
4116              * returning to try the next word on failure.
4117              *
4118              * We avoid having to build a list of words at runtime by
4119              * using a compile-time structure, wordinfo[].prev, which
4120              * gives, for each word, the previous accepting word (if any).
4121              * In the case above it would contain the mappings 1->2, 2->0,
4122              * 3->0, 4->5, 5->1.  We can use this table to generate, from
4123              * the longest word (4 above), a list of all words, by
4124              * following the list of prev pointers; this gives us the
4125              * unordered list 4,5,1,2. Then given the current word we have
4126              * just tried, we can go through the list and find the
4127              * next-biggest word to try (so if we just failed on word 2,
4128              * the next in the list is 4).
4129              *
4130              * Since at runtime we don't record the matching position in
4131              * the string for each word, we have to work that out for
4132              * each word we're about to process. The wordinfo table holds
4133              * the character length of each word; given that we recorded
4134              * at the start: the position of the shortest word and its
4135              * length in chars, we just need to move the pointer the
4136              * difference between the two char lengths. Depending on
4137              * Unicode status and folding, that's cheap or expensive.
4138              *
4139              * This algorithm is optimised for the case where are only a
4140              * small number of accept states, i.e. 0,1, or maybe 2.
4141              * With lots of accepts states, and having to try all of them,
4142              * it becomes quadratic on number of accept states to find all
4143              * the next words.
4144              */
4145
4146             {
4147                 /* what type of TRIE am I? (utf8 makes this contextual) */
4148                 DECL_TRIE_TYPE(scan);
4149
4150                 /* what trie are we using right now */
4151                 reg_trie_data * const trie
4152                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
4153                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
4154                 U32 state = trie->startstate;
4155
4156                 if (   trie->bitmap
4157                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4158                 {
4159                     if (trie->states[ state ].wordnum) {
4160                          DEBUG_EXECUTE_r(
4161                             PerlIO_printf(Perl_debug_log,
4162                                           "%*s  %smatched empty string...%s\n",
4163                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4164                         );
4165                         if (!trie->jump)
4166                             break;
4167                     } else {
4168                         DEBUG_EXECUTE_r(
4169                             PerlIO_printf(Perl_debug_log,
4170                                           "%*s  %sfailed to match trie start class...%s\n",
4171                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4172                         );
4173                         sayNO_SILENT;
4174                    }
4175                 }
4176
4177             {
4178                 U8 *uc = ( U8* )locinput;
4179
4180                 STRLEN len = 0;
4181                 STRLEN foldlen = 0;
4182                 U8 *uscan = (U8*)NULL;
4183                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
4184                 U32 charcount = 0; /* how many input chars we have matched */
4185                 U32 accepted = 0; /* have we seen any accepting states? */
4186
4187                 ST.jump = trie->jump;
4188                 ST.me = scan;
4189                 ST.firstpos = NULL;
4190                 ST.longfold = FALSE; /* char longer if folded => it's harder */
4191                 ST.nextword = 0;
4192
4193                 /* fully traverse the TRIE; note the position of the
4194                    shortest accept state and the wordnum of the longest
4195                    accept state */
4196
4197                 while ( state && uc <= (U8*)(reginfo->strend) ) {
4198                     U32 base = trie->states[ state ].trans.base;
4199                     UV uvc = 0;
4200                     U16 charid = 0;
4201                     U16 wordnum;
4202                     wordnum = trie->states[ state ].wordnum;
4203
4204                     if (wordnum) { /* it's an accept state */
4205                         if (!accepted) {
4206                             accepted = 1;
4207                             /* record first match position */
4208                             if (ST.longfold) {
4209                                 ST.firstpos = (U8*)locinput;
4210                                 ST.firstchars = 0;
4211                             }
4212                             else {
4213                                 ST.firstpos = uc;
4214                                 ST.firstchars = charcount;
4215                             }
4216                         }
4217                         if (!ST.nextword || wordnum < ST.nextword)
4218                             ST.nextword = wordnum;
4219                         ST.topword = wordnum;
4220                     }
4221
4222                     DEBUG_TRIE_EXECUTE_r({
4223                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
4224                                 PerlIO_printf( Perl_debug_log,
4225                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
4226                                     2+depth * 2, "", PL_colors[4],
4227                                     (UV)state, (accepted ? 'Y' : 'N'));
4228                     });
4229
4230                     /* read a char and goto next state */
4231                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
4232                         I32 offset;
4233                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4234                                              uscan, len, uvc, charid, foldlen,
4235                                              foldbuf, uniflags);
4236                         charcount++;
4237                         if (foldlen>0)
4238                             ST.longfold = TRUE;
4239                         if (charid &&
4240                              ( ((offset =
4241                               base + charid - 1 - trie->uniquecharcount)) >= 0)
4242
4243                              && ((U32)offset < trie->lasttrans)
4244                              && trie->trans[offset].check == state)
4245                         {
4246                             state = trie->trans[offset].next;
4247                         }
4248                         else {
4249                             state = 0;
4250                         }
4251                         uc += len;
4252
4253                     }
4254                     else {
4255                         state = 0;
4256                     }
4257                     DEBUG_TRIE_EXECUTE_r(
4258                         PerlIO_printf( Perl_debug_log,
4259                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
4260                             charid, uvc, (UV)state, PL_colors[5] );
4261                     );
4262                 }
4263                 if (!accepted)
4264                    sayNO;
4265
4266                 /* calculate total number of accept states */
4267                 {
4268                     U16 w = ST.topword;
4269                     accepted = 0;
4270                     while (w) {
4271                         w = trie->wordinfo[w].prev;
4272                         accepted++;
4273                     }
4274                     ST.accepted = accepted;
4275                 }
4276
4277                 DEBUG_EXECUTE_r(
4278                     PerlIO_printf( Perl_debug_log,
4279                         "%*s  %sgot %"IVdf" possible matches%s\n",
4280                         REPORT_CODE_OFF + depth * 2, "",
4281                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4282                 );
4283                 goto trie_first_try; /* jump into the fail handler */
4284             }}
4285             assert(0); /* NOTREACHED */
4286
4287         case TRIE_next_fail: /* we failed - try next alternative */
4288         {
4289             U8 *uc;
4290             if ( ST.jump) {
4291                 REGCP_UNWIND(ST.cp);
4292                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4293             }
4294             if (!--ST.accepted) {
4295                 DEBUG_EXECUTE_r({
4296                     PerlIO_printf( Perl_debug_log,
4297                         "%*s  %sTRIE failed...%s\n",
4298                         REPORT_CODE_OFF+depth*2, "",
4299                         PL_colors[4],
4300                         PL_colors[5] );
4301                 });
4302                 sayNO_SILENT;
4303             }
4304             {
4305                 /* Find next-highest word to process.  Note that this code
4306                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
4307                 U16 min = 0;
4308                 U16 word;
4309                 U16 const nextword = ST.nextword;
4310                 reg_trie_wordinfo * const wordinfo
4311                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4312                 for (word=ST.topword; word; word=wordinfo[word].prev) {
4313                     if (word > nextword && (!min || word < min))
4314                         min = word;
4315                 }
4316                 ST.nextword = min;
4317             }
4318
4319           trie_first_try:
4320             if (do_cutgroup) {
4321                 do_cutgroup = 0;
4322                 no_final = 0;
4323             }
4324
4325             if ( ST.jump) {
4326                 ST.lastparen = rex->lastparen;
4327                 ST.lastcloseparen = rex->lastcloseparen;
4328                 REGCP_SET(ST.cp);
4329             }
4330
4331             /* find start char of end of current word */
4332             {
4333                 U32 chars; /* how many chars to skip */
4334                 reg_trie_data * const trie
4335                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4336
4337                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4338                             >=  ST.firstchars);
4339                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4340                             - ST.firstchars;
4341                 uc = ST.firstpos;
4342
4343                 if (ST.longfold) {
4344                     /* the hard option - fold each char in turn and find
4345                      * its folded length (which may be different */
4346                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4347                     STRLEN foldlen;
4348                     STRLEN len;
4349                     UV uvc;
4350                     U8 *uscan;
4351
4352                     while (chars) {
4353                         if (utf8_target) {
4354                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4355                                                     uniflags);
4356                             uc += len;
4357                         }
4358                         else {
4359                             uvc = *uc;
4360                             uc++;
4361                         }
4362                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4363                         uscan = foldbuf;
4364                         while (foldlen) {
4365                             if (!--chars)
4366                                 break;
4367                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4368                                             uniflags);
4369                             uscan += len;
4370                             foldlen -= len;
4371                         }
4372                     }
4373                 }
4374                 else {
4375                     if (utf8_target)
4376                         while (chars--)
4377                             uc += UTF8SKIP(uc);
4378                     else
4379                         uc += chars;
4380                 }
4381             }
4382
4383             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4384                             ? ST.jump[ST.nextword]
4385                             : NEXT_OFF(ST.me));
4386
4387             DEBUG_EXECUTE_r({
4388                 PerlIO_printf( Perl_debug_log,
4389                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4390                     REPORT_CODE_OFF+depth*2, "",
4391                     PL_colors[4],
4392                     ST.nextword,
4393                     PL_colors[5]
4394                     );
4395             });
4396
4397             if (ST.accepted > 1 || has_cutgroup) {
4398                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4399                 assert(0); /* NOTREACHED */
4400             }
4401             /* only one choice left - just continue */
4402             DEBUG_EXECUTE_r({
4403                 AV *const trie_words
4404                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4405                 SV ** const tmp = av_fetch( trie_words,
4406                     ST.nextword-1, 0 );
4407                 SV *sv= tmp ? sv_newmortal() : NULL;
4408
4409                 PerlIO_printf( Perl_debug_log,
4410                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4411                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4412                     ST.nextword,
4413                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4414                             PL_colors[0], PL_colors[1],
4415                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4416                         )
4417                     : "not compiled under -Dr",
4418                     PL_colors[5] );
4419             });
4420
4421             locinput = (char*)uc;
4422             continue; /* execute rest of RE */
4423             assert(0); /* NOTREACHED */
4424         }
4425 #undef  ST
4426
4427         case EXACT: {            /*  /abc/        */
4428             char *s = STRING(scan);
4429             ln = STR_LEN(scan);
4430             if (utf8_target != is_utf8_pat) {
4431                 /* The target and the pattern have differing utf8ness. */
4432                 char *l = locinput;
4433                 const char * const e = s + ln;
4434
4435                 if (utf8_target) {
4436                     /* The target is utf8, the pattern is not utf8.
4437                      * Above-Latin1 code points can't match the pattern;
4438                      * invariants match exactly, and the other Latin1 ones need
4439                      * to be downgraded to a single byte in order to do the
4440                      * comparison.  (If we could be confident that the target
4441                      * is not malformed, this could be refactored to have fewer
4442                      * tests by just assuming that if the first bytes match, it
4443                      * is an invariant, but there are tests in the test suite
4444                      * dealing with (??{...}) which violate this) */
4445                     while (s < e) {
4446                         if (l >= reginfo->strend
4447                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4448                         {
4449                             sayNO;
4450                         }
4451                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4452                             if (*l != *s) {
4453                                 sayNO;
4454                             }
4455                             l++;
4456                         }
4457                         else {
4458                             if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4459                             {
4460                                 sayNO;
4461                             }
4462                             l += 2;
4463                         }
4464                         s++;
4465                     }
4466                 }
4467                 else {
4468                     /* The target is not utf8, the pattern is utf8. */
4469                     while (s < e) {
4470                         if (l >= reginfo->strend
4471                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4472                         {
4473                             sayNO;
4474                         }
4475                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4476                             if (*s != *l) {
4477                                 sayNO;
4478                             }
4479                             s++;
4480                         }
4481                         else {
4482                             if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4483                             {
4484                                 sayNO;
4485                             }
4486                             s += 2;
4487                         }
4488                         l++;
4489                     }
4490                 }
4491                 locinput = l;
4492             }
4493             else {
4494                 /* The target and the pattern have the same utf8ness. */
4495                 /* Inline the first character, for speed. */
4496                 if (reginfo->strend - locinput < ln
4497                     || UCHARAT(s) != nextchr
4498                     || (ln > 1 && memNE(s, locinput, ln)))
4499                 {
4500                     sayNO;
4501                 }
4502                 locinput += ln;
4503             }
4504             break;
4505             }
4506
4507         case EXACTFL: {          /*  /abc/il      */
4508             re_fold_t folder;
4509             const U8 * fold_array;
4510             const char * s;
4511             U32 fold_utf8_flags;
4512
4513             folder = foldEQ_locale;
4514             fold_array = PL_fold_locale;
4515             fold_utf8_flags = FOLDEQ_LOCALE;
4516             goto do_exactf;
4517
4518         case EXACTFU_SS:         /*  /\x{df}/iu   */
4519         case EXACTFU:            /*  /abc/iu      */
4520             folder = foldEQ_latin1;
4521             fold_array = PL_fold_latin1;
4522             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4523             goto do_exactf;
4524
4525         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
4526                                    patterns */
4527             assert(! is_utf8_pat);
4528             /* FALLTHROUGH */
4529         case EXACTFA:            /*  /abc/iaa     */
4530             folder = foldEQ_latin1;
4531             fold_array = PL_fold_latin1;
4532             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4533             goto do_exactf;
4534
4535         case EXACTF:             /*  /abc/i    This node only generated for
4536                                                non-utf8 patterns */
4537             assert(! is_utf8_pat);
4538             folder = foldEQ;
4539             fold_array = PL_fold;
4540             fold_utf8_flags = 0;
4541
4542           do_exactf:
4543             s = STRING(scan);
4544             ln = STR_LEN(scan);
4545
4546             if (utf8_target
4547                 || is_utf8_pat
4548                 || state_num == EXACTFU_SS
4549                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4550             {
4551               /* Either target or the pattern are utf8, or has the issue where
4552                * the fold lengths may differ. */
4553                 const char * const l = locinput;
4554                 char *e = reginfo->strend;
4555
4556                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4557                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4558                 {
4559                     sayNO;
4560                 }
4561                 locinput = e;
4562                 break;
4563             }
4564
4565             /* Neither the target nor the pattern are utf8 */
4566             if (UCHARAT(s) != nextchr
4567                 && !NEXTCHR_IS_EOS
4568                 && UCHARAT(s) != fold_array[nextchr])
4569             {
4570                 sayNO;
4571             }
4572             if (reginfo->strend - locinput < ln)
4573                 sayNO;
4574             if (ln > 1 && ! folder(s, locinput, ln))
4575                 sayNO;
4576             locinput += ln;
4577             break;
4578         }
4579
4580         /* XXX Could improve efficiency by separating these all out using a
4581          * macro or in-line function.  At that point regcomp.c would no longer
4582          * have to set the FLAGS fields of these */
4583         case BOUNDL:  /*  /\b/l  */
4584         case NBOUNDL: /*  /\B/l  */
4585         case BOUND:   /*  /\b/   */
4586         case BOUNDU:  /*  /\b/u  */
4587         case BOUNDA:  /*  /\b/a  */
4588         case NBOUND:  /*  /\B/   */
4589         case NBOUNDU: /*  /\B/u  */
4590         case NBOUNDA: /*  /\B/a  */
4591             /* was last char in word? */
4592             if (utf8_target
4593                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4594                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4595             {
4596                 if (locinput == reginfo->strbeg)
4597                     ln = '\n';
4598                 else {
4599                     const U8 * const r =
4600                             reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4601
4602                     ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4603                                                                    0, uniflags);
4604                 }
4605                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4606                     ln = isWORDCHAR_uni(ln);
4607                     if (NEXTCHR_IS_EOS)
4608                         n = 0;
4609                     else {
4610                         LOAD_UTF8_CHARCLASS_ALNUM();
4611                         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4612                                                                 utf8_target);
4613                     }
4614                 }
4615                 else {
4616                     ln = isWORDCHAR_LC_uvchr(ln);
4617                     n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4618                 }
4619             }
4620             else {
4621
4622                 /* Here the string isn't utf8, or is utf8 and only ascii
4623                  * characters are to match \w.  In the latter case looking at
4624                  * the byte just prior to the current one may be just the final
4625                  * byte of a multi-byte character.  This is ok.  There are two
4626                  * cases:
4627                  * 1) it is a single byte character, and then the test is doing
4628                  *      just what it's supposed to.
4629                  * 2) it is a multi-byte character, in which case the final
4630                  *      byte is never mistakable for ASCII, and so the test
4631                  *      will say it is not a word character, which is the
4632                  *      correct answer. */
4633                 ln = (locinput != reginfo->strbeg) ?
4634                     UCHARAT(locinput - 1) : '\n';
4635                 switch (FLAGS(scan)) {
4636                     case REGEX_UNICODE_CHARSET:
4637                         ln = isWORDCHAR_L1(ln);
4638                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4639                         break;
4640                     case REGEX_LOCALE_CHARSET:
4641                         ln = isWORDCHAR_LC(ln);
4642                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4643                         break;
4644                     case REGEX_DEPENDS_CHARSET:
4645                         ln = isWORDCHAR(ln);
4646                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4647                         break;
4648                     case REGEX_ASCII_RESTRICTED_CHARSET:
4649                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4650                         ln = isWORDCHAR_A(ln);
4651                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4652                         break;
4653                     default:
4654                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4655                 }
4656             }
4657             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4658              * regcomp.sym */
4659             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4660                     sayNO;
4661             break;
4662
4663         case ANYOF:  /*  /[abc]/       */
4664             if (NEXTCHR_IS_EOS)
4665                 sayNO;
4666             if (utf8_target) {
4667                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4668                                                                    utf8_target))
4669                     sayNO;
4670                 locinput += UTF8SKIP(locinput);
4671             }
4672             else {
4673                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4674                     sayNO;
4675                 locinput++;
4676             }
4677             break;
4678
4679         /* The argument (FLAGS) to all the POSIX node types is the class number
4680          * */
4681
4682         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4683             to_complement = 1;
4684             /* FALLTHROUGH */
4685
4686         case POSIXL:    /* \w or [:punct:] etc. under /l */
4687             if (NEXTCHR_IS_EOS)
4688                 sayNO;
4689
4690             /* Use isFOO_lc() for characters within Latin1.  (Note that
4691              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4692              * wouldn't be invariant) */
4693             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4694                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4695                     sayNO;
4696                 }
4697             }
4698             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4699                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4700                                            (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4701                                                             *(locinput + 1))))))
4702                 {
4703                     sayNO;
4704                 }
4705             }
4706             else { /* Here, must be an above Latin-1 code point */
4707                 goto utf8_posix_not_eos;
4708             }
4709
4710             /* Here, must be utf8 */
4711             locinput += UTF8SKIP(locinput);
4712             break;
4713
4714         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4715             to_complement = 1;
4716             /* FALLTHROUGH */
4717
4718         case POSIXD:    /* \w or [:punct:] etc. under /d */
4719             if (utf8_target) {
4720                 goto utf8_posix;
4721             }
4722             goto posixa;
4723
4724         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4725
4726             if (NEXTCHR_IS_EOS) {
4727                 sayNO;
4728             }
4729
4730             /* All UTF-8 variants match */
4731             if (! UTF8_IS_INVARIANT(nextchr)) {
4732                 goto increment_locinput;
4733             }
4734
4735             to_complement = 1;
4736             /* FALLTHROUGH */
4737
4738         case POSIXA:    /* \w or [:punct:] etc. under /a */
4739
4740           posixa:
4741             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4742              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4743              * character is a single byte */
4744
4745             if (NEXTCHR_IS_EOS
4746                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4747                                                             FLAGS(scan)))))
4748             {
4749                 sayNO;
4750             }
4751
4752             /* Here we are either not in utf8, or we matched a utf8-invariant,
4753              * so the next char is the next byte */
4754             locinput++;
4755             break;
4756
4757         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4758             to_complement = 1;
4759             /* FALLTHROUGH */
4760
4761         case POSIXU:    /* \w or [:punct:] etc. under /u */
4762           utf8_posix:
4763             if (NEXTCHR_IS_EOS) {
4764                 sayNO;
4765             }
4766           utf8_posix_not_eos:
4767
4768             /* Use _generic_isCC() for characters within Latin1.  (Note that
4769              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4770              * wouldn't be invariant) */
4771             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4772                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4773                                                            FLAGS(scan)))))
4774                 {
4775                     sayNO;
4776                 }
4777                 locinput++;
4778             }
4779             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4780                 if (! (to_complement
4781                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4782                                                                *(locinput + 1)),
4783                                              FLAGS(scan)))))
4784                 {
4785                     sayNO;
4786                 }
4787                 locinput += 2;
4788             }
4789             else {  /* Handle above Latin-1 code points */
4790                 classnum = (_char_class_number) FLAGS(scan);
4791                 if (classnum < _FIRST_NON_SWASH_CC) {
4792
4793                     /* Here, uses a swash to find such code points.  Load if if
4794                      * not done already */
4795                     if (! PL_utf8_swash_ptrs[classnum]) {
4796                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4797                         PL_utf8_swash_ptrs[classnum]
4798                                 = _core_swash_init("utf8",
4799                                         "",
4800                                         &PL_sv_undef, 1, 0,
4801                                         PL_XPosix_ptrs[classnum], &flags);
4802                     }
4803                     if (! (to_complement
4804                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4805                                                (U8 *) locinput, TRUE))))
4806                     {
4807                         sayNO;
4808                     }
4809                 }
4810                 else {  /* Here, uses macros to find above Latin-1 code points */
4811                     switch (classnum) {
4812                         case _CC_ENUM_SPACE:    /* XXX would require separate
4813                                                    code if we revert the change
4814                                                    of \v matching this */
4815                         case _CC_ENUM_PSXSPC:
4816                             if (! (to_complement
4817                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4818                             {
4819                                 sayNO;
4820                             }
4821                             break;
4822                         case _CC_ENUM_BLANK:
4823                             if (! (to_complement
4824                                             ^ cBOOL(is_HORIZWS_high(locinput))))
4825                             {
4826                                 sayNO;
4827                             }
4828                             break;
4829                         case _CC_ENUM_XDIGIT:
4830                             if (! (to_complement
4831                                             ^ cBOOL(is_XDIGIT_high(locinput))))
4832                             {
4833                                 sayNO;
4834                             }
4835                             break;
4836                         case _CC_ENUM_VERTSPACE:
4837                             if (! (to_complement
4838                                             ^ cBOOL(is_VERTWS_high(locinput))))
4839                             {
4840                                 sayNO;
4841                             }
4842                             break;
4843                         default:    /* The rest, e.g. [:cntrl:], can't match
4844                                        above Latin1 */
4845                             if (! to_complement) {
4846                                 sayNO;
4847                             }
4848                             break;
4849                     }
4850                 }
4851                 locinput += UTF8SKIP(locinput);
4852             }
4853             break;
4854
4855         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4856                        a Unicode extended Grapheme Cluster */
4857             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4858               extended Grapheme Cluster is:
4859
4860             CR LF
4861             | Prepend* Begin Extend*
4862             | .
4863
4864             Begin is:           ( Special_Begin | ! Control )
4865             Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4866             Extend is:          ( Grapheme_Extend | Spacing_Mark )
4867             Control is:         [ GCB_Control | CR | LF ]
4868             Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4869
4870                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4871                we can rewrite
4872
4873                    Begin is ( Regular_Begin + Special Begin )
4874
4875                It turns out that 98.4% of all Unicode code points match
4876                Regular_Begin.  Doing it this way eliminates a table match in
4877                the previous implementation for almost all Unicode code points.
4878
4879                There is a subtlety with Prepend* which showed up in testing.
4880                Note that the Begin, and only the Begin is required in:
4881                 | Prepend* Begin Extend*
4882                Also, Begin contains '! Control'.  A Prepend must be a
4883                '!  Control', which means it must also be a Begin.  What it
4884                comes down to is that if we match Prepend* and then find no
4885                suitable Begin afterwards, that if we backtrack the last
4886                Prepend, that one will be a suitable Begin.
4887             */
4888
4889             if (NEXTCHR_IS_EOS)
4890                 sayNO;
4891             if  (! utf8_target) {
4892
4893                 /* Match either CR LF  or '.', as all the other possibilities
4894                  * require utf8 */
4895                 locinput++;         /* Match the . or CR */
4896                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4897                                        match the LF */
4898                     && locinput < reginfo->strend
4899                     && UCHARAT(locinput) == '\n')
4900                 {
4901                     locinput++;
4902                 }
4903             }
4904             else {
4905
4906                 /* Utf8: See if is ( CR LF ); already know that locinput <
4907                  * reginfo->strend, so locinput+1 is in bounds */
4908                 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4909                      && UCHARAT(locinput + 1) == '\n')
4910                 {
4911                     locinput += 2;
4912                 }
4913                 else {
4914                     STRLEN len;
4915
4916                     /* In case have to backtrack to beginning, then match '.' */
4917                     char *starting = locinput;
4918
4919                     /* In case have to backtrack the last prepend */
4920                     char *previous_prepend = NULL;
4921
4922                     LOAD_UTF8_CHARCLASS_GCB();
4923
4924                     /* Match (prepend)*   */
4925                     while (locinput < reginfo->strend
4926                            && (len = is_GCB_Prepend_utf8(locinput)))
4927                     {
4928                         previous_prepend = locinput;
4929                         locinput += len;
4930                     }
4931
4932                     /* As noted above, if we matched a prepend character, but
4933                      * the next thing won't match, back off the last prepend we
4934                      * matched, as it is guaranteed to match the begin */
4935                     if (previous_prepend
4936                         && (locinput >=  reginfo->strend
4937                             || (! swash_fetch(PL_utf8_X_regular_begin,
4938                                              (U8*)locinput, utf8_target)
4939                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4940                         )
4941                     {
4942                         locinput = previous_prepend;
4943                     }
4944
4945                     /* Note that here we know reginfo->strend > locinput, as we
4946                      * tested that upon input to this switch case, and if we
4947                      * moved locinput forward, we tested the result just above
4948                      * and it either passed, or we backed off so that it will
4949                      * now pass */
4950                     if (swash_fetch(PL_utf8_X_regular_begin,
4951                                     (U8*)locinput, utf8_target)) {
4952                         locinput += UTF8SKIP(locinput);
4953                     }
4954                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4955
4956                         /* Here did not match the required 'Begin' in the
4957                          * second term.  So just match the very first
4958                          * character, the '.' of the final term of the regex */
4959                         locinput = starting + UTF8SKIP(starting);
4960                         goto exit_utf8;
4961                     } else {
4962
4963                         /* Here is a special begin.  It can be composed of
4964                          * several individual characters.  One possibility is
4965                          * RI+ */
4966                         if ((len = is_GCB_RI_utf8(locinput))) {
4967                             locinput += len;
4968                             while (locinput < reginfo->strend
4969                                    && (len = is_GCB_RI_utf8(locinput)))
4970                             {
4971                                 locinput += len;
4972                             }
4973                         } else if ((len = is_GCB_T_utf8(locinput))) {
4974                             /* Another possibility is T+ */
4975                             locinput += len;
4976                             while (locinput < reginfo->strend
4977                                 && (len = is_GCB_T_utf8(locinput)))
4978                             {
4979                                 locinput += len;
4980                             }
4981                         } else {
4982
4983                             /* Here, neither RI+ nor T+; must be some other
4984                              * Hangul.  That means it is one of the others: L,
4985                              * LV, LVT or V, and matches:
4986                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4987
4988                             /* Match L*           */
4989                             while (locinput < reginfo->strend
4990                                    && (len = is_GCB_L_utf8(locinput)))
4991                             {
4992                                 locinput += len;
4993                             }
4994
4995                             /* Here, have exhausted L*.  If the next character
4996                              * is not an LV, LVT nor V, it means we had to have
4997                              * at least one L, so matches L+ in the original
4998                              * equation, we have a complete hangul syllable.
4999                              * Are done. */
5000
5001                             if (locinput < reginfo->strend
5002                                 && is_GCB_LV_LVT_V_utf8(locinput))
5003                             {
5004                                 /* Otherwise keep going.  Must be LV, LVT or V.
5005                                  * See if LVT, by first ruling out V, then LV */
5006                                 if (! is_GCB_V_utf8(locinput)
5007                                         /* All but every TCount one is LV */
5008                                     && (valid_utf8_to_uvchr((U8 *) locinput,
5009                                                                          NULL)
5010                                                                         - SBASE)
5011                                         % TCount != 0)
5012                                 {
5013                                     locinput += UTF8SKIP(locinput);
5014                                 } else {
5015
5016                                     /* Must be  V or LV.  Take it, then match
5017                                      * V*     */
5018                                     locinput += UTF8SKIP(locinput);
5019                                     while (locinput < reginfo->strend
5020                                            && (len = is_GCB_V_utf8(locinput)))
5021                                     {
5022                                         locinput += len;
5023                                     }
5024                                 }
5025
5026                                 /* And any of LV, LVT, or V can be followed
5027                                  * by T*            */
5028                                 while (locinput < reginfo->strend
5029                                        && (len = is_GCB_T_utf8(locinput)))
5030                                 {
5031                                     locinput += len;
5032                                 }
5033                             }
5034                         }
5035                     }
5036
5037                     /* Match any extender */
5038                     while (locinput < reginfo->strend
5039                             && swash_fetch(PL_utf8_X_extend,
5040                                             (U8*)locinput, utf8_target))
5041                     {
5042                         locinput += UTF8SKIP(locinput);
5043                     }
5044                 }
5045             exit_utf8:
5046                 if (locinput > reginfo->strend) sayNO;
5047             }
5048             break;
5049
5050         case NREFFL:  /*  /\g{name}/il  */
5051         {   /* The capture buffer cases.  The ones beginning with N for the
5052                named buffers just convert to the equivalent numbered and
5053                pretend they were called as the corresponding numbered buffer
5054                op.  */
5055             /* don't initialize these in the declaration, it makes C++
5056                unhappy */
5057             const char *s;
5058             char type;
5059             re_fold_t folder;
5060             const U8 *fold_array;
5061             UV utf8_fold_flags;
5062
5063             folder = foldEQ_locale;
5064             fold_array = PL_fold_locale;
5065             type = REFFL;
5066             utf8_fold_flags = FOLDEQ_LOCALE;
5067             goto do_nref;
5068
5069         case NREFFA:  /*  /\g{name}/iaa  */
5070             folder = foldEQ_latin1;
5071             fold_array = PL_fold_latin1;
5072             type = REFFA;
5073             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5074             goto do_nref;
5075
5076         case NREFFU:  /*  /\g{name}/iu  */
5077             folder = foldEQ_latin1;
5078             fold_array = PL_fold_latin1;
5079             type = REFFU;
5080             utf8_fold_flags = 0;
5081             goto do_nref;
5082
5083         case NREFF:  /*  /\g{name}/i  */
5084             folder = foldEQ;
5085             fold_array = PL_fold;
5086             type = REFF;
5087             utf8_fold_flags = 0;
5088             goto do_nref;
5089
5090         case NREF:  /*  /\g{name}/   */
5091             type = REF;
5092             folder = NULL;
5093             fold_array = NULL;
5094             utf8_fold_flags = 0;
5095           do_nref:
5096
5097             /* For the named back references, find the corresponding buffer
5098              * number */
5099             n = reg_check_named_buff_matched(rex,scan);
5100
5101             if ( ! n ) {
5102                 sayNO;
5103             }
5104             goto do_nref_ref_common;
5105
5106         case REFFL:  /*  /\1/il  */
5107             folder = foldEQ_locale;
5108             fold_array = PL_fold_locale;
5109             utf8_fold_flags = FOLDEQ_LOCALE;
5110             goto do_ref;
5111
5112         case REFFA:  /*  /\1/iaa  */
5113             folder = foldEQ_latin1;
5114             fold_array = PL_fold_latin1;
5115             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5116             goto do_ref;
5117
5118         case REFFU:  /*  /\1/iu  */
5119             folder = foldEQ_latin1;
5120             fold_array = PL_fold_latin1;
5121             utf8_fold_flags = 0;
5122             goto do_ref;
5123
5124         case REFF:  /*  /\1/i  */
5125             folder = foldEQ;
5126             fold_array = PL_fold;
5127             utf8_fold_flags = 0;
5128             goto do_ref;
5129
5130         case REF:  /*  /\1/    */
5131             folder = NULL;
5132             fold_array = NULL;
5133             utf8_fold_flags = 0;
5134
5135           do_ref:
5136             type = OP(scan);
5137             n = ARG(scan);  /* which paren pair */
5138
5139           do_nref_ref_common:
5140             ln = rex->offs[n].start;
5141             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5142             if (rex->lastparen < n || ln == -1)
5143                 sayNO;                  /* Do not match unless seen CLOSEn. */
5144             if (ln == rex->offs[n].end)
5145                 break;
5146
5147             s = reginfo->strbeg + ln;
5148             if (type != REF     /* REF can do byte comparison */
5149                 && (utf8_target || type == REFFU || type == REFFL))
5150             {
5151                 char * limit = reginfo->strend;
5152
5153                 /* This call case insensitively compares the entire buffer
5154                     * at s, with the current input starting at locinput, but
5155                     * not going off the end given by reginfo->strend, and
5156                     * returns in <limit> upon success, how much of the
5157                     * current input was matched */
5158                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
5159                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
5160                 {
5161                     sayNO;
5162                 }
5163                 locinput = limit;
5164                 break;
5165             }
5166
5167             /* Not utf8:  Inline the first character, for speed. */
5168             if (!NEXTCHR_IS_EOS &&
5169                 UCHARAT(s) != nextchr &&
5170                 (type == REF ||
5171                  UCHARAT(s) != fold_array[nextchr]))
5172                 sayNO;
5173             ln = rex->offs[n].end - ln;
5174             if (locinput + ln > reginfo->strend)
5175                 sayNO;
5176             if (ln > 1 && (type == REF
5177                            ? memNE(s, locinput, ln)
5178                            : ! folder(s, locinput, ln)))
5179                 sayNO;
5180             locinput += ln;
5181             break;
5182         }
5183
5184         case NOTHING: /* null op; e.g. the 'nothing' following
5185                        * the '*' in m{(a+|b)*}' */
5186             break;
5187         case TAIL: /* placeholder while compiling (A|B|C) */
5188             break;
5189
5190         case BACK: /* ??? doesn't appear to be used ??? */
5191             break;
5192
5193 #undef  ST
5194 #define ST st->u.eval
5195         {
5196             SV *ret;
5197             REGEXP *re_sv;
5198             regexp *re;
5199             regexp_internal *rei;
5200             regnode *startpoint;
5201
5202         case GOSTART: /*  (?R)  */
5203         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
5204             if (cur_eval && cur_eval->locinput==locinput) {
5205                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
5206                     Perl_croak(aTHX_ "Infinite recursion in regex");
5207                 if ( ++nochange_depth > max_nochange_depth )
5208                     Perl_croak(aTHX_
5209                         "Pattern subroutine nesting without pos change"
5210                         " exceeded limit in regex");
5211             } else {
5212                 nochange_depth = 0;
5213             }
5214             re_sv = rex_sv;
5215             re = rex;
5216             rei = rexi;
5217             if (OP(scan)==GOSUB) {
5218                 startpoint = scan + ARG2L(scan);
5219                 ST.close_paren = ARG(scan);
5220             } else {
5221                 startpoint = rei->program+1;
5222                 ST.close_paren = 0;
5223             }
5224
5225             /* Save all the positions seen so far. */
5226             ST.cp = regcppush(rex, 0, maxopenparen);
5227             REGCP_SET(ST.lastcp);
5228
5229             /* and then jump to the code we share with EVAL */
5230             goto eval_recurse_doit;
5231
5232             assert(0); /* NOTREACHED */
5233
5234         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
5235             if (cur_eval && cur_eval->locinput==locinput) {
5236                 if ( ++nochange_depth > max_nochange_depth )
5237                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
5238             } else {
5239                 nochange_depth = 0;
5240             }
5241             {
5242                 /* execute the code in the {...} */
5243
5244                 dSP;
5245                 IV before;
5246                 OP * const oop = PL_op;
5247                 COP * const ocurcop = PL_curcop;
5248                 OP *nop;
5249                 CV *newcv;
5250
5251                 /* save *all* paren positions */
5252                 regcppush(rex, 0, maxopenparen);
5253                 REGCP_SET(runops_cp);
5254
5255                 if (!caller_cv)
5256                     caller_cv = find_runcv(NULL);
5257
5258                 n = ARG(scan);
5259
5260                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
5261                     newcv = (ReANY(
5262                                                 (REGEXP*)(rexi->data->data[n])
5263                                             ))->qr_anoncv
5264                                         ;
5265                     nop = (OP*)rexi->data->data[n+1];
5266                 }
5267                 else if (rexi->data->what[n] == 'l') { /* literal code */
5268                     newcv = caller_cv;
5269                     nop = (OP*)rexi->data->data[n];
5270                     assert(CvDEPTH(newcv));
5271                 }
5272                 else {
5273                     /* literal with own CV */
5274                     assert(rexi->data->what[n] == 'L');
5275                     newcv = rex->qr_anoncv;
5276                     nop = (OP*)rexi->data->data[n];
5277                 }
5278
5279                 /* normally if we're about to execute code from the same
5280                  * CV that we used previously, we just use the existing
5281                  * CX stack entry. However, its possible that in the
5282                  * meantime we may have backtracked, popped from the save
5283                  * stack, and undone the SAVECOMPPAD(s) associated with
5284                  * PUSH_MULTICALL; in which case PL_comppad no longer
5285                  * points to newcv's pad. */
5286                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5287                 {
5288                     U8 flags = (CXp_SUB_RE |
5289                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5290                     if (last_pushed_cv) {
5291                         CHANGE_MULTICALL_FLAGS(newcv, flags);
5292                     }
5293                     else {
5294                         PUSH_MULTICALL_FLAGS(newcv, flags);
5295                     }
5296                     last_pushed_cv = newcv;
5297                 }
5298                 else {
5299                     /* these assignments are just to silence compiler
5300                      * warnings */
5301                     multicall_cop = NULL;
5302                     newsp = NULL;
5303                 }
5304                 last_pad = PL_comppad;
5305
5306                 /* the initial nextstate you would normally execute
5307                  * at the start of an eval (which would cause error
5308                  * messages to come from the eval), may be optimised
5309                  * away from the execution path in the regex code blocks;
5310                  * so manually set PL_curcop to it initially */
5311                 {
5312                     OP *o = cUNOPx(nop)->op_first;
5313                     assert(o->op_type == OP_NULL);
5314                     if (o->op_targ == OP_SCOPE) {
5315                         o = cUNOPo->op_first;
5316                     }
5317                     else {
5318                         assert(o->op_targ == OP_LEAVE);
5319                         o = cUNOPo->op_first;
5320                         assert(o->op_type == OP_ENTER);
5321                         o = OP_SIBLING(o);
5322                     }
5323
5324                     if (o->op_type != OP_STUB) {
5325                         assert(    o->op_type == OP_NEXTSTATE
5326                                 || o->op_type == OP_DBSTATE
5327                                 || (o->op_type == OP_NULL
5328                                     &&  (  o->op_targ == OP_NEXTSTATE
5329                                         || o->op_targ == OP_DBSTATE
5330                                         )
5331                                     )
5332                         );
5333                         PL_curcop = (COP*)o;
5334                     }
5335                 }
5336                 nop = nop->op_next;
5337
5338                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5339                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5340
5341                 rex->offs[0].end = locinput - reginfo->strbeg;
5342                 if (reginfo->info_aux_eval->pos_magic)
5343                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5344                                   reginfo->sv, reginfo->strbeg,
5345                                   locinput - reginfo->strbeg);
5346
5347                 if (sv_yes_mark) {
5348                     SV *sv_mrk = get_sv("REGMARK", 1);
5349                     sv_setsv(sv_mrk, sv_yes_mark);
5350                 }
5351
5352                 /* we don't use MULTICALL here as we want to call the
5353                  * first op of the block of interest, rather than the
5354                  * first op of the sub */
5355                 before = (IV)(SP-PL_stack_base);
5356                 PL_op = nop;
5357                 CALLRUNOPS(aTHX);                       /* Scalar context. */
5358                 SPAGAIN;
5359                 if ((IV)(SP-PL_stack_base) == before)
5360                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5361                 else {
5362                     ret = POPs;
5363                     PUTBACK;
5364                 }
5365
5366                 /* before restoring everything, evaluate the returned
5367                  * value, so that 'uninit' warnings don't use the wrong
5368                  * PL_op or pad. Also need to process any magic vars
5369                  * (e.g. $1) *before* parentheses are restored */
5370
5371                 PL_op = NULL;
5372
5373                 re_sv = NULL;
5374                 if (logical == 0)        /*   (?{})/   */
5375                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5376                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5377                     sw = cBOOL(SvTRUE(ret));
5378                     logical = 0;
5379                 }
5380                 else {                   /*  /(??{})  */
5381                     /*  if its overloaded, let the regex compiler handle
5382                      *  it; otherwise extract regex, or stringify  */
5383                     if (SvGMAGICAL(ret))
5384                         ret = sv_mortalcopy(ret);
5385                     if (!SvAMAGIC(ret)) {
5386                         SV *sv = ret;
5387                         if (SvROK(sv))
5388                             sv = SvRV(sv);
5389                         if (SvTYPE(sv) == SVt_REGEXP)
5390                             re_sv = (REGEXP*) sv;
5391                         else if (SvSMAGICAL(ret)) {
5392                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
5393                             if (mg)
5394                                 re_sv = (REGEXP *) mg->mg_obj;
5395                         }
5396
5397                         /* force any undef warnings here */
5398                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5399                             ret = sv_mortalcopy(ret);
5400                             (void) SvPV_force_nolen(ret);
5401                         }
5402                     }
5403
5404                 }
5405
5406                 /* *** Note that at this point we don't restore
5407                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5408                  * be used again soon. This is safe as long as nothing
5409                  * in the regexp code uses the pad ! */
5410                 PL_op = oop;
5411                 PL_curcop = ocurcop;
5412                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5413                 PL_curpm = PL_reg_curpm;
5414
5415                 if (logical != 2)
5416                     break;
5417             }
5418
5419                 /* only /(??{})/  from now on */
5420                 logical = 0;
5421                 {
5422                     /* extract RE object from returned value; compiling if
5423                      * necessary */
5424
5425                     if (re_sv) {
5426                         re_sv = reg_temp_copy(NULL, re_sv);
5427                     }
5428                     else {
5429                         U32 pm_flags = 0;
5430
5431                         if (SvUTF8(ret) && IN_BYTES) {
5432                             /* In use 'bytes': make a copy of the octet
5433                              * sequence, but without the flag on */
5434                             STRLEN len;
5435                             const char *const p = SvPV(ret, len);
5436                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5437                         }
5438                         if (rex->intflags & PREGf_USE_RE_EVAL)
5439                             pm_flags |= PMf_USE_RE_EVAL;
5440
5441                         /* if we got here, it should be an engine which
5442                          * supports compiling code blocks and stuff */
5443                         assert(rex->engine && rex->engine->op_comp);
5444                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5445                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5446                                     rex->engine, NULL, NULL,
5447                                     /* copy /msix etc to inner pattern */
5448                                     scan->flags,
5449                                     pm_flags);
5450
5451                         if (!(SvFLAGS(ret)
5452                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
5453                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
5454                             /* This isn't a first class regexp. Instead, it's
5455                                caching a regexp onto an existing, Perl visible
5456                                scalar.  */
5457                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5458                         }
5459                     }
5460                     SAVEFREESV(re_sv);
5461                     re = ReANY(re_sv);
5462                 }
5463                 RXp_MATCH_COPIED_off(re);
5464                 re->subbeg = rex->subbeg;
5465                 re->sublen = rex->sublen;
5466                 re->suboffset = rex->suboffset;
5467                 re->subcoffset = rex->subcoffset;
5468                 re->lastparen = 0;
5469                 re->lastcloseparen = 0;
5470                 rei = RXi_GET(re);
5471                 DEBUG_EXECUTE_r(
5472                     debug_start_match(re_sv, utf8_target, locinput,
5473                                     reginfo->strend, "Matching embedded");
5474                 );
5475                 startpoint = rei->program + 1;
5476                 ST.close_paren = 0; /* only used for GOSUB */
5477                 /* Save all the seen positions so far. */
5478                 ST.cp = regcppush(rex, 0, maxopenparen);
5479                 REGCP_SET(ST.lastcp);
5480                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
5481                 maxopenparen = 0;
5482                 /* run the pattern returned from (??{...}) */
5483
5484         eval_recurse_doit: /* Share code with GOSUB below this line
5485                             * At this point we expect the stack context to be
5486                             * set up correctly */
5487
5488                 /* invalidate the S-L poscache. We're now executing a
5489                  * different set of WHILEM ops (and their associated
5490                  * indexes) against the same string, so the bits in the
5491                  * cache are meaningless. Setting maxiter to zero forces
5492                  * the cache to be invalidated and zeroed before reuse.
5493                  * XXX This is too dramatic a measure. Ideally we should
5494                  * save the old cache and restore when running the outer
5495                  * pattern again */
5496                 reginfo->poscache_maxiter = 0;
5497
5498                 /* the new regexp might have a different is_utf8_pat than we do */
5499                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5500
5501                 ST.prev_rex = rex_sv;
5502                 ST.prev_curlyx = cur_curlyx;
5503                 rex_sv = re_sv;
5504                 SET_reg_curpm(rex_sv);
5505                 rex = re;
5506                 rexi = rei;
5507                 cur_curlyx = NULL;
5508                 ST.B = next;
5509                 ST.prev_eval = cur_eval;
5510                 cur_eval = st;
5511                 /* now continue from first node in postoned RE */
5512                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5513                 assert(0); /* NOTREACHED */
5514         }
5515
5516         case EVAL_AB: /* cleanup after a successful (??{A})B */
5517             /* note: this is called twice; first after popping B, then A */
5518             rex_sv = ST.prev_rex;
5519             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5520             SET_reg_curpm(rex_sv);
5521             rex = ReANY(rex_sv);
5522             rexi = RXi_GET(rex);
5523             {
5524                 /* preserve $^R across LEAVE's. See Bug 121070. */
5525                 SV *save_sv= GvSV(PL_replgv);
5526                 SvREFCNT_inc(save_sv);
5527                 regcpblow(ST.cp); /* LEAVE in disguise */
5528                 sv_setsv(GvSV(PL_replgv), save_sv);
5529                 SvREFCNT_dec(save_sv);
5530             }
5531             cur_eval = ST.prev_eval;
5532             cur_curlyx = ST.prev_curlyx;
5533
5534             /* Invalidate cache. See "invalidate" comment above. */
5535             reginfo->poscache_maxiter = 0;
5536             if ( nochange_depth )
5537                 nochange_depth--;
5538             sayYES;
5539
5540
5541         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5542             /* note: this is called twice; first after popping B, then A */
5543             rex_sv = ST.prev_rex;
5544             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5545             SET_reg_curpm(rex_sv);
5546             rex = ReANY(rex_sv);
5547             rexi = RXi_GET(rex);
5548
5549             REGCP_UNWIND(ST.lastcp);
5550             regcppop(rex, &maxopenparen);
5551             cur_eval = ST.prev_eval;
5552             cur_curlyx = ST.prev_curlyx;
5553             /* Invalidate cache. See "invalidate" comment above. */
5554             reginfo->poscache_maxiter = 0;
5555             if ( nochange_depth )
5556                 nochange_depth--;
5557             sayNO_SILENT;
5558 #undef ST
5559
5560         case OPEN: /*  (  */
5561             n = ARG(scan);  /* which paren pair */
5562             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5563             if (n > maxopenparen)
5564                 maxopenparen = n;
5565             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5566                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5567                 PTR2UV(rex),
5568                 PTR2UV(rex->offs),
5569                 (UV)n,
5570                 (IV)rex->offs[n].start_tmp,
5571                 (UV)maxopenparen
5572             ));
5573             lastopen = n;
5574             break;
5575
5576 /* XXX really need to log other places start/end are set too */
5577 #define CLOSE_CAPTURE \
5578     rex->offs[n].start = rex->offs[n].start_tmp; \
5579     rex->offs[n].end = locinput - reginfo->strbeg; \
5580     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5581         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5582         PTR2UV(rex), \
5583         PTR2UV(rex->offs), \
5584         (UV)n, \
5585         (IV)rex->offs[n].start, \
5586         (IV)rex->offs[n].end \
5587     ))
5588
5589         case CLOSE:  /*  )  */
5590             n = ARG(scan);  /* which paren pair */
5591             CLOSE_CAPTURE;
5592             if (n > rex->lastparen)
5593                 rex->lastparen = n;
5594             rex->lastcloseparen = n;
5595             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5596                 goto fake_end;
5597             }
5598             break;
5599
5600         case ACCEPT:  /*  (*ACCEPT)  */
5601             if (ARG(scan)){
5602                 regnode *cursor;
5603                 for (cursor=scan;
5604                      cursor && OP(cursor)!=END;
5605                      cursor=regnext(cursor))
5606                 {
5607                     if ( OP(cursor)==CLOSE ){
5608                         n = ARG(cursor);
5609                         if ( n <= lastopen ) {
5610                             CLOSE_CAPTURE;
5611                             if (n > rex->lastparen)
5612                                 rex->lastparen = n;
5613                             rex->lastcloseparen = n;
5614                             if ( n == ARG(scan) || (cur_eval &&
5615                                 cur_eval->u.eval.close_paren == n))
5616                                 break;
5617                         }
5618                     }
5619                 }
5620             }
5621             goto fake_end;
5622             /*NOTREACHED*/
5623
5624         case GROUPP:  /*  (?(1))  */
5625             n = ARG(scan);  /* which paren pair */
5626             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5627             break;
5628
5629         case NGROUPP:  /*  (?(<name>))  */
5630             /* reg_check_named_buff_matched returns 0 for no match */
5631             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5632             break;
5633
5634         case INSUBP:   /*  (?(R))  */
5635             n = ARG(scan);
5636             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5637             break;
5638
5639         case DEFINEP:  /*  (?(DEFINE))  */
5640             sw = 0;
5641             break;
5642
5643         case IFTHEN:   /*  (?(cond)A|B)  */
5644             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5645             if (sw)
5646                 next = NEXTOPER(NEXTOPER(scan));
5647             else {
5648                 next = scan + ARG(scan);
5649                 if (OP(next) == IFTHEN) /* Fake one. */
5650                     next = NEXTOPER(NEXTOPER(next));
5651             }
5652             break;
5653
5654         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5655             logical = scan->flags;
5656             break;
5657
5658 /*******************************************************************
5659
5660 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5661 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5662 STAR/PLUS/CURLY/CURLYN are used instead.)
5663
5664 A*B is compiled as <CURLYX><A><WHILEM><B>
5665
5666 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5667 state, which contains the current count, initialised to -1. It also sets
5668 cur_curlyx to point to this state, with any previous value saved in the
5669 state block.
5670
5671 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5672 since the pattern may possibly match zero times (i.e. it's a while {} loop
5673 rather than a do {} while loop).
5674
5675 Each entry to WHILEM represents a successful match of A. The count in the
5676 CURLYX block is incremented, another WHILEM state is pushed, and execution
5677 passes to A or B depending on greediness and the current count.
5678
5679 For example, if matching against the string a1a2a3b (where the aN are
5680 substrings that match /A/), then the match progresses as follows: (the
5681 pushed states are interspersed with the bits of strings matched so far):
5682
5683     <CURLYX cnt=-1>
5684     <CURLYX cnt=0><WHILEM>
5685     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5686     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5687     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5688     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5689
5690 (Contrast this with something like CURLYM, which maintains only a single
5691 backtrack state:
5692
5693     <CURLYM cnt=0> a1
5694     a1 <CURLYM cnt=1> a2
5695     a1 a2 <CURLYM cnt=2> a3
5696     a1 a2 a3 <CURLYM cnt=3> b
5697 )
5698
5699 Each WHILEM state block marks a point to backtrack to upon partial failure
5700 of A or B, and also contains some minor state data related to that
5701 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5702 overall state, such as the count, and pointers to the A and B ops.
5703
5704 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5705 must always point to the *current* CURLYX block, the rules are:
5706
5707 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5708 and set cur_curlyx to point the new block.
5709
5710 When popping the CURLYX block after a successful or unsuccessful match,
5711 restore the previous cur_curlyx.
5712
5713 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5714 to the outer one saved in the CURLYX block.
5715
5716 When popping the WHILEM block after a successful or unsuccessful B match,
5717 restore the previous cur_curlyx.
5718
5719 Here's an example for the pattern (AI* BI)*BO
5720 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5721
5722 cur_
5723 curlyx backtrack stack
5724 ------ ---------------
5725 NULL
5726 CO     <CO prev=NULL> <WO>
5727 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5728 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5729 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5730
5731 At this point the pattern succeeds, and we work back down the stack to
5732 clean up, restoring as we go:
5733
5734 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5735 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5736 CO     <CO prev=NULL> <WO>
5737 NULL
5738
5739 *******************************************************************/
5740
5741 #define ST st->u.curlyx
5742
5743         case CURLYX:    /* start of /A*B/  (for complex A) */
5744         {
5745             /* No need to save/restore up to this paren */
5746             I32 parenfloor = scan->flags;
5747
5748             assert(next); /* keep Coverity happy */
5749             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5750                 next += ARG(next);
5751
5752             /* XXXX Probably it is better to teach regpush to support
5753                parenfloor > maxopenparen ... */
5754             if (parenfloor > (I32)rex->lastparen)
5755                 parenfloor = rex->lastparen; /* Pessimization... */
5756
5757             ST.prev_curlyx= cur_curlyx;
5758             cur_curlyx = st;
5759             ST.cp = PL_savestack_ix;
5760
5761             /* these fields contain the state of the current curly.
5762              * they are accessed by subsequent WHILEMs */
5763             ST.parenfloor = parenfloor;
5764             ST.me = scan;
5765             ST.B = next;
5766             ST.minmod = minmod;
5767             minmod = 0;
5768             ST.count = -1;      /* this will be updated by WHILEM */
5769             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5770
5771             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5772             assert(0); /* NOTREACHED */
5773         }
5774
5775         case CURLYX_end: /* just finished matching all of A*B */
5776             cur_curlyx = ST.prev_curlyx;
5777             sayYES;
5778             assert(0); /* NOTREACHED */
5779
5780         case CURLYX_end_fail: /* just failed to match all of A*B */
5781             regcpblow(ST.cp);
5782             cur_curlyx = ST.prev_curlyx;
5783             sayNO;
5784             assert(0); /* NOTREACHED */
5785
5786
5787 #undef ST
5788 #define ST st->u.whilem
5789
5790         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5791         {
5792             /* see the discussion above about CURLYX/WHILEM */
5793             I32 n;
5794             int min, max;
5795             regnode *A;
5796
5797             assert(cur_curlyx); /* keep Coverity happy */
5798
5799             min = ARG1(cur_curlyx->u.curlyx.me);
5800             max = ARG2(cur_curlyx->u.curlyx.me);
5801             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5802             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5803             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5804             ST.cache_offset = 0;
5805             ST.cache_mask = 0;
5806
5807
5808             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5809                   "%*s  whilem: matched %ld out of %d..%d\n",
5810                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5811             );
5812
5813             /* First just match a string of min A's. */
5814
5815             if (n < min) {
5816                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5817                                     maxopenparen);
5818                 cur_curlyx->u.curlyx.lastloc = locinput;
5819                 REGCP_SET(ST.lastcp);
5820
5821                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5822                 assert(0); /* NOTREACHED */
5823             }
5824
5825             /* If degenerate A matches "", assume A done. */
5826
5827             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5828                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5829                    "%*s  whilem: empty match detected, trying continuation...\n",
5830                    REPORT_CODE_OFF+depth*2, "")
5831                 );
5832                 goto do_whilem_B_max;
5833             }
5834
5835             /* super-linear cache processing.
5836              *
5837              * The idea here is that for certain types of CURLYX/WHILEM -
5838              * principally those whose upper bound is infinity (and
5839              * excluding regexes that have things like \1 and other very
5840              * non-regular expresssiony things), then if a pattern like
5841              * /....A*.../ fails and we backtrack to the WHILEM, then we
5842              * make a note that this particular WHILEM op was at string
5843              * position 47 (say) when the rest of pattern failed. Then, if
5844              * we ever find ourselves back at that WHILEM, and at string
5845              * position 47 again, we can just fail immediately rather than
5846              * running the rest of the pattern again.
5847              *
5848              * This is very handy when patterns start to go
5849              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5850              * with a combinatorial explosion of backtracking.
5851              *
5852              * The cache is implemented as a bit array, with one bit per
5853              * string byte position per WHILEM op (up to 16) - so its
5854              * between 0.25 and 2x the string size.
5855              *
5856              * To avoid allocating a poscache buffer every time, we do an
5857              * initially countdown; only after we have  executed a WHILEM
5858              * op (string-length x #WHILEMs) times do we allocate the
5859              * cache.
5860              *
5861              * The top 4 bits of scan->flags byte say how many different
5862              * relevant CURLLYX/WHILEM op pairs there are, while the
5863              * bottom 4-bits is the identifying index number of this
5864              * WHILEM.
5865              */
5866
5867             if (scan->flags) {
5868
5869                 if (!reginfo->poscache_maxiter) {
5870                     /* start the countdown: Postpone detection until we
5871                      * know the match is not *that* much linear. */
5872                     reginfo->poscache_maxiter
5873                         =    (reginfo->strend - reginfo->strbeg + 1)
5874                            * (scan->flags>>4);
5875                     /* possible overflow for long strings and many CURLYX's */
5876                     if (reginfo->poscache_maxiter < 0)
5877                         reginfo->poscache_maxiter = I32_MAX;
5878                     reginfo->poscache_iter = reginfo->poscache_maxiter;
5879                 }
5880
5881                 if (reginfo->poscache_iter-- == 0) {
5882                     /* initialise cache */
5883                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5884                     regmatch_info_aux *const aux = reginfo->info_aux;
5885                     if (aux->poscache) {
5886                         if ((SSize_t)reginfo->poscache_size < size) {
5887                             Renew(aux->poscache, size, char);
5888                             reginfo->poscache_size = size;
5889                         }
5890                         Zero(aux->poscache, size, char);
5891                     }
5892                     else {
5893                         reginfo->poscache_size = size;
5894                         Newxz(aux->poscache, size, char);
5895                     }
5896                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5897       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5898                               PL_colors[4], PL_colors[5])
5899                     );
5900                 }
5901
5902                 if (reginfo->poscache_iter < 0) {
5903                     /* have we already failed at this position? */
5904                     SSize_t offset, mask;
5905
5906                     reginfo->poscache_iter = -1; /* stop eventual underflow */
5907                     offset  = (scan->flags & 0xf) - 1
5908                                 +   (locinput - reginfo->strbeg)
5909                                   * (scan->flags>>4);
5910                     mask    = 1 << (offset % 8);
5911                     offset /= 8;
5912                     if (reginfo->info_aux->poscache[offset] & mask) {
5913                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5914                             "%*s  whilem: (cache) already tried at this position...\n",
5915                             REPORT_CODE_OFF+depth*2, "")
5916                         );
5917                         sayNO; /* cache records failure */
5918                     }
5919                     ST.cache_offset = offset;
5920                     ST.cache_mask   = mask;
5921                 }
5922             }
5923
5924             /* Prefer B over A for minimal matching. */
5925
5926             if (cur_curlyx->u.curlyx.minmod) {
5927                 ST.save_curlyx = cur_curlyx;
5928                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5929                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5930                             maxopenparen);
5931                 REGCP_SET(ST.lastcp);
5932                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5933                                     locinput);
5934                 assert(0); /* NOTREACHED */
5935             }
5936
5937             /* Prefer A over B for maximal matching. */
5938
5939             if (n < max) { /* More greed allowed? */
5940                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5941                             maxopenparen);
5942                 cur_curlyx->u.curlyx.lastloc = locinput;
5943                 REGCP_SET(ST.lastcp);
5944                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5945                 assert(0); /* NOTREACHED */
5946             }
5947             goto do_whilem_B_max;
5948         }
5949         assert(0); /* NOTREACHED */
5950
5951         case WHILEM_B_min: /* just matched B in a minimal match */
5952         case WHILEM_B_max: /* just matched B in a maximal match */
5953             cur_curlyx = ST.save_curlyx;
5954             sayYES;
5955             assert(0); /* NOTREACHED */
5956
5957         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5958             cur_curlyx = ST.save_curlyx;
5959             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5960             cur_curlyx->u.curlyx.count--;
5961             CACHEsayNO;
5962             assert(0); /* NOTREACHED */
5963
5964         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5965             /* FALLTHROUGH */
5966         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5967             REGCP_UNWIND(ST.lastcp);
5968             regcppop(rex, &maxopenparen);
5969             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5970             cur_curlyx->u.curlyx.count--;
5971             CACHEsayNO;
5972             assert(0); /* NOTREACHED */
5973
5974         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5975             REGCP_UNWIND(ST.lastcp);
5976             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5977             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5978                 "%*s  whilem: failed, trying continuation...\n",
5979                 REPORT_CODE_OFF+depth*2, "")
5980             );
5981           do_whilem_B_max:
5982             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5983                 && ckWARN(WARN_REGEXP)
5984                 && !reginfo->warned)
5985             {
5986                 reginfo->warned = TRUE;
5987                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5988                      "Complex regular subexpression recursion limit (%d) "
5989                      "exceeded",
5990                      REG_INFTY - 1);
5991             }
5992
5993             /* now try B */
5994             ST.save_curlyx = cur_curlyx;
5995             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5996             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5997                                 locinput);
5998             assert(0); /* NOTREACHED */
5999
6000         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6001             cur_curlyx = ST.save_curlyx;
6002             REGCP_UNWIND(ST.lastcp);
6003             regcppop(rex, &maxopenparen);
6004
6005             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6006                 /* Maximum greed exceeded */
6007                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6008                     && ckWARN(WARN_REGEXP)
6009                     && !reginfo->warned)
6010                 {
6011                     reginfo->warned     = TRUE;
6012                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6013                         "Complex regular subexpression recursion "
6014                         "limit (%d) exceeded",
6015                         REG_INFTY - 1);
6016                 }
6017                 cur_curlyx->u.curlyx.count--;
6018                 CACHEsayNO;
6019             }
6020
6021             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6022                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6023             );
6024             /* Try grabbing another A and see if it helps. */
6025             cur_curlyx->u.curlyx.lastloc = locinput;
6026             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6027                             maxopenparen);
6028             REGCP_SET(ST.lastcp);
6029             PUSH_STATE_GOTO(WHILEM_A_min,
6030                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6031                 locinput);
6032             assert(0); /* NOTREACHED */
6033
6034 #undef  ST
6035 #define ST st->u.branch
6036
6037         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
6038             next = scan + ARG(scan);
6039             if (next == scan)
6040                 next = NULL;
6041             scan = NEXTOPER(scan);
6042             /* FALLTHROUGH */
6043
6044         case BRANCH:        /*  /(...|A|...)/ */
6045             scan = NEXTOPER(scan); /* scan now points to inner node */
6046             ST.lastparen = rex->lastparen;
6047             ST.lastcloseparen = rex->lastcloseparen;
6048             ST.next_branch = next;
6049             REGCP_SET(ST.cp);
6050
6051             /* Now go into the branch */
6052             if (has_cutgroup) {
6053                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6054             } else {
6055                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6056             }
6057             assert(0); /* NOTREACHED */
6058
6059         case CUTGROUP:  /*  /(*THEN)/  */
6060             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6061                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6062             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6063             assert(0); /* NOTREACHED */
6064
6065         case CUTGROUP_next_fail:
6066             do_cutgroup = 1;
6067             no_final = 1;
6068             if (st->u.mark.mark_name)
6069                 sv_commit = st->u.mark.mark_name;
6070             sayNO;
6071             assert(0); /* NOTREACHED */
6072
6073         case BRANCH_next:
6074             sayYES;
6075             assert(0); /* NOTREACHED */
6076
6077         case BRANCH_next_fail: /* that branch failed; try the next, if any */
6078             if (do_cutgroup) {
6079                 do_cutgroup = 0;
6080                 no_final = 0;
6081             }
6082             REGCP_UNWIND(ST.cp);
6083             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6084             scan = ST.next_branch;
6085             /* no more branches? */
6086             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6087                 DEBUG_EXECUTE_r({
6088                     PerlIO_printf( Perl_debug_log,
6089                         "%*s  %sBRANCH failed...%s\n",
6090                         REPORT_CODE_OFF+depth*2, "",
6091                         PL_colors[4],
6092                         PL_colors[5] );
6093                 });
6094                 sayNO_SILENT;
6095             }
6096             continue; /* execute next BRANCH[J] op */
6097             assert(0); /* NOTREACHED */
6098
6099         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
6100             minmod = 1;
6101             break;
6102
6103 #undef  ST
6104 #define ST st->u.curlym
6105
6106         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
6107
6108             /* This is an optimisation of CURLYX that enables us to push
6109              * only a single backtracking state, no matter how many matches
6110              * there are in {m,n}. It relies on the pattern being constant
6111              * length, with no parens to influence future backrefs
6112              */
6113
6114             ST.me = scan;
6115             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6116
6117             ST.lastparen      = rex->lastparen;
6118             ST.lastcloseparen = rex->lastcloseparen;
6119
6120             /* if paren positive, emulate an OPEN/CLOSE around A */
6121             if (ST.me->flags) {
6122                 U32 paren = ST.me->flags;
6123                 if (paren > maxopenparen)
6124                     maxopenparen = paren;
6125                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6126             }
6127             ST.A = scan;
6128             ST.B = next;
6129             ST.alen = 0;
6130             ST.count = 0;
6131             ST.minmod = minmod;
6132             minmod = 0;
6133             ST.c1 = CHRTEST_UNINIT;
6134             REGCP_SET(ST.cp);
6135
6136             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6137                 goto curlym_do_B;
6138
6139           curlym_do_A: /* execute the A in /A{m,n}B/  */
6140             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
6141             assert(0); /* NOTREACHED */
6142
6143         case CURLYM_A: /* we've just matched an A */
6144             ST.count++;
6145             /* after first match, determine A's length: u.curlym.alen */
6146             if (ST.count == 1) {
6147                 if (reginfo->is_utf8_target) {
6148                     char *s = st->locinput;
6149                     while (s < locinput) {
6150                         ST.alen++;
6151                         s += UTF8SKIP(s);
6152                     }
6153                 }
6154                 else {
6155                     ST.alen = locinput - st->locinput;
6156                 }
6157                 if (ST.alen == 0)
6158                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6159             }
6160             DEBUG_EXECUTE_r(
6161                 PerlIO_printf(Perl_debug_log,
6162                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
6163                           (int)(REPORT_CODE_OFF+(depth*2)), "",
6164                           (IV) ST.count, (IV)ST.alen)
6165             );
6166
6167             if (cur_eval && cur_eval->u.eval.close_paren &&
6168                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6169                 goto fake_end;
6170
6171             {
6172                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6173                 if ( max == REG_INFTY || ST.count < max )
6174                     goto curlym_do_A; /* try to match another A */
6175             }
6176             goto curlym_do_B; /* try to match B */
6177
6178         case CURLYM_A_fail: /* just failed to match an A */
6179             REGCP_UNWIND(ST.cp);
6180
6181             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6182                 || (cur_eval && cur_eval->u.eval.close_paren &&
6183                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
6184                 sayNO;
6185
6186           curlym_do_B: /* execute the B in /A{m,n}B/  */
6187             if (ST.c1 == CHRTEST_UNINIT) {
6188                 /* calculate c1 and c2 for possible match of 1st char
6189                  * following curly */
6190                 ST.c1 = ST.c2 = CHRTEST_VOID;
6191                 assert(ST.B);
6192                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6193                     regnode *text_node = ST.B;
6194                     if (! HAS_TEXT(text_node))
6195                         FIND_NEXT_IMPT(text_node);
6196                     /* this used to be
6197
6198                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6199
6200                         But the former is redundant in light of the latter.
6201
6202                         if this changes back then the macro for
6203                         IS_TEXT and friends need to change.
6204                      */
6205                     if (PL_regkind[OP(text_node)] == EXACT) {
6206                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6207                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6208                            reginfo))
6209                         {
6210                             sayNO;
6211                         }
6212                     }
6213                 }
6214             }
6215
6216             DEBUG_EXECUTE_r(
6217                 PerlIO_printf(Perl_debug_log,
6218                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
6219                     (int)(REPORT_CODE_OFF+(depth*2)),
6220                     "", (IV)ST.count)
6221                 );
6222             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
6223                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6224                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6225                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6226                     {
6227                         /* simulate B failing */
6228                         DEBUG_OPTIMISE_r(
6229                             PerlIO_printf(Perl_debug_log,
6230                                 "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
6231                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
6232                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
6233                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6234                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6235                         );
6236                         state_num = CURLYM_B_fail;
6237                         goto reenter_switch;
6238                     }
6239                 }
6240                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
6241                     /* simulate B failing */
6242                     DEBUG_OPTIMISE_r(
6243                         PerlIO_printf(Perl_debug_log,
6244                             "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
6245                             (int)(REPORT_CODE_OFF+(depth*2)),"",
6246                             (int) nextchr, ST.c1, ST.c2)
6247                     );
6248                     state_num = CURLYM_B_fail;
6249                     goto reenter_switch;
6250                 }
6251             }
6252
6253             if (ST.me->flags) {
6254                 /* emulate CLOSE: mark current A as captured */
6255                 I32 paren = ST.me->flags;
6256                 if (ST.count) {
6257                     rex->offs[paren].start
6258                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6259                     rex->offs[paren].end = locinput - reginfo->strbeg;
6260                     if ((U32)paren > rex->lastparen)
6261                         rex->lastparen = paren;
6262                     rex->lastcloseparen = paren;
6263                 }
6264                 else
6265                     rex->offs[paren].end = -1;
6266                 if (cur_eval && cur_eval->u.eval.close_paren &&
6267                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6268                 {
6269                     if (ST.count)
6270                         goto fake_end;
6271                     else
6272                         sayNO;
6273                 }
6274             }
6275
6276             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
6277             assert(0); /* NOTREACHED */
6278
6279         case CURLYM_B_fail: /* just failed to match a B */
6280             REGCP_UNWIND(ST.cp);
6281             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6282             if (ST.minmod) {
6283                 I32 max = ARG2(ST.me);
6284                 if (max != REG_INFTY && ST.count == max)
6285                     sayNO;
6286                 goto curlym_do_A; /* try to match a further A */
6287             }
6288             /* backtrack one A */
6289             if (ST.count == ARG1(ST.me) /* min */)
6290                 sayNO;
6291             ST.count--;
6292             SET_locinput(HOPc(locinput, -ST.alen));
6293             goto curlym_do_B; /* try to match B */
6294
6295 #undef ST
6296 #define ST st->u.curly
6297
6298 #define CURLY_SETPAREN(paren, success) \
6299     if (paren) { \
6300         if (success) { \
6301             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6302             rex->offs[paren].end = locinput - reginfo->strbeg; \
6303             if (paren > rex->lastparen) \
6304                 rex->lastparen = paren; \
6305             rex->lastcloseparen = paren; \
6306         } \
6307         else { \
6308             rex->offs[paren].end = -1; \
6309             rex->lastparen      = ST.lastparen; \
6310             rex->lastcloseparen = ST.lastcloseparen; \
6311         } \
6312     }
6313
6314         case STAR:              /*  /A*B/ where A is width 1 char */
6315             ST.paren = 0;
6316             ST.min = 0;
6317             ST.max = REG_INFTY;
6318             scan = NEXTOPER(scan);
6319             goto repeat;
6320
6321         case PLUS:              /*  /A+B/ where A is width 1 char */
6322             ST.paren = 0;
6323             ST.min = 1;
6324             ST.max = REG_INFTY;
6325             scan = NEXTOPER(scan);
6326             goto repeat;
6327
6328         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
6329             ST.paren = scan->flags;     /* Which paren to set */
6330             ST.lastparen      = rex->lastparen;
6331             ST.lastcloseparen = rex->lastcloseparen;
6332             if (ST.paren > maxopenparen)
6333                 maxopenparen = ST.paren;
6334             ST.min = ARG1(scan);  /* min to match */
6335             ST.max = ARG2(scan);  /* max to match */
6336             if (cur_eval && cur_eval->u.eval.close_paren &&
6337                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6338                 ST.min=1;
6339                 ST.max=1;
6340             }
6341             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6342             goto repeat;
6343
6344         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
6345             ST.paren = 0;
6346             ST.min = ARG1(scan);  /* min to match */
6347             ST.max = ARG2(scan);  /* max to match */
6348             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6349           repeat:
6350             /*
6351             * Lookahead to avoid useless match attempts
6352             * when we know what character comes next.
6353             *
6354             * Used to only do .*x and .*?x, but now it allows
6355             * for )'s, ('s and (?{ ... })'s to be in the way
6356             * of the quantifier and the EXACT-like node.  -- japhy
6357             */
6358
6359             assert(ST.min <= ST.max);
6360             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6361                 ST.c1 = ST.c2 = CHRTEST_VOID;
6362             }
6363             else {
6364                 regnode *text_node = next;
6365
6366                 if (! HAS_TEXT(text_node))
6367                     FIND_NEXT_IMPT(text_node);
6368
6369                 if (! HAS_TEXT(text_node))
6370                     ST.c1 = ST.c2 = CHRTEST_VOID;
6371                 else {
6372                     if ( PL_regkind[OP(text_node)] != EXACT ) {
6373                         ST.c1 = ST.c2 = CHRTEST_VOID;
6374                     }
6375                     else {
6376
6377                     /*  Currently we only get here when
6378
6379                         PL_rekind[OP(text_node)] == EXACT
6380
6381                         if this changes back then the macro for IS_TEXT and
6382                         friends need to change. */
6383                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6384                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6385                            reginfo))
6386                         {
6387                             sayNO;
6388                         }
6389                     }
6390                 }
6391             }
6392
6393             ST.A = scan;
6394             ST.B = next;
6395             if (minmod) {
6396                 char *li = locinput;
6397                 minmod = 0;
6398                 if (ST.min &&
6399                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6400                             < ST.min)
6401                     sayNO;
6402                 SET_locinput(li);
6403                 ST.count = ST.min;
6404                 REGCP_SET(ST.cp);
6405                 if (ST.c1 == CHRTEST_VOID)
6406                     goto curly_try_B_min;
6407
6408                 ST.oldloc = locinput;
6409
6410                 /* set ST.maxpos to the furthest point along the
6411                  * string that could possibly match */
6412                 if  (ST.max == REG_INFTY) {
6413                     ST.maxpos = reginfo->strend - 1;
6414                     if (utf8_target)
6415                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6416                             ST.maxpos--;
6417                 }
6418                 else if (utf8_target) {
6419                     int m = ST.max - ST.min;
6420                     for (ST.maxpos = locinput;
6421                          m >0 && ST.maxpos < reginfo->strend; m--)
6422                         ST.maxpos += UTF8SKIP(ST.maxpos);
6423                 }
6424                 else {
6425                     ST.maxpos = locinput + ST.max - ST.min;
6426                     if (ST.maxpos >= reginfo->strend)
6427                         ST.maxpos = reginfo->strend - 1;
6428                 }
6429                 goto curly_try_B_min_known;
6430
6431             }
6432             else {
6433                 /* avoid taking address of locinput, so it can remain
6434                  * a register var */
6435                 char *li = locinput;
6436                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6437                 if (ST.count < ST.min)
6438                     sayNO;
6439                 SET_locinput(li);
6440                 if ((ST.count > ST.min)
6441                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6442                 {
6443                     /* A{m,n} must come at the end of the string, there's
6444                      * no point in backing off ... */
6445                     ST.min = ST.count;
6446                     /* ...except that $ and \Z can match before *and* after
6447                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6448                        We may back off by one in this case. */
6449                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6450                         ST.min--;
6451                 }
6452                 REGCP_SET(ST.cp);
6453                 goto curly_try_B_max;
6454             }
6455             assert(0); /* NOTREACHED */
6456
6457
6458         case CURLY_B_min_known_fail:
6459             /* failed to find B in a non-greedy match where c1,c2 valid */
6460
6461             REGCP_UNWIND(ST.cp);
6462             if (ST.paren) {
6463                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6464             }
6465             /* Couldn't or didn't -- move forward. */
6466             ST.oldloc = locinput;
6467             if (utf8_target)
6468                 locinput += UTF8SKIP(locinput);
6469             else
6470                 locinput++;
6471             ST.count++;
6472           curly_try_B_min_known:
6473              /* find the next place where 'B' could work, then call B */
6474             {
6475                 int n;
6476                 if (utf8_target) {
6477                     n = (ST.oldloc == locinput) ? 0 : 1;
6478                     if (ST.c1 == ST.c2) {
6479                         /* set n to utf8_distance(oldloc, locinput) */
6480                         while (locinput <= ST.maxpos
6481                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6482                         {
6483                             locinput += UTF8SKIP(locinput);
6484                             n++;
6485                         }
6486                     }
6487                     else {
6488                         /* set n to utf8_distance(oldloc, locinput) */
6489                         while (locinput <= ST.maxpos
6490                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6491                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6492                         {
6493                             locinput += UTF8SKIP(locinput);
6494                             n++;
6495                         }
6496                     }
6497                 }
6498                 else {  /* Not utf8_target */
6499                     if (ST.c1 == ST.c2) {
6500                         while (locinput <= ST.maxpos &&
6501                                UCHARAT(locinput) != ST.c1)
6502                             locinput++;
6503                     }
6504                     else {
6505                         while (locinput <= ST.maxpos
6506                                && UCHARAT(locinput) != ST.c1
6507                                && UCHARAT(locinput) != ST.c2)
6508                             locinput++;
6509                     }
6510                     n = locinput - ST.oldloc;
6511                 }
6512                 if (locinput > ST.maxpos)
6513                     sayNO;
6514                 if (n) {
6515                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6516                      * at b; check that everything between oldloc and
6517                      * locinput matches */
6518                     char *li = ST.oldloc;
6519                     ST.count += n;
6520                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6521                         sayNO;
6522                     assert(n == REG_INFTY || locinput == li);
6523                 }
6524                 CURLY_SETPAREN(ST.paren, ST.count);
6525                 if (cur_eval && cur_eval->u.eval.close_paren &&
6526                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6527                     goto fake_end;
6528                 }
6529                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6530             }
6531             assert(0); /* NOTREACHED */
6532
6533
6534         case CURLY_B_min_fail:
6535             /* failed to find B in a non-greedy match where c1,c2 invalid */
6536
6537             REGCP_UNWIND(ST.cp);
6538             if (ST.paren) {
6539                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6540             }
6541             /* failed -- move forward one */
6542             {
6543                 char *li = locinput;
6544                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6545                     sayNO;
6546                 }
6547                 locinput = li;
6548             }
6549             {
6550                 ST.count++;
6551                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6552                         ST.count > 0)) /* count overflow ? */
6553                 {
6554                   curly_try_B_min:
6555                     CURLY_SETPAREN(ST.paren, ST.count);
6556                     if (cur_eval && cur_eval->u.eval.close_paren &&
6557                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6558                         goto fake_end;
6559                     }
6560                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6561                 }
6562             }
6563             sayNO;
6564             assert(0); /* NOTREACHED */
6565
6566
6567         curly_try_B_max:
6568             /* a successful greedy match: now try to match B */
6569             if (cur_eval && cur_eval->u.eval.close_paren &&
6570                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6571                 goto fake_end;
6572             }
6573             {
6574                 bool could_match = locinput < reginfo->strend;
6575
6576                 /* If it could work, try it. */
6577                 if (ST.c1 != CHRTEST_VOID && could_match) {
6578                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6579                     {
6580                         could_match = memEQ(locinput,
6581                                             ST.c1_utf8,
6582                                             UTF8SKIP(locinput))
6583                                     || memEQ(locinput,
6584                                              ST.c2_utf8,
6585                                              UTF8SKIP(locinput));
6586                     }
6587                     else {
6588                         could_match = UCHARAT(locinput) == ST.c1
6589                                       || UCHARAT(locinput) == ST.c2;
6590                     }
6591                 }
6592                 if (ST.c1 == CHRTEST_VOID || could_match) {
6593                     CURLY_SETPAREN(ST.paren, ST.count);
6594                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6595                     assert(0); /* NOTREACHED */
6596                 }
6597             }
6598             /* FALLTHROUGH */
6599
6600         case CURLY_B_max_fail:
6601             /* failed to find B in a greedy match */
6602
6603             REGCP_UNWIND(ST.cp);
6604             if (ST.paren) {
6605                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6606             }
6607             /*  back up. */
6608             if (--ST.count < ST.min)
6609                 sayNO;
6610             locinput = HOPc(locinput, -1);
6611             goto curly_try_B_max;
6612
6613 #undef ST
6614
6615         case END: /*  last op of main pattern  */
6616             fake_end:
6617             if (cur_eval) {
6618                 /* we've just finished A in /(??{A})B/; now continue with B */
6619
6620                 st->u.eval.prev_rex = rex_sv;           /* inner */
6621
6622                 /* Save *all* the positions. */
6623                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6624                 rex_sv = cur_eval->u.eval.prev_rex;
6625                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6626                 SET_reg_curpm(rex_sv);
6627                 rex = ReANY(rex_sv);
6628                 rexi = RXi_GET(rex);
6629                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6630
6631                 REGCP_SET(st->u.eval.lastcp);
6632
6633                 /* Restore parens of the outer rex without popping the
6634                  * savestack */
6635                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6636                                         &maxopenparen);
6637
6638                 st->u.eval.prev_eval = cur_eval;
6639                 cur_eval = cur_eval->u.eval.prev_eval;
6640                 DEBUG_EXECUTE_r(
6641                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6642                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6643                 if ( nochange_depth )
6644                     nochange_depth--;
6645
6646                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6647                                     locinput); /* match B */
6648             }
6649
6650             if (locinput < reginfo->till) {
6651                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6652                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6653                                       PL_colors[4],
6654                                       (long)(locinput - startpos),
6655                                       (long)(reginfo->till - startpos),
6656                                       PL_colors[5]));
6657
6658                 sayNO_SILENT;           /* Cannot match: too short. */
6659             }
6660             sayYES;                     /* Success! */
6661
6662         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6663             DEBUG_EXECUTE_r(
6664             PerlIO_printf(Perl_debug_log,
6665                 "%*s  %ssubpattern success...%s\n",
6666                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6667             sayYES;                     /* Success! */
6668
6669 #undef  ST
6670 #define ST st->u.ifmatch
6671
6672         {
6673             char *newstart;
6674
6675         case SUSPEND:   /* (?>A) */
6676             ST.wanted = 1;
6677             newstart = locinput;
6678             goto do_ifmatch;
6679
6680         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6681             ST.wanted = 0;
6682             goto ifmatch_trivial_fail_test;
6683
6684         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6685             ST.wanted = 1;
6686           ifmatch_trivial_fail_test:
6687             if (scan->flags) {
6688                 char * const s = HOPBACKc(locinput, scan->flags);
6689                 if (!s) {
6690                     /* trivial fail */
6691                     if (logical) {
6692                         logical = 0;
6693                         sw = 1 - cBOOL(ST.wanted);
6694                     }
6695                     else if (ST.wanted)
6696                         sayNO;
6697                     next = scan + ARG(scan);
6698                     if (next == scan)
6699                         next = NULL;
6700                     break;
6701                 }
6702                 newstart = s;
6703             }
6704             else
6705                 newstart = locinput;
6706
6707           do_ifmatch:
6708             ST.me = scan;
6709             ST.logical = logical;
6710             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6711
6712             /* execute body of (?...A) */
6713             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6714             assert(0); /* NOTREACHED */
6715         }
6716
6717         case IFMATCH_A_fail: /* body of (?...A) failed */
6718             ST.wanted = !ST.wanted;
6719             /* FALLTHROUGH */
6720
6721         case IFMATCH_A: /* body of (?...A) succeeded */
6722             if (ST.logical) {
6723                 sw = cBOOL(ST.wanted);
6724             }
6725             else if (!ST.wanted)
6726                 sayNO;
6727
6728             if (OP(ST.me) != SUSPEND) {
6729                 /* restore old position except for (?>...) */
6730                 locinput = st->locinput;
6731             }
6732             scan = ST.me + ARG(ST.me);
6733             if (scan == ST.me)
6734                 scan = NULL;
6735             continue; /* execute B */
6736
6737 #undef ST
6738
6739         case LONGJMP: /*  alternative with many branches compiles to
6740                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6741             next = scan + ARG(scan);
6742             if (next == scan)
6743                 next = NULL;
6744             break;
6745
6746         case COMMIT:  /*  (*COMMIT)  */
6747             reginfo->cutpoint = reginfo->strend;
6748             /* FALLTHROUGH */
6749
6750         case PRUNE:   /*  (*PRUNE)   */
6751             if (!scan->flags)
6752                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6753             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6754             assert(0); /* NOTREACHED */
6755
6756         case COMMIT_next_fail:
6757             no_final = 1;
6758             /* FALLTHROUGH */
6759
6760         case OPFAIL:   /* (*FAIL)  */
6761             sayNO;
6762             assert(0); /* NOTREACHED */
6763
6764 #define ST st->u.mark
6765         case MARKPOINT: /*  (*MARK:foo)  */
6766             ST.prev_mark = mark_state;
6767             ST.mark_name = sv_commit = sv_yes_mark
6768                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6769             mark_state = st;
6770             ST.mark_loc = locinput;
6771             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6772             assert(0); /* NOTREACHED */
6773
6774         case MARKPOINT_next:
6775             mark_state = ST.prev_mark;
6776             sayYES;
6777             assert(0); /* NOTREACHED */
6778
6779         case MARKPOINT_next_fail:
6780             if (popmark && sv_eq(ST.mark_name,popmark))
6781             {
6782                 if (ST.mark_loc > startpoint)
6783                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6784                 popmark = NULL; /* we found our mark */
6785                 sv_commit = ST.mark_name;
6786
6787                 DEBUG_EXECUTE_r({
6788                         PerlIO_printf(Perl_debug_log,
6789                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6790                             REPORT_CODE_OFF+depth*2, "",
6791                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6792                 });
6793             }
6794             mark_state = ST.prev_mark;
6795             sv_yes_mark = mark_state ?
6796                 mark_state->u.mark.mark_name : NULL;
6797             sayNO;
6798             assert(0); /* NOTREACHED */
6799
6800         case SKIP:  /*  (*SKIP)  */
6801             if (scan->flags) {
6802                 /* (*SKIP) : if we fail we cut here*/
6803                 ST.mark_name = NULL;
6804                 ST.mark_loc = locinput;
6805                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6806             } else {
6807                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6808                    otherwise do nothing.  Meaning we need to scan
6809                  */
6810                 regmatch_state *cur = mark_state;
6811                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6812
6813                 while (cur) {
6814                     if ( sv_eq( cur->u.mark.mark_name,
6815                                 find ) )
6816                     {
6817                         ST.mark_name = find;
6818                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6819                     }
6820                     cur = cur->u.mark.prev_mark;
6821                 }
6822             }
6823             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6824             break;
6825
6826         case SKIP_next_fail:
6827             if (ST.mark_name) {
6828                 /* (*CUT:NAME) - Set up to search for the name as we
6829                    collapse the stack*/
6830                 popmark = ST.mark_name;
6831             } else {
6832                 /* (*CUT) - No name, we cut here.*/
6833                 if (ST.mark_loc > startpoint)
6834                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6835                 /* but we set sv_commit to latest mark_name if there
6836                    is one so they can test to see how things lead to this
6837                    cut */
6838                 if (mark_state)
6839                     sv_commit=mark_state->u.mark.mark_name;
6840             }
6841             no_final = 1;
6842             sayNO;
6843             assert(0); /* NOTREACHED */
6844 #undef ST
6845
6846         case LNBREAK: /* \R */
6847             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6848                 locinput += n;
6849             } else
6850                 sayNO;
6851             break;
6852
6853         default:
6854             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6855                           PTR2UV(scan), OP(scan));
6856             Perl_croak(aTHX_ "regexp memory corruption");
6857
6858         /* this is a point to jump to in order to increment
6859          * locinput by one character */
6860         increment_locinput:
6861             assert(!NEXTCHR_IS_EOS);
6862             if (utf8_target) {
6863                 locinput += PL_utf8skip[nextchr];
6864                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6865                 if (locinput > reginfo->strend)
6866                     sayNO;
6867             }
6868             else
6869                 locinput++;
6870             break;
6871
6872         } /* end switch */
6873
6874         /* switch break jumps here */
6875         scan = next; /* prepare to execute the next op and ... */
6876         continue;    /* ... jump back to the top, reusing st */
6877         assert(0); /* NOTREACHED */
6878
6879       push_yes_state:
6880         /* push a state that backtracks on success */
6881         st->u.yes.prev_yes_state = yes_state;
6882         yes_state = st;
6883         /* FALLTHROUGH */
6884       push_state:
6885         /* push a new regex state, then continue at scan  */
6886         {
6887             regmatch_state *newst;
6888
6889             DEBUG_STACK_r({
6890                 regmatch_state *cur = st;
6891                 regmatch_state *curyes = yes_state;
6892                 int curd = depth;
6893                 regmatch_slab *slab = PL_regmatch_slab;
6894                 for (;curd > -1;cur--,curd--) {
6895                     if (cur < SLAB_FIRST(slab)) {
6896                         slab = slab->prev;
6897                         cur = SLAB_LAST(slab);
6898                     }
6899                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6900                         REPORT_CODE_OFF + 2 + depth * 2,"",
6901                         curd, PL_reg_name[cur->resume_state],
6902                         (curyes == cur) ? "yes" : ""
6903                     );
6904                     if (curyes == cur)
6905                         curyes = cur->u.yes.prev_yes_state;
6906                 }
6907             } else
6908                 DEBUG_STATE_pp("push")
6909             );
6910             depth++;
6911             st->locinput = locinput;
6912             newst = st+1;
6913             if (newst >  SLAB_LAST(PL_regmatch_slab))
6914                 newst = S_push_slab(aTHX);
6915             PL_regmatch_state = newst;
6916
6917             locinput = pushinput;
6918             st = newst;
6919             continue;
6920             assert(0); /* NOTREACHED */
6921         }
6922     }
6923
6924     /*
6925     * We get here only if there's trouble -- normally "case END" is
6926     * the terminating point.
6927     */
6928     Perl_croak(aTHX_ "corrupted regexp pointers");
6929     /*NOTREACHED*/
6930     sayNO;
6931
6932 yes:
6933     if (yes_state) {
6934         /* we have successfully completed a subexpression, but we must now
6935          * pop to the state marked by yes_state and continue from there */
6936         assert(st != yes_state);
6937 #ifdef DEBUGGING
6938         while (st != yes_state) {
6939             st--;
6940             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6941                 PL_regmatch_slab = PL_regmatch_slab->prev;
6942                 st = SLAB_LAST(PL_regmatch_slab);
6943             }
6944             DEBUG_STATE_r({
6945                 if (no_final) {
6946                     DEBUG_STATE_pp("pop (no final)");
6947                 } else {
6948                     DEBUG_STATE_pp("pop (yes)");
6949                 }
6950             });
6951             depth--;
6952         }
6953 #else
6954         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6955             || yes_state > SLAB_LAST(PL_regmatch_slab))
6956         {
6957             /* not in this slab, pop slab */
6958             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6959             PL_regmatch_slab = PL_regmatch_slab->prev;
6960             st = SLAB_LAST(PL_regmatch_slab);
6961         }
6962         depth -= (st - yes_state);
6963 #endif
6964         st = yes_state;
6965         yes_state = st->u.yes.prev_yes_state;
6966         PL_regmatch_state = st;
6967
6968         if (no_final)
6969             locinput= st->locinput;
6970         state_num = st->resume_state + no_final;
6971         goto reenter_switch;
6972     }
6973
6974     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6975                           PL_colors[4], PL_colors[5]));
6976
6977     if (reginfo->info_aux_eval) {
6978         /* each successfully executed (?{...}) block does the equivalent of
6979          *   local $^R = do {...}
6980          * When popping the save stack, all these locals would be undone;
6981          * bypass this by setting the outermost saved $^R to the latest
6982          * value */
6983         /* I dont know if this is needed or works properly now.
6984          * see code related to PL_replgv elsewhere in this file.
6985          * Yves
6986          */
6987         if (oreplsv != GvSV(PL_replgv))
6988             sv_setsv(oreplsv, GvSV(PL_replgv));
6989     }
6990     result = 1;
6991     goto final_exit;
6992
6993 no:
6994     DEBUG_EXECUTE_r(
6995         PerlIO_printf(Perl_debug_log,
6996             "%*s  %sfailed...%s\n",
6997             REPORT_CODE_OFF+depth*2, "",
6998             PL_colors[4], PL_colors[5])
6999         );
7000
7001 no_silent:
7002     if (no_final) {
7003         if (yes_state) {
7004             goto yes;
7005         } else {
7006             goto final_exit;
7007         }
7008     }
7009     if (depth) {
7010         /* there's a previous state to backtrack to */
7011         st--;
7012         if (st < SLAB_FIRST(PL_regmatch_slab)) {
7013             PL_regmatch_slab = PL_regmatch_slab->prev;
7014             st = SLAB_LAST(PL_regmatch_slab);
7015         }
7016         PL_regmatch_state = st;
7017         locinput= st->locinput;
7018
7019         DEBUG_STATE_pp("pop");
7020         depth--;
7021         if (yes_state == st)
7022             yes_state = st->u.yes.prev_yes_state;
7023
7024         state_num = st->resume_state + 1; /* failure = success + 1 */
7025         goto reenter_switch;
7026     }
7027     result = 0;
7028
7029   final_exit:
7030     if (rex->intflags & PREGf_VERBARG_SEEN) {
7031         SV *sv_err = get_sv("REGERROR", 1);
7032         SV *sv_mrk = get_sv("REGMARK", 1);
7033         if (result) {
7034             sv_commit = &PL_sv_no;
7035             if (!sv_yes_mark)
7036                 sv_yes_mark = &PL_sv_yes;
7037         } else {
7038             if (!sv_commit)
7039                 sv_commit = &PL_sv_yes;
7040             sv_yes_mark = &PL_sv_no;
7041         }
7042         assert(sv_err);
7043         assert(sv_mrk);
7044         sv_setsv(sv_err, sv_commit);
7045         sv_setsv(sv_mrk, sv_yes_mark);
7046     }
7047
7048
7049     if (last_pushed_cv) {
7050         dSP;
7051         POP_MULTICALL;
7052         PERL_UNUSED_VAR(SP);
7053     }
7054
7055     assert(!result ||  locinput - reginfo->strbeg >= 0);
7056     return result ?  locinput - reginfo->strbeg : -1;
7057 }
7058
7059 /*
7060  - regrepeat - repeatedly match something simple, report how many
7061  *
7062  * What 'simple' means is a node which can be the operand of a quantifier like
7063  * '+', or {1,3}
7064  *
7065  * startposp - pointer a pointer to the start position.  This is updated
7066  *             to point to the byte following the highest successful
7067  *             match.
7068  * p         - the regnode to be repeatedly matched against.
7069  * reginfo   - struct holding match state, such as strend
7070  * max       - maximum number of things to match.
7071  * depth     - (for debugging) backtracking depth.
7072  */
7073 STATIC I32
7074 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
7075             regmatch_info *const reginfo, I32 max, int depth)
7076 {
7077     char *scan;     /* Pointer to current position in target string */
7078     I32 c;
7079     char *loceol = reginfo->strend;   /* local version */
7080     I32 hardcount = 0;  /* How many matches so far */
7081     bool utf8_target = reginfo->is_utf8_target;
7082     int to_complement = 0;  /* Invert the result? */
7083     UV utf8_flags;
7084     _char_class_number classnum;
7085 #ifndef DEBUGGING
7086     PERL_UNUSED_ARG(depth);
7087 #endif
7088
7089     PERL_ARGS_ASSERT_REGREPEAT;
7090
7091     scan = *startposp;
7092     if (max == REG_INFTY)
7093         max = I32_MAX;
7094     else if (! utf8_target && loceol - scan > max)
7095         loceol = scan + max;
7096
7097     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7098      * to the maximum of how far we should go in it (leaving it set to the real
7099      * end, if the maximum permissible would take us beyond that).  This allows
7100      * us to make the loop exit condition that we haven't gone past <loceol> to
7101      * also mean that we haven't exceeded the max permissible count, saving a
7102      * test each time through the loop.  But it assumes that the OP matches a
7103      * single byte, which is true for most of the OPs below when applied to a
7104      * non-UTF-8 target.  Those relatively few OPs that don't have this
7105      * characteristic will have to compensate.
7106      *
7107      * There is no adjustment for UTF-8 targets, as the number of bytes per
7108      * character varies.  OPs will have to test both that the count is less
7109      * than the max permissible (using <hardcount> to keep track), and that we
7110      * are still within the bounds of the string (using <loceol>.  A few OPs
7111      * match a single byte no matter what the encoding.  They can omit the max
7112      * test if, for the UTF-8 case, they do the adjustment that was skipped
7113      * above.
7114      *
7115      * Thus, the code above sets things up for the common case; and exceptional
7116      * cases need extra work; the common case is to make sure <scan> doesn't
7117      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7118      * count doesn't exceed the maximum permissible */
7119
7120     switch (OP(p)) {
7121     case REG_ANY:
7122         if (utf8_target) {
7123             while (scan < loceol && hardcount < max && *scan != '\n') {
7124                 scan += UTF8SKIP(scan);
7125                 hardcount++;
7126             }
7127         } else {
7128             while (scan < loceol && *scan != '\n')
7129                 scan++;
7130         }
7131         break;
7132     case SANY:
7133         if (utf8_target) {
7134             while (scan < loceol && hardcount < max) {
7135                 scan += UTF8SKIP(scan);
7136                 hardcount++;
7137             }
7138         }
7139         else
7140             scan = loceol;
7141         break;
7142     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
7143         if (utf8_target && loceol - scan > max) {
7144
7145             /* <loceol> hadn't been adjusted in the UTF-8 case */
7146             scan +=  max;
7147         }
7148         else {
7149             scan = loceol;
7150         }
7151         break;
7152     case EXACT:
7153         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7154
7155         c = (U8)*STRING(p);
7156
7157         /* Can use a simple loop if the pattern char to match on is invariant
7158          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
7159          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7160          * true iff it doesn't matter if the argument is in UTF-8 or not */
7161         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
7162             if (utf8_target && loceol - scan > max) {
7163                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7164                  * since here, to match at all, 1 char == 1 byte */
7165                 loceol = scan + max;
7166             }
7167             while (scan < loceol && UCHARAT(scan) == c) {
7168                 scan++;
7169             }
7170         }
7171         else if (reginfo->is_utf8_pat) {
7172             if (utf8_target) {
7173                 STRLEN scan_char_len;
7174
7175                 /* When both target and pattern are UTF-8, we have to do
7176                  * string EQ */
7177                 while (hardcount < max
7178                        && scan < loceol
7179                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
7180                        && memEQ(scan, STRING(p), scan_char_len))
7181                 {
7182                     scan += scan_char_len;
7183                     hardcount++;
7184                 }
7185             }
7186             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
7187
7188                 /* Target isn't utf8; convert the character in the UTF-8
7189                  * pattern to non-UTF8, and do a simple loop */
7190                 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
7191                 while (scan < loceol && UCHARAT(scan) == c) {
7192                     scan++;
7193                 }
7194             } /* else pattern char is above Latin1, can't possibly match the
7195                  non-UTF-8 target */
7196         }
7197         else {
7198
7199             /* Here, the string must be utf8; pattern isn't, and <c> is
7200              * different in utf8 than not, so can't compare them directly.
7201              * Outside the loop, find the two utf8 bytes that represent c, and
7202              * then look for those in sequence in the utf8 string */
7203             U8 high = UTF8_TWO_BYTE_HI(c);
7204             U8 low = UTF8_TWO_BYTE_LO(c);
7205
7206             while (hardcount < max
7207                     && scan + 1 < loceol
7208                     && UCHARAT(scan) == high
7209                     && UCHARAT(scan + 1) == low)
7210             {
7211                 scan += 2;
7212                 hardcount++;
7213             }
7214         }
7215         break;
7216
7217     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
7218         assert(! reginfo->is_utf8_pat);
7219         /* FALLTHROUGH */
7220     case EXACTFA:
7221         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7222         goto do_exactf;
7223
7224     case EXACTFL:
7225         utf8_flags = FOLDEQ_LOCALE;
7226         goto do_exactf;
7227
7228     case EXACTF:   /* This node only generated for non-utf8 patterns */
7229         assert(! reginfo->is_utf8_pat);
7230         utf8_flags = 0;
7231         goto do_exactf;
7232
7233     case EXACTFU_SS:
7234     case EXACTFU:
7235         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
7236
7237     do_exactf: {
7238         int c1, c2;
7239         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
7240
7241         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7242
7243         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
7244                                         reginfo))
7245         {
7246             if (c1 == CHRTEST_VOID) {
7247                 /* Use full Unicode fold matching */
7248                 char *tmpeol = reginfo->strend;
7249                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
7250                 while (hardcount < max
7251                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7252                                              STRING(p), NULL, pat_len,
7253                                              reginfo->is_utf8_pat, utf8_flags))
7254                 {
7255                     scan = tmpeol;
7256                     tmpeol = reginfo->strend;
7257                     hardcount++;
7258                 }
7259             }
7260             else if (utf8_target) {
7261                 if (c1 == c2) {
7262                     while (scan < loceol
7263                            && hardcount < max
7264                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7265                     {
7266                         scan += UTF8SKIP(scan);
7267                         hardcount++;
7268                     }
7269                 }
7270                 else {
7271                     while (scan < loceol
7272                            && hardcount < max
7273                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7274                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7275                     {
7276                         scan += UTF8SKIP(scan);
7277                         hardcount++;
7278                     }
7279                 }
7280             }
7281             else if (c1 == c2) {
7282                 while (scan < loceol && UCHARAT(scan) == c1) {
7283                     scan++;
7284                 }
7285             }
7286             else {
7287                 while (scan < loceol &&
7288                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7289                 {
7290                     scan++;
7291                 }
7292             }
7293         }
7294         break;
7295     }
7296     case ANYOF:
7297         if (utf8_target) {
7298             while (hardcount < max
7299                    && scan < loceol
7300                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
7301             {
7302                 scan += UTF8SKIP(scan);
7303                 hardcount++;
7304             }
7305         } else {
7306             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7307                 scan++;
7308         }
7309         break;
7310
7311     /* The argument (FLAGS) to all the POSIX node types is the class number */
7312
7313     case NPOSIXL:
7314         to_complement = 1;
7315         /* FALLTHROUGH */
7316
7317     case POSIXL:
7318         if (! utf8_target) {
7319             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7320                                                                    *scan)))
7321             {
7322                 scan++;
7323             }
7324         } else {
7325             while (hardcount < max && scan < loceol
7326                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7327                                                                   (U8 *) scan)))
7328             {
7329                 scan += UTF8SKIP(scan);
7330                 hardcount++;
7331             }
7332         }
7333         break;
7334
7335     case POSIXD:
7336         if (utf8_target) {
7337             goto utf8_posix;
7338         }
7339         /* FALLTHROUGH */
7340
7341     case POSIXA:
7342         if (utf8_target && loceol - scan > max) {
7343
7344             /* We didn't adjust <loceol> at the beginning of this routine
7345              * because is UTF-8, but it is actually ok to do so, since here, to
7346              * match, 1 char == 1 byte. */
7347             loceol = scan + max;
7348         }
7349         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7350             scan++;
7351         }
7352         break;
7353
7354     case NPOSIXD:
7355         if (utf8_target) {
7356             to_complement = 1;
7357             goto utf8_posix;
7358         }
7359         /* FALLTHROUGH */
7360
7361     case NPOSIXA:
7362         if (! utf8_target) {
7363             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7364                 scan++;
7365             }
7366         }
7367         else {
7368
7369             /* The complement of something that matches only ASCII matches all
7370              * non-ASCII, plus everything in ASCII that isn't in the class. */
7371             while (hardcount < max && scan < loceol
7372                    && (! isASCII_utf8(scan)
7373                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7374             {
7375                 scan += UTF8SKIP(scan);
7376                 hardcount++;
7377             }
7378         }
7379         break;
7380
7381     case NPOSIXU:
7382         to_complement = 1;
7383         /* FALLTHROUGH */
7384
7385     case POSIXU:
7386         if (! utf8_target) {
7387             while (scan < loceol && to_complement
7388                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7389             {
7390                 scan++;
7391             }
7392         }
7393         else {
7394       utf8_posix:
7395             classnum = (_char_class_number) FLAGS(p);
7396             if (classnum < _FIRST_NON_SWASH_CC) {
7397
7398                 /* Here, a swash is needed for above-Latin1 code points.
7399                  * Process as many Latin1 code points using the built-in rules.
7400                  * Go to another loop to finish processing upon encountering
7401                  * the first Latin1 code point.  We could do that in this loop
7402                  * as well, but the other way saves having to test if the swash
7403                  * has been loaded every time through the loop: extra space to
7404                  * save a test. */
7405                 while (hardcount < max && scan < loceol) {
7406                     if (UTF8_IS_INVARIANT(*scan)) {
7407                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7408                                                                    classnum))))
7409                         {
7410                             break;
7411                         }
7412                         scan++;
7413                     }
7414                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7415                         if (! (to_complement
7416                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7417                                                                      *(scan + 1)),
7418                                                     classnum))))
7419                         {
7420                             break;
7421                         }
7422                         scan += 2;
7423                     }
7424                     else {
7425                         goto found_above_latin1;
7426                     }
7427
7428                     hardcount++;
7429                 }
7430             }
7431             else {
7432                 /* For these character classes, the knowledge of how to handle
7433                  * every code point is compiled in to Perl via a macro.  This
7434                  * code is written for making the loops as tight as possible.
7435                  * It could be refactored to save space instead */
7436                 switch (classnum) {
7437                     case _CC_ENUM_SPACE:    /* XXX would require separate code
7438                                                if we revert the change of \v
7439                                                matching this */
7440                         /* FALLTHROUGH */
7441                     case _CC_ENUM_PSXSPC:
7442                         while (hardcount < max
7443                                && scan < loceol
7444                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7445                         {
7446                             scan += UTF8SKIP(scan);
7447                             hardcount++;
7448                         }
7449                         break;
7450                     case _CC_ENUM_BLANK:
7451                         while (hardcount < max
7452                                && scan < loceol
7453                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7454                         {
7455                             scan += UTF8SKIP(scan);
7456                             hardcount++;
7457                         }
7458                         break;
7459                     case _CC_ENUM_XDIGIT:
7460                         while (hardcount < max
7461                                && scan < loceol
7462                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7463                         {
7464                             scan += UTF8SKIP(scan);
7465                             hardcount++;
7466                         }
7467                         break;
7468                     case _CC_ENUM_VERTSPACE:
7469                         while (hardcount < max
7470                                && scan < loceol
7471                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7472                         {
7473                             scan += UTF8SKIP(scan);
7474                             hardcount++;
7475                         }
7476                         break;
7477                     case _CC_ENUM_CNTRL:
7478                         while (hardcount < max
7479                                && scan < loceol
7480                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7481                         {
7482                             scan += UTF8SKIP(scan);
7483                             hardcount++;
7484                         }
7485                         break;
7486                     default:
7487                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7488                 }
7489             }
7490         }
7491         break;
7492
7493       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7494
7495         /* Load the swash if not already present */
7496         if (! PL_utf8_swash_ptrs[classnum]) {
7497             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7498             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7499                                         "utf8",
7500                                         "",
7501                                         &PL_sv_undef, 1, 0,
7502                                         PL_XPosix_ptrs[classnum], &flags);
7503         }
7504
7505         while (hardcount < max && scan < loceol
7506                && to_complement ^ cBOOL(_generic_utf8(
7507                                        classnum,
7508                                        scan,
7509                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
7510                                                    (U8 *) scan,
7511                                                    TRUE))))
7512         {
7513             scan += UTF8SKIP(scan);
7514             hardcount++;
7515         }
7516         break;
7517
7518     case LNBREAK:
7519         if (utf8_target) {
7520             while (hardcount < max && scan < loceol &&
7521                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7522                 scan += c;
7523                 hardcount++;
7524             }
7525         } else {
7526             /* LNBREAK can match one or two latin chars, which is ok, but we
7527              * have to use hardcount in this situation, and throw away the
7528              * adjustment to <loceol> done before the switch statement */
7529             loceol = reginfo->strend;
7530             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7531                 scan+=c;
7532                 hardcount++;
7533             }
7534         }
7535         break;
7536
7537     case BOUND:
7538     case BOUNDA:
7539     case BOUNDL:
7540     case BOUNDU:
7541     case EOS:
7542     case GPOS:
7543     case KEEPS:
7544     case NBOUND:
7545     case NBOUNDA:
7546     case NBOUNDL:
7547     case NBOUNDU:
7548     case OPFAIL:
7549     case SBOL:
7550     case SEOL:
7551         /* These are all 0 width, so match right here or not at all. */
7552         break;
7553
7554     default:
7555         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7556         assert(0); /* NOTREACHED */
7557
7558     }
7559
7560     if (hardcount)
7561         c = hardcount;
7562     else
7563         c = scan - *startposp;
7564     *startposp = scan;
7565
7566     DEBUG_r({
7567         GET_RE_DEBUG_FLAGS_DECL;
7568         DEBUG_EXECUTE_r({
7569             SV * const prop = sv_newmortal();
7570             regprop(prog, prop, p, reginfo);
7571             PerlIO_printf(Perl_debug_log,
7572                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7573                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7574         });
7575     });
7576
7577     return(c);
7578 }
7579
7580
7581 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7582 /*
7583 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7584 create a copy so that changes the caller makes won't change the shared one.
7585 If <altsvp> is non-null, will return NULL in it, for back-compat.
7586  */
7587 SV *
7588 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7589 {
7590     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7591
7592     if (altsvp) {
7593         *altsvp = NULL;
7594     }
7595
7596     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
7597 }
7598
7599 SV *
7600 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
7601                                         const regnode* node,
7602                                         bool doinit,
7603                                         SV** listsvp,
7604                                         SV** only_utf8_locale_ptr)
7605 {
7606     /* For internal core use only.
7607      * Returns the swash for the input 'node' in the regex 'prog'.
7608      * If <doinit> is 'true', will attempt to create the swash if not already
7609      *    done.
7610      * If <listsvp> is non-null, will return the printable contents of the
7611      *    swash.  This can be used to get debugging information even before the
7612      *    swash exists, by calling this function with 'doinit' set to false, in
7613      *    which case the components that will be used to eventually create the
7614      *    swash are returned  (in a printable form).
7615      * Tied intimately to how regcomp.c sets up the data structure */
7616
7617     SV *sw  = NULL;
7618     SV *si  = NULL;         /* Input swash initialization string */
7619     SV*  invlist = NULL;
7620
7621     RXi_GET_DECL(prog,progi);
7622     const struct reg_data * const data = prog ? progi->data : NULL;
7623
7624     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
7625
7626     assert(ANYOF_FLAGS(node)
7627                         & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
7628
7629     if (data && data->count) {
7630         const U32 n = ARG(node);
7631
7632         if (data->what[n] == 's') {
7633             SV * const rv = MUTABLE_SV(data->data[n]);
7634             AV * const av = MUTABLE_AV(SvRV(rv));
7635             SV **const ary = AvARRAY(av);
7636             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7637
7638             si = *ary;  /* ary[0] = the string to initialize the swash with */
7639
7640             /* Elements 3 and 4 are either both present or both absent. [3] is
7641              * any inversion list generated at compile time; [4] indicates if
7642              * that inversion list has any user-defined properties in it. */
7643             if (av_tindex(av) >= 2) {
7644                 if (only_utf8_locale_ptr
7645                     && ary[2]
7646                     && ary[2] != &PL_sv_undef)
7647                 {
7648                     *only_utf8_locale_ptr = ary[2];
7649                 }
7650                 else {
7651                     assert(only_utf8_locale_ptr);
7652                     *only_utf8_locale_ptr = NULL;
7653                 }
7654
7655                 if (av_tindex(av) >= 3) {
7656                     invlist = ary[3];
7657                     if (SvUV(ary[4])) {
7658                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7659                     }
7660                 }
7661                 else {
7662                     invlist = NULL;
7663                 }
7664             }
7665
7666             /* Element [1] is reserved for the set-up swash.  If already there,
7667              * return it; if not, create it and store it there */
7668             if (ary[1] && SvROK(ary[1])) {
7669                 sw = ary[1];
7670             }
7671             else if (doinit && ((si && si != &PL_sv_undef)
7672                                  || (invlist && invlist != &PL_sv_undef))) {
7673                 assert(si);
7674                 sw = _core_swash_init("utf8", /* the utf8 package */
7675                                       "", /* nameless */
7676                                       si,
7677                                       1, /* binary */
7678                                       0, /* not from tr/// */
7679                                       invlist,
7680                                       &swash_init_flags);
7681                 (void)av_store(av, 1, sw);
7682             }
7683         }
7684     }
7685
7686     /* If requested, return a printable version of what this swash matches */
7687     if (listsvp) {
7688         SV* matches_string = newSVpvs("");
7689
7690         /* The swash should be used, if possible, to get the data, as it
7691          * contains the resolved data.  But this function can be called at
7692          * compile-time, before everything gets resolved, in which case we
7693          * return the currently best available information, which is the string
7694          * that will eventually be used to do that resolving, 'si' */
7695         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7696             && (si && si != &PL_sv_undef))
7697         {
7698             sv_catsv(matches_string, si);
7699         }
7700
7701         /* Add the inversion list to whatever we have.  This may have come from
7702          * the swash, or from an input parameter */
7703         if (invlist) {
7704             sv_catsv(matches_string, _invlist_contents(invlist));
7705         }
7706         *listsvp = matches_string;
7707     }
7708
7709     return sw;
7710 }
7711 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
7712
7713 /*
7714  - reginclass - determine if a character falls into a character class
7715
7716   n is the ANYOF regnode
7717   p is the target string
7718   p_end points to one byte beyond the end of the target string
7719   utf8_target tells whether p is in UTF-8.
7720
7721   Returns true if matched; false otherwise.
7722
7723   Note that this can be a synthetic start class, a combination of various
7724   nodes, so things you think might be mutually exclusive, such as locale,
7725   aren't.  It can match both locale and non-locale
7726
7727  */
7728
7729 STATIC bool
7730 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
7731 {
7732     dVAR;
7733     const char flags = ANYOF_FLAGS(n);
7734     bool match = FALSE;
7735     UV c = *p;
7736
7737     PERL_ARGS_ASSERT_REGINCLASS;
7738
7739     /* If c is not already the code point, get it.  Note that
7740      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7741     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7742         STRLEN c_len = 0;
7743         c = utf8n_to_uvchr(p, p_end - p, &c_len,
7744                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7745                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7746                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7747                  * UTF8_ALLOW_FFFF */
7748         if (c_len == (STRLEN)-1)
7749             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7750     }
7751
7752     /* If this character is potentially in the bitmap, check it */
7753     if (c < 256) {
7754         if (ANYOF_BITMAP_TEST(n, c))
7755             match = TRUE;
7756         else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
7757                 && ! utf8_target
7758                 && ! isASCII(c))
7759         {
7760             match = TRUE;
7761         }
7762         else if (flags & ANYOF_LOCALE_FLAGS) {
7763             if (flags & ANYOF_LOC_FOLD) {
7764                  if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
7765                     match = TRUE;
7766                 }
7767             }
7768             if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
7769
7770                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7771                  * if the class includes the Posix character class given by
7772                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7773                  * complemented Posix class given by int(bit/2).  So we loop
7774                  * through the bits, each time changing whether we complement
7775                  * the result or not.  Suppose for the sake of illustration
7776                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7777                  * is set, it means there is a match for this ANYOF node if the
7778                  * character is in the class given by the expression (0 / 2 = 0
7779                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7780                  * and since 'to_complement' is 0, the result will stay TRUE,
7781                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7782                  * bit 1 is 1.  That means there is a match if the character
7783                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7784                  * but will on bit 1.  On the second iteration 'to_complement'
7785                  * will be 1, so the exclusive or will reverse things, so we
7786                  * are testing for \W.  On the third iteration, 'to_complement'
7787                  * will be 0, and we would be testing for \s; the fourth
7788                  * iteration would test for \S, etc.
7789                  *
7790                  * Note that this code assumes that all the classes are closed
7791                  * under folding.  For example, if a character matches \w, then
7792                  * its fold does too; and vice versa.  This should be true for
7793                  * any well-behaved locale for all the currently defined Posix
7794                  * classes, except for :lower: and :upper:, which are handled
7795                  * by the pseudo-class :cased: which matches if either of the
7796                  * other two does.  To get rid of this assumption, an outer
7797                  * loop could be used below to iterate over both the source
7798                  * character, and its fold (if different) */
7799
7800                 int count = 0;
7801                 int to_complement = 0;
7802
7803                 while (count < ANYOF_MAX) {
7804                     if (ANYOF_POSIXL_TEST(n, count)
7805                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7806                     {
7807                         match = TRUE;
7808                         break;
7809                     }
7810                     count++;
7811                     to_complement ^= 1;
7812                 }
7813             }
7814         }
7815     }
7816
7817
7818     /* If the bitmap didn't (or couldn't) match, and something outside the
7819      * bitmap could match, try that. */
7820     if (!match) {
7821         if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
7822             match = TRUE;       /* Everything above 255 matches */
7823         }
7824         else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
7825                   || (utf8_target && (flags & ANYOF_UTF8))
7826                   || ((flags & ANYOF_LOC_FOLD)
7827                        && IN_UTF8_CTYPE_LOCALE
7828                        && ARG(n) != ANYOF_NONBITMAP_EMPTY))
7829         {
7830             SV* only_utf8_locale = NULL;
7831             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7832                                                             &only_utf8_locale);
7833             if (sw) {
7834                 U8 utf8_buffer[2];
7835                 U8 * utf8_p;
7836                 if (utf8_target) {
7837                     utf8_p = (U8 *) p;
7838                 } else { /* Convert to utf8 */
7839                     utf8_p = utf8_buffer;
7840                     append_utf8_from_native_byte(*p, &utf8_p);
7841                     utf8_p = utf8_buffer;
7842                 }
7843
7844                 if (swash_fetch(sw, utf8_p, TRUE)) {
7845                     match = TRUE;
7846                 }
7847             }
7848             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7849                 match = _invlist_contains_cp(only_utf8_locale, c);
7850             }
7851         }
7852
7853         if (UNICODE_IS_SUPER(c)
7854             && (flags & ANYOF_WARN_SUPER)
7855             && ckWARN_d(WARN_NON_UNICODE))
7856         {
7857             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7858                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
7859         }
7860     }
7861
7862 #if ANYOF_INVERT != 1
7863     /* Depending on compiler optimization cBOOL takes time, so if don't have to
7864      * use it, don't */
7865 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7866 #endif
7867
7868     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7869     return (flags & ANYOF_INVERT) ^ match;
7870 }
7871
7872 STATIC U8 *
7873 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7874 {
7875     /* return the position 'off' UTF-8 characters away from 's', forward if
7876      * 'off' >= 0, backwards if negative.  But don't go outside of position
7877      * 'lim', which better be < s  if off < 0 */
7878
7879     PERL_ARGS_ASSERT_REGHOP3;
7880
7881     if (off >= 0) {
7882         while (off-- && s < lim) {
7883             /* XXX could check well-formedness here */
7884             s += UTF8SKIP(s);
7885         }
7886     }
7887     else {
7888         while (off++ && s > lim) {
7889             s--;
7890             if (UTF8_IS_CONTINUED(*s)) {
7891                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7892                     s--;
7893             }
7894             /* XXX could check well-formedness here */
7895         }
7896     }
7897     return s;
7898 }
7899
7900 STATIC U8 *
7901 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7902 {
7903     PERL_ARGS_ASSERT_REGHOP4;
7904
7905     if (off >= 0) {
7906         while (off-- && s < rlim) {
7907             /* XXX could check well-formedness here */
7908             s += UTF8SKIP(s);
7909         }
7910     }
7911     else {
7912         while (off++ && s > llim) {
7913             s--;
7914             if (UTF8_IS_CONTINUED(*s)) {
7915                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7916                     s--;
7917             }
7918             /* XXX could check well-formedness here */
7919         }
7920     }
7921     return s;
7922 }
7923
7924 /* like reghop3, but returns NULL on overrun, rather than returning last
7925  * char pos */
7926
7927 STATIC U8 *
7928 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7929 {
7930     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7931
7932     if (off >= 0) {
7933         while (off-- && s < lim) {
7934             /* XXX could check well-formedness here */
7935             s += UTF8SKIP(s);
7936         }
7937         if (off >= 0)
7938             return NULL;
7939     }
7940     else {
7941         while (off++ && s > lim) {
7942             s--;
7943             if (UTF8_IS_CONTINUED(*s)) {
7944                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7945                     s--;
7946             }
7947             /* XXX could check well-formedness here */
7948         }
7949         if (off <= 0)
7950             return NULL;
7951     }
7952     return s;
7953 }
7954
7955
7956 /* when executing a regex that may have (?{}), extra stuff needs setting
7957    up that will be visible to the called code, even before the current
7958    match has finished. In particular:
7959
7960    * $_ is localised to the SV currently being matched;
7961    * pos($_) is created if necessary, ready to be updated on each call-out
7962      to code;
7963    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7964      isn't set until the current pattern is successfully finished), so that
7965      $1 etc of the match-so-far can be seen;
7966    * save the old values of subbeg etc of the current regex, and  set then
7967      to the current string (again, this is normally only done at the end
7968      of execution)
7969 */
7970
7971 static void
7972 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7973 {
7974     MAGIC *mg;
7975     regexp *const rex = ReANY(reginfo->prog);
7976     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7977
7978     eval_state->rex = rex;
7979
7980     if (reginfo->sv) {
7981         /* Make $_ available to executed code. */
7982         if (reginfo->sv != DEFSV) {
7983             SAVE_DEFSV;
7984             DEFSV_set(reginfo->sv);
7985         }
7986
7987         if (!(mg = mg_find_mglob(reginfo->sv))) {
7988             /* prepare for quick setting of pos */
7989             mg = sv_magicext_mglob(reginfo->sv);
7990             mg->mg_len = -1;
7991         }
7992         eval_state->pos_magic = mg;
7993         eval_state->pos       = mg->mg_len;
7994         eval_state->pos_flags = mg->mg_flags;
7995     }
7996     else
7997         eval_state->pos_magic = NULL;
7998
7999     if (!PL_reg_curpm) {
8000         /* PL_reg_curpm is a fake PMOP that we can attach the current
8001          * regex to and point PL_curpm at, so that $1 et al are visible
8002          * within a /(?{})/. It's just allocated once per interpreter the
8003          * first time its needed */
8004         Newxz(PL_reg_curpm, 1, PMOP);
8005 #ifdef USE_ITHREADS
8006         {
8007             SV* const repointer = &PL_sv_undef;
8008             /* this regexp is also owned by the new PL_reg_curpm, which
8009                will try to free it.  */
8010             av_push(PL_regex_padav, repointer);
8011             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8012             PL_regex_pad = AvARRAY(PL_regex_padav);
8013         }
8014 #endif
8015     }
8016     SET_reg_curpm(reginfo->prog);
8017     eval_state->curpm = PL_curpm;
8018     PL_curpm = PL_reg_curpm;
8019     if (RXp_MATCH_COPIED(rex)) {
8020         /*  Here is a serious problem: we cannot rewrite subbeg,
8021             since it may be needed if this match fails.  Thus
8022             $` inside (?{}) could fail... */
8023         eval_state->subbeg     = rex->subbeg;
8024         eval_state->sublen     = rex->sublen;
8025         eval_state->suboffset  = rex->suboffset;
8026         eval_state->subcoffset = rex->subcoffset;
8027 #ifdef PERL_ANY_COW
8028         eval_state->saved_copy = rex->saved_copy;
8029 #endif
8030         RXp_MATCH_COPIED_off(rex);
8031     }
8032     else
8033         eval_state->subbeg = NULL;
8034     rex->subbeg = (char *)reginfo->strbeg;
8035     rex->suboffset = 0;
8036     rex->subcoffset = 0;
8037     rex->sublen = reginfo->strend - reginfo->strbeg;
8038 }
8039
8040
8041 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8042
8043 static void
8044 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8045 {
8046     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8047     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8048     regmatch_slab *s;
8049
8050     Safefree(aux->poscache);
8051
8052     if (eval_state) {
8053
8054         /* undo the effects of S_setup_eval_state() */
8055
8056         if (eval_state->subbeg) {
8057             regexp * const rex = eval_state->rex;
8058             rex->subbeg     = eval_state->subbeg;
8059             rex->sublen     = eval_state->sublen;
8060             rex->suboffset  = eval_state->suboffset;
8061             rex->subcoffset = eval_state->subcoffset;
8062 #ifdef PERL_ANY_COW
8063             rex->saved_copy = eval_state->saved_copy;
8064 #endif
8065             RXp_MATCH_COPIED_on(rex);
8066         }
8067         if (eval_state->pos_magic)
8068         {
8069             eval_state->pos_magic->mg_len = eval_state->pos;
8070             eval_state->pos_magic->mg_flags =
8071                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8072                | (eval_state->pos_flags & MGf_BYTES);
8073         }
8074
8075         PL_curpm = eval_state->curpm;
8076     }
8077
8078     PL_regmatch_state = aux->old_regmatch_state;
8079     PL_regmatch_slab  = aux->old_regmatch_slab;
8080
8081     /* free all slabs above current one - this must be the last action
8082      * of this function, as aux and eval_state are allocated within
8083      * slabs and may be freed here */
8084
8085     s = PL_regmatch_slab->next;
8086     if (s) {
8087         PL_regmatch_slab->next = NULL;
8088         while (s) {
8089             regmatch_slab * const osl = s;
8090             s = s->next;
8091             Safefree(osl);
8092         }
8093     }
8094 }
8095
8096
8097 STATIC void
8098 S_to_utf8_substr(pTHX_ regexp *prog)
8099 {
8100     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8101      * on the converted value */
8102
8103     int i = 1;
8104
8105     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8106
8107     do {
8108         if (prog->substrs->data[i].substr
8109             && !prog->substrs->data[i].utf8_substr) {
8110             SV* const sv = newSVsv(prog->substrs->data[i].substr);
8111             prog->substrs->data[i].utf8_substr = sv;
8112             sv_utf8_upgrade(sv);
8113             if (SvVALID(prog->substrs->data[i].substr)) {
8114                 if (SvTAIL(prog->substrs->data[i].substr)) {
8115                     /* Trim the trailing \n that fbm_compile added last
8116                        time.  */
8117                     SvCUR_set(sv, SvCUR(sv) - 1);
8118                     /* Whilst this makes the SV technically "invalid" (as its
8119                        buffer is no longer followed by "\0") when fbm_compile()
8120                        adds the "\n" back, a "\0" is restored.  */
8121                     fbm_compile(sv, FBMcf_TAIL);
8122                 } else
8123                     fbm_compile(sv, 0);
8124             }
8125             if (prog->substrs->data[i].substr == prog->check_substr)
8126                 prog->check_utf8 = sv;
8127         }
8128     } while (i--);
8129 }
8130
8131 STATIC bool
8132 S_to_byte_substr(pTHX_ regexp *prog)
8133 {
8134     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8135      * on the converted value; returns FALSE if can't be converted. */
8136
8137     int i = 1;
8138
8139     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8140
8141     do {
8142         if (prog->substrs->data[i].utf8_substr
8143             && !prog->substrs->data[i].substr) {
8144             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8145             if (! sv_utf8_downgrade(sv, TRUE)) {
8146                 return FALSE;
8147             }
8148             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8149                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8150                     /* Trim the trailing \n that fbm_compile added last
8151                         time.  */
8152                     SvCUR_set(sv, SvCUR(sv) - 1);
8153                     fbm_compile(sv, FBMcf_TAIL);
8154                 } else
8155                     fbm_compile(sv, 0);
8156             }
8157             prog->substrs->data[i].substr = sv;
8158             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8159                 prog->check_substr = sv;
8160         }
8161     } while (i--);
8162
8163     return TRUE;
8164 }
8165
8166 /*
8167  * Local variables:
8168  * c-indentation-style: bsd
8169  * c-basic-offset: 4
8170  * indent-tabs-mode: nil
8171  * End:
8172  *
8173  * ex: set ts=8 sts=4 sw=4 et:
8174  */