regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  41       "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  42
  43 /*
  44  * pregcomp and pregexec -- regsub and regerror are not used in perl
  45  *
  46  *      Copyright (c) 1986 by University of Toronto.
  47  *      Written by Henry Spencer.  Not derived from licensed software.
  48  *
  49  *      Permission is granted to anyone to use this software for any
  50  *      purpose on any computer system, and to redistribute it freely,
  51  *      subject to the following restrictions:
  52  *
  53  *      1. The author is not responsible for the consequences of use of
  54  *              this software, no matter how awful, even if they arise
  55  *              from defects in it.
  56  *
  57  *      2. The origin of this software must not be misrepresented, either
  58  *              by explicit claim or by omission.
  59  *
  60  *      3. Altered versions must be plainly marked as such, and must not
  61  *              be misrepresented as being the original software.
  62  *
  63  ****    Alterations to Henry's code are...
  64  ****
  65  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  66  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  67  ****    by Larry Wall and others
  68  ****
  69  ****    You may distribute under the terms of either the GNU General Public
  70  ****    License or the Artistic License, as specified in the README file.
  71  *
  72  * Beware that some of this code is subtly aware of the way operator
  73  * precedence is structured in regular expressions.  Serious changes in
  74  * regular-expression syntax might require a total rethink.
  75  */
  76 #include "EXTERN.h"
  77 #define PERL_IN_REGEXEC_C
  78 #include "perl.h"
  79
  80 #ifdef PERL_IN_XSUB_RE
  81 #  include "re_comp.h"
  82 #else
  83 #  include "regcomp.h"
  84 #endif
  85
  86 #include "inline_invlist.c"
  87 #include "unicode_constants.h"
  88
  89 #ifdef DEBUGGING
  90 /* At least one required character in the target string is expressible only in
  91  * UTF-8. */
  92 static const char* const non_utf8_target_but_utf8_required
  93                 = "Can't match, because target string needs to be in UTF-8\n";
  94 #endif
  95
  96 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  97     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  98     goto target; \
  99 } STMT_END
 100
 101 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 /* Valid only for non-utf8 strings: avoids the reginclass
 108  * call if there are no complications: i.e., if everything matchable is
 109  * straight forward in the bitmap */
 110 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 111                                               : ANYOF_BITMAP_TEST(p,*(c)))
 112
 113 /*
 114  * Forwards.
 115  */
 116
 117 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 118 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 119
 120 #define HOPc(pos,off) \
 121         (char *)(reginfo->is_utf8_target \
 122             ? reghop3((U8*)pos, off, \
 123                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 124             : (U8*)(pos + off))
 125
 126 #define HOPBACKc(pos, off) \
 127         (char*)(reginfo->is_utf8_target \
 128             ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 129             : (pos - off >= reginfo->strbeg)    \
 130                 ? (U8*)pos - off                \
 131                 : NULL)
 132
 133 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 135
 136 /* lim must be +ve. Returns NULL on overshoot */
 137 #define HOPMAYBE3(pos,off,lim) \
 138         (reginfo->is_utf8_target                        \
 139             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 140             : ((U8*)pos + off <= lim)                   \
 141                 ? (U8*)pos + off                        \
 142                 : NULL)
 143
 144 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 145  * off must be >=0; args should be vars rather than expressions */
 146 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 147     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 148     : (U8*)((pos + off) > lim ? lim : (pos + off)))
 149
 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 152     : (U8*)(pos + off))
 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 154
 155 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 156 #define NEXTCHR_IS_EOS (nextchr < 0)
 157
 158 #define SET_nextchr \
 159     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 160
 161 #define SET_locinput(p) \
 162     locinput = (p);  \
 163     SET_nextchr
 164
 165
 166 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 167         if (!swash_ptr) {                                                     \
 168             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 169             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 170                                          1, 0, invlist, &flags);              \
 171             assert(swash_ptr);                                                \
 172         }                                                                     \
 173     } STMT_END
 174
 175 /* If in debug mode, we test that a known character properly matches */
 176 #ifdef DEBUGGING
 177 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 178                                           property_name,                      \
 179                                           invlist,                            \
 180                                           utf8_char_in_property)              \
 181         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 182         assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 183 #else
 184 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 185                                           property_name,                      \
 186                                           invlist,                            \
 187                                           utf8_char_in_property)              \
 188         LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 189 #endif
 190
 191 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 192                                         PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 193                                         "",                                   \
 194                                         PL_XPosix_ptrs[_CC_WORDCHAR],         \
 195                                         LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 196
 197 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 198 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 199
 200 /* for use after a quantifier and before an EXACT-like node -- japhy */
 201 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 202  *
 203  * NOTE that *nothing* that affects backtracking should be in here, specifically
 204  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 205  * node that is in between two EXACT like nodes when ascertaining what the required
 206  * "follow" character is. This should probably be moved to regex compile time
 207  * although it may be done at run time beause of the REF possibility - more
 208  * investigation required. -- demerphq
 209 */
 210 #define JUMPABLE(rn) (                                                             \
 211     OP(rn) == OPEN ||                                                              \
 212     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 213     OP(rn) == EVAL ||                                                              \
 214     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 215     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 216     OP(rn) == KEEPS ||                                                             \
 217     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 218 )
 219 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 220
 221 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 222
 223 #if 0
 224 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 225    we don't need this definition.  XXX These are now out-of-sync*/
 226 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 227 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 228 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 229
 230 #else
 231 /* ... so we use this as its faster. */
 232 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 233 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 234 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 235 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 236
 237 #endif
 238
 239 /*
 240   Search for mandatory following text node; for lookahead, the text must
 241   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 242 */
 243 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 244     while (JUMPABLE(rn)) { \
 245         const OPCODE type = OP(rn); \
 246         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 247             rn = NEXTOPER(NEXTOPER(rn)); \
 248         else if (type == PLUS) \
 249             rn = NEXTOPER(rn); \
 250         else if (type == IFMATCH) \
 251             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 252         else rn += NEXT_OFF(rn); \
 253     } \
 254 } STMT_END
 255
 256 #define SLAB_FIRST(s) (&(s)->states[0])
 257 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 258
 259 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 260 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 261 static regmatch_state * S_push_slab(pTHX);
 262
 263 #define REGCP_PAREN_ELEMS 3
 264 #define REGCP_OTHER_ELEMS 3
 265 #define REGCP_FRAME_ELEMS 1
 266 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 267  * are needed for the regexp context stack bookkeeping. */
 268
 269 STATIC CHECKPOINT
 270 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 271 {
 272     const int retval = PL_savestack_ix;
 273     const int paren_elems_to_push =
 274                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 275     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 276     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 277     I32 p;
 278     GET_RE_DEBUG_FLAGS_DECL;
 279
 280     PERL_ARGS_ASSERT_REGCPPUSH;
 281
 282     if (paren_elems_to_push < 0)
 283         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 284                    (int)paren_elems_to_push, (int)maxopenparen,
 285                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 286
 287     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 288         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 289                    " out of range (%lu-%ld)",
 290                    total_elems,
 291                    (unsigned long)maxopenparen,
 292                    (long)parenfloor);
 293
 294     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 295
 296     DEBUG_BUFFERS_r(
 297         if ((int)maxopenparen > (int)parenfloor)
 298             PerlIO_printf(Perl_debug_log,
 299                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 300                 PTR2UV(rex),
 301                 PTR2UV(rex->offs)
 302             );
 303     );
 304     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 305 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 306         SSPUSHIV(rex->offs[p].end);
 307         SSPUSHIV(rex->offs[p].start);
 308         SSPUSHINT(rex->offs[p].start_tmp);
 309         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 310             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 311             (UV)p,
 312             (IV)rex->offs[p].start,
 313             (IV)rex->offs[p].start_tmp,
 314             (IV)rex->offs[p].end
 315         ));
 316     }
 317 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 318     SSPUSHINT(maxopenparen);
 319     SSPUSHINT(rex->lastparen);
 320     SSPUSHINT(rex->lastcloseparen);
 321     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 322
 323     return retval;
 324 }
 325
 326 /* These are needed since we do not localize EVAL nodes: */
 327 #define REGCP_SET(cp)                                           \
 328     DEBUG_STATE_r(                                              \
 329             PerlIO_printf(Perl_debug_log,                       \
 330                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 331                 (IV)PL_savestack_ix));                          \
 332     cp = PL_savestack_ix
 333
 334 #define REGCP_UNWIND(cp)                                        \
 335     DEBUG_STATE_r(                                              \
 336         if (cp != PL_savestack_ix)                              \
 337             PerlIO_printf(Perl_debug_log,                       \
 338                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 339                 (IV)(cp), (IV)PL_savestack_ix));                \
 340     regcpblow(cp)
 341
 342 #define UNWIND_PAREN(lp, lcp)               \
 343     for (n = rex->lastparen; n > lp; n--)   \
 344         rex->offs[n].end = -1;              \
 345     rex->lastparen = n;                     \
 346     rex->lastcloseparen = lcp;
 347
 348
 349 STATIC void
 350 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 351 {
 352     UV i;
 353     U32 paren;
 354     GET_RE_DEBUG_FLAGS_DECL;
 355
 356     PERL_ARGS_ASSERT_REGCPPOP;
 357
 358     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 359     i = SSPOPUV;
 360     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 361     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 362     rex->lastcloseparen = SSPOPINT;
 363     rex->lastparen = SSPOPINT;
 364     *maxopenparen_p = SSPOPINT;
 365
 366     i -= REGCP_OTHER_ELEMS;
 367     /* Now restore the parentheses context. */
 368     DEBUG_BUFFERS_r(
 369         if (i || rex->lastparen + 1 <= rex->nparens)
 370             PerlIO_printf(Perl_debug_log,
 371                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 372                 PTR2UV(rex),
 373                 PTR2UV(rex->offs)
 374             );
 375     );
 376     paren = *maxopenparen_p;
 377     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 378         SSize_t tmps;
 379         rex->offs[paren].start_tmp = SSPOPINT;
 380         rex->offs[paren].start = SSPOPIV;
 381         tmps = SSPOPIV;
 382         if (paren <= rex->lastparen)
 383             rex->offs[paren].end = tmps;
 384         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 385             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 386             (UV)paren,
 387             (IV)rex->offs[paren].start,
 388             (IV)rex->offs[paren].start_tmp,
 389             (IV)rex->offs[paren].end,
 390             (paren > rex->lastparen ? "(skipped)" : ""));
 391         );
 392         paren--;
 393     }
 394 #if 1
 395     /* It would seem that the similar code in regtry()
 396      * already takes care of this, and in fact it is in
 397      * a better location to since this code can #if 0-ed out
 398      * but the code in regtry() is needed or otherwise tests
 399      * requiring null fields (pat.t#187 and split.t#{13,14}
 400      * (as of patchlevel 7877)  will fail.  Then again,
 401      * this code seems to be necessary or otherwise
 402      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 403      * --jhi updated by dapm */
 404     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 405         if (i > *maxopenparen_p)
 406             rex->offs[i].start = -1;
 407         rex->offs[i].end = -1;
 408         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 409             "    \\%"UVuf": %s   ..-1 undeffing\n",
 410             (UV)i,
 411             (i > *maxopenparen_p) ? "-1" : "  "
 412         ));
 413     }
 414 #endif
 415 }
 416
 417 /* restore the parens and associated vars at savestack position ix,
 418  * but without popping the stack */
 419
 420 STATIC void
 421 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 422 {
 423     I32 tmpix = PL_savestack_ix;
 424     PL_savestack_ix = ix;
 425     regcppop(rex, maxopenparen_p);
 426     PL_savestack_ix = tmpix;
 427 }
 428
 429 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 430
 431 STATIC bool
 432 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 433 {
 434     /* Returns a boolean as to whether or not 'character' is a member of the
 435      * Posix character class given by 'classnum' that should be equivalent to a
 436      * value in the typedef '_char_class_number'.
 437      *
 438      * Ideally this could be replaced by a just an array of function pointers
 439      * to the C library functions that implement the macros this calls.
 440      * However, to compile, the precise function signatures are required, and
 441      * these may vary from platform to to platform.  To avoid having to figure
 442      * out what those all are on each platform, I (khw) am using this method,
 443      * which adds an extra layer of function call overhead (unless the C
 444      * optimizer strips it away).  But we don't particularly care about
 445      * performance with locales anyway. */
 446
 447     switch ((_char_class_number) classnum) {
 448         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 449         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 450         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 451         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 452         case _CC_ENUM_CASED:     return isLOWER_LC(character)
 453                                         || isUPPER_LC(character);
 454         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 455         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 456         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 457         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 458         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 459         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 460         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 461         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 462         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 463         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 464         default:    /* VERTSPACE should never occur in locales */
 465             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 466     }
 467
 468     NOT_REACHED; /* NOTREACHED */
 469     return FALSE;
 470 }
 471
 472 STATIC bool
 473 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 474 {
 475     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 476      * 'character' is a member of the Posix character class given by 'classnum'
 477      * that should be equivalent to a value in the typedef
 478      * '_char_class_number'.
 479      *
 480      * This just calls isFOO_lc on the code point for the character if it is in
 481      * the range 0-255.  Outside that range, all characters use Unicode
 482      * rules, ignoring any locale.  So use the Unicode function if this class
 483      * requires a swash, and use the Unicode macro otherwise. */
 484
 485     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 486
 487     if (UTF8_IS_INVARIANT(*character)) {
 488         return isFOO_lc(classnum, *character);
 489     }
 490     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 491         return isFOO_lc(classnum,
 492                         TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 493     }
 494
 495     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 496
 497     if (classnum < _FIRST_NON_SWASH_CC) {
 498
 499         /* Initialize the swash unless done already */
 500         if (! PL_utf8_swash_ptrs[classnum]) {
 501             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 502             PL_utf8_swash_ptrs[classnum] =
 503                     _core_swash_init("utf8",
 504                                      "",
 505                                      &PL_sv_undef, 1, 0,
 506                                      PL_XPosix_ptrs[classnum], &flags);
 507         }
 508
 509         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 510                                  character,
 511                                  TRUE /* is UTF */ ));
 512     }
 513
 514     switch ((_char_class_number) classnum) {
 515         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 516         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 517         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 518         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 519         default:                 break;
 520     }
 521
 522     return FALSE; /* Things like CNTRL are always below 256 */
 523 }
 524
 525 /*
 526  * pregexec and friends
 527  */
 528
 529 #ifndef PERL_IN_XSUB_RE
 530 /*
 531  - pregexec - match a regexp against a string
 532  */
 533 I32
 534 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 535          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 536 /* stringarg: the point in the string at which to begin matching */
 537 /* strend:    pointer to null at end of string */
 538 /* strbeg:    real beginning of string */
 539 /* minend:    end of match must be >= minend bytes after stringarg. */
 540 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 541  *            itself is accessed via the pointers above */
 542 /* nosave:    For optimizations. */
 543 {
 544     PERL_ARGS_ASSERT_PREGEXEC;
 545
 546     return
 547         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 548                       nosave ? 0 : REXEC_COPY_STR);
 549 }
 550 #endif
 551
 552
 553
 554 /* re_intuit_start():
 555  *
 556  * Based on some optimiser hints, try to find the earliest position in the
 557  * string where the regex could match.
 558  *
 559  *   rx:     the regex to match against
 560  *   sv:     the SV being matched: only used for utf8 flag; the string
 561  *           itself is accessed via the pointers below. Note that on
 562  *           something like an overloaded SV, SvPOK(sv) may be false
 563  *           and the string pointers may point to something unrelated to
 564  *           the SV itself.
 565  *   strbeg: real beginning of string
 566  *   strpos: the point in the string at which to begin matching
 567  *   strend: pointer to the byte following the last char of the string
 568  *   flags   currently unused; set to 0
 569  *   data:   currently unused; set to NULL
 570  *
 571  * The basic idea of re_intuit_start() is to use some known information
 572  * about the pattern, namely:
 573  *
 574  *   a) the longest known anchored substring (i.e. one that's at a
 575  *      constant offset from the beginning of the pattern; but not
 576  *      necessarily at a fixed offset from the beginning of the
 577  *      string);
 578  *   b) the longest floating substring (i.e. one that's not at a constant
 579  *      offset from the beginning of the pattern);
 580  *   c) Whether the pattern is anchored to the string; either
 581  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 582  *      or anchored to pos(): /\G/;
 583  *   d) A start class: a real or synthetic character class which
 584  *      represents which characters are legal at the start of the pattern;
 585  *
 586  * to either quickly reject the match, or to find the earliest position
 587  * within the string at which the pattern might match, thus avoiding
 588  * running the full NFA engine at those earlier locations, only to
 589  * eventually fail and retry further along.
 590  *
 591  * Returns NULL if the pattern can't match, or returns the address within
 592  * the string which is the earliest place the match could occur.
 593  *
 594  * The longest of the anchored and floating substrings is called 'check'
 595  * and is checked first. The other is called 'other' and is checked
 596  * second. The 'other' substring may not be present.  For example,
 597  *
 598  *    /(abc|xyz)ABC\d{0,3}DEFG/
 599  *
 600  * will have
 601  *
 602  *   check substr (float)    = "DEFG", offset 6..9 chars
 603  *   other substr (anchored) = "ABC",  offset 3..3 chars
 604  *   stclass = [ax]
 605  *
 606  * Be aware that during the course of this function, sometimes 'anchored'
 607  * refers to a substring being anchored relative to the start of the
 608  * pattern, and sometimes to the pattern itself being anchored relative to
 609  * the string. For example:
 610  *
 611  *   /\dabc/:   "abc" is anchored to the pattern;
 612  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 613  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 614  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 615  *                    but the pattern is anchored to the string.
 616  */
 617
 618 char *
 619 Perl_re_intuit_start(pTHX_
 620                     REGEXP * const rx,
 621                     SV *sv,
 622                     const char * const strbeg,
 623                     char *strpos,
 624                     char *strend,
 625                     const U32 flags,
 626                     re_scream_pos_data *data)
 627 {
 628     struct regexp *const prog = ReANY(rx);
 629     SSize_t start_shift = prog->check_offset_min;
 630     /* Should be nonnegative! */
 631     SSize_t end_shift   = 0;
 632     /* current lowest pos in string where the regex can start matching */
 633     char *rx_origin = strpos;
 634     SV *check;
 635     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 636     U8   other_ix = 1 - prog->substrs->check_ix;
 637     bool ml_anch = 0;
 638     char *other_last = strpos;/* latest pos 'other' substr already checked to */
 639     char *check_at = NULL;              /* check substr found at this pos */
 640     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 641     RXi_GET_DECL(prog,progi);
 642     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 643     regmatch_info *const reginfo = &reginfo_buf;
 644     GET_RE_DEBUG_FLAGS_DECL;
 645
 646     PERL_ARGS_ASSERT_RE_INTUIT_START;
 647     PERL_UNUSED_ARG(flags);
 648     PERL_UNUSED_ARG(data);
 649
 650     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 651                 "Intuit: trying to determine minimum start position...\n"));
 652
 653     /* for now, assume that all substr offsets are positive. If at some point
 654      * in the future someone wants to do clever things with look-behind and
 655      * -ve offsets, they'll need to fix up any code in this function
 656      * which uses these offsets. See the thread beginning
 657      * <20140113145929.GF27210@iabyn.com>
 658      */
 659     assert(prog->substrs->data[0].min_offset >= 0);
 660     assert(prog->substrs->data[0].max_offset >= 0);
 661     assert(prog->substrs->data[1].min_offset >= 0);
 662     assert(prog->substrs->data[1].max_offset >= 0);
 663     assert(prog->substrs->data[2].min_offset >= 0);
 664     assert(prog->substrs->data[2].max_offset >= 0);
 665
 666     /* for now, assume that if both present, that the floating substring
 667      * doesn't start before the anchored substring.
 668      * If you break this assumption (e.g. doing better optimisations
 669      * with lookahead/behind), then you'll need to audit the code in this
 670      * function carefully first
 671      */
 672     assert(
 673             ! (  (prog->anchored_utf8 || prog->anchored_substr)
 674               && (prog->float_utf8    || prog->float_substr))
 675            || (prog->float_min_offset >= prog->anchored_offset));
 676
 677     /* byte rather than char calculation for efficiency. It fails
 678      * to quickly reject some cases that can't match, but will reject
 679      * them later after doing full char arithmetic */
 680     if (prog->minlen > strend - strpos) {
 681         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 682                               "  String too short...\n"));
 683         goto fail;
 684     }
 685
 686     RX_MATCH_UTF8_set(rx,utf8_target);
 687     reginfo->is_utf8_target = cBOOL(utf8_target);
 688     reginfo->info_aux = NULL;
 689     reginfo->strbeg = strbeg;
 690     reginfo->strend = strend;
 691     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 692     reginfo->intuit = 1;
 693     /* not actually used within intuit, but zero for safety anyway */
 694     reginfo->poscache_maxiter = 0;
 695
 696     if (utf8_target) {
 697         if (!prog->check_utf8 && prog->check_substr)
 698             to_utf8_substr(prog);
 699         check = prog->check_utf8;
 700     } else {
 701         if (!prog->check_substr && prog->check_utf8) {
 702             if (! to_byte_substr(prog)) {
 703                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 704             }
 705         }
 706         check = prog->check_substr;
 707     }
 708
 709     /* dump the various substring data */
 710     DEBUG_OPTIMISE_MORE_r({
 711         int i;
 712         for (i=0; i<=2; i++) {
 713             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 714                                   : prog->substrs->data[i].substr);
 715             if (!sv)
 716                 continue;
 717
 718             PerlIO_printf(Perl_debug_log,
 719                 "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 720                 " useful=%"IVdf" utf8=%d [%s]\n",
 721                 i,
 722                 (IV)prog->substrs->data[i].min_offset,
 723                 (IV)prog->substrs->data[i].max_offset,
 724                 (IV)prog->substrs->data[i].end_shift,
 725                 BmUSEFUL(sv),
 726                 utf8_target ? 1 : 0,
 727                 SvPEEK(sv));
 728         }
 729     });
 730
 731     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 732
 733         /* ml_anch: check after \n?
 734          *
 735          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 736          * with /.*.../, these flags will have been added by the
 737          * compiler:
 738          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 739          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 740          */
 741         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 742                    && !(prog->intflags & PREGf_IMPLICIT);
 743
 744         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 745             /* we are only allowed to match at BOS or \G */
 746
 747             /* trivially reject if there's a BOS anchor and we're not at BOS.
 748              *
 749              * Note that we don't try to do a similar quick reject for
 750              * \G, since generally the caller will have calculated strpos
 751              * based on pos() and gofs, so the string is already correctly
 752              * anchored by definition; and handling the exceptions would
 753              * be too fiddly (e.g. REXEC_IGNOREPOS).
 754              */
 755             if (   strpos != strbeg
 756                 && (prog->intflags & PREGf_ANCH_SBOL))
 757             {
 758                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 759                                 "  Not at start...\n"));
 760                 goto fail;
 761             }
 762
 763             /* in the presence of an anchor, the anchored (relative to the
 764              * start of the regex) substr must also be anchored relative
 765              * to strpos. So quickly reject if substr isn't found there.
 766              * This works for \G too, because the caller will already have
 767              * subtracted gofs from pos, and gofs is the offset from the
 768              * \G to the start of the regex. For example, in /.abc\Gdef/,
 769              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 770              * caller will have set strpos=pos()-4; we look for the substr
 771              * at position pos()-4+1, which lines up with the "a" */
 772
 773             if (prog->check_offset_min == prog->check_offset_max) {
 774                 /* Substring at constant offset from beg-of-str... */
 775                 SSize_t slen = SvCUR(check);
 776                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
 777
 778                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 779                     "  Looking for check substr at fixed offset %"IVdf"...\n",
 780                     (IV)prog->check_offset_min));
 781
 782                 if (SvTAIL(check)) {
 783                     /* In this case, the regex is anchored at the end too.
 784                      * Unless it's a multiline match, the lengths must match
 785                      * exactly, give or take a \n.  NB: slen >= 1 since
 786                      * the last char of check is \n */
 787                     if (!multiline
 788                         && (   strend - s > slen
 789                             || strend - s < slen - 1
 790                             || (strend - s == slen && strend[-1] != '\n')))
 791                     {
 792                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 793                                             "  String too long...\n"));
 794                         goto fail_finish;
 795                     }
 796                     /* Now should match s[0..slen-2] */
 797                     slen--;
 798                 }
 799                 if (slen && (*SvPVX_const(check) != *s
 800                     || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 801                 {
 802                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 803                                     "  String not equal...\n"));
 804                     goto fail_finish;
 805                 }
 806
 807                 check_at = s;
 808                 goto success_at_start;
 809             }
 810         }
 811     }
 812
 813     end_shift = prog->check_end_shift;
 814
 815 #ifdef DEBUGGING        /* 7/99: reports of failure (with the older version) */
 816     if (end_shift < 0)
 817         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 818                    (IV)end_shift, RX_PRECOMP(prog));
 819 #endif
 820
 821   restart:
 822
 823     /* This is the (re)entry point of the main loop in this function.
 824      * The goal of this loop is to:
 825      * 1) find the "check" substring in the region rx_origin..strend
 826      *    (adjusted by start_shift / end_shift). If not found, reject
 827      *    immediately.
 828      * 2) If it exists, look for the "other" substr too if defined; for
 829      *    example, if the check substr maps to the anchored substr, then
 830      *    check the floating substr, and vice-versa. If not found, go
 831      *    back to (1) with rx_origin suitably incremented.
 832      * 3) If we find an rx_origin position that doesn't contradict
 833      *    either of the substrings, then check the possible additional
 834      *    constraints on rx_origin of /^.../m or a known start class.
 835      *    If these fail, then depending on which constraints fail, jump
 836      *    back to here, or to various other re-entry points further along
 837      *    that skip some of the first steps.
 838      * 4) If we pass all those tests, update the BmUSEFUL() count on the
 839      *    substring. If the start position was determined to be at the
 840      *    beginning of the string  - so, not rejected, but not optimised,
 841      *    since we have to run regmatch from position 0 - decrement the
 842      *    BmUSEFUL() count. Otherwise increment it.
 843      */
 844
 845
 846     /* first, look for the 'check' substring */
 847
 848     {
 849         U8* start_point;
 850         U8* end_point;
 851
 852         DEBUG_OPTIMISE_MORE_r({
 853             PerlIO_printf(Perl_debug_log,
 854                 "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 855                 " Start shift: %"IVdf" End shift %"IVdf
 856                 " Real end Shift: %"IVdf"\n",
 857                 (IV)(rx_origin - strbeg),
 858                 (IV)prog->check_offset_min,
 859                 (IV)start_shift,
 860                 (IV)end_shift,
 861                 (IV)prog->check_end_shift);
 862         });
 863
 864         end_point = HOP3(strend, -end_shift, strbeg);
 865         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 866         if (!start_point)
 867             goto fail_finish;
 868
 869
 870         /* If the regex is absolutely anchored to either the start of the
 871          * string (SBOL) or to pos() (ANCH_GPOS), then
 872          * check_offset_max represents an upper bound on the string where
 873          * the substr could start. For the ANCH_GPOS case, we assume that
 874          * the caller of intuit will have already set strpos to
 875          * pos()-gofs, so in this case strpos + offset_max will still be
 876          * an upper bound on the substr.
 877          */
 878         if (!ml_anch
 879             && prog->intflags & PREGf_ANCH
 880             && prog->check_offset_max != SSize_t_MAX)
 881         {
 882             SSize_t len = SvCUR(check) - !!SvTAIL(check);
 883             const char * const anchor =
 884                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 885
 886             /* do a bytes rather than chars comparison. It's conservative;
 887              * so it skips doing the HOP if the result can't possibly end
 888              * up earlier than the old value of end_point.
 889              */
 890             if ((char*)end_point - anchor > prog->check_offset_max) {
 891                 end_point = HOP3lim((U8*)anchor,
 892                                 prog->check_offset_max,
 893                                 end_point -len)
 894                             + len;
 895             }
 896         }
 897
 898         check_at = fbm_instr( start_point, end_point,
 899                       check, multiline ? FBMrf_MULTILINE : 0);
 900
 901         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 902             "  doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
 903             (IV)((char*)start_point - strbeg),
 904             (IV)((char*)end_point   - strbeg),
 905             (IV)(check_at ? check_at - strbeg : -1)
 906         ));
 907
 908         /* Update the count-of-usability, remove useless subpatterns,
 909             unshift s.  */
 910
 911         DEBUG_EXECUTE_r({
 912             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 913                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 914             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 915                               (check_at ? "Found" : "Did not find"),
 916                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 917                     ? "anchored" : "floating"),
 918                 quoted,
 919                 RE_SV_TAIL(check),
 920                 (check_at ? " at offset " : "...\n") );
 921         });
 922
 923         if (!check_at)
 924             goto fail_finish;
 925         /* set rx_origin to the minimum position where the regex could start
 926          * matching, given the constraint of the just-matched check substring.
 927          * But don't set it lower than previously.
 928          */
 929
 930         if (check_at - rx_origin > prog->check_offset_max)
 931             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 932         /* Finish the diagnostic message */
 933         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 934             "%ld (rx_origin now %"IVdf")...\n",
 935             (long)(check_at - strbeg),
 936             (IV)(rx_origin - strbeg)
 937         ));
 938     }
 939
 940
 941     /* now look for the 'other' substring if defined */
 942
 943     if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 944                     : prog->substrs->data[other_ix].substr)
 945     {
 946         /* Take into account the "other" substring. */
 947         char *last, *last1;
 948         char *s;
 949         SV* must;
 950         struct reg_substr_datum *other;
 951
 952       do_other_substr:
 953         other = &prog->substrs->data[other_ix];
 954
 955         /* if "other" is anchored:
 956          * we've previously found a floating substr starting at check_at.
 957          * This means that the regex origin must lie somewhere
 958          * between min (rx_origin): HOP3(check_at, -check_offset_max)
 959          * and max:                 HOP3(check_at, -check_offset_min)
 960          * (except that min will be >= strpos)
 961          * So the fixed  substr must lie somewhere between
 962          *  HOP3(min, anchored_offset)
 963          *  HOP3(max, anchored_offset) + SvCUR(substr)
 964          */
 965
 966         /* if "other" is floating
 967          * Calculate last1, the absolute latest point where the
 968          * floating substr could start in the string, ignoring any
 969          * constraints from the earlier fixed match. It is calculated
 970          * as follows:
 971          *
 972          * strend - prog->minlen (in chars) is the absolute latest
 973          * position within the string where the origin of the regex
 974          * could appear. The latest start point for the floating
 975          * substr is float_min_offset(*) on from the start of the
 976          * regex.  last1 simply combines thee two offsets.
 977          *
 978          * (*) You might think the latest start point should be
 979          * float_max_offset from the regex origin, and technically
 980          * you'd be correct. However, consider
 981          *    /a\d{2,4}bcd\w/
 982          * Here, float min, max are 3,5 and minlen is 7.
 983          * This can match either
 984          *    /a\d\dbcd\w/
 985          *    /a\d\d\dbcd\w/
 986          *    /a\d\d\d\dbcd\w/
 987          * In the first case, the regex matches minlen chars; in the
 988          * second, minlen+1, in the third, minlen+2.
 989          * In the first case, the floating offset is 3 (which equals
 990          * float_min), in the second, 4, and in the third, 5 (which
 991          * equals float_max). In all cases, the floating string bcd
 992          * can never start more than 4 chars from the end of the
 993          * string, which equals minlen - float_min. As the substring
 994          * starts to match more than float_min from the start of the
 995          * regex, it makes the regex match more than minlen chars,
 996          * and the two cancel each other out. So we can always use
 997          * float_min - minlen, rather than float_max - minlen for the
 998          * latest position in the string.
 999          *
1000          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1001          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1002          */
1003
1004         assert(prog->minlen >= other->min_offset);
1005         last1 = HOP3c(strend,
1006                         other->min_offset - prog->minlen, strbeg);
1007
1008         if (other_ix) {/* i.e. if (other-is-float) */
1009             /* last is the latest point where the floating substr could
1010              * start, *given* any constraints from the earlier fixed
1011              * match. This constraint is that the floating string starts
1012              * <= float_max_offset chars from the regex origin (rx_origin).
1013              * If this value is less than last1, use it instead.
1014              */
1015             assert(rx_origin <= last1);
1016             last =
1017                 /* this condition handles the offset==infinity case, and
1018                  * is a short-cut otherwise. Although it's comparing a
1019                  * byte offset to a char length, it does so in a safe way,
1020                  * since 1 char always occupies 1 or more bytes,
1021                  * so if a string range is  (last1 - rx_origin) bytes,
1022                  * it will be less than or equal to  (last1 - rx_origin)
1023                  * chars; meaning it errs towards doing the accurate HOP3
1024                  * rather than just using last1 as a short-cut */
1025                 (last1 - rx_origin) < other->max_offset
1026                     ? last1
1027                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1028         }
1029         else {
1030             assert(strpos + start_shift <= check_at);
1031             last = HOP4c(check_at, other->min_offset - start_shift,
1032                         strbeg, strend);
1033         }
1034
1035         s = HOP3c(rx_origin, other->min_offset, strend);
1036         if (s < other_last)     /* These positions already checked */
1037             s = other_last;
1038
1039         must = utf8_target ? other->utf8_substr : other->substr;
1040         assert(SvPOK(must));
1041         {
1042             char *from = s;
1043             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1044
1045             if (from > to) {
1046                 s = NULL;
1047                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1048                     "  skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1049                     (IV)(from - strbeg),
1050                     (IV)(to   - strbeg)
1051                 ));
1052             }
1053             else {
1054                 s = fbm_instr(
1055                     (unsigned char*)from,
1056                     (unsigned char*)to,
1057                     must,
1058                     multiline ? FBMrf_MULTILINE : 0
1059                 );
1060                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1061                     "  doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1062                     (IV)(from - strbeg),
1063                     (IV)(to   - strbeg),
1064                     (IV)(s ? s - strbeg : -1)
1065                 ));
1066             }
1067         }
1068
1069         DEBUG_EXECUTE_r({
1070             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1071                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1072             PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1073                 s ? "Found" : "Contradicts",
1074                 other_ix ? "floating" : "anchored",
1075                 quoted, RE_SV_TAIL(must));
1076         });
1077
1078
1079         if (!s) {
1080             /* last1 is latest possible substr location. If we didn't
1081              * find it before there, we never will */
1082             if (last >= last1) {
1083                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1084                                         "; giving up...\n"));
1085                 goto fail_finish;
1086             }
1087
1088             /* try to find the check substr again at a later
1089              * position. Maybe next time we'll find the "other" substr
1090              * in range too */
1091             other_last = HOP3c(last, 1, strend) /* highest failure */;
1092             rx_origin =
1093                 other_ix /* i.e. if other-is-float */
1094                     ? HOP3c(rx_origin, 1, strend)
1095                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1096             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1097                 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1098                 (other_ix ? "floating" : "anchored"),
1099                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1100                 (IV)(rx_origin - strbeg)
1101             ));
1102             goto restart;
1103         }
1104         else {
1105             if (other_ix) { /* if (other-is-float) */
1106                 /* other_last is set to s, not s+1, since its possible for
1107                  * a floating substr to fail first time, then succeed
1108                  * second time at the same floating position; e.g.:
1109                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1110                  * The first time round, anchored and float match at
1111                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1112                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1113                  */
1114                 other_last = s;
1115             }
1116             else {
1117                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1118                 other_last = HOP3c(s, 1, strend);
1119             }
1120             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1121                 " at offset %ld (rx_origin now %"IVdf")...\n",
1122                   (long)(s - strbeg),
1123                 (IV)(rx_origin - strbeg)
1124               ));
1125
1126         }
1127     }
1128     else {
1129         DEBUG_OPTIMISE_MORE_r(
1130             PerlIO_printf(Perl_debug_log,
1131                 "  Check-only match: offset min:%"IVdf" max:%"IVdf
1132                 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1133                 " strend:%"IVdf"\n",
1134                 (IV)prog->check_offset_min,
1135                 (IV)prog->check_offset_max,
1136                 (IV)(check_at-strbeg),
1137                 (IV)(rx_origin-strbeg),
1138                 (IV)(rx_origin-check_at),
1139                 (IV)(strend-strbeg)
1140             )
1141         );
1142     }
1143
1144   postprocess_substr_matches:
1145
1146     /* handle the extra constraint of /^.../m if present */
1147
1148     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1149         char *s;
1150
1151         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1152                         "  looking for /^/m anchor"));
1153
1154         /* we have failed the constraint of a \n before rx_origin.
1155          * Find the next \n, if any, even if it's beyond the current
1156          * anchored and/or floating substrings. Whether we should be
1157          * scanning ahead for the next \n or the next substr is debatable.
1158          * On the one hand you'd expect rare substrings to appear less
1159          * often than \n's. On the other hand, searching for \n means
1160          * we're effectively flipping between check_substr and "\n" on each
1161          * iteration as the current "rarest" string candidate, which
1162          * means for example that we'll quickly reject the whole string if
1163          * hasn't got a \n, rather than trying every substr position
1164          * first
1165          */
1166
1167         s = HOP3c(strend, - prog->minlen, strpos);
1168         if (s <= rx_origin ||
1169             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1170         {
1171             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1172                             "  Did not find /%s^%s/m...\n",
1173                             PL_colors[0], PL_colors[1]));
1174             goto fail_finish;
1175         }
1176
1177         /* earliest possible origin is 1 char after the \n.
1178          * (since *rx_origin == '\n', it's safe to ++ here rather than
1179          * HOP(rx_origin, 1)) */
1180         rx_origin++;
1181
1182         if (prog->substrs->check_ix == 0  /* check is anchored */
1183             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1184         {
1185             /* Position contradicts check-string; either because
1186              * check was anchored (and thus has no wiggle room),
1187              * or check was float and rx_origin is above the float range */
1188             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1189                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1190                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1191             goto restart;
1192         }
1193
1194         /* if we get here, the check substr must have been float,
1195          * is in range, and we may or may not have had an anchored
1196          * "other" substr which still contradicts */
1197         assert(prog->substrs->check_ix); /* check is float */
1198
1199         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1200             /* whoops, the anchored "other" substr exists, so we still
1201              * contradict. On the other hand, the float "check" substr
1202              * didn't contradict, so just retry the anchored "other"
1203              * substr */
1204             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1205                 "  Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n",
1206                 PL_colors[0], PL_colors[1],
1207                 (long)(rx_origin - strbeg + prog->anchored_offset),
1208                 (long)(rx_origin - strbeg)
1209             ));
1210             goto do_other_substr;
1211         }
1212
1213         /* success: we don't contradict the found floating substring
1214          * (and there's no anchored substr). */
1215         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1216             "  Found /%s^%s/m with rx_origin %ld...\n",
1217             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1218     }
1219     else {
1220         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1221             "  (multiline anchor test skipped)\n"));
1222     }
1223
1224   success_at_start:
1225
1226
1227     /* if we have a starting character class, then test that extra constraint.
1228      * (trie stclasses are too expensive to use here, we are better off to
1229      * leave it to regmatch itself) */
1230
1231     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1232         const U8* const str = (U8*)STRING(progi->regstclass);
1233
1234         /* XXX this value could be pre-computed */
1235         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1236                     ?  (reginfo->is_utf8_pat
1237                         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1238                         : STR_LEN(progi->regstclass))
1239                     : 1);
1240         char * endpos;
1241         char *s;
1242         /* latest pos that a matching float substr constrains rx start to */
1243         char *rx_max_float = NULL;
1244
1245         /* if the current rx_origin is anchored, either by satisfying an
1246          * anchored substring constraint, or a /^.../m constraint, then we
1247          * can reject the current origin if the start class isn't found
1248          * at the current position. If we have a float-only match, then
1249          * rx_origin is constrained to a range; so look for the start class
1250          * in that range. if neither, then look for the start class in the
1251          * whole rest of the string */
1252
1253         /* XXX DAPM it's not clear what the minlen test is for, and why
1254          * it's not used in the floating case. Nothing in the test suite
1255          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1256          * Here are some old comments, which may or may not be correct:
1257          *
1258          *   minlen == 0 is possible if regstclass is \b or \B,
1259          *   and the fixed substr is ''$.
1260          *   Since minlen is already taken into account, rx_origin+1 is
1261          *   before strend; accidentally, minlen >= 1 guaranties no false
1262          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1263          *   0) below assumes that regstclass does not come from lookahead...
1264          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1265          *   This leaves EXACTF-ish only, which are dealt with in
1266          *   find_byclass().
1267          */
1268
1269         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1270             endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1271         else if (prog->float_substr || prog->float_utf8) {
1272             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1273             endpos= HOP3c(rx_max_float, cl_l, strend);
1274         }
1275         else
1276             endpos= strend;
1277
1278         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1279             "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1280             " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1281               (IV)start_shift, (IV)(check_at - strbeg),
1282               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1283
1284         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1285                             reginfo);
1286         if (!s) {
1287             if (endpos == strend) {
1288                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1289                                 "  Could not match STCLASS...\n") );
1290                 goto fail;
1291             }
1292             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1293                                "  This position contradicts STCLASS...\n") );
1294             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1295                         && !(prog->intflags & PREGf_IMPLICIT))
1296                 goto fail;
1297
1298             /* Contradict one of substrings */
1299             if (prog->anchored_substr || prog->anchored_utf8) {
1300                 if (prog->substrs->check_ix == 1) { /* check is float */
1301                     /* Have both, check_string is floating */
1302                     assert(rx_origin + start_shift <= check_at);
1303                     if (rx_origin + start_shift != check_at) {
1304                         /* not at latest position float substr could match:
1305                          * Recheck anchored substring, but not floating.
1306                          * The condition above is in bytes rather than
1307                          * chars for efficiency. It's conservative, in
1308                          * that it errs on the side of doing 'goto
1309                          * do_other_substr'. In this case, at worst,
1310                          * an extra anchored search may get done, but in
1311                          * practice the extra fbm_instr() is likely to
1312                          * get skipped anyway. */
1313                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1314                             "  about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1315                             (long)(other_last - strbeg),
1316                             (IV)(rx_origin - strbeg)
1317                         ));
1318                         goto do_other_substr;
1319                     }
1320                 }
1321             }
1322             else {
1323                 /* float-only */
1324
1325                 if (ml_anch) {
1326                     /* In the presence of ml_anch, we might be able to
1327                      * find another \n without breaking the current float
1328                      * constraint. */
1329
1330                     /* strictly speaking this should be HOP3c(..., 1, ...),
1331                      * but since we goto a block of code that's going to
1332                      * search for the next \n if any, its safe here */
1333                     rx_origin++;
1334                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1335                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1336                               PL_colors[0], PL_colors[1],
1337                               (long)(rx_origin - strbeg)) );
1338                     goto postprocess_substr_matches;
1339                 }
1340
1341                 /* strictly speaking this can never be true; but might
1342                  * be if we ever allow intuit without substrings */
1343                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1344                     goto fail;
1345
1346                 rx_origin = rx_max_float;
1347             }
1348
1349             /* at this point, any matching substrings have been
1350              * contradicted. Start again... */
1351
1352             rx_origin = HOP3c(rx_origin, 1, strend);
1353
1354             /* uses bytes rather than char calculations for efficiency.
1355              * It's conservative: it errs on the side of doing 'goto restart',
1356              * where there is code that does a proper char-based test */
1357             if (rx_origin + start_shift + end_shift > strend) {
1358                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1359                                        "  Could not match STCLASS...\n") );
1360                 goto fail;
1361             }
1362             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1363                 "  about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1364                 (prog->substrs->check_ix ? "floating" : "anchored"),
1365                 (long)(rx_origin + start_shift - strbeg),
1366                 (IV)(rx_origin - strbeg)
1367             ));
1368             goto restart;
1369         }
1370
1371         /* Success !!! */
1372
1373         if (rx_origin != s) {
1374             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1375                         "  By STCLASS: moving %ld --> %ld\n",
1376                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1377                    );
1378         }
1379         else {
1380             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1381                                   "  Does not contradict STCLASS...\n");
1382                    );
1383         }
1384     }
1385
1386     /* Decide whether using the substrings helped */
1387
1388     if (rx_origin != strpos) {
1389         /* Fixed substring is found far enough so that the match
1390            cannot start at strpos. */
1391
1392         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1393         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1394     }
1395     else {
1396         /* The found rx_origin position does not prohibit matching at
1397          * strpos, so calling intuit didn't gain us anything. Decrement
1398          * the BmUSEFUL() count on the check substring, and if we reach
1399          * zero, free it.  */
1400         if (!(prog->intflags & PREGf_NAUGHTY)
1401             && (utf8_target ? (
1402                 prog->check_utf8                /* Could be deleted already */
1403                 && --BmUSEFUL(prog->check_utf8) < 0
1404                 && (prog->check_utf8 == prog->float_utf8)
1405             ) : (
1406                 prog->check_substr              /* Could be deleted already */
1407                 && --BmUSEFUL(prog->check_substr) < 0
1408                 && (prog->check_substr == prog->float_substr)
1409             )))
1410         {
1411             /* If flags & SOMETHING - do not do it many times on the same match */
1412             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1413             /* XXX Does the destruction order has to change with utf8_target? */
1414             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1415             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1416             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1417             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1418             check = NULL;                       /* abort */
1419             /* XXXX This is a remnant of the old implementation.  It
1420                     looks wasteful, since now INTUIT can use many
1421                     other heuristics. */
1422             prog->extflags &= ~RXf_USE_INTUIT;
1423         }
1424     }
1425
1426     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1427             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1428              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1429
1430     return rx_origin;
1431
1432   fail_finish:                          /* Substring not found */
1433     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1434         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1435   fail:
1436     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1437                           PL_colors[4], PL_colors[5]));
1438     return NULL;
1439 }
1440
1441
1442 #define DECL_TRIE_TYPE(scan) \
1443     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1444                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1445                  trie_utf8l, trie_flu8 }                                            \
1446                     trie_type = ((scan->flags == EXACT)                             \
1447                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1448                                  : (scan->flags == EXACTL)                          \
1449                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1450                                     : (scan->flags == EXACTFA)                      \
1451                                       ? (utf8_target                                \
1452                                          ? trie_utf8_exactfa_fold                   \
1453                                          : trie_latin_utf8_exactfa_fold)            \
1454                                       : (scan->flags == EXACTFLU8                   \
1455                                          ? trie_flu8                                \
1456                                          : (utf8_target                             \
1457                                            ? trie_utf8_fold                         \
1458                                            :   trie_latin_utf8_fold)))
1459
1460 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1461 STMT_START {                                                                        \
1462     STRLEN skiplen;                                                                 \
1463     U8 flags = FOLD_FLAGS_FULL;                                                     \
1464     switch (trie_type) {                                                            \
1465     case trie_flu8:                                                                 \
1466         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1467         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1468             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1469         }                                                                           \
1470         goto do_trie_utf8_fold;                                                     \
1471     case trie_utf8_exactfa_fold:                                                    \
1472         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1473         /* FALLTHROUGH */                                                           \
1474     case trie_utf8_fold:                                                            \
1475       do_trie_utf8_fold:                                                            \
1476         if ( foldlen>0 ) {                                                          \
1477             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1478             foldlen -= len;                                                         \
1479             uscan += len;                                                           \
1480             len=0;                                                                  \
1481         } else {                                                                    \
1482             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1483             len = UTF8SKIP(uc);                                                     \
1484             skiplen = UNISKIP( uvc );                                               \
1485             foldlen -= skiplen;                                                     \
1486             uscan = foldbuf + skiplen;                                              \
1487         }                                                                           \
1488         break;                                                                      \
1489     case trie_latin_utf8_exactfa_fold:                                              \
1490         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1491         /* FALLTHROUGH */                                                           \
1492     case trie_latin_utf8_fold:                                                      \
1493         if ( foldlen>0 ) {                                                          \
1494             uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1495             foldlen -= len;                                                         \
1496             uscan += len;                                                           \
1497             len=0;                                                                  \
1498         } else {                                                                    \
1499             len = 1;                                                                \
1500             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1501             skiplen = UNISKIP( uvc );                                               \
1502             foldlen -= skiplen;                                                     \
1503             uscan = foldbuf + skiplen;                                              \
1504         }                                                                           \
1505         break;                                                                      \
1506     case trie_utf8l:                                                                \
1507         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1508         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1509             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1510         }                                                                           \
1511         /* FALLTHROUGH */                                                           \
1512     case trie_utf8:                                                                 \
1513         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1514         break;                                                                      \
1515     case trie_plain:                                                                \
1516         uvc = (UV)*uc;                                                              \
1517         len = 1;                                                                    \
1518     }                                                                               \
1519     if (uvc < 256) {                                                                \
1520         charid = trie->charmap[ uvc ];                                              \
1521     }                                                                               \
1522     else {                                                                          \
1523         charid = 0;                                                                 \
1524         if (widecharmap) {                                                          \
1525             SV** const svpp = hv_fetch(widecharmap,                                 \
1526                         (char*)&uvc, sizeof(UV), 0);                                \
1527             if (svpp)                                                               \
1528                 charid = (U16)SvIV(*svpp);                                          \
1529         }                                                                           \
1530     }                                                                               \
1531 } STMT_END
1532
1533 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1534     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1535                 startpos, doutf8)
1536
1537 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1538 STMT_START {                                              \
1539     while (s <= e) {                                      \
1540         if ( (COND)                                       \
1541              && (ln == 1 || folder(s, pat_string, ln))    \
1542              && (reginfo->intuit || regtry(reginfo, &s)) )\
1543             goto got_it;                                  \
1544         s++;                                              \
1545     }                                                     \
1546 } STMT_END
1547
1548 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1549 STMT_START {                                          \
1550     while (s < strend) {                              \
1551         CODE                                          \
1552         s += UTF8SKIP(s);                             \
1553     }                                                 \
1554 } STMT_END
1555
1556 #define REXEC_FBC_SCAN(CODE)                          \
1557 STMT_START {                                          \
1558     while (s < strend) {                              \
1559         CODE                                          \
1560         s++;                                          \
1561     }                                                 \
1562 } STMT_END
1563
1564 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1565 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1566     if (COND) {                                                \
1567         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1568             goto got_it;                                       \
1569         else                                                   \
1570             tmp = doevery;                                     \
1571     }                                                          \
1572     else                                                       \
1573         tmp = 1;                                               \
1574 )
1575
1576 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1577 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1578     if (COND) {                                                \
1579         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1580             goto got_it;                                       \
1581         else                                                   \
1582             tmp = doevery;                                     \
1583     }                                                          \
1584     else                                                       \
1585         tmp = 1;                                               \
1586 )
1587
1588 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1589     if (utf8_target) {                                         \
1590         REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1591     }                                                          \
1592     else {                                                     \
1593         REXEC_FBC_CLASS_SCAN(COND);                            \
1594     }
1595
1596 /* The three macros below are slightly different versions of the same logic.
1597  *
1598  * The first is for /a and /aa when the target string is UTF-8.  This can only
1599  * match ascii, but it must advance based on UTF-8.   The other two handle the
1600  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1601  * for the boundary (or non-boundary) between a word and non-word character.
1602  * The utf8 and non-utf8 cases have the same logic, but the details must be
1603  * different.  Find the "wordness" of the character just prior to this one, and
1604  * compare it with the wordness of this one.  If they differ, we have a
1605  * boundary.  At the beginning of the string, pretend that the previous
1606  * character was a new-line.
1607  *
1608  * All these macros uncleanly have side-effects with each other and outside
1609  * variables.  So far it's been too much trouble to clean-up
1610  *
1611  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1612  *               a word character or not.
1613  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1614  *               word/non-word
1615  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1616  *
1617  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1618  * are looking for a boundary or for a non-boundary.  If we are looking for a
1619  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1620  * see if this tentative match actually works, and if so, to quit the loop
1621  * here.  And vice-versa if we are looking for a non-boundary.
1622  *
1623  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1624  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1625  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1626  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1627  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1628  * complement.  But in that branch we complement tmp, meaning that at the
1629  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1630  * which means at the top of the loop in the next iteration, it is
1631  * TEST_NON_UTF8(s-1) */
1632 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1633     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1634     tmp = TEST_NON_UTF8(tmp);                                                  \
1635     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1636         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1637             tmp = !tmp;                                                        \
1638             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1639         }                                                                      \
1640         else {                                                                 \
1641             IF_FAIL;                                                           \
1642         }                                                                      \
1643     );                                                                         \
1644
1645 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1646  * TEST_UTF8 is a macro that for the same input code points returns identically
1647  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1648 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1649     if (s == reginfo->strbeg) {                                                \
1650         tmp = '\n';                                                            \
1651     }                                                                          \
1652     else { /* Back-up to the start of the previous character */                \
1653         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1654         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1655                                                        0, UTF8_ALLOW_DEFAULT); \
1656     }                                                                          \
1657     tmp = TEST_UV(tmp);                                                        \
1658     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1659     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1660         if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1661             tmp = !tmp;                                                        \
1662             IF_SUCCESS;                                                        \
1663         }                                                                      \
1664         else {                                                                 \
1665             IF_FAIL;                                                           \
1666         }                                                                      \
1667     );
1668
1669 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1670  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1671  * macros below */
1672 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1673     if (utf8_target) {                                                         \
1674         UTF8_CODE                                                              \
1675     }                                                                          \
1676     else {  /* Not utf8 */                                                     \
1677         tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1678         tmp = TEST_NON_UTF8(tmp);                                              \
1679         REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1680             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1681                 IF_SUCCESS;                                                    \
1682                 tmp = !tmp;                                                    \
1683             }                                                                  \
1684             else {                                                             \
1685                 IF_FAIL;                                                       \
1686             }                                                                  \
1687         );                                                                     \
1688     }                                                                          \
1689     /* Here, things have been set up by the previous code so that tmp is the   \
1690      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1691      * utf8ness of the target).  We also have to check if this matches against \
1692      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1693      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1694      * string */                                                               \
1695     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1696         IF_SUCCESS;                                                            \
1697     }                                                                          \
1698     else {                                                                     \
1699         IF_FAIL;                                                               \
1700     }
1701
1702 /* This is the macro to use when we want to see if something that looks like it
1703  * could match, actually does, and if so exits the loop */
1704 #define REXEC_FBC_TRYIT                            \
1705     if ((reginfo->intuit || regtry(reginfo, &s)))  \
1706         goto got_it
1707
1708 /* The only difference between the BOUND and NBOUND cases is that
1709  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1710  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1711  * with the other one being empty (PLACEHOLDER is defined as empty).
1712  *
1713  * The TEST_FOO parameters are for operating on different forms of input, but
1714  * all should be ones that return identically for the same underlying code
1715  * points */
1716 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1717     FBC_BOUND_COMMON(                                                          \
1718           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1719           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1720
1721 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1722     FBC_BOUND_COMMON(                                                          \
1723             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1724             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1725
1726 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1727     FBC_BOUND_COMMON(                                                          \
1728           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1729           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1730
1731 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1732     FBC_BOUND_COMMON(                                                          \
1733             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1734             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1735
1736 /* Takes a pointer to an inversion list, a pointer to its corresponding
1737  * inversion map, and a code point, and returns the code point's value
1738  * according to the two arrays.  It assumes that all code points have a value.
1739  * This is used as the base macro for macros for particular properties */
1740 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1741                              invmap[_invlist_search(invlist, cp)]
1742
1743 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1744  * of a code point, returning the value for the first code point in the string.
1745  * And it takes the particular macro name that finds the desired value given a
1746  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1747 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1748              (__ASSERT_(pos < strend)                                          \
1749                  /* Note assumes is valid UTF-8 */                             \
1750              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1751
1752 /* Returns the GCB value for the input code point */
1753 #define getGCB_VAL_CP(cp)                                                      \
1754           _generic_GET_BREAK_VAL_CP(                                           \
1755                                     PL_GCB_invlist,                            \
1756                                     Grapheme_Cluster_Break_invmap,             \
1757                                     (cp))
1758
1759 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1760  * bounded by pos and strend */
1761 #define getGCB_VAL_UTF8(pos, strend)                                           \
1762     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1763
1764
1765 /* Returns the SB value for the input code point */
1766 #define getSB_VAL_CP(cp)                                                       \
1767           _generic_GET_BREAK_VAL_CP(                                           \
1768                                     PL_SB_invlist,                             \
1769                                     Sentence_Break_invmap,                     \
1770                                     (cp))
1771
1772 /* Returns the SB value for the first code point in the UTF-8 encoded string
1773  * bounded by pos and strend */
1774 #define getSB_VAL_UTF8(pos, strend)                                            \
1775     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1776
1777 /* Returns the WB value for the input code point */
1778 #define getWB_VAL_CP(cp)                                                       \
1779           _generic_GET_BREAK_VAL_CP(                                           \
1780                                     PL_WB_invlist,                             \
1781                                     Word_Break_invmap,                         \
1782                                     (cp))
1783
1784 /* Returns the WB value for the first code point in the UTF-8 encoded string
1785  * bounded by pos and strend */
1786 #define getWB_VAL_UTF8(pos, strend)                                            \
1787     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1788
1789 /* We know what class REx starts with.  Try to find this position... */
1790 /* if reginfo->intuit, its a dryrun */
1791 /* annoyingly all the vars in this routine have different names from their counterparts
1792    in regmatch. /grrr */
1793 STATIC char *
1794 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1795     const char *strend, regmatch_info *reginfo)
1796 {
1797     dVAR;
1798     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1799     char *pat_string;   /* The pattern's exactish string */
1800     char *pat_end;          /* ptr to end char of pat_string */
1801     re_fold_t folder;   /* Function for computing non-utf8 folds */
1802     const U8 *fold_array;   /* array for folding ords < 256 */
1803     STRLEN ln;
1804     STRLEN lnc;
1805     U8 c1;
1806     U8 c2;
1807     char *e;
1808     I32 tmp = 1;        /* Scratch variable? */
1809     const bool utf8_target = reginfo->is_utf8_target;
1810     UV utf8_fold_flags = 0;
1811     const bool is_utf8_pat = reginfo->is_utf8_pat;
1812     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1813                                    with a result inverts that result, as 0^1 =
1814                                    1 and 1^1 = 0 */
1815     _char_class_number classnum;
1816
1817     RXi_GET_DECL(prog,progi);
1818
1819     PERL_ARGS_ASSERT_FIND_BYCLASS;
1820
1821     /* We know what class it must start with. */
1822     switch (OP(c)) {
1823     case ANYOFL:
1824         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1825         /* FALLTHROUGH */
1826     case ANYOF:
1827         if (utf8_target) {
1828             REXEC_FBC_UTF8_CLASS_SCAN(
1829                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1830         }
1831         else {
1832             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1833         }
1834         break;
1835
1836     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1837         assert(! is_utf8_pat);
1838         /* FALLTHROUGH */
1839     case EXACTFA:
1840         if (is_utf8_pat || utf8_target) {
1841             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1842             goto do_exactf_utf8;
1843         }
1844         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1845         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1846         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1847
1848     case EXACTF:   /* This node only generated for non-utf8 patterns */
1849         assert(! is_utf8_pat);
1850         if (utf8_target) {
1851             utf8_fold_flags = 0;
1852             goto do_exactf_utf8;
1853         }
1854         fold_array = PL_fold;
1855         folder = foldEQ;
1856         goto do_exactf_non_utf8;
1857
1858     case EXACTFL:
1859         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1860         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1861             utf8_fold_flags = FOLDEQ_LOCALE;
1862             goto do_exactf_utf8;
1863         }
1864         fold_array = PL_fold_locale;
1865         folder = foldEQ_locale;
1866         goto do_exactf_non_utf8;
1867
1868     case EXACTFU_SS:
1869         if (is_utf8_pat) {
1870             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1871         }
1872         goto do_exactf_utf8;
1873
1874     case EXACTFLU8:
1875             if (! utf8_target) {    /* All code points in this node require
1876                                        UTF-8 to express.  */
1877                 break;
1878             }
1879             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1880                                              | FOLDEQ_S2_FOLDS_SANE;
1881             goto do_exactf_utf8;
1882
1883     case EXACTFU:
1884         if (is_utf8_pat || utf8_target) {
1885             utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1886             goto do_exactf_utf8;
1887         }
1888
1889         /* Any 'ss' in the pattern should have been replaced by regcomp,
1890          * so we don't have to worry here about this single special case
1891          * in the Latin1 range */
1892         fold_array = PL_fold_latin1;
1893         folder = foldEQ_latin1;
1894
1895         /* FALLTHROUGH */
1896
1897       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1898                            are no glitches with fold-length differences
1899                            between the target string and pattern */
1900
1901         /* The idea in the non-utf8 EXACTF* cases is to first find the
1902          * first character of the EXACTF* node and then, if necessary,
1903          * case-insensitively compare the full text of the node.  c1 is the
1904          * first character.  c2 is its fold.  This logic will not work for
1905          * Unicode semantics and the german sharp ss, which hence should
1906          * not be compiled into a node that gets here. */
1907         pat_string = STRING(c);
1908         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1909
1910         /* We know that we have to match at least 'ln' bytes (which is the
1911          * same as characters, since not utf8).  If we have to match 3
1912          * characters, and there are only 2 availabe, we know without
1913          * trying that it will fail; so don't start a match past the
1914          * required minimum number from the far end */
1915         e = HOP3c(strend, -((SSize_t)ln), s);
1916
1917         if (reginfo->intuit && e < s) {
1918             e = s;                      /* Due to minlen logic of intuit() */
1919         }
1920
1921         c1 = *pat_string;
1922         c2 = fold_array[c1];
1923         if (c1 == c2) { /* If char and fold are the same */
1924             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1925         }
1926         else {
1927             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1928         }
1929         break;
1930
1931       do_exactf_utf8:
1932       {
1933         unsigned expansion;
1934
1935         /* If one of the operands is in utf8, we can't use the simpler folding
1936          * above, due to the fact that many different characters can have the
1937          * same fold, or portion of a fold, or different- length fold */
1938         pat_string = STRING(c);
1939         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1940         pat_end = pat_string + ln;
1941         lnc = is_utf8_pat       /* length to match in characters */
1942                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1943                 : ln;
1944
1945         /* We have 'lnc' characters to match in the pattern, but because of
1946          * multi-character folding, each character in the target can match
1947          * up to 3 characters (Unicode guarantees it will never exceed
1948          * this) if it is utf8-encoded; and up to 2 if not (based on the
1949          * fact that the Latin 1 folds are already determined, and the
1950          * only multi-char fold in that range is the sharp-s folding to
1951          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1952          * string character.  Adjust lnc accordingly, rounding up, so that
1953          * if we need to match at least 4+1/3 chars, that really is 5. */
1954         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1955         lnc = (lnc + expansion - 1) / expansion;
1956
1957         /* As in the non-UTF8 case, if we have to match 3 characters, and
1958          * only 2 are left, it's guaranteed to fail, so don't start a
1959          * match that would require us to go beyond the end of the string
1960          */
1961         e = HOP3c(strend, -((SSize_t)lnc), s);
1962
1963         if (reginfo->intuit && e < s) {
1964             e = s;                      /* Due to minlen logic of intuit() */
1965         }
1966
1967         /* XXX Note that we could recalculate e to stop the loop earlier,
1968          * as the worst case expansion above will rarely be met, and as we
1969          * go along we would usually find that e moves further to the left.
1970          * This would happen only after we reached the point in the loop
1971          * where if there were no expansion we should fail.  Unclear if
1972          * worth the expense */
1973
1974         while (s <= e) {
1975             char *my_strend= (char *)strend;
1976             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1977                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1978                 && (reginfo->intuit || regtry(reginfo, &s)) )
1979             {
1980                 goto got_it;
1981             }
1982             s += (utf8_target) ? UTF8SKIP(s) : 1;
1983         }
1984         break;
1985     }
1986
1987     case BOUNDL:
1988         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1989         if (FLAGS(c) != TRADITIONAL_BOUND) {
1990             if (! IN_UTF8_CTYPE_LOCALE) {
1991                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1992                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
1993             }
1994             goto do_boundu;
1995         }
1996
1997         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1998         break;
1999
2000     case NBOUNDL:
2001         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2002         if (FLAGS(c) != TRADITIONAL_BOUND) {
2003             if (! IN_UTF8_CTYPE_LOCALE) {
2004                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2005                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2006             }
2007             goto do_nboundu;
2008         }
2009
2010         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2011         break;
2012
2013     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2014                    meaning */
2015         assert(FLAGS(c) == TRADITIONAL_BOUND);
2016
2017         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2018         break;
2019
2020     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2021                    meaning */
2022         assert(FLAGS(c) == TRADITIONAL_BOUND);
2023
2024         FBC_BOUND_A(isWORDCHAR_A);
2025         break;
2026
2027     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2028                    meaning */
2029         assert(FLAGS(c) == TRADITIONAL_BOUND);
2030
2031         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2032         break;
2033
2034     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2035                    meaning */
2036         assert(FLAGS(c) == TRADITIONAL_BOUND);
2037
2038         FBC_NBOUND_A(isWORDCHAR_A);
2039         break;
2040
2041     case NBOUNDU:
2042         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2043             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2044             break;
2045         }
2046
2047       do_nboundu:
2048
2049         to_complement = 1;
2050         /* FALLTHROUGH */
2051
2052     case BOUNDU:
2053       do_boundu:
2054         switch((bound_type) FLAGS(c)) {
2055             case TRADITIONAL_BOUND:
2056                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2057                 break;
2058             case GCB_BOUND:
2059                 if (s == reginfo->strbeg) { /* GCB always matches at begin and
2060                                                end */
2061                     if (to_complement ^ cBOOL(reginfo->intuit
2062                                                       || regtry(reginfo, &s)))
2063                     {
2064                         goto got_it;
2065                     }
2066                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2067                 }
2068
2069                 if (utf8_target) {
2070                     GCB_enum before = getGCB_VAL_UTF8(
2071                                                reghop3((U8*)s, -1,
2072                                                        (U8*)(reginfo->strbeg)),
2073                                                (U8*) reginfo->strend);
2074                     while (s < strend) {
2075                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2076                                                         (U8*) reginfo->strend);
2077                         if (to_complement ^ isGCB(before, after)) {
2078                             if (reginfo->intuit || regtry(reginfo, &s)) {
2079                                 goto got_it;
2080                             }
2081                             before = after;
2082                         }
2083                         s += UTF8SKIP(s);
2084                     }
2085                 }
2086                 else {  /* Not utf8.  Everything is a GCB except between CR and
2087                            LF */
2088                     while (s < strend) {
2089                         if (to_complement ^ (UCHARAT(s - 1) != '\r'
2090                                              || UCHARAT(s) != '\n'))
2091                         {
2092                             if (reginfo->intuit || regtry(reginfo, &s)) {
2093                                 goto got_it;
2094                             }
2095                             s++;
2096                         }
2097                     }
2098                 }
2099
2100                 if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
2101                     goto got_it;
2102                 }
2103                 break;
2104
2105             case SB_BOUND:
2106                 if (s == reginfo->strbeg) { /* SB always matches at beginning */
2107                     if (to_complement
2108                                 ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
2109                     {
2110                         goto got_it;
2111                     }
2112
2113                     /* Didn't match.  Go try at the next position */
2114                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2115                 }
2116
2117                 if (utf8_target) {
2118                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2119                                                         -1,
2120                                                         (U8*)(reginfo->strbeg)),
2121                                                       (U8*) reginfo->strend);
2122                     while (s < strend) {
2123                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2124                                                          (U8*) reginfo->strend);
2125                         if (to_complement ^ isSB(before,
2126                                                  after,
2127                                                  (U8*) reginfo->strbeg,
2128                                                  (U8*) s,
2129                                                  (U8*) reginfo->strend,
2130                                                  utf8_target))
2131                         {
2132                             if (reginfo->intuit || regtry(reginfo, &s)) {
2133                                 goto got_it;
2134                             }
2135                             before = after;
2136                         }
2137                         s += UTF8SKIP(s);
2138                     }
2139                 }
2140                 else {  /* Not utf8. */
2141                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2142                     while (s < strend) {
2143                         SB_enum after = getSB_VAL_CP((U8) *s);
2144                         if (to_complement ^ isSB(before,
2145                                                  after,
2146                                                  (U8*) reginfo->strbeg,
2147                                                  (U8*) s,
2148                                                  (U8*) reginfo->strend,
2149                                                  utf8_target))
2150                         {
2151                             if (reginfo->intuit || regtry(reginfo, &s)) {
2152                                 goto got_it;
2153                             }
2154                             before = after;
2155                         }
2156                         s++;
2157                     }
2158                 }
2159
2160                 /* Here are at the final position in the target string.  The SB
2161                  * value is always true here, so matches, depending on other
2162                  * constraints */
2163                 if (to_complement ^ cBOOL(reginfo->intuit
2164                                                       || regtry(reginfo, &s)))
2165                 {
2166                     goto got_it;
2167                 }
2168
2169                 break;
2170
2171             case WB_BOUND:
2172                 if (s == reginfo->strbeg) {
2173                     if (to_complement ^ cBOOL(reginfo->intuit
2174                                               || regtry(reginfo, &s)))
2175                     {
2176                         goto got_it;
2177                     }
2178                     s += (utf8_target) ? UTF8SKIP(s) : 1;
2179                 }
2180
2181                 if (utf8_target) {
2182                     /* We are at a boundary between char_sub_0 and char_sub_1.
2183                      * We also keep track of the value for char_sub_-1 as we
2184                      * loop through the line.   Context may be needed to make a
2185                      * determination, and if so, this can save having to
2186                      * recalculate it */
2187                     WB_enum previous = WB_UNKNOWN;
2188                     WB_enum before = getWB_VAL_UTF8(
2189                                               reghop3((U8*)s,
2190                                                       -1,
2191                                                       (U8*)(reginfo->strbeg)),
2192                                               (U8*) reginfo->strend);
2193                     while (s < strend) {
2194                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2195                                                         (U8*) reginfo->strend);
2196                         if (to_complement ^ isWB(previous,
2197                                                  before,
2198                                                  after,
2199                                                  (U8*) reginfo->strbeg,
2200                                                  (U8*) s,
2201                                                  (U8*) reginfo->strend,
2202                                                  utf8_target))
2203                         {
2204                             if (reginfo->intuit || regtry(reginfo, &s)) {
2205                                 goto got_it;
2206                             }
2207                             previous = before;
2208                             before = after;
2209                         }
2210                         s += UTF8SKIP(s);
2211                     }
2212                 }
2213                 else {  /* Not utf8. */
2214                     WB_enum previous = WB_UNKNOWN;
2215                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2216                     while (s < strend) {
2217                         WB_enum after = getWB_VAL_CP((U8) *s);
2218                         if (to_complement ^ isWB(previous,
2219                                                  before,
2220                                                  after,
2221                                                  (U8*) reginfo->strbeg,
2222                                                  (U8*) s,
2223                                                  (U8*) reginfo->strend,
2224                                                  utf8_target))
2225                         {
2226                             if (reginfo->intuit || regtry(reginfo, &s)) {
2227                                 goto got_it;
2228                             }
2229                             previous = before;
2230                             before = after;
2231                         }
2232                         s++;
2233                     }
2234                 }
2235
2236                 if (to_complement ^ cBOOL(reginfo->intuit
2237                                           || regtry(reginfo, &s)))
2238                 {
2239                     goto got_it;
2240                 }
2241
2242                 break;
2243         }
2244         break;
2245
2246     case LNBREAK:
2247         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2248                         is_LNBREAK_latin1_safe(s, strend)
2249         );
2250         break;
2251
2252     /* The argument to all the POSIX node types is the class number to pass to
2253      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2254
2255     case NPOSIXL:
2256         to_complement = 1;
2257         /* FALLTHROUGH */
2258
2259     case POSIXL:
2260         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2261         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2262                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2263         break;
2264
2265     case NPOSIXD:
2266         to_complement = 1;
2267         /* FALLTHROUGH */
2268
2269     case POSIXD:
2270         if (utf8_target) {
2271             goto posix_utf8;
2272         }
2273         goto posixa;
2274
2275     case NPOSIXA:
2276         if (utf8_target) {
2277             /* The complement of something that matches only ASCII matches all
2278              * non-ASCII, plus everything in ASCII that isn't in the class. */
2279             REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2280                                       || ! _generic_isCC_A(*s, FLAGS(c)));
2281             break;
2282         }
2283
2284         to_complement = 1;
2285         /* FALLTHROUGH */
2286
2287     case POSIXA:
2288       posixa:
2289         /* Don't need to worry about utf8, as it can match only a single
2290          * byte invariant character. */
2291         REXEC_FBC_CLASS_SCAN(
2292                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2293         break;
2294
2295     case NPOSIXU:
2296         to_complement = 1;
2297         /* FALLTHROUGH */
2298
2299     case POSIXU:
2300         if (! utf8_target) {
2301             REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2302                                                                     FLAGS(c))));
2303         }
2304         else {
2305
2306           posix_utf8:
2307             classnum = (_char_class_number) FLAGS(c);
2308             if (classnum < _FIRST_NON_SWASH_CC) {
2309                 while (s < strend) {
2310
2311                     /* We avoid loading in the swash as long as possible, but
2312                      * should we have to, we jump to a separate loop.  This
2313                      * extra 'if' statement is what keeps this code from being
2314                      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2315                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
2316                         goto found_above_latin1;
2317                     }
2318                     if ((UTF8_IS_INVARIANT(*s)
2319                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2320                                                                 classnum)))
2321                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
2322                             && to_complement ^ cBOOL(
2323                                 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2324                                                                       *(s + 1)),
2325                                               classnum))))
2326                     {
2327                         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2328                             goto got_it;
2329                         else {
2330                             tmp = doevery;
2331                         }
2332                     }
2333                     else {
2334                         tmp = 1;
2335                     }
2336                     s += UTF8SKIP(s);
2337                 }
2338             }
2339             else switch (classnum) {    /* These classes are implemented as
2340                                            macros */
2341                 case _CC_ENUM_SPACE:
2342                     REXEC_FBC_UTF8_CLASS_SCAN(
2343                                         to_complement ^ cBOOL(isSPACE_utf8(s)));
2344                     break;
2345
2346                 case _CC_ENUM_BLANK:
2347                     REXEC_FBC_UTF8_CLASS_SCAN(
2348                                         to_complement ^ cBOOL(isBLANK_utf8(s)));
2349                     break;
2350
2351                 case _CC_ENUM_XDIGIT:
2352                     REXEC_FBC_UTF8_CLASS_SCAN(
2353                                        to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2354                     break;
2355
2356                 case _CC_ENUM_VERTSPACE:
2357                     REXEC_FBC_UTF8_CLASS_SCAN(
2358                                        to_complement ^ cBOOL(isVERTWS_utf8(s)));
2359                     break;
2360
2361                 case _CC_ENUM_CNTRL:
2362                     REXEC_FBC_UTF8_CLASS_SCAN(
2363                                         to_complement ^ cBOOL(isCNTRL_utf8(s)));
2364                     break;
2365
2366                 default:
2367                     Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2368                     NOT_REACHED; /* NOTREACHED */
2369             }
2370         }
2371         break;
2372
2373       found_above_latin1:   /* Here we have to load a swash to get the result
2374                                for the current code point */
2375         if (! PL_utf8_swash_ptrs[classnum]) {
2376             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2377             PL_utf8_swash_ptrs[classnum] =
2378                     _core_swash_init("utf8",
2379                                      "",
2380                                      &PL_sv_undef, 1, 0,
2381                                      PL_XPosix_ptrs[classnum], &flags);
2382         }
2383
2384         /* This is a copy of the loop above for swash classes, though using the
2385          * FBC macro instead of being expanded out.  Since we've loaded the
2386          * swash, we don't have to check for that each time through the loop */
2387         REXEC_FBC_UTF8_CLASS_SCAN(
2388                 to_complement ^ cBOOL(_generic_utf8(
2389                                       classnum,
2390                                       s,
2391                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
2392                                                   (U8 *) s, TRUE))));
2393         break;
2394
2395     case AHOCORASICKC:
2396     case AHOCORASICK:
2397         {
2398             DECL_TRIE_TYPE(c);
2399             /* what trie are we using right now */
2400             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2401             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2402             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2403
2404             const char *last_start = strend - trie->minlen;
2405 #ifdef DEBUGGING
2406             const char *real_start = s;
2407 #endif
2408             STRLEN maxlen = trie->maxlen;
2409             SV *sv_points;
2410             U8 **points; /* map of where we were in the input string
2411                             when reading a given char. For ASCII this
2412                             is unnecessary overhead as the relationship
2413                             is always 1:1, but for Unicode, especially
2414                             case folded Unicode this is not true. */
2415             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2416             U8 *bitmap=NULL;
2417
2418
2419             GET_RE_DEBUG_FLAGS_DECL;
2420
2421             /* We can't just allocate points here. We need to wrap it in
2422              * an SV so it gets freed properly if there is a croak while
2423              * running the match */
2424             ENTER;
2425             SAVETMPS;
2426             sv_points=newSV(maxlen * sizeof(U8 *));
2427             SvCUR_set(sv_points,
2428                 maxlen * sizeof(U8 *));
2429             SvPOK_on(sv_points);
2430             sv_2mortal(sv_points);
2431             points=(U8**)SvPV_nolen(sv_points );
2432             if ( trie_type != trie_utf8_fold
2433                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2434             {
2435                 if (trie->bitmap)
2436                     bitmap=(U8*)trie->bitmap;
2437                 else
2438                     bitmap=(U8*)ANYOF_BITMAP(c);
2439             }
2440             /* this is the Aho-Corasick algorithm modified a touch
2441                to include special handling for long "unknown char" sequences.
2442                The basic idea being that we use AC as long as we are dealing
2443                with a possible matching char, when we encounter an unknown char
2444                (and we have not encountered an accepting state) we scan forward
2445                until we find a legal starting char.
2446                AC matching is basically that of trie matching, except that when
2447                we encounter a failing transition, we fall back to the current
2448                states "fail state", and try the current char again, a process
2449                we repeat until we reach the root state, state 1, or a legal
2450                transition. If we fail on the root state then we can either
2451                terminate if we have reached an accepting state previously, or
2452                restart the entire process from the beginning if we have not.
2453
2454              */
2455             while (s <= last_start) {
2456                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2457                 U8 *uc = (U8*)s;
2458                 U16 charid = 0;
2459                 U32 base = 1;
2460                 U32 state = 1;
2461                 UV uvc = 0;
2462                 STRLEN len = 0;
2463                 STRLEN foldlen = 0;
2464                 U8 *uscan = (U8*)NULL;
2465                 U8 *leftmost = NULL;
2466 #ifdef DEBUGGING
2467                 U32 accepted_word= 0;
2468 #endif
2469                 U32 pointpos = 0;
2470
2471                 while ( state && uc <= (U8*)strend ) {
2472                     int failed=0;
2473                     U32 word = aho->states[ state ].wordnum;
2474
2475                     if( state==1 ) {
2476                         if ( bitmap ) {
2477                             DEBUG_TRIE_EXECUTE_r(
2478                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2479                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2480                                         (char *)uc, utf8_target );
2481                                     PerlIO_printf( Perl_debug_log,
2482                                         " Scanning for legal start char...\n");
2483                                 }
2484                             );
2485                             if (utf8_target) {
2486                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2487                                     uc += UTF8SKIP(uc);
2488                                 }
2489                             } else {
2490                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2491                                     uc++;
2492                                 }
2493                             }
2494                             s= (char *)uc;
2495                         }
2496                         if (uc >(U8*)last_start) break;
2497                     }
2498
2499                     if ( word ) {
2500                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2501                         if (!leftmost || lpos < leftmost) {
2502                             DEBUG_r(accepted_word=word);
2503                             leftmost= lpos;
2504                         }
2505                         if (base==0) break;
2506
2507                     }
2508                     points[pointpos++ % maxlen]= uc;
2509                     if (foldlen || uc < (U8*)strend) {
2510                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2511                                          widecharmap, uc,
2512                                          uscan, len, uvc, charid, foldlen,
2513                                          foldbuf, uniflags);
2514                         DEBUG_TRIE_EXECUTE_r({
2515                             dump_exec_pos( (char *)uc, c, strend,
2516                                         real_start, s, utf8_target);
2517                             PerlIO_printf(Perl_debug_log,
2518                                 " Charid:%3u CP:%4"UVxf" ",
2519                                  charid, uvc);
2520                         });
2521                     }
2522                     else {
2523                         len = 0;
2524                         charid = 0;
2525                     }
2526
2527
2528                     do {
2529 #ifdef DEBUGGING
2530                         word = aho->states[ state ].wordnum;
2531 #endif
2532                         base = aho->states[ state ].trans.base;
2533
2534                         DEBUG_TRIE_EXECUTE_r({
2535                             if (failed)
2536                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2537                                     s,   utf8_target );
2538                             PerlIO_printf( Perl_debug_log,
2539                                 "%sState: %4"UVxf", word=%"UVxf,
2540                                 failed ? " Fail transition to " : "",
2541                                 (UV)state, (UV)word);
2542                         });
2543                         if ( base ) {
2544                             U32 tmp;
2545                             I32 offset;
2546                             if (charid &&
2547                                  ( ((offset = base + charid
2548                                     - 1 - trie->uniquecharcount)) >= 0)
2549                                  && ((U32)offset < trie->lasttrans)
2550                                  && trie->trans[offset].check == state
2551                                  && (tmp=trie->trans[offset].next))
2552                             {
2553                                 DEBUG_TRIE_EXECUTE_r(
2554                                     PerlIO_printf( Perl_debug_log," - legal\n"));
2555                                 state = tmp;
2556                                 break;
2557                             }
2558                             else {
2559                                 DEBUG_TRIE_EXECUTE_r(
2560                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2561                                 failed = 1;
2562                                 state = aho->fail[state];
2563                             }
2564                         }
2565                         else {
2566                             /* we must be accepting here */
2567                             DEBUG_TRIE_EXECUTE_r(
2568                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2569                             failed = 1;
2570                             break;
2571                         }
2572                     } while(state);
2573                     uc += len;
2574                     if (failed) {
2575                         if (leftmost)
2576                             break;
2577                         if (!state) state = 1;
2578                     }
2579                 }
2580                 if ( aho->states[ state ].wordnum ) {
2581                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2582                     if (!leftmost || lpos < leftmost) {
2583                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2584                         leftmost = lpos;
2585                     }
2586                 }
2587                 if (leftmost) {
2588                     s = (char*)leftmost;
2589                     DEBUG_TRIE_EXECUTE_r({
2590                         PerlIO_printf(
2591                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2592                             (UV)accepted_word, (IV)(s - real_start)
2593                         );
2594                     });
2595                     if (reginfo->intuit || regtry(reginfo, &s)) {
2596                         FREETMPS;
2597                         LEAVE;
2598                         goto got_it;
2599                     }
2600                     s = HOPc(s,1);
2601                     DEBUG_TRIE_EXECUTE_r({
2602                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2603                     });
2604                 } else {
2605                     DEBUG_TRIE_EXECUTE_r(
2606                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2607                     break;
2608                 }
2609             }
2610             FREETMPS;
2611             LEAVE;
2612         }
2613         break;
2614     default:
2615         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2616     }
2617     return 0;
2618   got_it:
2619     return s;
2620 }
2621
2622 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2623  * flags have same meanings as with regexec_flags() */
2624
2625 static void
2626 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2627                             char *strbeg,
2628                             char *strend,
2629                             SV *sv,
2630                             U32 flags,
2631                             bool utf8_target)
2632 {
2633     struct regexp *const prog = ReANY(rx);
2634
2635     if (flags & REXEC_COPY_STR) {
2636 #ifdef PERL_ANY_COW
2637         if (SvCANCOW(sv)) {
2638             if (DEBUG_C_TEST) {
2639                 PerlIO_printf(Perl_debug_log,
2640                               "Copy on write: regexp capture, type %d\n",
2641                               (int) SvTYPE(sv));
2642             }
2643             /* Create a new COW SV to share the match string and store
2644              * in saved_copy, unless the current COW SV in saved_copy
2645              * is valid and suitable for our purpose */
2646             if ((   prog->saved_copy
2647                  && SvIsCOW(prog->saved_copy)
2648                  && SvPOKp(prog->saved_copy)
2649                  && SvIsCOW(sv)
2650                  && SvPOKp(sv)
2651                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
2652             {
2653                 /* just reuse saved_copy SV */
2654                 if (RXp_MATCH_COPIED(prog)) {
2655                     Safefree(prog->subbeg);
2656                     RXp_MATCH_COPIED_off(prog);
2657                 }
2658             }
2659             else {
2660                 /* create new COW SV to share string */
2661                 RX_MATCH_COPY_FREE(rx);
2662                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2663             }
2664             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2665             assert (SvPOKp(prog->saved_copy));
2666             prog->sublen  = strend - strbeg;
2667             prog->suboffset = 0;
2668             prog->subcoffset = 0;
2669         } else
2670 #endif
2671         {
2672             SSize_t min = 0;
2673             SSize_t max = strend - strbeg;
2674             SSize_t sublen;
2675
2676             if (    (flags & REXEC_COPY_SKIP_POST)
2677                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2678                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2679             ) { /* don't copy $' part of string */
2680                 U32 n = 0;
2681                 max = -1;
2682                 /* calculate the right-most part of the string covered
2683                  * by a capture. Due to look-ahead, this may be to
2684                  * the right of $&, so we have to scan all captures */
2685                 while (n <= prog->lastparen) {
2686                     if (prog->offs[n].end > max)
2687                         max = prog->offs[n].end;
2688                     n++;
2689                 }
2690                 if (max == -1)
2691                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2692                             ? prog->offs[0].start
2693                             : 0;
2694                 assert(max >= 0 && max <= strend - strbeg);
2695             }
2696
2697             if (    (flags & REXEC_COPY_SKIP_PRE)
2698                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2699                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2700             ) { /* don't copy $` part of string */
2701                 U32 n = 0;
2702                 min = max;
2703                 /* calculate the left-most part of the string covered
2704                  * by a capture. Due to look-behind, this may be to
2705                  * the left of $&, so we have to scan all captures */
2706                 while (min && n <= prog->lastparen) {
2707                     if (   prog->offs[n].start != -1
2708                         && prog->offs[n].start < min)
2709                     {
2710                         min = prog->offs[n].start;
2711                     }
2712                     n++;
2713                 }
2714                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2715                     && min >  prog->offs[0].end
2716                 )
2717                     min = prog->offs[0].end;
2718
2719             }
2720
2721             assert(min >= 0 && min <= max && min <= strend - strbeg);
2722             sublen = max - min;
2723
2724             if (RX_MATCH_COPIED(rx)) {
2725                 if (sublen > prog->sublen)
2726                     prog->subbeg =
2727                             (char*)saferealloc(prog->subbeg, sublen+1);
2728             }
2729             else
2730                 prog->subbeg = (char*)safemalloc(sublen+1);
2731             Copy(strbeg + min, prog->subbeg, sublen, char);
2732             prog->subbeg[sublen] = '\0';
2733             prog->suboffset = min;
2734             prog->sublen = sublen;
2735             RX_MATCH_COPIED_on(rx);
2736         }
2737         prog->subcoffset = prog->suboffset;
2738         if (prog->suboffset && utf8_target) {
2739             /* Convert byte offset to chars.
2740              * XXX ideally should only compute this if @-/@+
2741              * has been seen, a la PL_sawampersand ??? */
2742
2743             /* If there's a direct correspondence between the
2744              * string which we're matching and the original SV,
2745              * then we can use the utf8 len cache associated with
2746              * the SV. In particular, it means that under //g,
2747              * sv_pos_b2u() will use the previously cached
2748              * position to speed up working out the new length of
2749              * subcoffset, rather than counting from the start of
2750              * the string each time. This stops
2751              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2752              * from going quadratic */
2753             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2754                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2755                                                 SV_GMAGIC|SV_CONST_RETURN);
2756             else
2757                 prog->subcoffset = utf8_length((U8*)strbeg,
2758                                     (U8*)(strbeg+prog->suboffset));
2759         }
2760     }
2761     else {
2762         RX_MATCH_COPY_FREE(rx);
2763         prog->subbeg = strbeg;
2764         prog->suboffset = 0;
2765         prog->subcoffset = 0;
2766         prog->sublen = strend - strbeg;
2767     }
2768 }
2769
2770
2771
2772
2773 /*
2774  - regexec_flags - match a regexp against a string
2775  */
2776 I32
2777 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2778               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2779 /* stringarg: the point in the string at which to begin matching */
2780 /* strend:    pointer to null at end of string */
2781 /* strbeg:    real beginning of string */
2782 /* minend:    end of match must be >= minend bytes after stringarg. */
2783 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2784  *            itself is accessed via the pointers above */
2785 /* data:      May be used for some additional optimizations.
2786               Currently unused. */
2787 /* flags:     For optimizations. See REXEC_* in regexp.h */
2788
2789 {
2790     struct regexp *const prog = ReANY(rx);
2791     char *s;
2792     regnode *c;
2793     char *startpos;
2794     SSize_t minlen;             /* must match at least this many chars */
2795     SSize_t dontbother = 0;     /* how many characters not to try at end */
2796     const bool utf8_target = cBOOL(DO_UTF8(sv));
2797     I32 multiline;
2798     RXi_GET_DECL(prog,progi);
2799     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2800     regmatch_info *const reginfo = &reginfo_buf;
2801     regexp_paren_pair *swap = NULL;
2802     I32 oldsave;
2803     GET_RE_DEBUG_FLAGS_DECL;
2804
2805     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2806     PERL_UNUSED_ARG(data);
2807
2808     /* Be paranoid... */
2809     if (prog == NULL) {
2810         Perl_croak(aTHX_ "NULL regexp parameter");
2811     }
2812
2813     DEBUG_EXECUTE_r(
2814         debug_start_match(rx, utf8_target, stringarg, strend,
2815         "Matching");
2816     );
2817
2818     startpos = stringarg;
2819
2820     if (prog->intflags & PREGf_GPOS_SEEN) {
2821         MAGIC *mg;
2822
2823         /* set reginfo->ganch, the position where \G can match */
2824
2825         reginfo->ganch =
2826             (flags & REXEC_IGNOREPOS)
2827             ? stringarg /* use start pos rather than pos() */
2828             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2829               /* Defined pos(): */
2830             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2831             : strbeg; /* pos() not defined; use start of string */
2832
2833         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2834             "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2835
2836         /* in the presence of \G, we may need to start looking earlier in
2837          * the string than the suggested start point of stringarg:
2838          * if prog->gofs is set, then that's a known, fixed minimum
2839          * offset, such as
2840          * /..\G/:   gofs = 2
2841          * /ab|c\G/: gofs = 1
2842          * or if the minimum offset isn't known, then we have to go back
2843          * to the start of the string, e.g. /w+\G/
2844          */
2845
2846         if (prog->intflags & PREGf_ANCH_GPOS) {
2847             startpos  = reginfo->ganch - prog->gofs;
2848             if (startpos <
2849                 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2850             {
2851                 DEBUG_r(PerlIO_printf(Perl_debug_log,
2852                         "fail: ganch-gofs before earliest possible start\n"));
2853                 return 0;
2854             }
2855         }
2856         else if (prog->gofs) {
2857             if (startpos - prog->gofs < strbeg)
2858                 startpos = strbeg;
2859             else
2860                 startpos -= prog->gofs;
2861         }
2862         else if (prog->intflags & PREGf_GPOS_FLOAT)
2863             startpos = strbeg;
2864     }
2865
2866     minlen = prog->minlen;
2867     if ((startpos + minlen) > strend || startpos < strbeg) {
2868         DEBUG_r(PerlIO_printf(Perl_debug_log,
2869                     "Regex match can't succeed, so not even tried\n"));
2870         return 0;
2871     }
2872
2873     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2874      * which will call destuctors to reset PL_regmatch_state, free higher
2875      * PL_regmatch_slabs, and clean up regmatch_info_aux and
2876      * regmatch_info_aux_eval */
2877
2878     oldsave = PL_savestack_ix;
2879
2880     s = startpos;
2881
2882     if ((prog->extflags & RXf_USE_INTUIT)
2883         && !(flags & REXEC_CHECKED))
2884     {
2885         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2886                                     flags, NULL);
2887         if (!s)
2888             return 0;
2889
2890         if (prog->extflags & RXf_CHECK_ALL) {
2891             /* we can match based purely on the result of INTUIT.
2892              * Set up captures etc just for $& and $-[0]
2893              * (an intuit-only match wont have $1,$2,..) */
2894             assert(!prog->nparens);
2895
2896             /* s/// doesn't like it if $& is earlier than where we asked it to
2897              * start searching (which can happen on something like /.\G/) */
2898             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2899                     && (s < stringarg))
2900             {
2901                 /* this should only be possible under \G */
2902                 assert(prog->intflags & PREGf_GPOS_SEEN);
2903                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2904                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2905                 goto phooey;
2906             }
2907
2908             /* match via INTUIT shouldn't have any captures.
2909              * Let @-, @+, $^N know */
2910             prog->lastparen = prog->lastcloseparen = 0;
2911             RX_MATCH_UTF8_set(rx, utf8_target);
2912             prog->offs[0].start = s - strbeg;
2913             prog->offs[0].end = utf8_target
2914                 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2915                 : s - strbeg + prog->minlenret;
2916             if ( !(flags & REXEC_NOT_FIRST) )
2917                 S_reg_set_capture_string(aTHX_ rx,
2918                                         strbeg, strend,
2919                                         sv, flags, utf8_target);
2920
2921             return 1;
2922         }
2923     }
2924
2925     multiline = prog->extflags & RXf_PMf_MULTILINE;
2926
2927     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2928         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2929                               "String too short [regexec_flags]...\n"));
2930         goto phooey;
2931     }
2932
2933     /* Check validity of program. */
2934     if (UCHARAT(progi->program) != REG_MAGIC) {
2935         Perl_croak(aTHX_ "corrupted regexp program");
2936     }
2937
2938     RX_MATCH_TAINTED_off(rx);
2939     RX_MATCH_UTF8_set(rx, utf8_target);
2940
2941     reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2942     reginfo->intuit = 0;
2943     reginfo->is_utf8_target = cBOOL(utf8_target);
2944     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2945     reginfo->warned = FALSE;
2946     reginfo->strbeg  = strbeg;
2947     reginfo->sv = sv;
2948     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2949     reginfo->strend = strend;
2950     /* see how far we have to get to not match where we matched before */
2951     reginfo->till = stringarg + minend;
2952
2953     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2954         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2955            S_cleanup_regmatch_info_aux has executed (registered by
2956            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2957            magic belonging to this SV.
2958            Not newSVsv, either, as it does not COW.
2959         */
2960         reginfo->sv = newSV(0);
2961         SvSetSV_nosteal(reginfo->sv, sv);
2962         SAVEFREESV(reginfo->sv);
2963     }
2964
2965     /* reserve next 2 or 3 slots in PL_regmatch_state:
2966      * slot N+0: may currently be in use: skip it
2967      * slot N+1: use for regmatch_info_aux struct
2968      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2969      * slot N+3: ready for use by regmatch()
2970      */
2971
2972     {
2973         regmatch_state *old_regmatch_state;
2974         regmatch_slab  *old_regmatch_slab;
2975         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2976
2977         /* on first ever match, allocate first slab */
2978         if (!PL_regmatch_slab) {
2979             Newx(PL_regmatch_slab, 1, regmatch_slab);
2980             PL_regmatch_slab->prev = NULL;
2981             PL_regmatch_slab->next = NULL;
2982             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2983         }
2984
2985         old_regmatch_state = PL_regmatch_state;
2986         old_regmatch_slab  = PL_regmatch_slab;
2987
2988         for (i=0; i <= max; i++) {
2989             if (i == 1)
2990                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2991             else if (i ==2)
2992                 reginfo->info_aux_eval =
2993                 reginfo->info_aux->info_aux_eval =
2994                             &(PL_regmatch_state->u.info_aux_eval);
2995
2996             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2997                 PL_regmatch_state = S_push_slab(aTHX);
2998         }
2999
3000         /* note initial PL_regmatch_state position; at end of match we'll
3001          * pop back to there and free any higher slabs */
3002
3003         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3004         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3005         reginfo->info_aux->poscache = NULL;
3006
3007         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3008
3009         if ((prog->extflags & RXf_EVAL_SEEN))
3010             S_setup_eval_state(aTHX_ reginfo);
3011         else
3012             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3013     }
3014
3015     /* If there is a "must appear" string, look for it. */
3016
3017     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3018         /* We have to be careful. If the previous successful match
3019            was from this regex we don't want a subsequent partially
3020            successful match to clobber the old results.
3021            So when we detect this possibility we add a swap buffer
3022            to the re, and switch the buffer each match. If we fail,
3023            we switch it back; otherwise we leave it swapped.
3024         */
3025         swap = prog->offs;
3026         /* do we need a save destructor here for eval dies? */
3027         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3028         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3029             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3030             PTR2UV(prog),
3031             PTR2UV(swap),
3032             PTR2UV(prog->offs)
3033         ));
3034     }
3035
3036     /* Simplest case: anchored match need be tried only once, or with
3037      * MBOL, only at the beginning of each line.
3038      *
3039      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3040      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3041      * match at the start of the string then it won't match anywhere else
3042      * either; while with /.*.../, if it doesn't match at the beginning,
3043      * the earliest it could match is at the start of the next line */
3044
3045     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3046         char *end;
3047
3048         if (regtry(reginfo, &s))
3049             goto got_it;
3050
3051         if (!(prog->intflags & PREGf_ANCH_MBOL))
3052             goto phooey;
3053
3054         /* didn't match at start, try at other newline positions */
3055
3056         if (minlen)
3057             dontbother = minlen - 1;
3058         end = HOP3c(strend, -dontbother, strbeg) - 1;
3059
3060         /* skip to next newline */
3061
3062         while (s <= end) { /* note it could be possible to match at the end of the string */
3063             /* NB: newlines are the same in unicode as they are in latin */
3064             if (*s++ != '\n')
3065                 continue;
3066             if (prog->check_substr || prog->check_utf8) {
3067             /* note that with PREGf_IMPLICIT, intuit can only fail
3068              * or return the start position, so it's of limited utility.
3069              * Nevertheless, I made the decision that the potential for
3070              * quick fail was still worth it - DAPM */
3071                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3072                 if (!s)
3073                     goto phooey;
3074             }
3075             if (regtry(reginfo, &s))
3076                 goto got_it;
3077         }
3078         goto phooey;
3079     } /* end anchored search */
3080
3081     if (prog->intflags & PREGf_ANCH_GPOS)
3082     {
3083         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3084         assert(prog->intflags & PREGf_GPOS_SEEN);
3085         /* For anchored \G, the only position it can match from is
3086          * (ganch-gofs); we already set startpos to this above; if intuit
3087          * moved us on from there, we can't possibly succeed */
3088         assert(startpos == reginfo->ganch - prog->gofs);
3089         if (s == startpos && regtry(reginfo, &s))
3090             goto got_it;
3091         goto phooey;
3092     }
3093
3094     /* Messy cases:  unanchored match. */
3095     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3096         /* we have /x+whatever/ */
3097         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3098         char ch;
3099 #ifdef DEBUGGING
3100         int did_match = 0;
3101 #endif
3102         if (utf8_target) {
3103             if (! prog->anchored_utf8) {
3104                 to_utf8_substr(prog);
3105             }
3106             ch = SvPVX_const(prog->anchored_utf8)[0];
3107             REXEC_FBC_SCAN(
3108                 if (*s == ch) {
3109                     DEBUG_EXECUTE_r( did_match = 1 );
3110                     if (regtry(reginfo, &s)) goto got_it;
3111                     s += UTF8SKIP(s);
3112                     while (s < strend && *s == ch)
3113                         s += UTF8SKIP(s);
3114                 }
3115             );
3116
3117         }
3118         else {
3119             if (! prog->anchored_substr) {
3120                 if (! to_byte_substr(prog)) {
3121                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3122                 }
3123             }
3124             ch = SvPVX_const(prog->anchored_substr)[0];
3125             REXEC_FBC_SCAN(
3126                 if (*s == ch) {
3127                     DEBUG_EXECUTE_r( did_match = 1 );
3128                     if (regtry(reginfo, &s)) goto got_it;
3129                     s++;
3130                     while (s < strend && *s == ch)
3131                         s++;
3132                 }
3133             );
3134         }
3135         DEBUG_EXECUTE_r(if (!did_match)
3136                 PerlIO_printf(Perl_debug_log,
3137                                   "Did not find anchored character...\n")
3138                );
3139     }
3140     else if (prog->anchored_substr != NULL
3141               || prog->anchored_utf8 != NULL
3142               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3143                   && prog->float_max_offset < strend - s)) {
3144         SV *must;
3145         SSize_t back_max;
3146         SSize_t back_min;
3147         char *last;
3148         char *last1;            /* Last position checked before */
3149 #ifdef DEBUGGING
3150         int did_match = 0;
3151 #endif
3152         if (prog->anchored_substr || prog->anchored_utf8) {
3153             if (utf8_target) {
3154                 if (! prog->anchored_utf8) {
3155                     to_utf8_substr(prog);
3156                 }
3157                 must = prog->anchored_utf8;
3158             }
3159             else {
3160                 if (! prog->anchored_substr) {
3161                     if (! to_byte_substr(prog)) {
3162                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3163                     }
3164                 }
3165                 must = prog->anchored_substr;
3166             }
3167             back_max = back_min = prog->anchored_offset;
3168         } else {
3169             if (utf8_target) {
3170                 if (! prog->float_utf8) {
3171                     to_utf8_substr(prog);
3172                 }
3173                 must = prog->float_utf8;
3174             }
3175             else {
3176                 if (! prog->float_substr) {
3177                     if (! to_byte_substr(prog)) {
3178                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3179                     }
3180                 }
3181                 must = prog->float_substr;
3182             }
3183             back_max = prog->float_max_offset;
3184             back_min = prog->float_min_offset;
3185         }
3186
3187         if (back_min<0) {
3188             last = strend;
3189         } else {
3190             last = HOP3c(strend,        /* Cannot start after this */
3191                   -(SSize_t)(CHR_SVLEN(must)
3192                          - (SvTAIL(must) != 0) + back_min), strbeg);
3193         }
3194         if (s > reginfo->strbeg)
3195             last1 = HOPc(s, -1);
3196         else
3197             last1 = s - 1;      /* bogus */
3198
3199         /* XXXX check_substr already used to find "s", can optimize if
3200            check_substr==must. */
3201         dontbother = 0;
3202         strend = HOPc(strend, -dontbother);
3203         while ( (s <= last) &&
3204                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3205                                   (unsigned char*)strend, must,
3206                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3207             DEBUG_EXECUTE_r( did_match = 1 );
3208             if (HOPc(s, -back_max) > last1) {
3209                 last1 = HOPc(s, -back_min);
3210                 s = HOPc(s, -back_max);
3211             }
3212             else {
3213                 char * const t = (last1 >= reginfo->strbeg)
3214                                     ? HOPc(last1, 1) : last1 + 1;
3215
3216                 last1 = HOPc(s, -back_min);
3217                 s = t;
3218             }
3219             if (utf8_target) {
3220                 while (s <= last1) {
3221                     if (regtry(reginfo, &s))
3222                         goto got_it;
3223                     if (s >= last1) {
3224                         s++; /* to break out of outer loop */
3225                         break;
3226                     }
3227                     s += UTF8SKIP(s);
3228                 }
3229             }
3230             else {
3231                 while (s <= last1) {
3232                     if (regtry(reginfo, &s))
3233                         goto got_it;
3234                     s++;
3235                 }
3236             }
3237         }
3238         DEBUG_EXECUTE_r(if (!did_match) {
3239             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3240                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3241             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
3242                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
3243                                ? "anchored" : "floating"),
3244                 quoted, RE_SV_TAIL(must));
3245         });
3246         goto phooey;
3247     }
3248     else if ( (c = progi->regstclass) ) {
3249         if (minlen) {
3250             const OPCODE op = OP(progi->regstclass);
3251             /* don't bother with what can't match */
3252             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3253                 strend = HOPc(strend, -(minlen - 1));
3254         }
3255         DEBUG_EXECUTE_r({
3256             SV * const prop = sv_newmortal();
3257             regprop(prog, prop, c, reginfo, NULL);
3258             {
3259                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3260                     s,strend-s,60);
3261                 PerlIO_printf(Perl_debug_log,
3262                     "Matching stclass %.*s against %s (%d bytes)\n",
3263                     (int)SvCUR(prop), SvPVX_const(prop),
3264                      quoted, (int)(strend - s));
3265             }
3266         });
3267         if (find_byclass(prog, c, s, strend, reginfo))
3268             goto got_it;
3269         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
3270     }
3271     else {
3272         dontbother = 0;
3273         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3274             /* Trim the end. */
3275             char *last= NULL;
3276             SV* float_real;
3277             STRLEN len;
3278             const char *little;
3279
3280             if (utf8_target) {
3281                 if (! prog->float_utf8) {
3282                     to_utf8_substr(prog);
3283                 }
3284                 float_real = prog->float_utf8;
3285             }
3286             else {
3287                 if (! prog->float_substr) {
3288                     if (! to_byte_substr(prog)) {
3289                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3290                     }
3291                 }
3292                 float_real = prog->float_substr;
3293             }
3294
3295             little = SvPV_const(float_real, len);
3296             if (SvTAIL(float_real)) {
3297                     /* This means that float_real contains an artificial \n on
3298                      * the end due to the presence of something like this:
3299                      * /foo$/ where we can match both "foo" and "foo\n" at the
3300                      * end of the string.  So we have to compare the end of the
3301                      * string first against the float_real without the \n and
3302                      * then against the full float_real with the string.  We
3303                      * have to watch out for cases where the string might be
3304                      * smaller than the float_real or the float_real without
3305                      * the \n. */
3306                     char *checkpos= strend - len;
3307                     DEBUG_OPTIMISE_r(
3308                         PerlIO_printf(Perl_debug_log,
3309                             "%sChecking for float_real.%s\n",
3310                             PL_colors[4], PL_colors[5]));
3311                     if (checkpos + 1 < strbeg) {
3312                         /* can't match, even if we remove the trailing \n
3313                          * string is too short to match */
3314                         DEBUG_EXECUTE_r(
3315                             PerlIO_printf(Perl_debug_log,
3316                                 "%sString shorter than required trailing substring, cannot match.%s\n",
3317                                 PL_colors[4], PL_colors[5]));
3318                         goto phooey;
3319                     } else if (memEQ(checkpos + 1, little, len - 1)) {
3320                         /* can match, the end of the string matches without the
3321                          * "\n" */
3322                         last = checkpos + 1;
3323                     } else if (checkpos < strbeg) {
3324                         /* cant match, string is too short when the "\n" is
3325                          * included */
3326                         DEBUG_EXECUTE_r(
3327                             PerlIO_printf(Perl_debug_log,
3328                                 "%sString does not contain required trailing substring, cannot match.%s\n",
3329                                 PL_colors[4], PL_colors[5]));
3330                         goto phooey;
3331                     } else if (!multiline) {
3332                         /* non multiline match, so compare with the "\n" at the
3333                          * end of the string */
3334                         if (memEQ(checkpos, little, len)) {
3335                             last= checkpos;
3336                         } else {
3337                             DEBUG_EXECUTE_r(
3338                                 PerlIO_printf(Perl_debug_log,
3339                                     "%sString does not contain required trailing substring, cannot match.%s\n",
3340                                     PL_colors[4], PL_colors[5]));
3341                             goto phooey;
3342                         }
3343                     } else {
3344                         /* multiline match, so we have to search for a place
3345                          * where the full string is located */
3346                         goto find_last;
3347                     }
3348             } else {
3349                   find_last:
3350                     if (len)
3351                         last = rninstr(s, strend, little, little + len);
3352                     else
3353                         last = strend;  /* matching "$" */
3354             }
3355             if (!last) {
3356                 /* at one point this block contained a comment which was
3357                  * probably incorrect, which said that this was a "should not
3358                  * happen" case.  Even if it was true when it was written I am
3359                  * pretty sure it is not anymore, so I have removed the comment
3360                  * and replaced it with this one. Yves */
3361                 DEBUG_EXECUTE_r(
3362                     PerlIO_printf(Perl_debug_log,
3363                         "%sString does not contain required substring, cannot match.%s\n",
3364                         PL_colors[4], PL_colors[5]
3365                     ));
3366                 goto phooey;
3367             }
3368             dontbother = strend - last + prog->float_min_offset;
3369         }
3370         if (minlen && (dontbother < minlen))
3371             dontbother = minlen - 1;
3372         strend -= dontbother;              /* this one's always in bytes! */
3373         /* We don't know much -- general case. */
3374         if (utf8_target) {
3375             for (;;) {
3376                 if (regtry(reginfo, &s))
3377                     goto got_it;
3378                 if (s >= strend)
3379                     break;
3380                 s += UTF8SKIP(s);
3381             };
3382         }
3383         else {
3384             do {
3385                 if (regtry(reginfo, &s))
3386                     goto got_it;
3387             } while (s++ < strend);
3388         }
3389     }
3390
3391     /* Failure. */
3392     goto phooey;
3393
3394   got_it:
3395     /* s/// doesn't like it if $& is earlier than where we asked it to
3396      * start searching (which can happen on something like /.\G/) */
3397     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3398             && (prog->offs[0].start < stringarg - strbeg))
3399     {
3400         /* this should only be possible under \G */
3401         assert(prog->intflags & PREGf_GPOS_SEEN);
3402         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3403             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3404         goto phooey;
3405     }
3406
3407     DEBUG_BUFFERS_r(
3408         if (swap)
3409             PerlIO_printf(Perl_debug_log,
3410                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3411                 PTR2UV(prog),
3412                 PTR2UV(swap)
3413             );
3414     );
3415     Safefree(swap);
3416
3417     /* clean up; this will trigger destructors that will free all slabs
3418      * above the current one, and cleanup the regmatch_info_aux
3419      * and regmatch_info_aux_eval sructs */
3420
3421     LEAVE_SCOPE(oldsave);
3422
3423     if (RXp_PAREN_NAMES(prog))
3424         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3425
3426     /* make sure $`, $&, $', and $digit will work later */
3427     if ( !(flags & REXEC_NOT_FIRST) )
3428         S_reg_set_capture_string(aTHX_ rx,
3429                                     strbeg, reginfo->strend,
3430                                     sv, flags, utf8_target);
3431
3432     return 1;
3433
3434   phooey:
3435     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3436                           PL_colors[4], PL_colors[5]));
3437
3438     /* clean up; this will trigger destructors that will free all slabs
3439      * above the current one, and cleanup the regmatch_info_aux
3440      * and regmatch_info_aux_eval sructs */
3441
3442     LEAVE_SCOPE(oldsave);
3443
3444     if (swap) {
3445         /* we failed :-( roll it back */
3446         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3447             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3448             PTR2UV(prog),
3449             PTR2UV(prog->offs),
3450             PTR2UV(swap)
3451         ));
3452         Safefree(prog->offs);
3453         prog->offs = swap;
3454     }
3455     return 0;
3456 }
3457
3458
3459 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3460  * Do inc before dec, in case old and new rex are the same */
3461 #define SET_reg_curpm(Re2)                          \
3462     if (reginfo->info_aux_eval) {                   \
3463         (void)ReREFCNT_inc(Re2);                    \
3464         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
3465         PM_SETRE((PL_reg_curpm), (Re2));            \
3466     }
3467
3468
3469 /*
3470  - regtry - try match at specific point
3471  */
3472 STATIC I32                      /* 0 failure, 1 success */
3473 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3474 {
3475     CHECKPOINT lastcp;
3476     REGEXP *const rx = reginfo->prog;
3477     regexp *const prog = ReANY(rx);
3478     SSize_t result;
3479     RXi_GET_DECL(prog,progi);
3480     GET_RE_DEBUG_FLAGS_DECL;
3481
3482     PERL_ARGS_ASSERT_REGTRY;
3483
3484     reginfo->cutpoint=NULL;
3485
3486     prog->offs[0].start = *startposp - reginfo->strbeg;
3487     prog->lastparen = 0;
3488     prog->lastcloseparen = 0;
3489
3490     /* XXXX What this code is doing here?!!!  There should be no need
3491        to do this again and again, prog->lastparen should take care of
3492        this!  --ilya*/
3493
3494     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3495      * Actually, the code in regcppop() (which Ilya may be meaning by
3496      * prog->lastparen), is not needed at all by the test suite
3497      * (op/regexp, op/pat, op/split), but that code is needed otherwise
3498      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3499      * Meanwhile, this code *is* needed for the
3500      * above-mentioned test suite tests to succeed.  The common theme
3501      * on those tests seems to be returning null fields from matches.
3502      * --jhi updated by dapm */
3503 #if 1
3504     if (prog->nparens) {
3505         regexp_paren_pair *pp = prog->offs;
3506         I32 i;
3507         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3508             ++pp;
3509             pp->start = -1;
3510             pp->end = -1;
3511         }
3512     }
3513 #endif
3514     REGCP_SET(lastcp);
3515     result = regmatch(reginfo, *startposp, progi->program + 1);
3516     if (result != -1) {
3517         prog->offs[0].end = result;
3518         return 1;
3519     }
3520     if (reginfo->cutpoint)
3521         *startposp= reginfo->cutpoint;
3522     REGCP_UNWIND(lastcp);
3523     return 0;
3524 }
3525
3526
3527 #define sayYES goto yes
3528 #define sayNO goto no
3529 #define sayNO_SILENT goto no_silent
3530
3531 /* we dont use STMT_START/END here because it leads to
3532    "unreachable code" warnings, which are bogus, but distracting. */
3533 #define CACHEsayNO \
3534     if (ST.cache_mask) \
3535        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3536     sayNO
3537
3538 /* this is used to determine how far from the left messages like
3539    'failed...' are printed. It should be set such that messages
3540    are inline with the regop output that created them.
3541 */
3542 #define REPORT_CODE_OFF 32
3543
3544
3545 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3546 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3547 #define CHRTEST_NOT_A_CP_1 -999
3548 #define CHRTEST_NOT_A_CP_2 -998
3549
3550 /* grab a new slab and return the first slot in it */
3551
3552 STATIC regmatch_state *
3553 S_push_slab(pTHX)
3554 {
3555 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3556     dMY_CXT;
3557 #endif
3558     regmatch_slab *s = PL_regmatch_slab->next;
3559     if (!s) {
3560         Newx(s, 1, regmatch_slab);
3561         s->prev = PL_regmatch_slab;
3562         s->next = NULL;
3563         PL_regmatch_slab->next = s;
3564     }
3565     PL_regmatch_slab = s;
3566     return SLAB_FIRST(s);
3567 }
3568
3569
3570 /* push a new state then goto it */
3571
3572 #define PUSH_STATE_GOTO(state, node, input) \
3573     pushinput = input; \
3574     scan = node; \
3575     st->resume_state = state; \
3576     goto push_state;
3577
3578 /* push a new state with success backtracking, then goto it */
3579
3580 #define PUSH_YES_STATE_GOTO(state, node, input) \
3581     pushinput = input; \
3582     scan = node; \
3583     st->resume_state = state; \
3584     goto push_yes_state;
3585
3586
3587
3588
3589 /*
3590
3591 regmatch() - main matching routine
3592
3593 This is basically one big switch statement in a loop. We execute an op,
3594 set 'next' to point the next op, and continue. If we come to a point which
3595 we may need to backtrack to on failure such as (A|B|C), we push a
3596 backtrack state onto the backtrack stack. On failure, we pop the top
3597 state, and re-enter the loop at the state indicated. If there are no more
3598 states to pop, we return failure.
3599
3600 Sometimes we also need to backtrack on success; for example /A+/, where
3601 after successfully matching one A, we need to go back and try to
3602 match another one; similarly for lookahead assertions: if the assertion
3603 completes successfully, we backtrack to the state just before the assertion
3604 and then carry on.  In these cases, the pushed state is marked as
3605 'backtrack on success too'. This marking is in fact done by a chain of
3606 pointers, each pointing to the previous 'yes' state. On success, we pop to
3607 the nearest yes state, discarding any intermediate failure-only states.
3608 Sometimes a yes state is pushed just to force some cleanup code to be
3609 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3610 it to free the inner regex.
3611
3612 Note that failure backtracking rewinds the cursor position, while
3613 success backtracking leaves it alone.
3614
3615 A pattern is complete when the END op is executed, while a subpattern
3616 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3617 ops trigger the "pop to last yes state if any, otherwise return true"
3618 behaviour.
3619
3620 A common convention in this function is to use A and B to refer to the two
3621 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3622 the subpattern to be matched possibly multiple times, while B is the entire
3623 rest of the pattern. Variable and state names reflect this convention.
3624
3625 The states in the main switch are the union of ops and failure/success of
3626 substates associated with with that op.  For example, IFMATCH is the op
3627 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3628 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3629 successfully matched A and IFMATCH_A_fail is a state saying that we have
3630 just failed to match A. Resume states always come in pairs. The backtrack
3631 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3632 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3633 on success or failure.
3634
3635 The struct that holds a backtracking state is actually a big union, with
3636 one variant for each major type of op. The variable st points to the
3637 top-most backtrack struct. To make the code clearer, within each
3638 block of code we #define ST to alias the relevant union.
3639
3640 Here's a concrete example of a (vastly oversimplified) IFMATCH
3641 implementation:
3642
3643     switch (state) {
3644     ....
3645
3646 #define ST st->u.ifmatch
3647
3648     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3649         ST.foo = ...; // some state we wish to save
3650         ...
3651         // push a yes backtrack state with a resume value of
3652         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3653         // first node of A:
3654         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3655         // NOTREACHED
3656
3657     case IFMATCH_A: // we have successfully executed A; now continue with B
3658         next = B;
3659         bar = ST.foo; // do something with the preserved value
3660         break;
3661
3662     case IFMATCH_A_fail: // A failed, so the assertion failed
3663         ...;   // do some housekeeping, then ...
3664         sayNO; // propagate the failure
3665
3666 #undef ST
3667
3668     ...
3669     }
3670
3671 For any old-timers reading this who are familiar with the old recursive
3672 approach, the code above is equivalent to:
3673
3674     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3675     {
3676         int foo = ...
3677         ...
3678         if (regmatch(A)) {
3679             next = B;
3680             bar = foo;
3681             break;
3682         }
3683         ...;   // do some housekeeping, then ...
3684         sayNO; // propagate the failure
3685     }
3686
3687 The topmost backtrack state, pointed to by st, is usually free. If you
3688 want to claim it, populate any ST.foo fields in it with values you wish to
3689 save, then do one of
3690
3691         PUSH_STATE_GOTO(resume_state, node, newinput);
3692         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3693
3694 which sets that backtrack state's resume value to 'resume_state', pushes a
3695 new free entry to the top of the backtrack stack, then goes to 'node'.
3696 On backtracking, the free slot is popped, and the saved state becomes the
3697 new free state. An ST.foo field in this new top state can be temporarily
3698 accessed to retrieve values, but once the main loop is re-entered, it
3699 becomes available for reuse.
3700
3701 Note that the depth of the backtrack stack constantly increases during the
3702 left-to-right execution of the pattern, rather than going up and down with
3703 the pattern nesting. For example the stack is at its maximum at Z at the
3704 end of the pattern, rather than at X in the following:
3705
3706     /(((X)+)+)+....(Y)+....Z/
3707
3708 The only exceptions to this are lookahead/behind assertions and the cut,
3709 (?>A), which pop all the backtrack states associated with A before
3710 continuing.
3711
3712 Backtrack state structs are allocated in slabs of about 4K in size.
3713 PL_regmatch_state and st always point to the currently active state,
3714 and PL_regmatch_slab points to the slab currently containing
3715 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3716 allocated, and is never freed until interpreter destruction. When the slab
3717 is full, a new one is allocated and chained to the end. At exit from
3718 regmatch(), slabs allocated since entry are freed.
3719
3720 */
3721
3722
3723 #define DEBUG_STATE_pp(pp)                                  \
3724     DEBUG_STATE_r({                                         \
3725         DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3726         PerlIO_printf(Perl_debug_log,                       \
3727             "    %*s"pp" %s%s%s%s%s\n",                     \
3728             depth*2, "",                                    \
3729             PL_reg_name[st->resume_state],                  \
3730             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3731             ((st==yes_state) ? "Y" : ""),                   \
3732             ((st==mark_state) ? "M" : ""),                  \
3733             ((st==yes_state||st==mark_state) ? "]" : "")    \
3734         );                                                  \
3735     });
3736
3737
3738 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3739
3740 #ifdef DEBUGGING
3741
3742 STATIC void
3743 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3744     const char *start, const char *end, const char *blurb)
3745 {
3746     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3747
3748     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3749
3750     if (!PL_colorset)
3751             reginitcolors();
3752     {
3753         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3754             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3755
3756         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3757             start, end - start, 60);
3758
3759         PerlIO_printf(Perl_debug_log,
3760             "%s%s REx%s %s against %s\n",
3761                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3762
3763         if (utf8_target||utf8_pat)
3764             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3765                 utf8_pat ? "pattern" : "",
3766                 utf8_pat && utf8_target ? " and " : "",
3767                 utf8_target ? "string" : ""
3768             );
3769     }
3770 }
3771
3772 STATIC void
3773 S_dump_exec_pos(pTHX_ const char *locinput,
3774                       const regnode *scan,
3775                       const char *loc_regeol,
3776                       const char *loc_bostr,
3777                       const char *loc_reg_starttry,
3778                       const bool utf8_target)
3779 {
3780     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3781     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3782     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3783     /* The part of the string before starttry has one color
3784        (pref0_len chars), between starttry and current
3785        position another one (pref_len - pref0_len chars),
3786        after the current position the third one.
3787        We assume that pref0_len <= pref_len, otherwise we
3788        decrease pref0_len.  */
3789     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3790         ? (5 + taill) - l : locinput - loc_bostr;
3791     int pref0_len;
3792
3793     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3794
3795     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3796         pref_len++;
3797     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3798     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3799         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3800               ? (5 + taill) - pref_len : loc_regeol - locinput);
3801     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3802         l--;
3803     if (pref0_len < 0)
3804         pref0_len = 0;
3805     if (pref0_len > pref_len)
3806         pref0_len = pref_len;
3807     {
3808         const int is_uni = utf8_target ? 1 : 0;
3809
3810         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3811             (locinput - pref_len),pref0_len, 60, 4, 5);
3812
3813         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3814                     (locinput - pref_len + pref0_len),
3815                     pref_len - pref0_len, 60, 2, 3);
3816
3817         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3818                     locinput, loc_regeol - locinput, 10, 0, 1);
3819
3820         const STRLEN tlen=len0+len1+len2;
3821         PerlIO_printf(Perl_debug_log,
3822                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3823                     (IV)(locinput - loc_bostr),
3824                     len0, s0,
3825                     len1, s1,
3826                     (docolor ? "" : "> <"),
3827                     len2, s2,
3828                     (int)(tlen > 19 ? 0 :  19 - tlen),
3829                     "");
3830     }
3831 }
3832
3833 #endif
3834
3835 /* reg_check_named_buff_matched()
3836  * Checks to see if a named buffer has matched. The data array of
3837  * buffer numbers corresponding to the buffer is expected to reside
3838  * in the regexp->data->data array in the slot stored in the ARG() of
3839  * node involved. Note that this routine doesn't actually care about the
3840  * name, that information is not preserved from compilation to execution.
3841  * Returns the index of the leftmost defined buffer with the given name
3842  * or 0 if non of the buffers matched.
3843  */
3844 STATIC I32
3845 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3846 {
3847     I32 n;
3848     RXi_GET_DECL(rex,rexi);
3849     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3850     I32 *nums=(I32*)SvPVX(sv_dat);
3851
3852     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3853
3854     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3855         if ((I32)rex->lastparen >= nums[n] &&
3856             rex->offs[nums[n]].end != -1)
3857         {
3858             return nums[n];
3859         }
3860     }
3861     return 0;
3862 }
3863
3864
3865 static bool
3866 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3867         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3868 {
3869     /* This function determines if there are one or two characters that match
3870      * the first character of the passed-in EXACTish node <text_node>, and if
3871      * so, returns them in the passed-in pointers.
3872      *
3873      * If it determines that no possible character in the target string can
3874      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3875      * the first character in <text_node> requires UTF-8 to represent, and the
3876      * target string isn't in UTF-8.)
3877      *
3878      * If there are more than two characters that could match the beginning of
3879      * <text_node>, or if more context is required to determine a match or not,
3880      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3881      *
3882      * The motiviation behind this function is to allow the caller to set up
3883      * tight loops for matching.  If <text_node> is of type EXACT, there is
3884      * only one possible character that can match its first character, and so
3885      * the situation is quite simple.  But things get much more complicated if
3886      * folding is involved.  It may be that the first character of an EXACTFish
3887      * node doesn't participate in any possible fold, e.g., punctuation, so it
3888      * can be matched only by itself.  The vast majority of characters that are
3889      * in folds match just two things, their lower and upper-case equivalents.
3890      * But not all are like that; some have multiple possible matches, or match
3891      * sequences of more than one character.  This function sorts all that out.
3892      *
3893      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3894      * loop of trying to match A*, we know we can't exit where the thing
3895      * following it isn't a B.  And something can't be a B unless it is the
3896      * beginning of B.  By putting a quick test for that beginning in a tight
3897      * loop, we can rule out things that can't possibly be B without having to
3898      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3899      * character, we can make a tight loop matching A*, using the outputs of
3900      * this function.
3901      *
3902      * If the target string to match isn't in UTF-8, and there aren't
3903      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3904      * the one or two possible octets (which are characters in this situation)
3905      * that can match.  In all cases, if there is only one character that can
3906      * match, *<c1p> and *<c2p> will be identical.
3907      *
3908      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3909      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3910      * can match the beginning of <text_node>.  They should be declared with at
3911      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3912      * undefined what these contain.)  If one or both of the buffers are
3913      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3914      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3915      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3916      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3917      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3918
3919     const bool utf8_target = reginfo->is_utf8_target;
3920
3921     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3922     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3923     bool use_chrtest_void = FALSE;
3924     const bool is_utf8_pat = reginfo->is_utf8_pat;
3925
3926     /* Used when we have both utf8 input and utf8 output, to avoid converting
3927      * to/from code points */
3928     bool utf8_has_been_setup = FALSE;
3929
3930     dVAR;
3931
3932     U8 *pat = (U8*)STRING(text_node);
3933     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3934
3935     if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
3936
3937         /* In an exact node, only one thing can be matched, that first
3938          * character.  If both the pat and the target are UTF-8, we can just
3939          * copy the input to the output, avoiding finding the code point of
3940          * that character */
3941         if (!is_utf8_pat) {
3942             c2 = c1 = *pat;
3943         }
3944         else if (utf8_target) {
3945             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3946             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3947             utf8_has_been_setup = TRUE;
3948         }
3949         else {
3950             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3951         }
3952     }
3953     else { /* an EXACTFish node */
3954         U8 *pat_end = pat + STR_LEN(text_node);
3955
3956         /* An EXACTFL node has at least some characters unfolded, because what
3957          * they match is not known until now.  So, now is the time to fold
3958          * the first few of them, as many as are needed to determine 'c1' and
3959          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3960          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3961          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3962          * need to fold as many characters as a single character can fold to,
3963          * so that later we can check if the first ones are such a multi-char
3964          * fold.  But, in such a pattern only locale-problematic characters
3965          * aren't folded, so we can skip this completely if the first character
3966          * in the node isn't one of the tricky ones */
3967         if (OP(text_node) == EXACTFL) {
3968
3969             if (! is_utf8_pat) {
3970                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3971                 {
3972                     folded[0] = folded[1] = 's';
3973                     pat = folded;
3974                     pat_end = folded + 2;
3975                 }
3976             }
3977             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3978                 U8 *s = pat;
3979                 U8 *d = folded;
3980                 int i;
3981
3982                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3983                     if (isASCII(*s)) {
3984                         *(d++) = (U8) toFOLD_LC(*s);
3985                         s++;
3986                     }
3987                     else {
3988                         STRLEN len;
3989                         _to_utf8_fold_flags(s,
3990                                             d,
3991                                             &len,
3992                                             FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3993                         d += len;
3994                         s += UTF8SKIP(s);
3995                     }
3996                 }
3997
3998                 pat = folded;
3999                 pat_end = d;
4000             }
4001         }
4002
4003         if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4004              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4005         {
4006             /* Multi-character folds require more context to sort out.  Also
4007              * PL_utf8_foldclosures used below doesn't handle them, so have to
4008              * be handled outside this routine */
4009             use_chrtest_void = TRUE;
4010         }
4011         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4012             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4013             if (c1 > 255) {
4014                 /* Load the folds hash, if not already done */
4015                 SV** listp;
4016                 if (! PL_utf8_foldclosures) {
4017                     _load_PL_utf8_foldclosures();
4018                 }
4019
4020                 /* The fold closures data structure is a hash with the keys
4021                  * being the UTF-8 of every character that is folded to, like
4022                  * 'k', and the values each an array of all code points that
4023                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4024                  * Multi-character folds are not included */
4025                 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4026                                         (char *) pat,
4027                                         UTF8SKIP(pat),
4028                                         FALSE))))
4029                 {
4030                     /* Not found in the hash, therefore there are no folds
4031                     * containing it, so there is only a single character that
4032                     * could match */
4033                     c2 = c1;
4034                 }
4035                 else {  /* Does participate in folds */
4036                     AV* list = (AV*) *listp;
4037                     if (av_tindex(list) != 1) {
4038
4039                         /* If there aren't exactly two folds to this, it is
4040                          * outside the scope of this function */
4041                         use_chrtest_void = TRUE;
4042                     }
4043                     else {  /* There are two.  Get them */
4044                         SV** c_p = av_fetch(list, 0, FALSE);
4045                         if (c_p == NULL) {
4046                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4047                         }
4048                         c1 = SvUV(*c_p);
4049
4050                         c_p = av_fetch(list, 1, FALSE);
4051                         if (c_p == NULL) {
4052                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4053                         }
4054                         c2 = SvUV(*c_p);
4055
4056                         /* Folds that cross the 255/256 boundary are forbidden
4057                          * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4058                          * one is ASCIII.  Since the pattern character is above
4059                          * 255, and its only other match is below 256, the only
4060                          * legal match will be to itself.  We have thrown away
4061                          * the original, so have to compute which is the one
4062                          * above 255. */
4063                         if ((c1 < 256) != (c2 < 256)) {
4064                             if ((OP(text_node) == EXACTFL
4065                                  && ! IN_UTF8_CTYPE_LOCALE)
4066                                 || ((OP(text_node) == EXACTFA
4067                                     || OP(text_node) == EXACTFA_NO_TRIE)
4068                                     && (isASCII(c1) || isASCII(c2))))
4069                             {
4070                                 if (c1 < 256) {
4071                                     c1 = c2;
4072                                 }
4073                                 else {
4074                                     c2 = c1;
4075                                 }
4076                             }
4077                         }
4078                     }
4079                 }
4080             }
4081             else /* Here, c1 is <= 255 */
4082                 if (utf8_target
4083                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4084                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4085                     && ((OP(text_node) != EXACTFA
4086                         && OP(text_node) != EXACTFA_NO_TRIE)
4087                         || ! isASCII(c1)))
4088             {
4089                 /* Here, there could be something above Latin1 in the target
4090                  * which folds to this character in the pattern.  All such
4091                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4092                  * than two characters involved in their folds, so are outside
4093                  * the scope of this function */
4094                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4095                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4096                 }
4097                 else {
4098                     use_chrtest_void = TRUE;
4099                 }
4100             }
4101             else { /* Here nothing above Latin1 can fold to the pattern
4102                       character */
4103                 switch (OP(text_node)) {
4104
4105                     case EXACTFL:   /* /l rules */
4106                         c2 = PL_fold_locale[c1];
4107                         break;
4108
4109                     case EXACTF:   /* This node only generated for non-utf8
4110                                     patterns */
4111                         assert(! is_utf8_pat);
4112                         if (! utf8_target) {    /* /d rules */
4113                             c2 = PL_fold[c1];
4114                             break;
4115                         }
4116                         /* FALLTHROUGH */
4117                         /* /u rules for all these.  This happens to work for
4118                         * EXACTFA as nothing in Latin1 folds to ASCII */
4119                     case EXACTFA_NO_TRIE:   /* This node only generated for
4120                                             non-utf8 patterns */
4121                         assert(! is_utf8_pat);
4122                         /* FALLTHROUGH */
4123                     case EXACTFA:
4124                     case EXACTFU_SS:
4125                     case EXACTFU:
4126                         c2 = PL_fold_latin1[c1];
4127                         break;
4128
4129                     default:
4130                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4131                         NOT_REACHED; /* NOTREACHED */
4132                 }
4133             }
4134         }
4135     }
4136
4137     /* Here have figured things out.  Set up the returns */
4138     if (use_chrtest_void) {
4139         *c2p = *c1p = CHRTEST_VOID;
4140     }
4141     else if (utf8_target) {
4142         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4143             uvchr_to_utf8(c1_utf8, c1);
4144             uvchr_to_utf8(c2_utf8, c2);
4145         }
4146
4147         /* Invariants are stored in both the utf8 and byte outputs; Use
4148          * negative numbers otherwise for the byte ones.  Make sure that the
4149          * byte ones are the same iff the utf8 ones are the same */
4150         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4151         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4152                 ? *c2_utf8
4153                 : (c1 == c2)
4154                   ? CHRTEST_NOT_A_CP_1
4155                   : CHRTEST_NOT_A_CP_2;
4156     }
4157     else if (c1 > 255) {
4158        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4159                            can represent */
4160            return FALSE;
4161        }
4162
4163        *c1p = *c2p = c2;    /* c2 is the only representable value */
4164     }
4165     else {  /* c1 is representable; see about c2 */
4166        *c1p = c1;
4167        *c2p = (c2 < 256) ? c2 : c1;
4168     }
4169
4170     return TRUE;
4171 }
4172
4173 /* This creates a single number by combining two, with 'before' being like the
4174  * 10's digit, but this isn't necessarily base 10; it is base however many
4175  * elements of the enum there are */
4176 #define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
4177
4178 STATIC bool
4179 S_isGCB(const GCB_enum before, const GCB_enum after)
4180 {
4181     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4182      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4183
4184     switch (GCBcase(before, after)) {
4185
4186         /*  Break at the start and end of text.
4187             GB1.   sot ÷
4188             GB2.   ÷ eot
4189
4190             Break before and after controls except between CR and LF
4191             GB4.  ( Control | CR | LF )  ÷
4192             GB5.   ÷  ( Control | CR | LF )
4193
4194             Otherwise, break everywhere.
4195             GB10.  Any  ÷  Any */
4196         default:
4197             return TRUE;
4198
4199         /* Do not break between a CR and LF.
4200             GB3.  CR  ×  LF */
4201         case GCBcase(GCB_CR, GCB_LF):
4202             return FALSE;
4203
4204         /* Do not break Hangul syllable sequences.
4205             GB6.  L  ×  ( L | V | LV | LVT ) */
4206         case GCBcase(GCB_L, GCB_L):
4207         case GCBcase(GCB_L, GCB_V):
4208         case GCBcase(GCB_L, GCB_LV):
4209         case GCBcase(GCB_L, GCB_LVT):
4210             return FALSE;
4211
4212         /*  GB7.  ( LV | V )  ×  ( V | T ) */
4213         case GCBcase(GCB_LV, GCB_V):
4214         case GCBcase(GCB_LV, GCB_T):
4215         case GCBcase(GCB_V, GCB_V):
4216         case GCBcase(GCB_V, GCB_T):
4217             return FALSE;
4218
4219         /*  GB8.  ( LVT | T)  ×  T */
4220         case GCBcase(GCB_LVT, GCB_T):
4221         case GCBcase(GCB_T, GCB_T):
4222             return FALSE;
4223
4224         /* Do not break between regional indicator symbols.
4225             GB8a.  Regional_Indicator  ×  Regional_Indicator */
4226         case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
4227             return FALSE;
4228
4229         /* Do not break before extending characters.
4230             GB9.     ×  Extend */
4231         case GCBcase(GCB_Other, GCB_Extend):
4232         case GCBcase(GCB_Extend, GCB_Extend):
4233         case GCBcase(GCB_L, GCB_Extend):
4234         case GCBcase(GCB_LV, GCB_Extend):
4235         case GCBcase(GCB_LVT, GCB_Extend):
4236         case GCBcase(GCB_Prepend, GCB_Extend):
4237         case GCBcase(GCB_Regional_Indicator, GCB_Extend):
4238         case GCBcase(GCB_SpacingMark, GCB_Extend):
4239         case GCBcase(GCB_T, GCB_Extend):
4240         case GCBcase(GCB_V, GCB_Extend):
4241             return FALSE;
4242
4243         /* Do not break before SpacingMarks, or after Prepend characters.
4244             GB9a.     ×  SpacingMark */
4245         case GCBcase(GCB_Other, GCB_SpacingMark):
4246         case GCBcase(GCB_Extend, GCB_SpacingMark):
4247         case GCBcase(GCB_L, GCB_SpacingMark):
4248         case GCBcase(GCB_LV, GCB_SpacingMark):
4249         case GCBcase(GCB_LVT, GCB_SpacingMark):
4250         case GCBcase(GCB_Prepend, GCB_SpacingMark):
4251         case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
4252         case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
4253         case GCBcase(GCB_T, GCB_SpacingMark):
4254         case GCBcase(GCB_V, GCB_SpacingMark):
4255             return FALSE;
4256
4257         /* GB9b.  Prepend  ×   */
4258         case GCBcase(GCB_Prepend, GCB_Other):
4259         case GCBcase(GCB_Prepend, GCB_L):
4260         case GCBcase(GCB_Prepend, GCB_LV):
4261         case GCBcase(GCB_Prepend, GCB_LVT):
4262         case GCBcase(GCB_Prepend, GCB_Prepend):
4263         case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
4264         case GCBcase(GCB_Prepend, GCB_T):
4265         case GCBcase(GCB_Prepend, GCB_V):
4266             return FALSE;
4267     }
4268
4269     NOT_REACHED; /* NOTREACHED */
4270 }
4271
4272 #define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
4273
4274 STATIC bool
4275 S_isSB(pTHX_ SB_enum before,
4276              SB_enum after,
4277              const U8 * const strbeg,
4278              const U8 * const curpos,
4279              const U8 * const strend,
4280              const bool utf8_target)
4281 {
4282     /* returns a boolean indicating if there is a Sentence Boundary Break
4283      * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4284
4285     U8 * lpos = (U8 *) curpos;
4286     U8 * temp_pos;
4287     SB_enum backup;
4288
4289     PERL_ARGS_ASSERT_ISSB;
4290
4291     /* Break at the start and end of text.
4292         SB1.  sot  ÷
4293         SB2.  ÷  eot */
4294     if (before == SB_EDGE || after == SB_EDGE) {
4295         return TRUE;
4296     }
4297
4298     /* SB 3: Do not break within CRLF. */
4299     if (before == SB_CR && after == SB_LF) {
4300         return FALSE;
4301     }
4302
4303     /* Break after paragraph separators.  (though why CR and LF are considered
4304      * so is beyond me (khw)
4305        SB4.  Sep | CR | LF  ÷ */
4306     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4307         return TRUE;
4308     }
4309
4310     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4311      * (See Section 6.2, Replacing Ignore Rules.)
4312         SB5.  X (Extend | Format)*  →  X */
4313     if (after == SB_Extend || after == SB_Format) {
4314         return FALSE;
4315     }
4316
4317     if (before == SB_Extend || before == SB_Format) {
4318         before = backup_one_SB(strbeg, &lpos, utf8_target);
4319     }
4320
4321     /* Do not break after ambiguous terminators like period, if they are
4322      * immediately followed by a number or lowercase letter, if they are
4323      * between uppercase letters, if the first following letter (optionally
4324      * after certain punctuation) is lowercase, or if they are followed by
4325      * "continuation" punctuation such as comma, colon, or semicolon. For
4326      * example, a period may be an abbreviation or numeric period, and thus may
4327      * not mark the end of a sentence.
4328
4329      * SB6. ATerm  ×  Numeric */
4330     if (before == SB_ATerm && after == SB_Numeric) {
4331         return FALSE;
4332     }
4333
4334     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4335     if (before == SB_ATerm && after == SB_Upper) {
4336         temp_pos = lpos;
4337         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4338         if (backup == SB_Upper || backup == SB_Lower) {
4339             return FALSE;
4340         }
4341     }
4342
4343     /* SB8a.  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4344      * SB10.  (STerm | ATerm) Close* Sp*  ×  ( Sp | Sep | CR | LF )      */
4345     backup = before;
4346     temp_pos = lpos;
4347     while (backup == SB_Sp) {
4348         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4349     }
4350     while (backup == SB_Close) {
4351         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4352     }
4353     if ((backup == SB_STerm || backup == SB_ATerm)
4354         && (   after == SB_SContinue
4355             || after == SB_STerm
4356             || after == SB_ATerm
4357             || after == SB_Sp
4358             || after == SB_Sep
4359             || after == SB_CR
4360             || after == SB_LF))
4361     {
4362         return FALSE;
4363     }
4364
4365     /* SB8.  ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
4366      *                                              STerm | ATerm) )* Lower */
4367     if (backup == SB_ATerm) {
4368         U8 * rpos = (U8 *) curpos;
4369         SB_enum later = after;
4370
4371         while (    later != SB_OLetter
4372                 && later != SB_Upper
4373                 && later != SB_Lower
4374                 && later != SB_Sep
4375                 && later != SB_CR
4376                 && later != SB_LF
4377                 && later != SB_STerm
4378                 && later != SB_ATerm
4379                 && later != SB_EDGE)
4380         {
4381             later = advance_one_SB(&rpos, strend, utf8_target);
4382         }
4383         if (later == SB_Lower) {
4384             return FALSE;
4385         }
4386     }
4387
4388     /* Break after sentence terminators, but include closing punctuation,
4389      * trailing spaces, and a paragraph separator (if present). [See note
4390      * below.]
4391      * SB9.  ( STerm | ATerm ) Close*  ×  ( Close | Sp | Sep | CR | LF ) */
4392     backup = before;
4393     temp_pos = lpos;
4394     while (backup == SB_Close) {
4395         backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4396     }
4397     if ((backup == SB_STerm || backup == SB_ATerm)
4398         && (   after == SB_Close
4399             || after == SB_Sp
4400             || after == SB_Sep
4401             || after == SB_CR
4402             || after == SB_LF))
4403     {
4404         return FALSE;
4405     }
4406
4407
4408     /* SB11.  ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )?  ÷ */
4409     temp_pos = lpos;
4410     backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4411     if (   backup == SB_Sep
4412         || backup == SB_CR
4413         || backup == SB_LF)
4414     {
4415         lpos = temp_pos;
4416     }
4417     else {
4418         backup = before;
4419     }
4420     while (backup == SB_Sp) {
4421         backup = backup_one_SB(strbeg, &lpos, utf8_target);
4422     }
4423     while (backup == SB_Close) {
4424         backup = backup_one_SB(strbeg, &lpos, utf8_target);
4425     }
4426     if (backup == SB_STerm || backup == SB_ATerm) {
4427         return TRUE;
4428     }
4429
4430     /* Otherwise, do not break.
4431     SB12.  Any  ×  Any */
4432
4433     return FALSE;
4434 }
4435
4436 STATIC SB_enum
4437 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4438 {
4439     SB_enum sb;
4440
4441     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4442
4443     if (*curpos >= strend) {
4444         return SB_EDGE;
4445     }
4446
4447     if (utf8_target) {
4448         do {
4449             *curpos += UTF8SKIP(*curpos);
4450             if (*curpos >= strend) {
4451                 return SB_EDGE;
4452             }
4453             sb = getSB_VAL_UTF8(*curpos, strend);
4454         } while (sb == SB_Extend || sb == SB_Format);
4455     }
4456     else {
4457         do {
4458             (*curpos)++;
4459             if (*curpos >= strend) {
4460                 return SB_EDGE;
4461             }
4462             sb = getSB_VAL_CP(**curpos);
4463         } while (sb == SB_Extend || sb == SB_Format);
4464     }
4465
4466     return sb;
4467 }
4468
4469 STATIC SB_enum
4470 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4471 {
4472     SB_enum sb;
4473
4474     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4475
4476     if (*curpos < strbeg) {
4477         return SB_EDGE;
4478     }
4479
4480     if (utf8_target) {
4481         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4482         if (! prev_char_pos) {
4483             return SB_EDGE;
4484         }
4485
4486         /* Back up over Extend and Format.  curpos is always just to the right
4487          * of the characater whose value we are getting */
4488         do {
4489             U8 * prev_prev_char_pos;
4490             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4491                                                                       strbeg)))
4492             {
4493                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4494                 *curpos = prev_char_pos;
4495                 prev_char_pos = prev_prev_char_pos;
4496             }
4497             else {
4498                 *curpos = (U8 *) strbeg;
4499                 return SB_EDGE;
4500             }
4501         } while (sb == SB_Extend || sb == SB_Format);
4502     }
4503     else {
4504         do {
4505             if (*curpos - 2 < strbeg) {
4506                 *curpos = (U8 *) strbeg;
4507                 return SB_EDGE;
4508             }
4509             (*curpos)--;
4510             sb = getSB_VAL_CP(*(*curpos - 1));
4511         } while (sb == SB_Extend || sb == SB_Format);
4512     }
4513
4514     return sb;
4515 }
4516
4517 #define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
4518
4519 STATIC bool
4520 S_isWB(pTHX_ WB_enum previous,
4521              WB_enum before,
4522              WB_enum after,
4523              const U8 * const strbeg,
4524              const U8 * const curpos,
4525              const U8 * const strend,
4526              const bool utf8_target)
4527 {
4528     /*  Return a boolean as to if the boundary between 'before' and 'after' is
4529      *  a Unicode word break, using their published algorithm.  Context may be
4530      *  needed to make this determination.  If the value for the character
4531      *  before 'before' is known, it is passed as 'previous'; otherwise that
4532      *  should be set to WB_UNKNOWN.  The other input parameters give the
4533      *  boundaries and current position in the matching of the string.  That
4534      *  is, 'curpos' marks the position where the character whose wb value is
4535      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
4536
4537     U8 * before_pos = (U8 *) curpos;
4538     U8 * after_pos = (U8 *) curpos;
4539
4540     PERL_ARGS_ASSERT_ISWB;
4541
4542     /* WB1 and WB2: Break at the start and end of text. */
4543     if (before == WB_EDGE || after == WB_EDGE) {
4544         return TRUE;
4545     }
4546
4547     /* WB 3: Do not break within CRLF. */
4548     if (before == WB_CR && after == WB_LF) {
4549         return FALSE;
4550     }
4551
4552     /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
4553      * and LF) */
4554     if (   before == WB_CR || before == WB_LF || before == WB_Newline
4555         || after ==  WB_CR || after ==  WB_LF || after ==  WB_Newline)
4556     {
4557         return TRUE;
4558     }
4559
4560     /* Ignore Format and Extend characters, except when they appear at the
4561      * beginning of a region of text.
4562      * WB4.  X (Extend | Format)*  →  X. */
4563
4564     if (after == WB_Extend || after == WB_Format) {
4565         return FALSE;
4566     }
4567
4568     if (before == WB_Extend || before == WB_Format) {
4569         before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4570     }
4571
4572     switch (WBcase(before, after)) {
4573             /* Otherwise, break everywhere (including around ideographs).
4574                 WB14.  Any  ÷  Any */
4575             default:
4576                 return TRUE;
4577
4578             /* Do not break between most letters.
4579                 WB5.  (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
4580             case WBcase(WB_ALetter, WB_ALetter):
4581             case WBcase(WB_ALetter, WB_Hebrew_Letter):
4582             case WBcase(WB_Hebrew_Letter, WB_ALetter):
4583             case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
4584                 return FALSE;
4585
4586             /* Do not break letters across certain punctuation.
4587                 WB6.  (ALetter | Hebrew_Letter)
4588                         × (MidLetter | MidNumLet | Single_Quote) (ALetter
4589                                                             | Hebrew_Letter) */
4590             case WBcase(WB_ALetter, WB_MidLetter):
4591             case WBcase(WB_ALetter, WB_MidNumLet):
4592             case WBcase(WB_ALetter, WB_Single_Quote):
4593             case WBcase(WB_Hebrew_Letter, WB_MidLetter):
4594             case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
4595             /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
4596                 after = advance_one_WB(&after_pos, strend, utf8_target);
4597                 return after != WB_ALetter && after != WB_Hebrew_Letter;
4598
4599             /* WB7.  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
4600              *                    Single_Quote) ×  (ALetter | Hebrew_Letter) */
4601             case WBcase(WB_MidLetter, WB_ALetter):
4602             case WBcase(WB_MidLetter, WB_Hebrew_Letter):
4603             case WBcase(WB_MidNumLet, WB_ALetter):
4604             case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
4605             case WBcase(WB_Single_Quote, WB_ALetter):
4606             case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
4607                 before
4608                   = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4609                 return before != WB_ALetter && before != WB_Hebrew_Letter;
4610
4611             /* WB7a.  Hebrew_Letter  ×  Single_Quote */
4612             case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
4613                 return FALSE;
4614
4615             /* WB7b.  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
4616             case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
4617                 return advance_one_WB(&after_pos, strend, utf8_target)
4618                                                         != WB_Hebrew_Letter;
4619
4620             /* WB7c.  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
4621             case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
4622                 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4623                                                         != WB_Hebrew_Letter;
4624
4625             /* Do not break within sequences of digits, or digits adjacent to
4626              * letters (“3a”, or “A3”).
4627                 WB8.  Numeric  ×  Numeric */
4628             case WBcase(WB_Numeric, WB_Numeric):
4629                 return FALSE;
4630
4631             /* WB9.  (ALetter | Hebrew_Letter)  ×  Numeric */
4632             case WBcase(WB_ALetter, WB_Numeric):
4633             case WBcase(WB_Hebrew_Letter, WB_Numeric):
4634                 return FALSE;
4635
4636             /* WB10.  Numeric  ×  (ALetter | Hebrew_Letter) */
4637             case WBcase(WB_Numeric, WB_ALetter):
4638             case WBcase(WB_Numeric, WB_Hebrew_Letter):
4639                 return FALSE;
4640
4641             /* Do not break within sequences, such as “3.2” or “3,456.789”.
4642                 WB11.   Numeric (MidNum | MidNumLet | Single_Quote)  ×  Numeric
4643              */
4644             case WBcase(WB_MidNum, WB_Numeric):
4645             case WBcase(WB_MidNumLet, WB_Numeric):
4646             case WBcase(WB_Single_Quote, WB_Numeric):
4647                 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4648                                                                != WB_Numeric;
4649
4650             /*  WB12.   Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric
4651              *  */
4652             case WBcase(WB_Numeric, WB_MidNum):
4653             case WBcase(WB_Numeric, WB_MidNumLet):
4654             case WBcase(WB_Numeric, WB_Single_Quote):
4655                 return advance_one_WB(&after_pos, strend, utf8_target)
4656                                                                != WB_Numeric;
4657
4658             /* Do not break between Katakana.
4659                WB13.  Katakana  ×  Katakana */
4660             case WBcase(WB_Katakana, WB_Katakana):
4661                 return FALSE;
4662
4663             /* Do not break from extenders.
4664                WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana |
4665                                             ExtendNumLet)  ×  ExtendNumLet */
4666             case WBcase(WB_ALetter, WB_ExtendNumLet):
4667             case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
4668             case WBcase(WB_Numeric, WB_ExtendNumLet):
4669             case WBcase(WB_Katakana, WB_ExtendNumLet):
4670             case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
4671                 return FALSE;
4672
4673             /* WB13b.  ExtendNumLet  ×  (ALetter | Hebrew_Letter | Numeric
4674              *                                                 | Katakana) */
4675             case WBcase(WB_ExtendNumLet, WB_ALetter):
4676             case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
4677             case WBcase(WB_ExtendNumLet, WB_Numeric):
4678             case WBcase(WB_ExtendNumLet, WB_Katakana):
4679                 return FALSE;
4680
4681             /* Do not break between regional indicator symbols.
4682                WB13c.  Regional_Indicator  ×  Regional_Indicator */
4683             case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
4684                 return FALSE;
4685
4686     }
4687
4688     NOT_REACHED; /* NOTREACHED */
4689 }
4690
4691 STATIC WB_enum
4692 S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4693 {
4694     WB_enum wb;
4695
4696     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
4697
4698     if (*curpos >= strend) {
4699         return WB_EDGE;
4700     }
4701
4702     if (utf8_target) {
4703
4704         /* Advance over Extend and Format */
4705         do {
4706             *curpos += UTF8SKIP(*curpos);
4707             if (*curpos >= strend) {
4708                 return WB_EDGE;
4709             }
4710             wb = getWB_VAL_UTF8(*curpos, strend);
4711         } while (wb == WB_Extend || wb == WB_Format);
4712     }
4713     else {
4714         do {
4715             (*curpos)++;
4716             if (*curpos >= strend) {
4717                 return WB_EDGE;
4718             }
4719             wb = getWB_VAL_CP(**curpos);
4720         } while (wb == WB_Extend || wb == WB_Format);
4721     }
4722
4723     return wb;
4724 }
4725
4726 STATIC WB_enum
4727 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4728 {
4729     WB_enum wb;
4730
4731     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
4732
4733     /* If we know what the previous character's break value is, don't have
4734         * to look it up */
4735     if (*previous != WB_UNKNOWN) {
4736         wb = *previous;
4737         *previous = WB_UNKNOWN;
4738         /* XXX Note that doesn't change curpos, and maybe should */
4739
4740         /* But we always back up over these two types */
4741         if (wb != WB_Extend && wb != WB_Format) {
4742             return wb;
4743         }
4744     }
4745
4746     if (*curpos < strbeg) {
4747         return WB_EDGE;
4748     }
4749
4750     if (utf8_target) {
4751         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4752         if (! prev_char_pos) {
4753             return WB_EDGE;
4754         }
4755
4756         /* Back up over Extend and Format.  curpos is always just to the right
4757          * of the characater whose value we are getting */
4758         do {
4759             U8 * prev_prev_char_pos;
4760             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
4761                                                    -1,
4762                                                    strbeg)))
4763             {
4764                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4765                 *curpos = prev_char_pos;
4766                 prev_char_pos = prev_prev_char_pos;
4767             }
4768             else {
4769                 *curpos = (U8 *) strbeg;
4770                 return WB_EDGE;
4771             }
4772         } while (wb == WB_Extend || wb == WB_Format);
4773     }
4774     else {
4775         do {
4776             if (*curpos - 2 < strbeg) {
4777                 *curpos = (U8 *) strbeg;
4778                 return WB_EDGE;
4779             }
4780             (*curpos)--;
4781             wb = getWB_VAL_CP(*(*curpos - 1));
4782         } while (wb == WB_Extend || wb == WB_Format);
4783     }
4784
4785     return wb;
4786 }
4787
4788 /* returns -1 on failure, $+[0] on success */
4789 STATIC SSize_t
4790 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
4791 {
4792 #if PERL_VERSION < 9 && !defined(PERL_CORE)
4793     dMY_CXT;
4794 #endif
4795     dVAR;
4796     const bool utf8_target = reginfo->is_utf8_target;
4797     const U32 uniflags = UTF8_ALLOW_DEFAULT;
4798     REGEXP *rex_sv = reginfo->prog;
4799     regexp *rex = ReANY(rex_sv);
4800     RXi_GET_DECL(rex,rexi);
4801     /* the current state. This is a cached copy of PL_regmatch_state */
4802     regmatch_state *st;
4803     /* cache heavy used fields of st in registers */
4804     regnode *scan;
4805     regnode *next;
4806     U32 n = 0;  /* general value; init to avoid compiler warning */
4807     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
4808     char *locinput = startpos;
4809     char *pushinput; /* where to continue after a PUSH */
4810     I32 nextchr;   /* is always set to UCHARAT(locinput) */
4811
4812     bool result = 0;        /* return value of S_regmatch */
4813     int depth = 0;          /* depth of backtrack stack */
4814     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
4815     const U32 max_nochange_depth =
4816         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
4817         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
4818     regmatch_state *yes_state = NULL; /* state to pop to on success of
4819                                                             subpattern */
4820     /* mark_state piggy backs on the yes_state logic so that when we unwind
4821        the stack on success we can update the mark_state as we go */
4822     regmatch_state *mark_state = NULL; /* last mark state we have seen */
4823     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
4824     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
4825     U32 state_num;
4826     bool no_final = 0;      /* prevent failure from backtracking? */
4827     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
4828     char *startpoint = locinput;
4829     SV *popmark = NULL;     /* are we looking for a mark? */
4830     SV *sv_commit = NULL;   /* last mark name seen in failure */
4831     SV *sv_yes_mark = NULL; /* last mark name we have seen
4832                                during a successful match */
4833     U32 lastopen = 0;       /* last open we saw */
4834     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4835     SV* const oreplsv = GvSVn(PL_replgv);
4836     /* these three flags are set by various ops to signal information to
4837      * the very next op. They have a useful lifetime of exactly one loop
4838      * iteration, and are not preserved or restored by state pushes/pops
4839      */
4840     bool sw = 0;            /* the condition value in (?(cond)a|b) */
4841     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
4842     int logical = 0;        /* the following EVAL is:
4843                                 0: (?{...})
4844                                 1: (?(?{...})X|Y)
4845                                 2: (??{...})
4846                                or the following IFMATCH/UNLESSM is:
4847                                 false: plain (?=foo)
4848                                 true:  used as a condition: (?(?=foo))
4849                             */
4850     PAD* last_pad = NULL;
4851     dMULTICALL;
4852     I32 gimme = G_SCALAR;
4853     CV *caller_cv = NULL;       /* who called us */
4854     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
4855     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
4856     U32 maxopenparen = 0;       /* max '(' index seen so far */
4857     int to_complement;  /* Invert the result? */
4858     _char_class_number classnum;
4859     bool is_utf8_pat = reginfo->is_utf8_pat;
4860     bool match = FALSE;
4861
4862
4863 #ifdef DEBUGGING
4864     GET_RE_DEBUG_FLAGS_DECL;
4865 #endif
4866
4867     /* protect against undef(*^R) */
4868     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
4869
4870     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
4871     multicall_oldcatch = 0;
4872     multicall_cv = NULL;
4873     cx = NULL;
4874     PERL_UNUSED_VAR(multicall_cop);
4875     PERL_UNUSED_VAR(newsp);
4876
4877
4878     PERL_ARGS_ASSERT_REGMATCH;
4879
4880     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
4881             PerlIO_printf(Perl_debug_log,"regmatch start\n");
4882     }));
4883
4884     st = PL_regmatch_state;
4885
4886     /* Note that nextchr is a byte even in UTF */
4887     SET_nextchr;
4888     scan = prog;
4889     while (scan != NULL) {
4890
4891         DEBUG_EXECUTE_r( {
4892             SV * const prop = sv_newmortal();
4893             regnode *rnext=regnext(scan);
4894             DUMP_EXEC_POS( locinput, scan, utf8_target );
4895             regprop(rex, prop, scan, reginfo, NULL);
4896
4897             PerlIO_printf(Perl_debug_log,
4898                     "%3"IVdf":%*s%s(%"IVdf")\n",
4899                     (IV)(scan - rexi->program), depth*2, "",
4900                     SvPVX_const(prop),
4901                     (PL_regkind[OP(scan)] == END || !rnext) ?
4902                         0 : (IV)(rnext - rexi->program));
4903         });
4904
4905         next = scan + NEXT_OFF(scan);
4906         if (next == scan)
4907             next = NULL;
4908         state_num = OP(scan);
4909
4910       reenter_switch:
4911         to_complement = 0;
4912
4913         SET_nextchr;
4914         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4915
4916         switch (state_num) {
4917         case SBOL: /*  /^../ and /\A../  */
4918             if (locinput == reginfo->strbeg)
4919                 break;
4920             sayNO;
4921
4922         case MBOL: /*  /^../m  */
4923             if (locinput == reginfo->strbeg ||
4924                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4925             {
4926                 break;
4927             }
4928             sayNO;
4929
4930         case GPOS: /*  \G  */
4931             if (locinput == reginfo->ganch)
4932                 break;
4933             sayNO;
4934
4935         case KEEPS: /*   \K  */
4936             /* update the startpoint */
4937             st->u.keeper.val = rex->offs[0].start;
4938             rex->offs[0].start = locinput - reginfo->strbeg;
4939             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4940             /* NOTREACHED */
4941             NOT_REACHED; /* NOTREACHED */
4942
4943         case KEEPS_next_fail:
4944             /* rollback the start point change */
4945             rex->offs[0].start = st->u.keeper.val;
4946             sayNO_SILENT;
4947             /* NOTREACHED */
4948             NOT_REACHED; /* NOTREACHED */
4949
4950         case MEOL: /* /..$/m  */
4951             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4952                 sayNO;
4953             break;
4954
4955         case SEOL: /* /..$/  */
4956             if (!NEXTCHR_IS_EOS && nextchr != '\n')
4957                 sayNO;
4958             if (reginfo->strend - locinput > 1)
4959                 sayNO;
4960             break;
4961
4962         case EOS: /*  \z  */
4963             if (!NEXTCHR_IS_EOS)
4964                 sayNO;
4965             break;
4966
4967         case SANY: /*  /./s  */
4968             if (NEXTCHR_IS_EOS)
4969                 sayNO;
4970             goto increment_locinput;
4971
4972         case REG_ANY: /*  /./  */
4973             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4974                 sayNO;
4975             goto increment_locinput;
4976
4977
4978 #undef  ST
4979 #define ST st->u.trie
4980         case TRIEC: /* (ab|cd) with known charclass */
4981             /* In this case the charclass data is available inline so
4982                we can fail fast without a lot of extra overhead.
4983              */
4984             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4985                 DEBUG_EXECUTE_r(
4986                     PerlIO_printf(Perl_debug_log,
4987                               "%*s  %sfailed to match trie start class...%s\n",
4988                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4989                 );
4990                 sayNO_SILENT;
4991                 /* NOTREACHED */
4992                 NOT_REACHED; /* NOTREACHED */
4993             }
4994             /* FALLTHROUGH */
4995         case TRIE:  /* (ab|cd)  */
4996             /* the basic plan of execution of the trie is:
4997              * At the beginning, run though all the states, and
4998              * find the longest-matching word. Also remember the position
4999              * of the shortest matching word. For example, this pattern:
5000              *    1  2 3 4    5
5001              *    ab|a|x|abcd|abc
5002              * when matched against the string "abcde", will generate
5003              * accept states for all words except 3, with the longest
5004              * matching word being 4, and the shortest being 2 (with
5005              * the position being after char 1 of the string).
5006              *
5007              * Then for each matching word, in word order (i.e. 1,2,4,5),
5008              * we run the remainder of the pattern; on each try setting
5009              * the current position to the character following the word,
5010              * returning to try the next word on failure.
5011              *
5012              * We avoid having to build a list of words at runtime by
5013              * using a compile-time structure, wordinfo[].prev, which
5014              * gives, for each word, the previous accepting word (if any).
5015              * In the case above it would contain the mappings 1->2, 2->0,
5016              * 3->0, 4->5, 5->1.  We can use this table to generate, from
5017              * the longest word (4 above), a list of all words, by
5018              * following the list of prev pointers; this gives us the
5019              * unordered list 4,5,1,2. Then given the current word we have
5020              * just tried, we can go through the list and find the
5021              * next-biggest word to try (so if we just failed on word 2,
5022              * the next in the list is 4).
5023              *
5024              * Since at runtime we don't record the matching position in
5025              * the string for each word, we have to work that out for
5026              * each word we're about to process. The wordinfo table holds
5027              * the character length of each word; given that we recorded
5028              * at the start: the position of the shortest word and its
5029              * length in chars, we just need to move the pointer the
5030              * difference between the two char lengths. Depending on
5031              * Unicode status and folding, that's cheap or expensive.
5032              *
5033              * This algorithm is optimised for the case where are only a
5034              * small number of accept states, i.e. 0,1, or maybe 2.
5035              * With lots of accepts states, and having to try all of them,
5036              * it becomes quadratic on number of accept states to find all
5037              * the next words.
5038              */
5039
5040             {
5041                 /* what type of TRIE am I? (utf8 makes this contextual) */
5042                 DECL_TRIE_TYPE(scan);
5043
5044                 /* what trie are we using right now */
5045                 reg_trie_data * const trie
5046                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5047                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5048                 U32 state = trie->startstate;
5049
5050                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5051                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5052                     if (utf8_target
5053                         && UTF8_IS_ABOVE_LATIN1(nextchr)
5054                         && scan->flags == EXACTL)
5055                     {
5056                         /* We only output for EXACTL, as we let the folder
5057                          * output this message for EXACTFLU8 to avoid
5058                          * duplication */
5059                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5060                                                                reginfo->strend);
5061                     }
5062                 }
5063                 if (   trie->bitmap
5064                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5065                 {
5066                     if (trie->states[ state ].wordnum) {
5067                          DEBUG_EXECUTE_r(
5068                             PerlIO_printf(Perl_debug_log,
5069                                           "%*s  %smatched empty string...%s\n",
5070                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5071                         );
5072                         if (!trie->jump)
5073                             break;
5074                     } else {
5075                         DEBUG_EXECUTE_r(
5076                             PerlIO_printf(Perl_debug_log,
5077                                           "%*s  %sfailed to match trie start class...%s\n",
5078                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5079                         );
5080                         sayNO_SILENT;
5081                    }
5082                 }
5083
5084             {
5085                 U8 *uc = ( U8* )locinput;
5086
5087                 STRLEN len = 0;
5088                 STRLEN foldlen = 0;
5089                 U8 *uscan = (U8*)NULL;
5090                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5091                 U32 charcount = 0; /* how many input chars we have matched */
5092                 U32 accepted = 0; /* have we seen any accepting states? */
5093
5094                 ST.jump = trie->jump;
5095                 ST.me = scan;
5096                 ST.firstpos = NULL;
5097                 ST.longfold = FALSE; /* char longer if folded => it's harder */
5098                 ST.nextword = 0;
5099
5100                 /* fully traverse the TRIE; note the position of the
5101                    shortest accept state and the wordnum of the longest
5102                    accept state */
5103
5104                 while ( state && uc <= (U8*)(reginfo->strend) ) {
5105                     U32 base = trie->states[ state ].trans.base;
5106                     UV uvc = 0;
5107                     U16 charid = 0;
5108                     U16 wordnum;
5109                     wordnum = trie->states[ state ].wordnum;
5110
5111                     if (wordnum) { /* it's an accept state */
5112                         if (!accepted) {
5113                             accepted = 1;
5114                             /* record first match position */
5115                             if (ST.longfold) {
5116                                 ST.firstpos = (U8*)locinput;
5117                                 ST.firstchars = 0;
5118                             }
5119                             else {
5120                                 ST.firstpos = uc;
5121                                 ST.firstchars = charcount;
5122                             }
5123                         }
5124                         if (!ST.nextword || wordnum < ST.nextword)
5125                             ST.nextword = wordnum;
5126                         ST.topword = wordnum;
5127                     }
5128
5129                     DEBUG_TRIE_EXECUTE_r({
5130                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
5131                                 PerlIO_printf( Perl_debug_log,
5132                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
5133                                     2+depth * 2, "", PL_colors[4],
5134                                     (UV)state, (accepted ? 'Y' : 'N'));
5135                     });
5136
5137                     /* read a char and goto next state */
5138                     if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5139                         I32 offset;
5140                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5141                                              uscan, len, uvc, charid, foldlen,
5142                                              foldbuf, uniflags);
5143                         charcount++;
5144                         if (foldlen>0)
5145                             ST.longfold = TRUE;
5146                         if (charid &&
5147                              ( ((offset =
5148                               base + charid - 1 - trie->uniquecharcount)) >= 0)
5149
5150                              && ((U32)offset < trie->lasttrans)
5151                              && trie->trans[offset].check == state)
5152                         {
5153                             state = trie->trans[offset].next;
5154                         }
5155                         else {
5156                             state = 0;
5157                         }
5158                         uc += len;
5159
5160                     }
5161                     else {
5162                         state = 0;
5163                     }
5164                     DEBUG_TRIE_EXECUTE_r(
5165                         PerlIO_printf( Perl_debug_log,
5166                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5167                             charid, uvc, (UV)state, PL_colors[5] );
5168                     );
5169                 }
5170                 if (!accepted)
5171                    sayNO;
5172
5173                 /* calculate total number of accept states */
5174                 {
5175                     U16 w = ST.topword;
5176                     accepted = 0;
5177                     while (w) {
5178                         w = trie->wordinfo[w].prev;
5179                         accepted++;
5180                     }
5181                     ST.accepted = accepted;
5182                 }
5183
5184                 DEBUG_EXECUTE_r(
5185                     PerlIO_printf( Perl_debug_log,
5186                         "%*s  %sgot %"IVdf" possible matches%s\n",
5187                         REPORT_CODE_OFF + depth * 2, "",
5188                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5189                 );
5190                 goto trie_first_try; /* jump into the fail handler */
5191             }}
5192             /* NOTREACHED */
5193             NOT_REACHED; /* NOTREACHED */
5194
5195         case TRIE_next_fail: /* we failed - try next alternative */
5196         {
5197             U8 *uc;
5198             if ( ST.jump) {
5199                 REGCP_UNWIND(ST.cp);
5200                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5201             }
5202             if (!--ST.accepted) {
5203                 DEBUG_EXECUTE_r({
5204                     PerlIO_printf( Perl_debug_log,
5205                         "%*s  %sTRIE failed...%s\n",
5206                         REPORT_CODE_OFF+depth*2, "",
5207                         PL_colors[4],
5208                         PL_colors[5] );
5209                 });
5210                 sayNO_SILENT;
5211             }
5212             {
5213                 /* Find next-highest word to process.  Note that this code
5214                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
5215                 U16 min = 0;
5216                 U16 word;
5217                 U16 const nextword = ST.nextword;
5218                 reg_trie_wordinfo * const wordinfo
5219                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5220                 for (word=ST.topword; word; word=wordinfo[word].prev) {
5221                     if (word > nextword && (!min || word < min))
5222                         min = word;
5223                 }
5224                 ST.nextword = min;
5225             }
5226
5227           trie_first_try:
5228             if (do_cutgroup) {
5229                 do_cutgroup = 0;
5230                 no_final = 0;
5231             }
5232
5233             if ( ST.jump) {
5234                 ST.lastparen = rex->lastparen;
5235                 ST.lastcloseparen = rex->lastcloseparen;
5236                 REGCP_SET(ST.cp);
5237             }
5238
5239             /* find start char of end of current word */
5240             {
5241                 U32 chars; /* how many chars to skip */
5242                 reg_trie_data * const trie
5243                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5244
5245                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5246                             >=  ST.firstchars);
5247                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5248                             - ST.firstchars;
5249                 uc = ST.firstpos;
5250
5251                 if (ST.longfold) {
5252                     /* the hard option - fold each char in turn and find
5253                      * its folded length (which may be different */
5254                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5255                     STRLEN foldlen;
5256                     STRLEN len;
5257                     UV uvc;
5258                     U8 *uscan;
5259
5260                     while (chars) {
5261                         if (utf8_target) {
5262                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5263                                                     uniflags);
5264                             uc += len;
5265                         }
5266                         else {
5267                             uvc = *uc;
5268                             uc++;
5269                         }
5270                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5271                         uscan = foldbuf;
5272                         while (foldlen) {
5273                             if (!--chars)
5274                                 break;
5275                             uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5276                                             uniflags);
5277                             uscan += len;
5278                             foldlen -= len;
5279                         }
5280                     }
5281                 }
5282                 else {
5283                     if (utf8_target)
5284                         while (chars--)
5285                             uc += UTF8SKIP(uc);
5286                     else
5287                         uc += chars;
5288                 }
5289             }
5290
5291             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5292                             ? ST.jump[ST.nextword]
5293                             : NEXT_OFF(ST.me));
5294
5295             DEBUG_EXECUTE_r({
5296                 PerlIO_printf( Perl_debug_log,
5297                     "%*s  %sTRIE matched word #%d, continuing%s\n",
5298                     REPORT_CODE_OFF+depth*2, "",
5299                     PL_colors[4],
5300                     ST.nextword,
5301                     PL_colors[5]
5302                     );
5303             });
5304
5305             if (ST.accepted > 1 || has_cutgroup) {
5306                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5307                 /* NOTREACHED */
5308                 NOT_REACHED; /* NOTREACHED */
5309             }
5310             /* only one choice left - just continue */
5311             DEBUG_EXECUTE_r({
5312                 AV *const trie_words
5313                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5314                 SV ** const tmp = trie_words
5315                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5316                 SV *sv= tmp ? sv_newmortal() : NULL;
5317
5318                 PerlIO_printf( Perl_debug_log,
5319                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
5320                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
5321                     ST.nextword,
5322                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5323                             PL_colors[0], PL_colors[1],
5324                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5325                         )
5326                     : "not compiled under -Dr",
5327                     PL_colors[5] );
5328             });
5329
5330             locinput = (char*)uc;
5331             continue; /* execute rest of RE */
5332             /* NOTREACHED */
5333         }
5334 #undef  ST
5335
5336         case EXACTL:             /*  /abc/l       */
5337             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5338
5339             /* Complete checking would involve going through every character
5340              * matched by the string to see if any is above latin1.  But the
5341              * comparision otherwise might very well be a fast assembly
5342              * language routine, and I (khw) don't think slowing things down
5343              * just to check for this warning is worth it.  So this just checks
5344              * the first character */
5345             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5346                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5347             }
5348             /* FALLTHROUGH */
5349         case EXACT: {            /*  /abc/        */
5350             char *s = STRING(scan);
5351             ln = STR_LEN(scan);
5352             if (utf8_target != is_utf8_pat) {
5353                 /* The target and the pattern have differing utf8ness. */
5354                 char *l = locinput;
5355                 const char * const e = s + ln;
5356
5357                 if (utf8_target) {
5358                     /* The target is utf8, the pattern is not utf8.
5359                      * Above-Latin1 code points can't match the pattern;
5360                      * invariants match exactly, and the other Latin1 ones need
5361                      * to be downgraded to a single byte in order to do the
5362                      * comparison.  (If we could be confident that the target
5363                      * is not malformed, this could be refactored to have fewer
5364                      * tests by just assuming that if the first bytes match, it
5365                      * is an invariant, but there are tests in the test suite
5366                      * dealing with (??{...}) which violate this) */
5367                     while (s < e) {
5368                         if (l >= reginfo->strend
5369                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5370                         {
5371                             sayNO;
5372                         }
5373                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
5374                             if (*l != *s) {
5375                                 sayNO;
5376                             }
5377                             l++;
5378                         }
5379                         else {
5380                             if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5381                             {
5382                                 sayNO;
5383                             }
5384                             l += 2;
5385                         }
5386                         s++;
5387                     }
5388                 }
5389                 else {
5390                     /* The target is not utf8, the pattern is utf8. */
5391                     while (s < e) {
5392                         if (l >= reginfo->strend
5393                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5394                         {
5395                             sayNO;
5396                         }
5397                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
5398                             if (*s != *l) {
5399                                 sayNO;
5400                             }
5401                             s++;
5402                         }
5403                         else {
5404                             if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5405                             {
5406                                 sayNO;
5407                             }
5408                             s += 2;
5409                         }
5410                         l++;
5411                     }
5412                 }
5413                 locinput = l;
5414             }
5415             else {
5416                 /* The target and the pattern have the same utf8ness. */
5417                 /* Inline the first character, for speed. */
5418                 if (reginfo->strend - locinput < ln
5419                     || UCHARAT(s) != nextchr
5420                     || (ln > 1 && memNE(s, locinput, ln)))
5421                 {
5422                     sayNO;
5423                 }
5424                 locinput += ln;
5425             }
5426             break;
5427             }
5428
5429         case EXACTFL: {          /*  /abc/il      */
5430             re_fold_t folder;
5431             const U8 * fold_array;
5432             const char * s;
5433             U32 fold_utf8_flags;
5434
5435             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5436             folder = foldEQ_locale;
5437             fold_array = PL_fold_locale;
5438             fold_utf8_flags = FOLDEQ_LOCALE;
5439             goto do_exactf;
5440
5441         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5442                                       is effectively /u; hence to match, target
5443                                       must be UTF-8. */
5444             if (! utf8_target) {
5445                 sayNO;
5446             }
5447             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5448                                              | FOLDEQ_S1_FOLDS_SANE;
5449             folder = foldEQ_latin1;
5450             fold_array = PL_fold_latin1;
5451             goto do_exactf;
5452
5453         case EXACTFU_SS:         /*  /\x{df}/iu   */
5454         case EXACTFU:            /*  /abc/iu      */
5455             folder = foldEQ_latin1;
5456             fold_array = PL_fold_latin1;
5457             fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5458             goto do_exactf;
5459
5460         case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
5461                                    patterns */
5462             assert(! is_utf8_pat);
5463             /* FALLTHROUGH */
5464         case EXACTFA:            /*  /abc/iaa     */
5465             folder = foldEQ_latin1;
5466             fold_array = PL_fold_latin1;
5467             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5468             goto do_exactf;
5469
5470         case EXACTF:             /*  /abc/i    This node only generated for
5471                                                non-utf8 patterns */
5472             assert(! is_utf8_pat);
5473             folder = foldEQ;
5474             fold_array = PL_fold;
5475             fold_utf8_flags = 0;
5476
5477           do_exactf:
5478             s = STRING(scan);
5479             ln = STR_LEN(scan);
5480
5481             if (utf8_target
5482                 || is_utf8_pat
5483                 || state_num == EXACTFU_SS
5484                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
5485             {
5486               /* Either target or the pattern are utf8, or has the issue where
5487                * the fold lengths may differ. */
5488                 const char * const l = locinput;
5489                 char *e = reginfo->strend;
5490
5491                 if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
5492                                         l, &e, 0,  utf8_target, fold_utf8_flags))
5493                 {
5494                     sayNO;
5495                 }
5496                 locinput = e;
5497                 break;
5498             }
5499
5500             /* Neither the target nor the pattern are utf8 */
5501             if (UCHARAT(s) != nextchr
5502                 && !NEXTCHR_IS_EOS
5503                 && UCHARAT(s) != fold_array[nextchr])
5504             {
5505                 sayNO;
5506             }
5507             if (reginfo->strend - locinput < ln)
5508                 sayNO;
5509             if (ln > 1 && ! folder(s, locinput, ln))
5510                 sayNO;
5511             locinput += ln;
5512             break;
5513         }
5514
5515         case NBOUNDL: /*  /\B/l  */
5516             to_complement = 1;
5517             /* FALLTHROUGH */
5518
5519         case BOUNDL:  /*  /\b/l  */
5520             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5521
5522             if (FLAGS(scan) != TRADITIONAL_BOUND) {
5523                 if (! IN_UTF8_CTYPE_LOCALE) {
5524                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
5525                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
5526                 }
5527                 goto boundu;
5528             }
5529
5530             if (utf8_target) {
5531                 if (locinput == reginfo->strbeg)
5532                     ln = isWORDCHAR_LC('\n');
5533                 else {
5534                     ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
5535                                                         (U8*)(reginfo->strbeg)));
5536                 }
5537                 n = (NEXTCHR_IS_EOS)
5538                     ? isWORDCHAR_LC('\n')
5539                     : isWORDCHAR_LC_utf8((U8*)locinput);
5540             }
5541             else { /* Here the string isn't utf8 */
5542                 ln = (locinput == reginfo->strbeg)
5543                      ? isWORDCHAR_LC('\n')
5544                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
5545                 n = (NEXTCHR_IS_EOS)
5546                     ? isWORDCHAR_LC('\n')
5547                     : isWORDCHAR_LC(nextchr);
5548             }
5549             if (to_complement ^ (ln == n)) {
5550                 sayNO;
5551             }
5552             break;
5553
5554         case NBOUND:  /*  /\B/   */
5555             to_complement = 1;
5556             /* FALLTHROUGH */
5557
5558         case BOUND:   /*  /\b/   */
5559             if (utf8_target) {
5560                 goto bound_utf8;
5561             }
5562             goto bound_ascii_match_only;
5563
5564         case NBOUNDA: /*  /\B/a  */
5565             to_complement = 1;
5566             /* FALLTHROUGH */
5567
5568         case BOUNDA:  /*  /\b/a  */
5569
5570           bound_ascii_match_only:
5571             /* Here the string isn't utf8, or is utf8 and only ascii characters
5572              * are to match \w.  In the latter case looking at the byte just
5573              * prior to the current one may be just the final byte of a
5574              * multi-byte character.  This is ok.  There are two cases:
5575              * 1) it is a single byte character, and then the test is doing
5576              *    just what it's supposed to.
5577              * 2) it is a multi-byte character, in which case the final byte is
5578              *    never mistakable for ASCII, and so the test will say it is
5579              *    not a word character, which is the correct answer. */
5580             ln = (locinput == reginfo->strbeg)
5581                  ? isWORDCHAR_A('\n')
5582                  : isWORDCHAR_A(UCHARAT(locinput - 1));
5583             n = (NEXTCHR_IS_EOS)
5584                 ? isWORDCHAR_A('\n')
5585                 : isWORDCHAR_A(nextchr);
5586             if (to_complement ^ (ln == n)) {
5587                 sayNO;
5588             }
5589             break;
5590
5591         case NBOUNDU: /*  /\B/u  */
5592             to_complement = 1;
5593             /* FALLTHROUGH */
5594
5595         case BOUNDU:  /*  /\b/u  */
5596
5597           boundu:
5598             if (utf8_target) {
5599
5600               bound_utf8:
5601                 switch((bound_type) FLAGS(scan)) {
5602                     case TRADITIONAL_BOUND:
5603                         ln = (locinput == reginfo->strbeg)
5604                              ? 0 /* isWORDCHAR_L1('\n') */
5605                              : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
5606                                                                 (U8*)(reginfo->strbeg)));
5607                         n = (NEXTCHR_IS_EOS)
5608                             ? 0 /* isWORDCHAR_L1('\n') */
5609                             : isWORDCHAR_utf8((U8*)locinput);
5610                         match = cBOOL(ln != n);
5611                         break;
5612                     case GCB_BOUND:
5613                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5614                             match = TRUE; /* GCB always matches at begin and
5615                                              end */
5616                         }
5617                         else {
5618                             /* Find the gcb values of previous and current
5619                              * chars, then see if is a break point */
5620                             match = isGCB(getGCB_VAL_UTF8(
5621                                                 reghop3((U8*)locinput,
5622                                                         -1,
5623                                                         (U8*)(reginfo->strbeg)),
5624                                                 (U8*) reginfo->strend),
5625                                           getGCB_VAL_UTF8((U8*) locinput,
5626                                                         (U8*) reginfo->strend));
5627                         }
5628                         break;
5629
5630                     case SB_BOUND: /* Always matches at begin and end */
5631                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5632                             match = TRUE;
5633                         }
5634                         else {
5635                             match = isSB(getSB_VAL_UTF8(
5636                                                 reghop3((U8*)locinput,
5637                                                         -1,
5638                                                         (U8*)(reginfo->strbeg)),
5639                                                 (U8*) reginfo->strend),
5640                                           getSB_VAL_UTF8((U8*) locinput,
5641                                                         (U8*) reginfo->strend),
5642                                           (U8*) reginfo->strbeg,
5643                                           (U8*) locinput,
5644                                           (U8*) reginfo->strend,
5645                                           utf8_target);
5646                         }
5647                         break;
5648
5649                     case WB_BOUND:
5650                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5651                             match = TRUE;
5652                         }
5653                         else {
5654                             match = isWB(WB_UNKNOWN,
5655                                          getWB_VAL_UTF8(
5656                                                 reghop3((U8*)locinput,
5657                                                         -1,
5658                                                         (U8*)(reginfo->strbeg)),
5659                                                 (U8*) reginfo->strend),
5660                                           getWB_VAL_UTF8((U8*) locinput,
5661                                                         (U8*) reginfo->strend),
5662                                           (U8*) reginfo->strbeg,
5663                                           (U8*) locinput,
5664                                           (U8*) reginfo->strend,
5665                                           utf8_target);
5666                         }
5667                         break;
5668                 }
5669             }
5670             else {  /* Not utf8 target */
5671                 switch((bound_type) FLAGS(scan)) {
5672                     case TRADITIONAL_BOUND:
5673                         ln = (locinput == reginfo->strbeg)
5674                             ? 0 /* isWORDCHAR_L1('\n') */
5675                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
5676                         n = (NEXTCHR_IS_EOS)
5677                             ? 0 /* isWORDCHAR_L1('\n') */
5678                             : isWORDCHAR_L1(nextchr);
5679                         match = cBOOL(ln != n);
5680                         break;
5681
5682                     case GCB_BOUND:
5683                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5684                             match = TRUE; /* GCB always matches at begin and
5685                                              end */
5686                         }
5687                         else {  /* Only CR-LF combo isn't a GCB in 0-255
5688                                    range */
5689                             match =    UCHARAT(locinput - 1) != '\r'
5690                                     || UCHARAT(locinput) != '\n';
5691                         }
5692                         break;
5693
5694                     case SB_BOUND: /* Always matches at begin and end */
5695                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5696                             match = TRUE;
5697                         }
5698                         else {
5699                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
5700                                          getSB_VAL_CP(UCHARAT(locinput)),
5701                                          (U8*) reginfo->strbeg,
5702                                          (U8*) locinput,
5703                                          (U8*) reginfo->strend,
5704                                          utf8_target);
5705                         }
5706                         break;
5707
5708                     case WB_BOUND:
5709                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5710                             match = TRUE;
5711                         }
5712                         else {
5713                             match = isWB(WB_UNKNOWN,
5714                                          getWB_VAL_CP(UCHARAT(locinput -1)),
5715                                          getWB_VAL_CP(UCHARAT(locinput)),
5716                                          (U8*) reginfo->strbeg,
5717                                          (U8*) locinput,
5718                                          (U8*) reginfo->strend,
5719                                          utf8_target);
5720                         }
5721                         break;
5722                 }
5723             }
5724
5725             if (to_complement ^ ! match) {
5726                 sayNO;
5727             }
5728             break;
5729
5730         case ANYOFL:  /*  /[abc]/l      */
5731             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5732             /* FALLTHROUGH */
5733         case ANYOF:  /*   /[abc]/       */
5734             if (NEXTCHR_IS_EOS)
5735                 sayNO;
5736             if (utf8_target) {
5737                 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
5738                                                                    utf8_target))
5739                     sayNO;
5740                 locinput += UTF8SKIP(locinput);
5741             }
5742             else {
5743                 if (!REGINCLASS(rex, scan, (U8*)locinput))
5744                     sayNO;
5745                 locinput++;
5746             }
5747             break;
5748
5749         /* The argument (FLAGS) to all the POSIX node types is the class number
5750          * */
5751
5752         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
5753             to_complement = 1;
5754             /* FALLTHROUGH */
5755
5756         case POSIXL:    /* \w or [:punct:] etc. under /l */
5757             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5758             if (NEXTCHR_IS_EOS)
5759                 sayNO;
5760
5761             /* Use isFOO_lc() for characters within Latin1.  (Note that
5762              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5763              * wouldn't be invariant) */
5764             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5765                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
5766                     sayNO;
5767                 }
5768             }
5769             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5770                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
5771                                            (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5772                                                             *(locinput + 1))))))
5773                 {
5774                     sayNO;
5775                 }
5776             }
5777             else { /* Here, must be an above Latin-1 code point */
5778                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5779                 goto utf8_posix_above_latin1;
5780             }
5781
5782             /* Here, must be utf8 */
5783             locinput += UTF8SKIP(locinput);
5784             break;
5785
5786         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
5787             to_complement = 1;
5788             /* FALLTHROUGH */
5789
5790         case POSIXD:    /* \w or [:punct:] etc. under /d */
5791             if (utf8_target) {
5792                 goto utf8_posix;
5793             }
5794             goto posixa;
5795
5796         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
5797
5798             if (NEXTCHR_IS_EOS) {
5799                 sayNO;
5800             }
5801
5802             /* All UTF-8 variants match */
5803             if (! UTF8_IS_INVARIANT(nextchr)) {
5804                 goto increment_locinput;
5805             }
5806
5807             to_complement = 1;
5808             /* FALLTHROUGH */
5809
5810         case POSIXA:    /* \w or [:punct:] etc. under /a */
5811
5812           posixa:
5813             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
5814              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
5815              * character is a single byte */
5816
5817             if (NEXTCHR_IS_EOS
5818                 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
5819                                                             FLAGS(scan)))))
5820             {
5821                 sayNO;
5822             }
5823
5824             /* Here we are either not in utf8, or we matched a utf8-invariant,
5825              * so the next char is the next byte */
5826             locinput++;
5827             break;
5828
5829         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
5830             to_complement = 1;
5831             /* FALLTHROUGH */
5832
5833         case POSIXU:    /* \w or [:punct:] etc. under /u */
5834           utf8_posix:
5835             if (NEXTCHR_IS_EOS) {
5836                 sayNO;
5837             }
5838
5839             /* Use _generic_isCC() for characters within Latin1.  (Note that
5840              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5841              * wouldn't be invariant) */
5842             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5843                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
5844                                                            FLAGS(scan)))))
5845                 {
5846                     sayNO;
5847                 }
5848                 locinput++;
5849             }
5850             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5851                 if (! (to_complement
5852                        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5853                                                                *(locinput + 1)),
5854                                              FLAGS(scan)))))
5855                 {
5856                     sayNO;
5857                 }
5858                 locinput += 2;
5859             }
5860             else {  /* Handle above Latin-1 code points */
5861               utf8_posix_above_latin1:
5862                 classnum = (_char_class_number) FLAGS(scan);
5863                 if (classnum < _FIRST_NON_SWASH_CC) {
5864
5865                     /* Here, uses a swash to find such code points.  Load if if
5866                      * not done already */
5867                     if (! PL_utf8_swash_ptrs[classnum]) {
5868                         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
5869                         PL_utf8_swash_ptrs[classnum]
5870                                 = _core_swash_init("utf8",
5871                                         "",
5872                                         &PL_sv_undef, 1, 0,
5873                                         PL_XPosix_ptrs[classnum], &flags);
5874                     }
5875                     if (! (to_complement
5876                            ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
5877                                                (U8 *) locinput, TRUE))))
5878                     {
5879                         sayNO;
5880                     }
5881                 }
5882                 else {  /* Here, uses macros to find above Latin-1 code points */
5883                     switch (classnum) {
5884                         case _CC_ENUM_SPACE:
5885                             if (! (to_complement
5886                                         ^ cBOOL(is_XPERLSPACE_high(locinput))))
5887                             {
5888                                 sayNO;
5889                             }
5890                             break;
5891                         case _CC_ENUM_BLANK:
5892                             if (! (to_complement
5893                                             ^ cBOOL(is_HORIZWS_high(locinput))))
5894                             {
5895                                 sayNO;
5896                             }
5897                             break;
5898                         case _CC_ENUM_XDIGIT:
5899                             if (! (to_complement
5900                                             ^ cBOOL(is_XDIGIT_high(locinput))))
5901                             {
5902                                 sayNO;
5903                             }
5904                             break;
5905                         case _CC_ENUM_VERTSPACE:
5906                             if (! (to_complement
5907                                             ^ cBOOL(is_VERTWS_high(locinput))))
5908                             {
5909                                 sayNO;
5910                             }
5911                             break;
5912                         default:    /* The rest, e.g. [:cntrl:], can't match
5913                                        above Latin1 */
5914                             if (! to_complement) {
5915                                 sayNO;
5916                             }
5917                             break;
5918                     }
5919                 }
5920                 locinput += UTF8SKIP(locinput);
5921             }
5922             break;
5923
5924         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
5925                        a Unicode extended Grapheme Cluster */
5926             if (NEXTCHR_IS_EOS)
5927                 sayNO;
5928             if  (! utf8_target) {
5929
5930                 /* Match either CR LF  or '.', as all the other possibilities
5931                  * require utf8 */
5932                 locinput++;         /* Match the . or CR */
5933                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
5934                                        match the LF */
5935                     && locinput < reginfo->strend
5936                     && UCHARAT(locinput) == '\n')
5937                 {
5938                     locinput++;
5939                 }
5940             }
5941             else {
5942
5943                 /* Get the gcb type for the current character */
5944                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
5945                                                        (U8*) reginfo->strend);
5946
5947                 /* Then scan through the input until we get to the first
5948                  * character whose type is supposed to be a gcb with the
5949                  * current character.  (There is always a break at the
5950                  * end-of-input) */
5951                 locinput += UTF8SKIP(locinput);
5952                 while (locinput < reginfo->strend) {
5953                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
5954                                                          (U8*) reginfo->strend);
5955                     if (isGCB(prev_gcb, cur_gcb)) {
5956                         break;
5957                     }
5958
5959                     prev_gcb = cur_gcb;
5960                     locinput += UTF8SKIP(locinput);
5961                 }
5962
5963
5964             }
5965             break;
5966
5967         case NREFFL:  /*  /\g{name}/il  */
5968         {   /* The capture buffer cases.  The ones beginning with N for the
5969                named buffers just convert to the equivalent numbered and
5970                pretend they were called as the corresponding numbered buffer
5971                op.  */
5972             /* don't initialize these in the declaration, it makes C++
5973                unhappy */
5974             const char *s;
5975             char type;
5976             re_fold_t folder;
5977             const U8 *fold_array;
5978             UV utf8_fold_flags;
5979
5980             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5981             folder = foldEQ_locale;
5982             fold_array = PL_fold_locale;
5983             type = REFFL;
5984             utf8_fold_flags = FOLDEQ_LOCALE;
5985             goto do_nref;
5986
5987         case NREFFA:  /*  /\g{name}/iaa  */
5988             folder = foldEQ_latin1;
5989             fold_array = PL_fold_latin1;
5990             type = REFFA;
5991             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5992             goto do_nref;
5993
5994         case NREFFU:  /*  /\g{name}/iu  */
5995             folder = foldEQ_latin1;
5996             fold_array = PL_fold_latin1;
5997             type = REFFU;
5998             utf8_fold_flags = 0;
5999             goto do_nref;
6000
6001         case NREFF:  /*  /\g{name}/i  */
6002             folder = foldEQ;
6003             fold_array = PL_fold;
6004             type = REFF;
6005             utf8_fold_flags = 0;
6006             goto do_nref;
6007
6008         case NREF:  /*  /\g{name}/   */
6009             type = REF;
6010             folder = NULL;
6011             fold_array = NULL;
6012             utf8_fold_flags = 0;
6013           do_nref:
6014
6015             /* For the named back references, find the corresponding buffer
6016              * number */
6017             n = reg_check_named_buff_matched(rex,scan);
6018
6019             if ( ! n ) {
6020                 sayNO;
6021             }
6022             goto do_nref_ref_common;
6023
6024         case REFFL:  /*  /\1/il  */
6025             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6026             folder = foldEQ_locale;
6027             fold_array = PL_fold_locale;
6028             utf8_fold_flags = FOLDEQ_LOCALE;
6029             goto do_ref;
6030
6031         case REFFA:  /*  /\1/iaa  */
6032             folder = foldEQ_latin1;
6033             fold_array = PL_fold_latin1;
6034             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6035             goto do_ref;
6036
6037         case REFFU:  /*  /\1/iu  */
6038             folder = foldEQ_latin1;
6039             fold_array = PL_fold_latin1;
6040             utf8_fold_flags = 0;
6041             goto do_ref;
6042
6043         case REFF:  /*  /\1/i  */
6044             folder = foldEQ;
6045             fold_array = PL_fold;
6046             utf8_fold_flags = 0;
6047             goto do_ref;
6048
6049         case REF:  /*  /\1/    */
6050             folder = NULL;
6051             fold_array = NULL;
6052             utf8_fold_flags = 0;
6053
6054           do_ref:
6055             type = OP(scan);
6056             n = ARG(scan);  /* which paren pair */
6057
6058           do_nref_ref_common:
6059             ln = rex->offs[n].start;
6060             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6061             if (rex->lastparen < n || ln == -1)
6062                 sayNO;                  /* Do not match unless seen CLOSEn. */
6063             if (ln == rex->offs[n].end)
6064                 break;
6065
6066             s = reginfo->strbeg + ln;
6067             if (type != REF     /* REF can do byte comparison */
6068                 && (utf8_target || type == REFFU || type == REFFL))
6069             {
6070                 char * limit = reginfo->strend;
6071
6072                 /* This call case insensitively compares the entire buffer
6073                     * at s, with the current input starting at locinput, but
6074                     * not going off the end given by reginfo->strend, and
6075                     * returns in <limit> upon success, how much of the
6076                     * current input was matched */
6077                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6078                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
6079                 {
6080                     sayNO;
6081                 }
6082                 locinput = limit;
6083                 break;
6084             }
6085
6086             /* Not utf8:  Inline the first character, for speed. */
6087             if (!NEXTCHR_IS_EOS &&
6088                 UCHARAT(s) != nextchr &&
6089                 (type == REF ||
6090                  UCHARAT(s) != fold_array[nextchr]))
6091                 sayNO;
6092             ln = rex->offs[n].end - ln;
6093             if (locinput + ln > reginfo->strend)
6094                 sayNO;
6095             if (ln > 1 && (type == REF
6096                            ? memNE(s, locinput, ln)
6097                            : ! folder(s, locinput, ln)))
6098                 sayNO;
6099             locinput += ln;
6100             break;
6101         }
6102
6103         case NOTHING: /* null op; e.g. the 'nothing' following
6104                        * the '*' in m{(a+|b)*}' */
6105             break;
6106         case TAIL: /* placeholder while compiling (A|B|C) */
6107             break;
6108
6109 #undef  ST
6110 #define ST st->u.eval
6111         {
6112             SV *ret;
6113             REGEXP *re_sv;
6114             regexp *re;
6115             regexp_internal *rei;
6116             regnode *startpoint;
6117
6118         case GOSTART: /*  (?R)  */
6119         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6120             if (cur_eval && cur_eval->locinput==locinput) {
6121                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
6122                     Perl_croak(aTHX_ "Infinite recursion in regex");
6123                 if ( ++nochange_depth > max_nochange_depth )
6124                     Perl_croak(aTHX_
6125                         "Pattern subroutine nesting without pos change"
6126                         " exceeded limit in regex");
6127             } else {
6128                 nochange_depth = 0;
6129             }
6130             re_sv = rex_sv;
6131             re = rex;
6132             rei = rexi;
6133             if (OP(scan)==GOSUB) {
6134                 startpoint = scan + ARG2L(scan);
6135                 ST.close_paren = ARG(scan);
6136             } else {
6137                 startpoint = rei->program+1;
6138                 ST.close_paren = 0;
6139             }
6140
6141             /* Save all the positions seen so far. */
6142             ST.cp = regcppush(rex, 0, maxopenparen);
6143             REGCP_SET(ST.lastcp);
6144
6145             /* and then jump to the code we share with EVAL */
6146             goto eval_recurse_doit;
6147             /* NOTREACHED */
6148
6149         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
6150             if (cur_eval && cur_eval->locinput==locinput) {
6151                 if ( ++nochange_depth > max_nochange_depth )
6152                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6153             } else {
6154                 nochange_depth = 0;
6155             }
6156             {
6157                 /* execute the code in the {...} */
6158
6159                 dSP;
6160                 IV before;
6161                 OP * const oop = PL_op;
6162                 COP * const ocurcop = PL_curcop;
6163                 OP *nop;
6164                 CV *newcv;
6165
6166                 /* save *all* paren positions */
6167                 regcppush(rex, 0, maxopenparen);
6168                 REGCP_SET(runops_cp);
6169
6170                 if (!caller_cv)
6171                     caller_cv = find_runcv(NULL);
6172
6173                 n = ARG(scan);
6174
6175                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6176                     newcv = (ReANY(
6177                                                 (REGEXP*)(rexi->data->data[n])
6178                                             ))->qr_anoncv
6179                                         ;
6180                     nop = (OP*)rexi->data->data[n+1];
6181                 }
6182                 else if (rexi->data->what[n] == 'l') { /* literal code */
6183                     newcv = caller_cv;
6184                     nop = (OP*)rexi->data->data[n];
6185                     assert(CvDEPTH(newcv));
6186                 }
6187                 else {
6188                     /* literal with own CV */
6189                     assert(rexi->data->what[n] == 'L');
6190                     newcv = rex->qr_anoncv;
6191                     nop = (OP*)rexi->data->data[n];
6192                 }
6193
6194                 /* normally if we're about to execute code from the same
6195                  * CV that we used previously, we just use the existing
6196                  * CX stack entry. However, its possible that in the
6197                  * meantime we may have backtracked, popped from the save
6198                  * stack, and undone the SAVECOMPPAD(s) associated with
6199                  * PUSH_MULTICALL; in which case PL_comppad no longer
6200                  * points to newcv's pad. */
6201                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6202                 {
6203                     U8 flags = (CXp_SUB_RE |
6204                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6205                     if (last_pushed_cv) {
6206                         CHANGE_MULTICALL_FLAGS(newcv, flags);
6207                     }
6208                     else {
6209                         PUSH_MULTICALL_FLAGS(newcv, flags);
6210                     }
6211                     last_pushed_cv = newcv;
6212                 }
6213                 else {
6214                     /* these assignments are just to silence compiler
6215                      * warnings */
6216                     multicall_cop = NULL;
6217                     newsp = NULL;
6218                 }
6219                 last_pad = PL_comppad;
6220
6221                 /* the initial nextstate you would normally execute
6222                  * at the start of an eval (which would cause error
6223                  * messages to come from the eval), may be optimised
6224                  * away from the execution path in the regex code blocks;
6225                  * so manually set PL_curcop to it initially */
6226                 {
6227                     OP *o = cUNOPx(nop)->op_first;
6228                     assert(o->op_type == OP_NULL);
6229                     if (o->op_targ == OP_SCOPE) {
6230                         o = cUNOPo->op_first;
6231                     }
6232                     else {
6233                         assert(o->op_targ == OP_LEAVE);
6234                         o = cUNOPo->op_first;
6235                         assert(o->op_type == OP_ENTER);
6236                         o = OpSIBLING(o);
6237                     }
6238
6239                     if (o->op_type != OP_STUB) {
6240                         assert(    o->op_type == OP_NEXTSTATE
6241                                 || o->op_type == OP_DBSTATE
6242                                 || (o->op_type == OP_NULL
6243                                     &&  (  o->op_targ == OP_NEXTSTATE
6244                                         || o->op_targ == OP_DBSTATE
6245                                         )
6246                                     )
6247                         );
6248                         PL_curcop = (COP*)o;
6249                     }
6250                 }
6251                 nop = nop->op_next;
6252
6253                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
6254                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6255
6256                 rex->offs[0].end = locinput - reginfo->strbeg;
6257                 if (reginfo->info_aux_eval->pos_magic)
6258                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6259                                   reginfo->sv, reginfo->strbeg,
6260                                   locinput - reginfo->strbeg);
6261
6262                 if (sv_yes_mark) {
6263                     SV *sv_mrk = get_sv("REGMARK", 1);
6264                     sv_setsv(sv_mrk, sv_yes_mark);
6265                 }
6266
6267                 /* we don't use MULTICALL here as we want to call the
6268                  * first op of the block of interest, rather than the
6269                  * first op of the sub */
6270                 before = (IV)(SP-PL_stack_base);
6271                 PL_op = nop;
6272                 CALLRUNOPS(aTHX);                       /* Scalar context. */
6273                 SPAGAIN;
6274                 if ((IV)(SP-PL_stack_base) == before)
6275                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6276                 else {
6277                     ret = POPs;
6278                     PUTBACK;
6279                 }
6280
6281                 /* before restoring everything, evaluate the returned
6282                  * value, so that 'uninit' warnings don't use the wrong
6283                  * PL_op or pad. Also need to process any magic vars
6284                  * (e.g. $1) *before* parentheses are restored */
6285
6286                 PL_op = NULL;
6287
6288                 re_sv = NULL;
6289                 if (logical == 0)        /*   (?{})/   */
6290                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6291                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6292                     sw = cBOOL(SvTRUE(ret));
6293                     logical = 0;
6294                 }
6295                 else {                   /*  /(??{})  */
6296                     /*  if its overloaded, let the regex compiler handle
6297                      *  it; otherwise extract regex, or stringify  */
6298                     if (SvGMAGICAL(ret))
6299                         ret = sv_mortalcopy(ret);
6300                     if (!SvAMAGIC(ret)) {
6301                         SV *sv = ret;
6302                         if (SvROK(sv))
6303                             sv = SvRV(sv);
6304                         if (SvTYPE(sv) == SVt_REGEXP)
6305                             re_sv = (REGEXP*) sv;
6306                         else if (SvSMAGICAL(ret)) {
6307                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6308                             if (mg)
6309                                 re_sv = (REGEXP *) mg->mg_obj;
6310                         }
6311
6312                         /* force any undef warnings here */
6313                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6314                             ret = sv_mortalcopy(ret);
6315                             (void) SvPV_force_nolen(ret);
6316                         }
6317                     }
6318
6319                 }
6320
6321                 /* *** Note that at this point we don't restore
6322                  * PL_comppad, (or pop the CxSUB) on the assumption it may
6323                  * be used again soon. This is safe as long as nothing
6324                  * in the regexp code uses the pad ! */
6325                 PL_op = oop;
6326                 PL_curcop = ocurcop;
6327                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
6328                 PL_curpm = PL_reg_curpm;
6329
6330                 if (logical != 2)
6331                     break;
6332             }
6333
6334                 /* only /(??{})/  from now on */
6335                 logical = 0;
6336                 {
6337                     /* extract RE object from returned value; compiling if
6338                      * necessary */
6339
6340                     if (re_sv) {
6341                         re_sv = reg_temp_copy(NULL, re_sv);
6342                     }
6343                     else {
6344                         U32 pm_flags = 0;
6345
6346                         if (SvUTF8(ret) && IN_BYTES) {
6347                             /* In use 'bytes': make a copy of the octet
6348                              * sequence, but without the flag on */
6349                             STRLEN len;
6350                             const char *const p = SvPV(ret, len);
6351                             ret = newSVpvn_flags(p, len, SVs_TEMP);
6352                         }
6353                         if (rex->intflags & PREGf_USE_RE_EVAL)
6354                             pm_flags |= PMf_USE_RE_EVAL;
6355
6356                         /* if we got here, it should be an engine which
6357                          * supports compiling code blocks and stuff */
6358                         assert(rex->engine && rex->engine->op_comp);
6359                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6360                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6361                                     rex->engine, NULL, NULL,
6362                                     /* copy /msixn etc to inner pattern */
6363                                     ARG2L(scan),
6364                                     pm_flags);
6365
6366                         if (!(SvFLAGS(ret)
6367                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
6368                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
6369                             /* This isn't a first class regexp. Instead, it's
6370                                caching a regexp onto an existing, Perl visible
6371                                scalar.  */
6372                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
6373                         }
6374                     }
6375                     SAVEFREESV(re_sv);
6376                     re = ReANY(re_sv);
6377                 }
6378                 RXp_MATCH_COPIED_off(re);
6379                 re->subbeg = rex->subbeg;
6380                 re->sublen = rex->sublen;
6381                 re->suboffset = rex->suboffset;
6382                 re->subcoffset = rex->subcoffset;
6383                 re->lastparen = 0;
6384                 re->lastcloseparen = 0;
6385                 rei = RXi_GET(re);
6386                 DEBUG_EXECUTE_r(
6387                     debug_start_match(re_sv, utf8_target, locinput,
6388                                     reginfo->strend, "Matching embedded");
6389                 );
6390                 startpoint = rei->program + 1;
6391                 ST.close_paren = 0; /* only used for GOSUB */
6392                 /* Save all the seen positions so far. */
6393                 ST.cp = regcppush(rex, 0, maxopenparen);
6394                 REGCP_SET(ST.lastcp);
6395                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
6396                 maxopenparen = 0;
6397                 /* run the pattern returned from (??{...}) */
6398
6399               eval_recurse_doit: /* Share code with GOSUB below this line
6400                             * At this point we expect the stack context to be
6401                             * set up correctly */
6402
6403                 /* invalidate the S-L poscache. We're now executing a
6404                  * different set of WHILEM ops (and their associated
6405                  * indexes) against the same string, so the bits in the
6406                  * cache are meaningless. Setting maxiter to zero forces
6407                  * the cache to be invalidated and zeroed before reuse.
6408                  * XXX This is too dramatic a measure. Ideally we should
6409                  * save the old cache and restore when running the outer
6410                  * pattern again */
6411                 reginfo->poscache_maxiter = 0;
6412
6413                 /* the new regexp might have a different is_utf8_pat than we do */
6414                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
6415
6416                 ST.prev_rex = rex_sv;
6417                 ST.prev_curlyx = cur_curlyx;
6418                 rex_sv = re_sv;
6419                 SET_reg_curpm(rex_sv);
6420                 rex = re;
6421                 rexi = rei;
6422                 cur_curlyx = NULL;
6423                 ST.B = next;
6424                 ST.prev_eval = cur_eval;
6425                 cur_eval = st;
6426                 /* now continue from first node in postoned RE */
6427                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
6428                 /* NOTREACHED */
6429                 NOT_REACHED; /* NOTREACHED */
6430         }
6431
6432         case EVAL_AB: /* cleanup after a successful (??{A})B */
6433             /* note: this is called twice; first after popping B, then A */
6434             rex_sv = ST.prev_rex;
6435             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6436             SET_reg_curpm(rex_sv);
6437             rex = ReANY(rex_sv);
6438             rexi = RXi_GET(rex);
6439             {
6440                 /* preserve $^R across LEAVE's. See Bug 121070. */
6441                 SV *save_sv= GvSV(PL_replgv);
6442                 SvREFCNT_inc(save_sv);
6443                 regcpblow(ST.cp); /* LEAVE in disguise */
6444                 sv_setsv(GvSV(PL_replgv), save_sv);
6445                 SvREFCNT_dec(save_sv);
6446             }
6447             cur_eval = ST.prev_eval;
6448             cur_curlyx = ST.prev_curlyx;
6449
6450             /* Invalidate cache. See "invalidate" comment above. */
6451             reginfo->poscache_maxiter = 0;
6452             if ( nochange_depth )
6453                 nochange_depth--;
6454             sayYES;
6455
6456
6457         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
6458             /* note: this is called twice; first after popping B, then A */
6459             rex_sv = ST.prev_rex;
6460             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6461             SET_reg_curpm(rex_sv);
6462             rex = ReANY(rex_sv);
6463             rexi = RXi_GET(rex);
6464
6465             REGCP_UNWIND(ST.lastcp);
6466             regcppop(rex, &maxopenparen);
6467             cur_eval = ST.prev_eval;
6468             cur_curlyx = ST.prev_curlyx;
6469             /* Invalidate cache. See "invalidate" comment above. */
6470             reginfo->poscache_maxiter = 0;
6471             if ( nochange_depth )
6472                 nochange_depth--;
6473             sayNO_SILENT;
6474 #undef ST
6475
6476         case OPEN: /*  (  */
6477             n = ARG(scan);  /* which paren pair */
6478             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
6479             if (n > maxopenparen)
6480                 maxopenparen = n;
6481             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
6482                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
6483                 PTR2UV(rex),
6484                 PTR2UV(rex->offs),
6485                 (UV)n,
6486                 (IV)rex->offs[n].start_tmp,
6487                 (UV)maxopenparen
6488             ));
6489             lastopen = n;
6490             break;
6491
6492 /* XXX really need to log other places start/end are set too */
6493 #define CLOSE_CAPTURE \
6494     rex->offs[n].start = rex->offs[n].start_tmp; \
6495     rex->offs[n].end = locinput - reginfo->strbeg; \
6496     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
6497         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
6498         PTR2UV(rex), \
6499         PTR2UV(rex->offs), \
6500         (UV)n, \
6501         (IV)rex->offs[n].start, \
6502         (IV)rex->offs[n].end \
6503     ))
6504
6505         case CLOSE:  /*  )  */
6506             n = ARG(scan);  /* which paren pair */
6507             CLOSE_CAPTURE;
6508             if (n > rex->lastparen)
6509                 rex->lastparen = n;
6510             rex->lastcloseparen = n;
6511             if (cur_eval && cur_eval->u.eval.close_paren == n) {
6512                 goto fake_end;
6513             }
6514             break;
6515
6516         case ACCEPT:  /*  (*ACCEPT)  */
6517             if (ARG(scan)){
6518                 regnode *cursor;
6519                 for (cursor=scan;
6520                      cursor && OP(cursor)!=END;
6521                      cursor=regnext(cursor))
6522                 {
6523                     if ( OP(cursor)==CLOSE ){
6524                         n = ARG(cursor);
6525                         if ( n <= lastopen ) {
6526                             CLOSE_CAPTURE;
6527                             if (n > rex->lastparen)
6528                                 rex->lastparen = n;
6529                             rex->lastcloseparen = n;
6530                             if ( n == ARG(scan) || (cur_eval &&
6531                                 cur_eval->u.eval.close_paren == n))
6532                                 break;
6533                         }
6534                     }
6535                 }
6536             }
6537             goto fake_end;
6538             /* NOTREACHED */
6539
6540         case GROUPP:  /*  (?(1))  */
6541             n = ARG(scan);  /* which paren pair */
6542             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
6543             break;
6544
6545         case NGROUPP:  /*  (?(<name>))  */
6546             /* reg_check_named_buff_matched returns 0 for no match */
6547             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
6548             break;
6549
6550         case INSUBP:   /*  (?(R))  */
6551             n = ARG(scan);
6552             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
6553             break;
6554
6555         case DEFINEP:  /*  (?(DEFINE))  */
6556             sw = 0;
6557             break;
6558
6559         case IFTHEN:   /*  (?(cond)A|B)  */
6560             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6561             if (sw)
6562                 next = NEXTOPER(NEXTOPER(scan));
6563             else {
6564                 next = scan + ARG(scan);
6565                 if (OP(next) == IFTHEN) /* Fake one. */
6566                     next = NEXTOPER(NEXTOPER(next));
6567             }
6568             break;
6569
6570         case LOGICAL:  /* modifier for EVAL and IFMATCH */
6571             logical = scan->flags;
6572             break;
6573
6574 /*******************************************************************
6575
6576 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
6577 pattern, where A and B are subpatterns. (For simple A, CURLYM or
6578 STAR/PLUS/CURLY/CURLYN are used instead.)
6579
6580 A*B is compiled as <CURLYX><A><WHILEM><B>
6581
6582 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
6583 state, which contains the current count, initialised to -1. It also sets
6584 cur_curlyx to point to this state, with any previous value saved in the
6585 state block.
6586
6587 CURLYX then jumps straight to the WHILEM op, rather than executing A,
6588 since the pattern may possibly match zero times (i.e. it's a while {} loop
6589 rather than a do {} while loop).
6590
6591 Each entry to WHILEM represents a successful match of A. The count in the
6592 CURLYX block is incremented, another WHILEM state is pushed, and execution
6593 passes to A or B depending on greediness and the current count.
6594
6595 For example, if matching against the string a1a2a3b (where the aN are
6596 substrings that match /A/), then the match progresses as follows: (the
6597 pushed states are interspersed with the bits of strings matched so far):
6598
6599     <CURLYX cnt=-1>
6600     <CURLYX cnt=0><WHILEM>
6601     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
6602     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
6603     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
6604     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
6605
6606 (Contrast this with something like CURLYM, which maintains only a single
6607 backtrack state:
6608
6609     <CURLYM cnt=0> a1
6610     a1 <CURLYM cnt=1> a2
6611     a1 a2 <CURLYM cnt=2> a3
6612     a1 a2 a3 <CURLYM cnt=3> b
6613 )
6614
6615 Each WHILEM state block marks a point to backtrack to upon partial failure
6616 of A or B, and also contains some minor state data related to that
6617 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
6618 overall state, such as the count, and pointers to the A and B ops.
6619
6620 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
6621 must always point to the *current* CURLYX block, the rules are:
6622
6623 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
6624 and set cur_curlyx to point the new block.
6625
6626 When popping the CURLYX block after a successful or unsuccessful match,
6627 restore the previous cur_curlyx.
6628
6629 When WHILEM is about to execute B, save the current cur_curlyx, and set it
6630 to the outer one saved in the CURLYX block.
6631
6632 When popping the WHILEM block after a successful or unsuccessful B match,
6633 restore the previous cur_curlyx.
6634
6635 Here's an example for the pattern (AI* BI)*BO
6636 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
6637
6638 cur_
6639 curlyx backtrack stack
6640 ------ ---------------
6641 NULL
6642 CO     <CO prev=NULL> <WO>
6643 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6644 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6645 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
6646
6647 At this point the pattern succeeds, and we work back down the stack to
6648 clean up, restoring as we go:
6649
6650 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6651 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6652 CO     <CO prev=NULL> <WO>
6653 NULL
6654
6655 *******************************************************************/
6656
6657 #define ST st->u.curlyx
6658
6659         case CURLYX:    /* start of /A*B/  (for complex A) */
6660         {
6661             /* No need to save/restore up to this paren */
6662             I32 parenfloor = scan->flags;
6663
6664             assert(next); /* keep Coverity happy */
6665             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
6666                 next += ARG(next);
6667
6668             /* XXXX Probably it is better to teach regpush to support
6669                parenfloor > maxopenparen ... */
6670             if (parenfloor > (I32)rex->lastparen)
6671                 parenfloor = rex->lastparen; /* Pessimization... */
6672
6673             ST.prev_curlyx= cur_curlyx;
6674             cur_curlyx = st;
6675             ST.cp = PL_savestack_ix;
6676
6677             /* these fields contain the state of the current curly.
6678              * they are accessed by subsequent WHILEMs */
6679             ST.parenfloor = parenfloor;
6680             ST.me = scan;
6681             ST.B = next;
6682             ST.minmod = minmod;
6683             minmod = 0;
6684             ST.count = -1;      /* this will be updated by WHILEM */
6685             ST.lastloc = NULL;  /* this will be updated by WHILEM */
6686
6687             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
6688             /* NOTREACHED */
6689             NOT_REACHED; /* NOTREACHED */
6690         }
6691
6692         case CURLYX_end: /* just finished matching all of A*B */
6693             cur_curlyx = ST.prev_curlyx;
6694             sayYES;
6695             /* NOTREACHED */
6696             NOT_REACHED; /* NOTREACHED */
6697
6698         case CURLYX_end_fail: /* just failed to match all of A*B */
6699             regcpblow(ST.cp);
6700             cur_curlyx = ST.prev_curlyx;
6701             sayNO;
6702             /* NOTREACHED */
6703             NOT_REACHED; /* NOTREACHED */
6704
6705
6706 #undef ST
6707 #define ST st->u.whilem
6708
6709         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
6710         {
6711             /* see the discussion above about CURLYX/WHILEM */
6712             I32 n;
6713             int min, max;
6714             regnode *A;
6715
6716             assert(cur_curlyx); /* keep Coverity happy */
6717
6718             min = ARG1(cur_curlyx->u.curlyx.me);
6719             max = ARG2(cur_curlyx->u.curlyx.me);
6720             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
6721             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
6722             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
6723             ST.cache_offset = 0;
6724             ST.cache_mask = 0;
6725
6726
6727             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6728                   "%*s  whilem: matched %ld out of %d..%d\n",
6729                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
6730             );
6731
6732             /* First just match a string of min A's. */
6733
6734             if (n < min) {
6735                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6736                                     maxopenparen);
6737                 cur_curlyx->u.curlyx.lastloc = locinput;
6738                 REGCP_SET(ST.lastcp);
6739
6740                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
6741                 /* NOTREACHED */
6742                 NOT_REACHED; /* NOTREACHED */
6743             }
6744
6745             /* If degenerate A matches "", assume A done. */
6746
6747             if (locinput == cur_curlyx->u.curlyx.lastloc) {
6748                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6749                    "%*s  whilem: empty match detected, trying continuation...\n",
6750                    REPORT_CODE_OFF+depth*2, "")
6751                 );
6752                 goto do_whilem_B_max;
6753             }
6754
6755             /* super-linear cache processing.
6756              *
6757              * The idea here is that for certain types of CURLYX/WHILEM -
6758              * principally those whose upper bound is infinity (and
6759              * excluding regexes that have things like \1 and other very
6760              * non-regular expresssiony things), then if a pattern like
6761              * /....A*.../ fails and we backtrack to the WHILEM, then we
6762              * make a note that this particular WHILEM op was at string
6763              * position 47 (say) when the rest of pattern failed. Then, if
6764              * we ever find ourselves back at that WHILEM, and at string
6765              * position 47 again, we can just fail immediately rather than
6766              * running the rest of the pattern again.
6767              *
6768              * This is very handy when patterns start to go
6769              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
6770              * with a combinatorial explosion of backtracking.
6771              *
6772              * The cache is implemented as a bit array, with one bit per
6773              * string byte position per WHILEM op (up to 16) - so its
6774              * between 0.25 and 2x the string size.
6775              *
6776              * To avoid allocating a poscache buffer every time, we do an
6777              * initially countdown; only after we have  executed a WHILEM
6778              * op (string-length x #WHILEMs) times do we allocate the
6779              * cache.
6780              *
6781              * The top 4 bits of scan->flags byte say how many different
6782              * relevant CURLLYX/WHILEM op pairs there are, while the
6783              * bottom 4-bits is the identifying index number of this
6784              * WHILEM.
6785              */
6786
6787             if (scan->flags) {
6788
6789                 if (!reginfo->poscache_maxiter) {
6790                     /* start the countdown: Postpone detection until we
6791                      * know the match is not *that* much linear. */
6792                     reginfo->poscache_maxiter
6793                         =    (reginfo->strend - reginfo->strbeg + 1)
6794                            * (scan->flags>>4);
6795                     /* possible overflow for long strings and many CURLYX's */
6796                     if (reginfo->poscache_maxiter < 0)
6797                         reginfo->poscache_maxiter = I32_MAX;
6798                     reginfo->poscache_iter = reginfo->poscache_maxiter;
6799                 }
6800
6801                 if (reginfo->poscache_iter-- == 0) {
6802                     /* initialise cache */
6803                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
6804                     regmatch_info_aux *const aux = reginfo->info_aux;
6805                     if (aux->poscache) {
6806                         if ((SSize_t)reginfo->poscache_size < size) {
6807                             Renew(aux->poscache, size, char);
6808                             reginfo->poscache_size = size;
6809                         }
6810                         Zero(aux->poscache, size, char);
6811                     }
6812                     else {
6813                         reginfo->poscache_size = size;
6814                         Newxz(aux->poscache, size, char);
6815                     }
6816                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6817       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
6818                               PL_colors[4], PL_colors[5])
6819                     );
6820                 }
6821
6822                 if (reginfo->poscache_iter < 0) {
6823                     /* have we already failed at this position? */
6824                     SSize_t offset, mask;
6825
6826                     reginfo->poscache_iter = -1; /* stop eventual underflow */
6827                     offset  = (scan->flags & 0xf) - 1
6828                                 +   (locinput - reginfo->strbeg)
6829                                   * (scan->flags>>4);
6830                     mask    = 1 << (offset % 8);
6831                     offset /= 8;
6832                     if (reginfo->info_aux->poscache[offset] & mask) {
6833                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6834                             "%*s  whilem: (cache) already tried at this position...\n",
6835                             REPORT_CODE_OFF+depth*2, "")
6836                         );
6837                         sayNO; /* cache records failure */
6838                     }
6839                     ST.cache_offset = offset;
6840                     ST.cache_mask   = mask;
6841                 }
6842             }
6843
6844             /* Prefer B over A for minimal matching. */
6845
6846             if (cur_curlyx->u.curlyx.minmod) {
6847                 ST.save_curlyx = cur_curlyx;
6848                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6849                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
6850                             maxopenparen);
6851                 REGCP_SET(ST.lastcp);
6852                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
6853                                     locinput);
6854                 /* NOTREACHED */
6855                 NOT_REACHED; /* NOTREACHED */
6856             }
6857
6858             /* Prefer A over B for maximal matching. */
6859
6860             if (n < max) { /* More greed allowed? */
6861                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6862                             maxopenparen);
6863                 cur_curlyx->u.curlyx.lastloc = locinput;
6864                 REGCP_SET(ST.lastcp);
6865                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
6866                 /* NOTREACHED */
6867                 NOT_REACHED; /* NOTREACHED */
6868             }
6869             goto do_whilem_B_max;
6870         }
6871         /* NOTREACHED */
6872         NOT_REACHED; /* NOTREACHED */
6873
6874         case WHILEM_B_min: /* just matched B in a minimal match */
6875         case WHILEM_B_max: /* just matched B in a maximal match */
6876             cur_curlyx = ST.save_curlyx;
6877             sayYES;
6878             /* NOTREACHED */
6879             NOT_REACHED; /* NOTREACHED */
6880
6881         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
6882             cur_curlyx = ST.save_curlyx;
6883             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6884             cur_curlyx->u.curlyx.count--;
6885             CACHEsayNO;
6886             /* NOTREACHED */
6887             NOT_REACHED; /* NOTREACHED */
6888
6889         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
6890             /* FALLTHROUGH */
6891         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
6892             REGCP_UNWIND(ST.lastcp);
6893             regcppop(rex, &maxopenparen);
6894             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6895             cur_curlyx->u.curlyx.count--;
6896             CACHEsayNO;
6897             /* NOTREACHED */
6898             NOT_REACHED; /* NOTREACHED */
6899
6900         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
6901             REGCP_UNWIND(ST.lastcp);
6902             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
6903             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6904                 "%*s  whilem: failed, trying continuation...\n",
6905                 REPORT_CODE_OFF+depth*2, "")
6906             );
6907           do_whilem_B_max:
6908             if (cur_curlyx->u.curlyx.count >= REG_INFTY
6909                 && ckWARN(WARN_REGEXP)
6910                 && !reginfo->warned)
6911             {
6912                 reginfo->warned = TRUE;
6913                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6914                      "Complex regular subexpression recursion limit (%d) "
6915                      "exceeded",
6916                      REG_INFTY - 1);
6917             }
6918
6919             /* now try B */
6920             ST.save_curlyx = cur_curlyx;
6921             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6922             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6923                                 locinput);
6924             /* NOTREACHED */
6925             NOT_REACHED; /* NOTREACHED */
6926
6927         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6928             cur_curlyx = ST.save_curlyx;
6929             REGCP_UNWIND(ST.lastcp);
6930             regcppop(rex, &maxopenparen);
6931
6932             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6933                 /* Maximum greed exceeded */
6934                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6935                     && ckWARN(WARN_REGEXP)
6936                     && !reginfo->warned)
6937                 {
6938                     reginfo->warned     = TRUE;
6939                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6940                         "Complex regular subexpression recursion "
6941                         "limit (%d) exceeded",
6942                         REG_INFTY - 1);
6943                 }
6944                 cur_curlyx->u.curlyx.count--;
6945                 CACHEsayNO;
6946             }
6947
6948             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6949                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6950             );
6951             /* Try grabbing another A and see if it helps. */
6952             cur_curlyx->u.curlyx.lastloc = locinput;
6953             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6954                             maxopenparen);
6955             REGCP_SET(ST.lastcp);
6956             PUSH_STATE_GOTO(WHILEM_A_min,
6957                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6958                 locinput);
6959             /* NOTREACHED */
6960             NOT_REACHED; /* NOTREACHED */
6961
6962 #undef  ST
6963 #define ST st->u.branch
6964
6965         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
6966             next = scan + ARG(scan);
6967             if (next == scan)
6968                 next = NULL;
6969             scan = NEXTOPER(scan);
6970             /* FALLTHROUGH */
6971
6972         case BRANCH:        /*  /(...|A|...)/ */
6973             scan = NEXTOPER(scan); /* scan now points to inner node */
6974             ST.lastparen = rex->lastparen;
6975             ST.lastcloseparen = rex->lastcloseparen;
6976             ST.next_branch = next;
6977             REGCP_SET(ST.cp);
6978
6979             /* Now go into the branch */
6980             if (has_cutgroup) {
6981                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6982             } else {
6983                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6984             }
6985             /* NOTREACHED */
6986             NOT_REACHED; /* NOTREACHED */
6987
6988         case CUTGROUP:  /*  /(*THEN)/  */
6989             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6990                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6991             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6992             /* NOTREACHED */
6993             NOT_REACHED; /* NOTREACHED */
6994
6995         case CUTGROUP_next_fail:
6996             do_cutgroup = 1;
6997             no_final = 1;
6998             if (st->u.mark.mark_name)
6999                 sv_commit = st->u.mark.mark_name;
7000             sayNO;
7001             /* NOTREACHED */
7002             NOT_REACHED; /* NOTREACHED */
7003
7004         case BRANCH_next:
7005             sayYES;
7006             /* NOTREACHED */
7007             NOT_REACHED; /* NOTREACHED */
7008
7009         case BRANCH_next_fail: /* that branch failed; try the next, if any */
7010             if (do_cutgroup) {
7011                 do_cutgroup = 0;
7012                 no_final = 0;
7013             }
7014             REGCP_UNWIND(ST.cp);
7015             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7016             scan = ST.next_branch;
7017             /* no more branches? */
7018             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7019                 DEBUG_EXECUTE_r({
7020                     PerlIO_printf( Perl_debug_log,
7021                         "%*s  %sBRANCH failed...%s\n",
7022                         REPORT_CODE_OFF+depth*2, "",
7023                         PL_colors[4],
7024                         PL_colors[5] );
7025                 });
7026                 sayNO_SILENT;
7027             }
7028             continue; /* execute next BRANCH[J] op */
7029             /* NOTREACHED */
7030
7031         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7032             minmod = 1;
7033             break;
7034
7035 #undef  ST
7036 #define ST st->u.curlym
7037
7038         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
7039
7040             /* This is an optimisation of CURLYX that enables us to push
7041              * only a single backtracking state, no matter how many matches
7042              * there are in {m,n}. It relies on the pattern being constant
7043              * length, with no parens to influence future backrefs
7044              */
7045
7046             ST.me = scan;
7047             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7048
7049             ST.lastparen      = rex->lastparen;
7050             ST.lastcloseparen = rex->lastcloseparen;
7051
7052             /* if paren positive, emulate an OPEN/CLOSE around A */
7053             if (ST.me->flags) {
7054                 U32 paren = ST.me->flags;
7055                 if (paren > maxopenparen)
7056                     maxopenparen = paren;
7057                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7058             }
7059             ST.A = scan;
7060             ST.B = next;
7061             ST.alen = 0;
7062             ST.count = 0;
7063             ST.minmod = minmod;
7064             minmod = 0;
7065             ST.c1 = CHRTEST_UNINIT;
7066             REGCP_SET(ST.cp);
7067
7068             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7069                 goto curlym_do_B;
7070
7071           curlym_do_A: /* execute the A in /A{m,n}B/  */
7072             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7073             /* NOTREACHED */
7074             NOT_REACHED; /* NOTREACHED */
7075
7076         case CURLYM_A: /* we've just matched an A */
7077             ST.count++;
7078             /* after first match, determine A's length: u.curlym.alen */
7079             if (ST.count == 1) {
7080                 if (reginfo->is_utf8_target) {
7081                     char *s = st->locinput;
7082                     while (s < locinput) {
7083                         ST.alen++;
7084                         s += UTF8SKIP(s);
7085                     }
7086                 }
7087                 else {
7088                     ST.alen = locinput - st->locinput;
7089                 }
7090                 if (ST.alen == 0)
7091                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7092             }
7093             DEBUG_EXECUTE_r(
7094                 PerlIO_printf(Perl_debug_log,
7095                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7096                           (int)(REPORT_CODE_OFF+(depth*2)), "",
7097                           (IV) ST.count, (IV)ST.alen)
7098             );
7099
7100             if (cur_eval && cur_eval->u.eval.close_paren &&
7101                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7102                 goto fake_end;
7103
7104             {
7105                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7106                 if ( max == REG_INFTY || ST.count < max )
7107                     goto curlym_do_A; /* try to match another A */
7108             }
7109             goto curlym_do_B; /* try to match B */
7110
7111         case CURLYM_A_fail: /* just failed to match an A */
7112             REGCP_UNWIND(ST.cp);
7113
7114             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7115                 || (cur_eval && cur_eval->u.eval.close_paren &&
7116                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
7117                 sayNO;
7118
7119           curlym_do_B: /* execute the B in /A{m,n}B/  */
7120             if (ST.c1 == CHRTEST_UNINIT) {
7121                 /* calculate c1 and c2 for possible match of 1st char
7122                  * following curly */
7123                 ST.c1 = ST.c2 = CHRTEST_VOID;
7124                 assert(ST.B);
7125                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7126                     regnode *text_node = ST.B;
7127                     if (! HAS_TEXT(text_node))
7128                         FIND_NEXT_IMPT(text_node);
7129                     /* this used to be
7130
7131                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7132
7133                         But the former is redundant in light of the latter.
7134
7135                         if this changes back then the macro for
7136                         IS_TEXT and friends need to change.
7137                      */
7138                     if (PL_regkind[OP(text_node)] == EXACT) {
7139                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7140                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7141                            reginfo))
7142                         {
7143                             sayNO;
7144                         }
7145                     }
7146                 }
7147             }
7148
7149             DEBUG_EXECUTE_r(
7150                 PerlIO_printf(Perl_debug_log,
7151                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
7152                     (int)(REPORT_CODE_OFF+(depth*2)),
7153                     "", (IV)ST.count)
7154                 );
7155             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7156                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7157                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7158                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7159                     {
7160                         /* simulate B failing */
7161                         DEBUG_OPTIMISE_r(
7162                             PerlIO_printf(Perl_debug_log,
7163                                 "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7164                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
7165                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7166                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7167                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7168                         );
7169                         state_num = CURLYM_B_fail;
7170                         goto reenter_switch;
7171                     }
7172                 }
7173                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7174                     /* simulate B failing */
7175                     DEBUG_OPTIMISE_r(
7176                         PerlIO_printf(Perl_debug_log,
7177                             "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7178                             (int)(REPORT_CODE_OFF+(depth*2)),"",
7179                             (int) nextchr, ST.c1, ST.c2)
7180                     );
7181                     state_num = CURLYM_B_fail;
7182                     goto reenter_switch;
7183                 }
7184             }
7185
7186             if (ST.me->flags) {
7187                 /* emulate CLOSE: mark current A as captured */
7188                 I32 paren = ST.me->flags;
7189                 if (ST.count) {
7190                     rex->offs[paren].start
7191                         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7192                     rex->offs[paren].end = locinput - reginfo->strbeg;
7193                     if ((U32)paren > rex->lastparen)
7194                         rex->lastparen = paren;
7195                     rex->lastcloseparen = paren;
7196                 }
7197                 else
7198                     rex->offs[paren].end = -1;
7199                 if (cur_eval && cur_eval->u.eval.close_paren &&
7200                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7201                 {
7202                     if (ST.count)
7203                         goto fake_end;
7204                     else
7205                         sayNO;
7206                 }
7207             }
7208
7209             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7210             /* NOTREACHED */
7211             NOT_REACHED; /* NOTREACHED */
7212
7213         case CURLYM_B_fail: /* just failed to match a B */
7214             REGCP_UNWIND(ST.cp);
7215             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7216             if (ST.minmod) {
7217                 I32 max = ARG2(ST.me);
7218                 if (max != REG_INFTY && ST.count == max)
7219                     sayNO;
7220                 goto curlym_do_A; /* try to match a further A */
7221             }
7222             /* backtrack one A */
7223             if (ST.count == ARG1(ST.me) /* min */)
7224                 sayNO;
7225             ST.count--;
7226             SET_locinput(HOPc(locinput, -ST.alen));
7227             goto curlym_do_B; /* try to match B */
7228
7229 #undef ST
7230 #define ST st->u.curly
7231
7232 #define CURLY_SETPAREN(paren, success) \
7233     if (paren) { \
7234         if (success) { \
7235             rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7236             rex->offs[paren].end = locinput - reginfo->strbeg; \
7237             if (paren > rex->lastparen) \
7238                 rex->lastparen = paren; \
7239             rex->lastcloseparen = paren; \
7240         } \
7241         else { \
7242             rex->offs[paren].end = -1; \
7243             rex->lastparen      = ST.lastparen; \
7244             rex->lastcloseparen = ST.lastcloseparen; \
7245         } \
7246     }
7247
7248         case STAR:              /*  /A*B/ where A is width 1 char */
7249             ST.paren = 0;
7250             ST.min = 0;
7251             ST.max = REG_INFTY;
7252             scan = NEXTOPER(scan);
7253             goto repeat;
7254
7255         case PLUS:              /*  /A+B/ where A is width 1 char */
7256             ST.paren = 0;
7257             ST.min = 1;
7258             ST.max = REG_INFTY;
7259             scan = NEXTOPER(scan);
7260             goto repeat;
7261
7262         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
7263             ST.paren = scan->flags;     /* Which paren to set */
7264             ST.lastparen      = rex->lastparen;
7265             ST.lastcloseparen = rex->lastcloseparen;
7266             if (ST.paren > maxopenparen)
7267                 maxopenparen = ST.paren;
7268             ST.min = ARG1(scan);  /* min to match */
7269             ST.max = ARG2(scan);  /* max to match */
7270             if (cur_eval && cur_eval->u.eval.close_paren &&
7271                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7272                 ST.min=1;
7273                 ST.max=1;
7274             }
7275             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7276             goto repeat;
7277
7278         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
7279             ST.paren = 0;
7280             ST.min = ARG1(scan);  /* min to match */
7281             ST.max = ARG2(scan);  /* max to match */
7282             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7283           repeat:
7284             /*
7285             * Lookahead to avoid useless match attempts
7286             * when we know what character comes next.
7287             *
7288             * Used to only do .*x and .*?x, but now it allows
7289             * for )'s, ('s and (?{ ... })'s to be in the way
7290             * of the quantifier and the EXACT-like node.  -- japhy
7291             */
7292
7293             assert(ST.min <= ST.max);
7294             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7295                 ST.c1 = ST.c2 = CHRTEST_VOID;
7296             }
7297             else {
7298                 regnode *text_node = next;
7299
7300                 if (! HAS_TEXT(text_node))
7301                     FIND_NEXT_IMPT(text_node);
7302
7303                 if (! HAS_TEXT(text_node))
7304                     ST.c1 = ST.c2 = CHRTEST_VOID;
7305                 else {
7306                     if ( PL_regkind[OP(text_node)] != EXACT ) {
7307                         ST.c1 = ST.c2 = CHRTEST_VOID;
7308                     }
7309                     else {
7310
7311                     /*  Currently we only get here when
7312
7313                         PL_rekind[OP(text_node)] == EXACT
7314
7315                         if this changes back then the macro for IS_TEXT and
7316                         friends need to change. */
7317                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7318                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7319                            reginfo))
7320                         {
7321                             sayNO;
7322                         }
7323                     }
7324                 }
7325             }
7326
7327             ST.A = scan;
7328             ST.B = next;
7329             if (minmod) {
7330                 char *li = locinput;
7331                 minmod = 0;
7332                 if (ST.min &&
7333                         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
7334                             < ST.min)
7335                     sayNO;
7336                 SET_locinput(li);
7337                 ST.count = ST.min;
7338                 REGCP_SET(ST.cp);
7339                 if (ST.c1 == CHRTEST_VOID)
7340                     goto curly_try_B_min;
7341
7342                 ST.oldloc = locinput;
7343
7344                 /* set ST.maxpos to the furthest point along the
7345                  * string that could possibly match */
7346                 if  (ST.max == REG_INFTY) {
7347                     ST.maxpos = reginfo->strend - 1;
7348                     if (utf8_target)
7349                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7350                             ST.maxpos--;
7351                 }
7352                 else if (utf8_target) {
7353                     int m = ST.max - ST.min;
7354                     for (ST.maxpos = locinput;
7355                          m >0 && ST.maxpos < reginfo->strend; m--)
7356                         ST.maxpos += UTF8SKIP(ST.maxpos);
7357                 }
7358                 else {
7359                     ST.maxpos = locinput + ST.max - ST.min;
7360                     if (ST.maxpos >= reginfo->strend)
7361                         ST.maxpos = reginfo->strend - 1;
7362                 }
7363                 goto curly_try_B_min_known;
7364
7365             }
7366             else {
7367                 /* avoid taking address of locinput, so it can remain
7368                  * a register var */
7369                 char *li = locinput;
7370                 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
7371                 if (ST.count < ST.min)
7372                     sayNO;
7373                 SET_locinput(li);
7374                 if ((ST.count > ST.min)
7375                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7376                 {
7377                     /* A{m,n} must come at the end of the string, there's
7378                      * no point in backing off ... */
7379                     ST.min = ST.count;
7380                     /* ...except that $ and \Z can match before *and* after
7381                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
7382                        We may back off by one in this case. */
7383                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
7384                         ST.min--;
7385                 }
7386                 REGCP_SET(ST.cp);
7387                 goto curly_try_B_max;
7388             }
7389             /* NOTREACHED */
7390             NOT_REACHED; /* NOTREACHED */
7391
7392         case CURLY_B_min_known_fail:
7393             /* failed to find B in a non-greedy match where c1,c2 valid */
7394
7395             REGCP_UNWIND(ST.cp);
7396             if (ST.paren) {
7397                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7398             }
7399             /* Couldn't or didn't -- move forward. */
7400             ST.oldloc = locinput;
7401             if (utf8_target)
7402                 locinput += UTF8SKIP(locinput);
7403             else
7404                 locinput++;
7405             ST.count++;
7406           curly_try_B_min_known:
7407              /* find the next place where 'B' could work, then call B */
7408             {
7409                 int n;
7410                 if (utf8_target) {
7411                     n = (ST.oldloc == locinput) ? 0 : 1;
7412                     if (ST.c1 == ST.c2) {
7413                         /* set n to utf8_distance(oldloc, locinput) */
7414                         while (locinput <= ST.maxpos
7415                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
7416                         {
7417                             locinput += UTF8SKIP(locinput);
7418                             n++;
7419                         }
7420                     }
7421                     else {
7422                         /* set n to utf8_distance(oldloc, locinput) */
7423                         while (locinput <= ST.maxpos
7424                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7425                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7426                         {
7427                             locinput += UTF8SKIP(locinput);
7428                             n++;
7429                         }
7430                     }
7431                 }
7432                 else {  /* Not utf8_target */
7433                     if (ST.c1 == ST.c2) {
7434                         while (locinput <= ST.maxpos &&
7435                                UCHARAT(locinput) != ST.c1)
7436                             locinput++;
7437                     }
7438                     else {
7439                         while (locinput <= ST.maxpos
7440                                && UCHARAT(locinput) != ST.c1
7441                                && UCHARAT(locinput) != ST.c2)
7442                             locinput++;
7443                     }
7444                     n = locinput - ST.oldloc;
7445                 }
7446                 if (locinput > ST.maxpos)
7447                     sayNO;
7448                 if (n) {
7449                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
7450                      * at b; check that everything between oldloc and
7451                      * locinput matches */
7452                     char *li = ST.oldloc;
7453                     ST.count += n;
7454                     if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
7455                         sayNO;
7456                     assert(n == REG_INFTY || locinput == li);
7457                 }
7458                 CURLY_SETPAREN(ST.paren, ST.count);
7459                 if (cur_eval && cur_eval->u.eval.close_paren &&
7460                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
7461                     goto fake_end;
7462                 }
7463                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
7464             }
7465             /* NOTREACHED */
7466             NOT_REACHED; /* NOTREACHED */
7467
7468         case CURLY_B_min_fail:
7469             /* failed to find B in a non-greedy match where c1,c2 invalid */
7470
7471             REGCP_UNWIND(ST.cp);
7472             if (ST.paren) {
7473                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7474             }
7475             /* failed -- move forward one */
7476             {
7477                 char *li = locinput;
7478                 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
7479                     sayNO;
7480                 }
7481                 locinput = li;
7482             }
7483             {
7484                 ST.count++;
7485                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
7486                         ST.count > 0)) /* count overflow ? */
7487                 {
7488                   curly_try_B_min:
7489                     CURLY_SETPAREN(ST.paren, ST.count);
7490                     if (cur_eval && cur_eval->u.eval.close_paren &&
7491                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
7492                         goto fake_end;
7493                     }
7494                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
7495                 }
7496             }
7497             sayNO;
7498             /* NOTREACHED */
7499             NOT_REACHED; /* NOTREACHED */
7500
7501           curly_try_B_max:
7502             /* a successful greedy match: now try to match B */
7503             if (cur_eval && cur_eval->u.eval.close_paren &&
7504                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7505                 goto fake_end;
7506             }
7507             {
7508                 bool could_match = locinput < reginfo->strend;
7509
7510                 /* If it could work, try it. */
7511                 if (ST.c1 != CHRTEST_VOID && could_match) {
7512                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
7513                     {
7514                         could_match = memEQ(locinput,
7515                                             ST.c1_utf8,
7516                                             UTF8SKIP(locinput))
7517                                     || memEQ(locinput,
7518                                              ST.c2_utf8,
7519                                              UTF8SKIP(locinput));
7520                     }
7521                     else {
7522                         could_match = UCHARAT(locinput) == ST.c1
7523                                       || UCHARAT(locinput) == ST.c2;
7524                     }
7525                 }
7526                 if (ST.c1 == CHRTEST_VOID || could_match) {
7527                     CURLY_SETPAREN(ST.paren, ST.count);
7528                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
7529                     /* NOTREACHED */
7530                     NOT_REACHED; /* NOTREACHED */
7531                 }
7532             }
7533             /* FALLTHROUGH */
7534
7535         case CURLY_B_max_fail:
7536             /* failed to find B in a greedy match */
7537
7538             REGCP_UNWIND(ST.cp);
7539             if (ST.paren) {
7540                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7541             }
7542             /*  back up. */
7543             if (--ST.count < ST.min)
7544                 sayNO;
7545             locinput = HOPc(locinput, -1);
7546             goto curly_try_B_max;
7547
7548 #undef ST
7549
7550         case END: /*  last op of main pattern  */
7551           fake_end:
7552             if (cur_eval) {
7553                 /* we've just finished A in /(??{A})B/; now continue with B */
7554
7555                 st->u.eval.prev_rex = rex_sv;           /* inner */
7556
7557                 /* Save *all* the positions. */
7558                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
7559                 rex_sv = cur_eval->u.eval.prev_rex;
7560                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7561                 SET_reg_curpm(rex_sv);
7562                 rex = ReANY(rex_sv);
7563                 rexi = RXi_GET(rex);
7564                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
7565
7566                 REGCP_SET(st->u.eval.lastcp);
7567
7568                 /* Restore parens of the outer rex without popping the
7569                  * savestack */
7570                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
7571                                         &maxopenparen);
7572
7573                 st->u.eval.prev_eval = cur_eval;
7574                 cur_eval = cur_eval->u.eval.prev_eval;
7575                 DEBUG_EXECUTE_r(
7576                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
7577                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
7578                 if ( nochange_depth )
7579                     nochange_depth--;
7580
7581                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
7582                                     locinput); /* match B */
7583             }
7584
7585             if (locinput < reginfo->till) {
7586                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7587                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
7588                                       PL_colors[4],
7589                                       (long)(locinput - startpos),
7590                                       (long)(reginfo->till - startpos),
7591                                       PL_colors[5]));
7592
7593                 sayNO_SILENT;           /* Cannot match: too short. */
7594             }
7595             sayYES;                     /* Success! */
7596
7597         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
7598             DEBUG_EXECUTE_r(
7599             PerlIO_printf(Perl_debug_log,
7600                 "%*s  %ssubpattern success...%s\n",
7601                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
7602             sayYES;                     /* Success! */
7603
7604 #undef  ST
7605 #define ST st->u.ifmatch
7606
7607         {
7608             char *newstart;
7609
7610         case SUSPEND:   /* (?>A) */
7611             ST.wanted = 1;
7612             newstart = locinput;
7613             goto do_ifmatch;
7614
7615         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
7616             ST.wanted = 0;
7617             goto ifmatch_trivial_fail_test;
7618
7619         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
7620             ST.wanted = 1;
7621           ifmatch_trivial_fail_test:
7622             if (scan->flags) {
7623                 char * const s = HOPBACKc(locinput, scan->flags);
7624                 if (!s) {
7625                     /* trivial fail */
7626                     if (logical) {
7627                         logical = 0;
7628                         sw = 1 - cBOOL(ST.wanted);
7629                     }
7630                     else if (ST.wanted)
7631                         sayNO;
7632                     next = scan + ARG(scan);
7633                     if (next == scan)
7634                         next = NULL;
7635                     break;
7636                 }
7637                 newstart = s;
7638             }
7639             else
7640                 newstart = locinput;
7641
7642           do_ifmatch:
7643             ST.me = scan;
7644             ST.logical = logical;
7645             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
7646
7647             /* execute body of (?...A) */
7648             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
7649             /* NOTREACHED */
7650             NOT_REACHED; /* NOTREACHED */
7651         }
7652
7653         case IFMATCH_A_fail: /* body of (?...A) failed */
7654             ST.wanted = !ST.wanted;
7655             /* FALLTHROUGH */
7656
7657         case IFMATCH_A: /* body of (?...A) succeeded */
7658             if (ST.logical) {
7659                 sw = cBOOL(ST.wanted);
7660             }
7661             else if (!ST.wanted)
7662                 sayNO;
7663
7664             if (OP(ST.me) != SUSPEND) {
7665                 /* restore old position except for (?>...) */
7666                 locinput = st->locinput;
7667             }
7668             scan = ST.me + ARG(ST.me);
7669             if (scan == ST.me)
7670                 scan = NULL;
7671             continue; /* execute B */
7672
7673 #undef ST
7674
7675         case LONGJMP: /*  alternative with many branches compiles to
7676                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
7677             next = scan + ARG(scan);
7678             if (next == scan)
7679                 next = NULL;
7680             break;
7681
7682         case COMMIT:  /*  (*COMMIT)  */
7683             reginfo->cutpoint = reginfo->strend;
7684             /* FALLTHROUGH */
7685
7686         case PRUNE:   /*  (*PRUNE)   */
7687             if (!scan->flags)
7688                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7689             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
7690             /* NOTREACHED */
7691             NOT_REACHED; /* NOTREACHED */
7692
7693         case COMMIT_next_fail:
7694             no_final = 1;
7695             /* FALLTHROUGH */
7696
7697         case OPFAIL:   /* (*FAIL)  */
7698             sayNO;
7699             /* NOTREACHED */
7700             NOT_REACHED; /* NOTREACHED */
7701
7702 #define ST st->u.mark
7703         case MARKPOINT: /*  (*MARK:foo)  */
7704             ST.prev_mark = mark_state;
7705             ST.mark_name = sv_commit = sv_yes_mark
7706                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7707             mark_state = st;
7708             ST.mark_loc = locinput;
7709             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
7710             /* NOTREACHED */
7711             NOT_REACHED; /* NOTREACHED */
7712
7713         case MARKPOINT_next:
7714             mark_state = ST.prev_mark;
7715             sayYES;
7716             /* NOTREACHED */
7717             NOT_REACHED; /* NOTREACHED */
7718
7719         case MARKPOINT_next_fail:
7720             if (popmark && sv_eq(ST.mark_name,popmark))
7721             {
7722                 if (ST.mark_loc > startpoint)
7723                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7724                 popmark = NULL; /* we found our mark */
7725                 sv_commit = ST.mark_name;
7726
7727                 DEBUG_EXECUTE_r({
7728                         PerlIO_printf(Perl_debug_log,
7729                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
7730                             REPORT_CODE_OFF+depth*2, "",
7731                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
7732                 });
7733             }
7734             mark_state = ST.prev_mark;
7735             sv_yes_mark = mark_state ?
7736                 mark_state->u.mark.mark_name : NULL;
7737             sayNO;
7738             /* NOTREACHED */
7739             NOT_REACHED; /* NOTREACHED */
7740
7741         case SKIP:  /*  (*SKIP)  */
7742             if (scan->flags) {
7743                 /* (*SKIP) : if we fail we cut here*/
7744                 ST.mark_name = NULL;
7745                 ST.mark_loc = locinput;
7746                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
7747             } else {
7748                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
7749                    otherwise do nothing.  Meaning we need to scan
7750                  */
7751                 regmatch_state *cur = mark_state;
7752                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7753
7754                 while (cur) {
7755                     if ( sv_eq( cur->u.mark.mark_name,
7756                                 find ) )
7757                     {
7758                         ST.mark_name = find;
7759                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
7760                     }
7761                     cur = cur->u.mark.prev_mark;
7762                 }
7763             }
7764             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
7765             break;
7766
7767         case SKIP_next_fail:
7768             if (ST.mark_name) {
7769                 /* (*CUT:NAME) - Set up to search for the name as we
7770                    collapse the stack*/
7771                 popmark = ST.mark_name;
7772             } else {
7773                 /* (*CUT) - No name, we cut here.*/
7774                 if (ST.mark_loc > startpoint)
7775                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7776                 /* but we set sv_commit to latest mark_name if there
7777                    is one so they can test to see how things lead to this
7778                    cut */
7779                 if (mark_state)
7780                     sv_commit=mark_state->u.mark.mark_name;
7781             }
7782             no_final = 1;
7783             sayNO;
7784             /* NOTREACHED */
7785             NOT_REACHED; /* NOTREACHED */
7786 #undef ST
7787
7788         case LNBREAK: /* \R */
7789             if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
7790                 locinput += n;
7791             } else
7792                 sayNO;
7793             break;
7794
7795         default:
7796             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
7797                           PTR2UV(scan), OP(scan));
7798             Perl_croak(aTHX_ "regexp memory corruption");
7799
7800         /* this is a point to jump to in order to increment
7801          * locinput by one character */
7802           increment_locinput:
7803             assert(!NEXTCHR_IS_EOS);
7804             if (utf8_target) {
7805                 locinput += PL_utf8skip[nextchr];
7806                 /* locinput is allowed to go 1 char off the end, but not 2+ */
7807                 if (locinput > reginfo->strend)
7808                     sayNO;
7809             }
7810             else
7811                 locinput++;
7812             break;
7813
7814         } /* end switch */
7815
7816         /* switch break jumps here */
7817         scan = next; /* prepare to execute the next op and ... */
7818         continue;    /* ... jump back to the top, reusing st */
7819         /* NOTREACHED */
7820
7821       push_yes_state:
7822         /* push a state that backtracks on success */
7823         st->u.yes.prev_yes_state = yes_state;
7824         yes_state = st;
7825         /* FALLTHROUGH */
7826       push_state:
7827         /* push a new regex state, then continue at scan  */
7828         {
7829             regmatch_state *newst;
7830
7831             DEBUG_STACK_r({
7832                 regmatch_state *cur = st;
7833                 regmatch_state *curyes = yes_state;
7834                 int curd = depth;
7835                 regmatch_slab *slab = PL_regmatch_slab;
7836                 for (;curd > -1;cur--,curd--) {
7837                     if (cur < SLAB_FIRST(slab)) {
7838                         slab = slab->prev;
7839                         cur = SLAB_LAST(slab);
7840                     }
7841                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
7842                         REPORT_CODE_OFF + 2 + depth * 2,"",
7843                         curd, PL_reg_name[cur->resume_state],
7844                         (curyes == cur) ? "yes" : ""
7845                     );
7846                     if (curyes == cur)
7847                         curyes = cur->u.yes.prev_yes_state;
7848                 }
7849             } else
7850                 DEBUG_STATE_pp("push")
7851             );
7852             depth++;
7853             st->locinput = locinput;
7854             newst = st+1;
7855             if (newst >  SLAB_LAST(PL_regmatch_slab))
7856                 newst = S_push_slab(aTHX);
7857             PL_regmatch_state = newst;
7858
7859             locinput = pushinput;
7860             st = newst;
7861             continue;
7862             /* NOTREACHED */
7863         }
7864     }
7865
7866     /*
7867     * We get here only if there's trouble -- normally "case END" is
7868     * the terminating point.
7869     */
7870     Perl_croak(aTHX_ "corrupted regexp pointers");
7871     /* NOTREACHED */
7872     sayNO;
7873     NOT_REACHED; /* NOTREACHED */
7874
7875   yes:
7876     if (yes_state) {
7877         /* we have successfully completed a subexpression, but we must now
7878          * pop to the state marked by yes_state and continue from there */
7879         assert(st != yes_state);
7880 #ifdef DEBUGGING
7881         while (st != yes_state) {
7882             st--;
7883             if (st < SLAB_FIRST(PL_regmatch_slab)) {
7884                 PL_regmatch_slab = PL_regmatch_slab->prev;
7885                 st = SLAB_LAST(PL_regmatch_slab);
7886             }
7887             DEBUG_STATE_r({
7888                 if (no_final) {
7889                     DEBUG_STATE_pp("pop (no final)");
7890                 } else {
7891                     DEBUG_STATE_pp("pop (yes)");
7892                 }
7893             });
7894             depth--;
7895         }
7896 #else
7897         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
7898             || yes_state > SLAB_LAST(PL_regmatch_slab))
7899         {
7900             /* not in this slab, pop slab */
7901             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
7902             PL_regmatch_slab = PL_regmatch_slab->prev;
7903             st = SLAB_LAST(PL_regmatch_slab);
7904         }
7905         depth -= (st - yes_state);
7906 #endif
7907         st = yes_state;
7908         yes_state = st->u.yes.prev_yes_state;
7909         PL_regmatch_state = st;
7910
7911         if (no_final)
7912             locinput= st->locinput;
7913         state_num = st->resume_state + no_final;
7914         goto reenter_switch;
7915     }
7916
7917     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7918                           PL_colors[4], PL_colors[5]));
7919
7920     if (reginfo->info_aux_eval) {
7921         /* each successfully executed (?{...}) block does the equivalent of
7922          *   local $^R = do {...}
7923          * When popping the save stack, all these locals would be undone;
7924          * bypass this by setting the outermost saved $^R to the latest
7925          * value */
7926         /* I dont know if this is needed or works properly now.
7927          * see code related to PL_replgv elsewhere in this file.
7928          * Yves
7929          */
7930         if (oreplsv != GvSV(PL_replgv))
7931             sv_setsv(oreplsv, GvSV(PL_replgv));
7932     }
7933     result = 1;
7934     goto final_exit;
7935
7936   no:
7937     DEBUG_EXECUTE_r(
7938         PerlIO_printf(Perl_debug_log,
7939             "%*s  %sfailed...%s\n",
7940             REPORT_CODE_OFF+depth*2, "",
7941             PL_colors[4], PL_colors[5])
7942         );
7943
7944   no_silent:
7945     if (no_final) {
7946         if (yes_state) {
7947             goto yes;
7948         } else {
7949             goto final_exit;
7950         }
7951     }
7952     if (depth) {
7953         /* there's a previous state to backtrack to */
7954         st--;
7955         if (st < SLAB_FIRST(PL_regmatch_slab)) {
7956             PL_regmatch_slab = PL_regmatch_slab->prev;
7957             st = SLAB_LAST(PL_regmatch_slab);
7958         }
7959         PL_regmatch_state = st;
7960         locinput= st->locinput;
7961
7962         DEBUG_STATE_pp("pop");
7963         depth--;
7964         if (yes_state == st)
7965             yes_state = st->u.yes.prev_yes_state;
7966
7967         state_num = st->resume_state + 1; /* failure = success + 1 */
7968         goto reenter_switch;
7969     }
7970     result = 0;
7971
7972   final_exit:
7973     if (rex->intflags & PREGf_VERBARG_SEEN) {
7974         SV *sv_err = get_sv("REGERROR", 1);
7975         SV *sv_mrk = get_sv("REGMARK", 1);
7976         if (result) {
7977             sv_commit = &PL_sv_no;
7978             if (!sv_yes_mark)
7979                 sv_yes_mark = &PL_sv_yes;
7980         } else {
7981             if (!sv_commit)
7982                 sv_commit = &PL_sv_yes;
7983             sv_yes_mark = &PL_sv_no;
7984         }
7985         assert(sv_err);
7986         assert(sv_mrk);
7987         sv_setsv(sv_err, sv_commit);
7988         sv_setsv(sv_mrk, sv_yes_mark);
7989     }
7990
7991
7992     if (last_pushed_cv) {
7993         dSP;
7994         POP_MULTICALL;
7995         PERL_UNUSED_VAR(SP);
7996     }
7997
7998     assert(!result ||  locinput - reginfo->strbeg >= 0);
7999     return result ?  locinput - reginfo->strbeg : -1;
8000 }
8001
8002 /*
8003  - regrepeat - repeatedly match something simple, report how many
8004  *
8005  * What 'simple' means is a node which can be the operand of a quantifier like
8006  * '+', or {1,3}
8007  *
8008  * startposp - pointer a pointer to the start position.  This is updated
8009  *             to point to the byte following the highest successful
8010  *             match.
8011  * p         - the regnode to be repeatedly matched against.
8012  * reginfo   - struct holding match state, such as strend
8013  * max       - maximum number of things to match.
8014  * depth     - (for debugging) backtracking depth.
8015  */
8016 STATIC I32
8017 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8018             regmatch_info *const reginfo, I32 max, int depth)
8019 {
8020     char *scan;     /* Pointer to current position in target string */
8021     I32 c;
8022     char *loceol = reginfo->strend;   /* local version */
8023     I32 hardcount = 0;  /* How many matches so far */
8024     bool utf8_target = reginfo->is_utf8_target;
8025     unsigned int to_complement = 0;  /* Invert the result? */
8026     UV utf8_flags;
8027     _char_class_number classnum;
8028 #ifndef DEBUGGING
8029     PERL_UNUSED_ARG(depth);
8030 #endif
8031
8032     PERL_ARGS_ASSERT_REGREPEAT;
8033
8034     scan = *startposp;
8035     if (max == REG_INFTY)
8036         max = I32_MAX;
8037     else if (! utf8_target && loceol - scan > max)
8038         loceol = scan + max;
8039
8040     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8041      * to the maximum of how far we should go in it (leaving it set to the real
8042      * end, if the maximum permissible would take us beyond that).  This allows
8043      * us to make the loop exit condition that we haven't gone past <loceol> to
8044      * also mean that we haven't exceeded the max permissible count, saving a
8045      * test each time through the loop.  But it assumes that the OP matches a
8046      * single byte, which is true for most of the OPs below when applied to a
8047      * non-UTF-8 target.  Those relatively few OPs that don't have this
8048      * characteristic will have to compensate.
8049      *
8050      * There is no adjustment for UTF-8 targets, as the number of bytes per
8051      * character varies.  OPs will have to test both that the count is less
8052      * than the max permissible (using <hardcount> to keep track), and that we
8053      * are still within the bounds of the string (using <loceol>.  A few OPs
8054      * match a single byte no matter what the encoding.  They can omit the max
8055      * test if, for the UTF-8 case, they do the adjustment that was skipped
8056      * above.
8057      *
8058      * Thus, the code above sets things up for the common case; and exceptional
8059      * cases need extra work; the common case is to make sure <scan> doesn't
8060      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8061      * count doesn't exceed the maximum permissible */
8062
8063     switch (OP(p)) {
8064     case REG_ANY:
8065         if (utf8_target) {
8066             while (scan < loceol && hardcount < max && *scan != '\n') {
8067                 scan += UTF8SKIP(scan);
8068                 hardcount++;
8069             }
8070         } else {
8071             while (scan < loceol && *scan != '\n')
8072                 scan++;
8073         }
8074         break;
8075     case SANY:
8076         if (utf8_target) {
8077             while (scan < loceol && hardcount < max) {
8078                 scan += UTF8SKIP(scan);
8079                 hardcount++;
8080             }
8081         }
8082         else
8083             scan = loceol;
8084         break;
8085     case EXACTL:
8086         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8087         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8088             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8089         }
8090         /* FALLTHROUGH */
8091     case EXACT:
8092         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8093
8094         c = (U8)*STRING(p);
8095
8096         /* Can use a simple loop if the pattern char to match on is invariant
8097          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8098          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8099          * true iff it doesn't matter if the argument is in UTF-8 or not */
8100         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8101             if (utf8_target && loceol - scan > max) {
8102                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8103                  * since here, to match at all, 1 char == 1 byte */
8104                 loceol = scan + max;
8105             }
8106             while (scan < loceol && UCHARAT(scan) == c) {
8107                 scan++;
8108             }
8109         }
8110         else if (reginfo->is_utf8_pat) {
8111             if (utf8_target) {
8112                 STRLEN scan_char_len;
8113
8114                 /* When both target and pattern are UTF-8, we have to do
8115                  * string EQ */
8116                 while (hardcount < max
8117                        && scan < loceol
8118                        && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8119                        && memEQ(scan, STRING(p), scan_char_len))
8120                 {
8121                     scan += scan_char_len;
8122                     hardcount++;
8123                 }
8124             }
8125             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8126
8127                 /* Target isn't utf8; convert the character in the UTF-8
8128                  * pattern to non-UTF8, and do a simple loop */
8129                 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8130                 while (scan < loceol && UCHARAT(scan) == c) {
8131                     scan++;
8132                 }
8133             } /* else pattern char is above Latin1, can't possibly match the
8134                  non-UTF-8 target */
8135         }
8136         else {
8137
8138             /* Here, the string must be utf8; pattern isn't, and <c> is
8139              * different in utf8 than not, so can't compare them directly.
8140              * Outside the loop, find the two utf8 bytes that represent c, and
8141              * then look for those in sequence in the utf8 string */
8142             U8 high = UTF8_TWO_BYTE_HI(c);
8143             U8 low = UTF8_TWO_BYTE_LO(c);
8144
8145             while (hardcount < max
8146                     && scan + 1 < loceol
8147                     && UCHARAT(scan) == high
8148                     && UCHARAT(scan + 1) == low)
8149             {
8150                 scan += 2;
8151                 hardcount++;
8152             }
8153         }
8154         break;
8155
8156     case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8157         assert(! reginfo->is_utf8_pat);
8158         /* FALLTHROUGH */
8159     case EXACTFA:
8160         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8161         goto do_exactf;
8162
8163     case EXACTFL:
8164         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8165         utf8_flags = FOLDEQ_LOCALE;
8166         goto do_exactf;
8167
8168     case EXACTF:   /* This node only generated for non-utf8 patterns */
8169         assert(! reginfo->is_utf8_pat);
8170         utf8_flags = 0;
8171         goto do_exactf;
8172
8173     case EXACTFLU8:
8174         if (! utf8_target) {
8175             break;
8176         }
8177         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8178                                     | FOLDEQ_S2_FOLDS_SANE;
8179         goto do_exactf;
8180
8181     case EXACTFU_SS:
8182     case EXACTFU:
8183         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8184
8185       do_exactf: {
8186         int c1, c2;
8187         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8188
8189         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8190
8191         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8192                                         reginfo))
8193         {
8194             if (c1 == CHRTEST_VOID) {
8195                 /* Use full Unicode fold matching */
8196                 char *tmpeol = reginfo->strend;
8197                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8198                 while (hardcount < max
8199                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8200                                              STRING(p), NULL, pat_len,
8201                                              reginfo->is_utf8_pat, utf8_flags))
8202                 {
8203                     scan = tmpeol;
8204                     tmpeol = reginfo->strend;
8205                     hardcount++;
8206                 }
8207             }
8208             else if (utf8_target) {
8209                 if (c1 == c2) {
8210                     while (scan < loceol
8211                            && hardcount < max
8212                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8213                     {
8214                         scan += UTF8SKIP(scan);
8215                         hardcount++;
8216                     }
8217                 }
8218                 else {
8219                     while (scan < loceol
8220                            && hardcount < max
8221                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8222                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8223                     {
8224                         scan += UTF8SKIP(scan);
8225                         hardcount++;
8226                     }
8227                 }
8228             }
8229             else if (c1 == c2) {
8230                 while (scan < loceol && UCHARAT(scan) == c1) {
8231                     scan++;
8232                 }
8233             }
8234             else {
8235                 while (scan < loceol &&
8236                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8237                 {
8238                     scan++;
8239                 }
8240             }
8241         }
8242         break;
8243     }
8244     case ANYOFL:
8245         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8246         /* FALLTHROUGH */
8247     case ANYOF:
8248         if (utf8_target) {
8249             while (hardcount < max
8250                    && scan < loceol
8251                    && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8252             {
8253                 scan += UTF8SKIP(scan);
8254                 hardcount++;
8255             }
8256         } else {
8257             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
8258                 scan++;
8259         }
8260         break;
8261
8262     /* The argument (FLAGS) to all the POSIX node types is the class number */
8263
8264     case NPOSIXL:
8265         to_complement = 1;
8266         /* FALLTHROUGH */
8267
8268     case POSIXL:
8269         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8270         if (! utf8_target) {
8271             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8272                                                                    *scan)))
8273             {
8274                 scan++;
8275             }
8276         } else {
8277             while (hardcount < max && scan < loceol
8278                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8279                                                                   (U8 *) scan)))
8280             {
8281                 scan += UTF8SKIP(scan);
8282                 hardcount++;
8283             }
8284         }
8285         break;
8286
8287     case POSIXD:
8288         if (utf8_target) {
8289             goto utf8_posix;
8290         }
8291         /* FALLTHROUGH */
8292
8293     case POSIXA:
8294         if (utf8_target && loceol - scan > max) {
8295
8296             /* We didn't adjust <loceol> at the beginning of this routine
8297              * because is UTF-8, but it is actually ok to do so, since here, to
8298              * match, 1 char == 1 byte. */
8299             loceol = scan + max;
8300         }
8301         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8302             scan++;
8303         }
8304         break;
8305
8306     case NPOSIXD:
8307         if (utf8_target) {
8308             to_complement = 1;
8309             goto utf8_posix;
8310         }
8311         /* FALLTHROUGH */
8312
8313     case NPOSIXA:
8314         if (! utf8_target) {
8315             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8316                 scan++;
8317             }
8318         }
8319         else {
8320
8321             /* The complement of something that matches only ASCII matches all
8322              * non-ASCII, plus everything in ASCII that isn't in the class. */
8323             while (hardcount < max && scan < loceol
8324                    && (! isASCII_utf8(scan)
8325                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8326             {
8327                 scan += UTF8SKIP(scan);
8328                 hardcount++;
8329             }
8330         }
8331         break;
8332
8333     case NPOSIXU:
8334         to_complement = 1;
8335         /* FALLTHROUGH */
8336
8337     case POSIXU:
8338         if (! utf8_target) {
8339             while (scan < loceol && to_complement
8340                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8341             {
8342                 scan++;
8343             }
8344         }
8345         else {
8346           utf8_posix:
8347             classnum = (_char_class_number) FLAGS(p);
8348             if (classnum < _FIRST_NON_SWASH_CC) {
8349
8350                 /* Here, a swash is needed for above-Latin1 code points.
8351                  * Process as many Latin1 code points using the built-in rules.
8352                  * Go to another loop to finish processing upon encountering
8353                  * the first Latin1 code point.  We could do that in this loop
8354                  * as well, but the other way saves having to test if the swash
8355                  * has been loaded every time through the loop: extra space to
8356                  * save a test. */
8357                 while (hardcount < max && scan < loceol) {
8358                     if (UTF8_IS_INVARIANT(*scan)) {
8359                         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8360                                                                    classnum))))
8361                         {
8362                             break;
8363                         }
8364                         scan++;
8365                     }
8366                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8367                         if (! (to_complement
8368                               ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
8369                                                                      *(scan + 1)),
8370                                                     classnum))))
8371                         {
8372                             break;
8373                         }
8374                         scan += 2;
8375                     }
8376                     else {
8377                         goto found_above_latin1;
8378                     }
8379
8380                     hardcount++;
8381                 }
8382             }
8383             else {
8384                 /* For these character classes, the knowledge of how to handle
8385                  * every code point is compiled in to Perl via a macro.  This
8386                  * code is written for making the loops as tight as possible.
8387                  * It could be refactored to save space instead */
8388                 switch (classnum) {
8389                     case _CC_ENUM_SPACE:
8390                         while (hardcount < max
8391                                && scan < loceol
8392                                && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
8393                         {
8394                             scan += UTF8SKIP(scan);
8395                             hardcount++;
8396                         }
8397                         break;
8398                     case _CC_ENUM_BLANK:
8399                         while (hardcount < max
8400                                && scan < loceol
8401                                && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
8402                         {
8403                             scan += UTF8SKIP(scan);
8404                             hardcount++;
8405                         }
8406                         break;
8407                     case _CC_ENUM_XDIGIT:
8408                         while (hardcount < max
8409                                && scan < loceol
8410                                && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
8411                         {
8412                             scan += UTF8SKIP(scan);
8413                             hardcount++;
8414                         }
8415                         break;
8416                     case _CC_ENUM_VERTSPACE:
8417                         while (hardcount < max
8418                                && scan < loceol
8419                                && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
8420                         {
8421                             scan += UTF8SKIP(scan);
8422                             hardcount++;
8423                         }
8424                         break;
8425                     case _CC_ENUM_CNTRL:
8426                         while (hardcount < max
8427                                && scan < loceol
8428                                && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
8429                         {
8430                             scan += UTF8SKIP(scan);
8431                             hardcount++;
8432                         }
8433                         break;
8434                     default:
8435                         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
8436                 }
8437             }
8438         }
8439         break;
8440
8441       found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
8442
8443         /* Load the swash if not already present */
8444         if (! PL_utf8_swash_ptrs[classnum]) {
8445             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
8446             PL_utf8_swash_ptrs[classnum] = _core_swash_init(
8447                                         "utf8",
8448                                         "",
8449                                         &PL_sv_undef, 1, 0,
8450                                         PL_XPosix_ptrs[classnum], &flags);
8451         }
8452
8453         while (hardcount < max && scan < loceol
8454                && to_complement ^ cBOOL(_generic_utf8(
8455                                        classnum,
8456                                        scan,
8457                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
8458                                                    (U8 *) scan,
8459                                                    TRUE))))
8460         {
8461             scan += UTF8SKIP(scan);
8462             hardcount++;
8463         }
8464         break;
8465
8466     case LNBREAK:
8467         if (utf8_target) {
8468             while (hardcount < max && scan < loceol &&
8469                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
8470                 scan += c;
8471                 hardcount++;
8472             }
8473         } else {
8474             /* LNBREAK can match one or two latin chars, which is ok, but we
8475              * have to use hardcount in this situation, and throw away the
8476              * adjustment to <loceol> done before the switch statement */
8477             loceol = reginfo->strend;
8478             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
8479                 scan+=c;
8480                 hardcount++;
8481             }
8482         }
8483         break;
8484
8485     case BOUNDL:
8486     case NBOUNDL:
8487         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8488         /* FALLTHROUGH */
8489     case BOUND:
8490     case BOUNDA:
8491     case BOUNDU:
8492     case EOS:
8493     case GPOS:
8494     case KEEPS:
8495     case NBOUND:
8496     case NBOUNDA:
8497     case NBOUNDU:
8498     case OPFAIL:
8499     case SBOL:
8500     case SEOL:
8501         /* These are all 0 width, so match right here or not at all. */
8502         break;
8503
8504     default:
8505         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
8506         /* NOTREACHED */
8507         NOT_REACHED; /* NOTREACHED */
8508
8509     }
8510
8511     if (hardcount)
8512         c = hardcount;
8513     else
8514         c = scan - *startposp;
8515     *startposp = scan;
8516
8517     DEBUG_r({
8518         GET_RE_DEBUG_FLAGS_DECL;
8519         DEBUG_EXECUTE_r({
8520             SV * const prop = sv_newmortal();
8521             regprop(prog, prop, p, reginfo, NULL);
8522             PerlIO_printf(Perl_debug_log,
8523                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
8524                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
8525         });
8526     });
8527
8528     return(c);
8529 }
8530
8531
8532 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
8533 /*
8534 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
8535 create a copy so that changes the caller makes won't change the shared one.
8536 If <altsvp> is non-null, will return NULL in it, for back-compat.
8537  */
8538 SV *
8539 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
8540 {
8541     PERL_ARGS_ASSERT_REGCLASS_SWASH;
8542
8543     if (altsvp) {
8544         *altsvp = NULL;
8545     }
8546
8547     return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
8548 }
8549
8550 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
8551
8552 /*
8553  - reginclass - determine if a character falls into a character class
8554
8555   n is the ANYOF-type regnode
8556   p is the target string
8557   p_end points to one byte beyond the end of the target string
8558   utf8_target tells whether p is in UTF-8.
8559
8560   Returns true if matched; false otherwise.
8561
8562   Note that this can be a synthetic start class, a combination of various
8563   nodes, so things you think might be mutually exclusive, such as locale,
8564   aren't.  It can match both locale and non-locale
8565
8566  */
8567
8568 STATIC bool
8569 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
8570 {
8571     dVAR;
8572     const char flags = ANYOF_FLAGS(n);
8573     bool match = FALSE;
8574     UV c = *p;
8575
8576     PERL_ARGS_ASSERT_REGINCLASS;
8577
8578     /* If c is not already the code point, get it.  Note that
8579      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
8580     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
8581         STRLEN c_len = 0;
8582         c = utf8n_to_uvchr(p, p_end - p, &c_len,
8583                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
8584                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
8585                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
8586                  * UTF8_ALLOW_FFFF */
8587         if (c_len == (STRLEN)-1)
8588             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
8589         if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
8590             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
8591         }
8592     }
8593
8594     /* If this character is potentially in the bitmap, check it */
8595     if (c < NUM_ANYOF_CODE_POINTS) {
8596         if (ANYOF_BITMAP_TEST(n, c))
8597             match = TRUE;
8598         else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
8599                   && ! utf8_target
8600                   && ! isASCII(c))
8601         {
8602             match = TRUE;
8603         }
8604         else if (flags & ANYOF_LOCALE_FLAGS) {
8605             if ((flags & ANYOF_LOC_FOLD)
8606                 && c < 256
8607                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
8608             {
8609                 match = TRUE;
8610             }
8611             else if (ANYOF_POSIXL_TEST_ANY_SET(n)
8612                      && c < 256
8613             ) {
8614
8615                 /* The data structure is arranged so bits 0, 2, 4, ... are set
8616                  * if the class includes the Posix character class given by
8617                  * bit/2; and 1, 3, 5, ... are set if the class includes the
8618                  * complemented Posix class given by int(bit/2).  So we loop
8619                  * through the bits, each time changing whether we complement
8620                  * the result or not.  Suppose for the sake of illustration
8621                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
8622                  * is set, it means there is a match for this ANYOF node if the
8623                  * character is in the class given by the expression (0 / 2 = 0
8624                  * = \w).  If it is in that class, isFOO_lc() will return 1,
8625                  * and since 'to_complement' is 0, the result will stay TRUE,
8626                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
8627                  * bit 1 is 1.  That means there is a match if the character
8628                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
8629                  * but will on bit 1.  On the second iteration 'to_complement'
8630                  * will be 1, so the exclusive or will reverse things, so we
8631                  * are testing for \W.  On the third iteration, 'to_complement'
8632                  * will be 0, and we would be testing for \s; the fourth
8633                  * iteration would test for \S, etc.
8634                  *
8635                  * Note that this code assumes that all the classes are closed
8636                  * under folding.  For example, if a character matches \w, then
8637                  * its fold does too; and vice versa.  This should be true for
8638                  * any well-behaved locale for all the currently defined Posix
8639                  * classes, except for :lower: and :upper:, which are handled
8640                  * by the pseudo-class :cased: which matches if either of the
8641                  * other two does.  To get rid of this assumption, an outer
8642                  * loop could be used below to iterate over both the source
8643                  * character, and its fold (if different) */
8644
8645                 int count = 0;
8646                 int to_complement = 0;
8647
8648                 while (count < ANYOF_MAX) {
8649                     if (ANYOF_POSIXL_TEST(n, count)
8650                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
8651                     {
8652                         match = TRUE;
8653                         break;
8654                     }
8655                     count++;
8656                     to_complement ^= 1;
8657                 }
8658             }
8659         }
8660     }
8661
8662
8663     /* If the bitmap didn't (or couldn't) match, and something outside the
8664      * bitmap could match, try that. */
8665     if (!match) {
8666         if (c >= NUM_ANYOF_CODE_POINTS
8667             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
8668         {
8669             match = TRUE;       /* Everything above the bitmap matches */
8670         }
8671         else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
8672                   || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
8673                   || ((flags & ANYOF_LOC_FOLD)
8674                        && IN_UTF8_CTYPE_LOCALE
8675                        && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
8676         {
8677             SV* only_utf8_locale = NULL;
8678             SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
8679                                                        &only_utf8_locale, NULL);
8680             if (sw) {
8681                 U8 utf8_buffer[2];
8682                 U8 * utf8_p;
8683                 if (utf8_target) {
8684                     utf8_p = (U8 *) p;
8685                 } else { /* Convert to utf8 */
8686                     utf8_p = utf8_buffer;
8687                     append_utf8_from_native_byte(*p, &utf8_p);
8688                     utf8_p = utf8_buffer;
8689                 }
8690
8691                 if (swash_fetch(sw, utf8_p, TRUE)) {
8692                     match = TRUE;
8693                 }
8694             }
8695             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
8696                 match = _invlist_contains_cp(only_utf8_locale, c);
8697             }
8698         }
8699
8700         if (UNICODE_IS_SUPER(c)
8701             && (flags & ANYOF_WARN_SUPER)
8702             && ckWARN_d(WARN_NON_UNICODE))
8703         {
8704             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
8705                 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
8706         }
8707     }
8708
8709 #if ANYOF_INVERT != 1
8710     /* Depending on compiler optimization cBOOL takes time, so if don't have to
8711      * use it, don't */
8712 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
8713 #endif
8714
8715     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
8716     return (flags & ANYOF_INVERT) ^ match;
8717 }
8718
8719 STATIC U8 *
8720 S_reghop3(U8 *s, SSize_t off, const U8* lim)
8721 {
8722     /* return the position 'off' UTF-8 characters away from 's', forward if
8723      * 'off' >= 0, backwards if negative.  But don't go outside of position
8724      * 'lim', which better be < s  if off < 0 */
8725
8726     PERL_ARGS_ASSERT_REGHOP3;
8727
8728     if (off >= 0) {
8729         while (off-- && s < lim) {
8730             /* XXX could check well-formedness here */
8731             s += UTF8SKIP(s);
8732         }
8733     }
8734     else {
8735         while (off++ && s > lim) {
8736             s--;
8737             if (UTF8_IS_CONTINUED(*s)) {
8738                 while (s > lim && UTF8_IS_CONTINUATION(*s))
8739                     s--;
8740             }
8741             /* XXX could check well-formedness here */
8742         }
8743     }
8744     return s;
8745 }
8746
8747 STATIC U8 *
8748 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
8749 {
8750     PERL_ARGS_ASSERT_REGHOP4;
8751
8752     if (off >= 0) {
8753         while (off-- && s < rlim) {
8754             /* XXX could check well-formedness here */
8755             s += UTF8SKIP(s);
8756         }
8757     }
8758     else {
8759         while (off++ && s > llim) {
8760             s--;
8761             if (UTF8_IS_CONTINUED(*s)) {
8762                 while (s > llim && UTF8_IS_CONTINUATION(*s))
8763                     s--;
8764             }
8765             /* XXX could check well-formedness here */
8766         }
8767     }
8768     return s;
8769 }
8770
8771 /* like reghop3, but returns NULL on overrun, rather than returning last
8772  * char pos */
8773
8774 STATIC U8 *
8775 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
8776 {
8777     PERL_ARGS_ASSERT_REGHOPMAYBE3;
8778
8779     if (off >= 0) {
8780         while (off-- && s < lim) {
8781             /* XXX could check well-formedness here */
8782             s += UTF8SKIP(s);
8783         }
8784         if (off >= 0)
8785             return NULL;
8786     }
8787     else {
8788         while (off++ && s > lim) {
8789             s--;
8790             if (UTF8_IS_CONTINUED(*s)) {
8791                 while (s > lim && UTF8_IS_CONTINUATION(*s))
8792                     s--;
8793             }
8794             /* XXX could check well-formedness here */
8795         }
8796         if (off <= 0)
8797             return NULL;
8798     }
8799     return s;
8800 }
8801
8802
8803 /* when executing a regex that may have (?{}), extra stuff needs setting
8804    up that will be visible to the called code, even before the current
8805    match has finished. In particular:
8806
8807    * $_ is localised to the SV currently being matched;
8808    * pos($_) is created if necessary, ready to be updated on each call-out
8809      to code;
8810    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8811      isn't set until the current pattern is successfully finished), so that
8812      $1 etc of the match-so-far can be seen;
8813    * save the old values of subbeg etc of the current regex, and  set then
8814      to the current string (again, this is normally only done at the end
8815      of execution)
8816 */
8817
8818 static void
8819 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8820 {
8821     MAGIC *mg;
8822     regexp *const rex = ReANY(reginfo->prog);
8823     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8824
8825     eval_state->rex = rex;
8826
8827     if (reginfo->sv) {
8828         /* Make $_ available to executed code. */
8829         if (reginfo->sv != DEFSV) {
8830             SAVE_DEFSV;
8831             DEFSV_set(reginfo->sv);
8832         }
8833
8834         if (!(mg = mg_find_mglob(reginfo->sv))) {
8835             /* prepare for quick setting of pos */
8836             mg = sv_magicext_mglob(reginfo->sv);
8837             mg->mg_len = -1;
8838         }
8839         eval_state->pos_magic = mg;
8840         eval_state->pos       = mg->mg_len;
8841         eval_state->pos_flags = mg->mg_flags;
8842     }
8843     else
8844         eval_state->pos_magic = NULL;
8845
8846     if (!PL_reg_curpm) {
8847         /* PL_reg_curpm is a fake PMOP that we can attach the current
8848          * regex to and point PL_curpm at, so that $1 et al are visible
8849          * within a /(?{})/. It's just allocated once per interpreter the
8850          * first time its needed */
8851         Newxz(PL_reg_curpm, 1, PMOP);
8852 #ifdef USE_ITHREADS
8853         {
8854             SV* const repointer = &PL_sv_undef;
8855             /* this regexp is also owned by the new PL_reg_curpm, which
8856                will try to free it.  */
8857             av_push(PL_regex_padav, repointer);
8858             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8859             PL_regex_pad = AvARRAY(PL_regex_padav);
8860         }
8861 #endif
8862     }
8863     SET_reg_curpm(reginfo->prog);
8864     eval_state->curpm = PL_curpm;
8865     PL_curpm = PL_reg_curpm;
8866     if (RXp_MATCH_COPIED(rex)) {
8867         /*  Here is a serious problem: we cannot rewrite subbeg,
8868             since it may be needed if this match fails.  Thus
8869             $` inside (?{}) could fail... */
8870         eval_state->subbeg     = rex->subbeg;
8871         eval_state->sublen     = rex->sublen;
8872         eval_state->suboffset  = rex->suboffset;
8873         eval_state->subcoffset = rex->subcoffset;
8874 #ifdef PERL_ANY_COW
8875         eval_state->saved_copy = rex->saved_copy;
8876 #endif
8877         RXp_MATCH_COPIED_off(rex);
8878     }
8879     else
8880         eval_state->subbeg = NULL;
8881     rex->subbeg = (char *)reginfo->strbeg;
8882     rex->suboffset = 0;
8883     rex->subcoffset = 0;
8884     rex->sublen = reginfo->strend - reginfo->strbeg;
8885 }
8886
8887
8888 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8889
8890 static void
8891 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8892 {
8893     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8894     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8895     regmatch_slab *s;
8896
8897     Safefree(aux->poscache);
8898
8899     if (eval_state) {
8900
8901         /* undo the effects of S_setup_eval_state() */
8902
8903         if (eval_state->subbeg) {
8904             regexp * const rex = eval_state->rex;
8905             rex->subbeg     = eval_state->subbeg;
8906             rex->sublen     = eval_state->sublen;
8907             rex->suboffset  = eval_state->suboffset;
8908             rex->subcoffset = eval_state->subcoffset;
8909 #ifdef PERL_ANY_COW
8910             rex->saved_copy = eval_state->saved_copy;
8911 #endif
8912             RXp_MATCH_COPIED_on(rex);
8913         }
8914         if (eval_state->pos_magic)
8915         {
8916             eval_state->pos_magic->mg_len = eval_state->pos;
8917             eval_state->pos_magic->mg_flags =
8918                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8919                | (eval_state->pos_flags & MGf_BYTES);
8920         }
8921
8922         PL_curpm = eval_state->curpm;
8923     }
8924
8925     PL_regmatch_state = aux->old_regmatch_state;
8926     PL_regmatch_slab  = aux->old_regmatch_slab;
8927
8928     /* free all slabs above current one - this must be the last action
8929      * of this function, as aux and eval_state are allocated within
8930      * slabs and may be freed here */
8931
8932     s = PL_regmatch_slab->next;
8933     if (s) {
8934         PL_regmatch_slab->next = NULL;
8935         while (s) {
8936             regmatch_slab * const osl = s;
8937             s = s->next;
8938             Safefree(osl);
8939         }
8940     }
8941 }
8942
8943
8944 STATIC void
8945 S_to_utf8_substr(pTHX_ regexp *prog)
8946 {
8947     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8948      * on the converted value */
8949
8950     int i = 1;
8951
8952     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8953
8954     do {
8955         if (prog->substrs->data[i].substr
8956             && !prog->substrs->data[i].utf8_substr) {
8957             SV* const sv = newSVsv(prog->substrs->data[i].substr);
8958             prog->substrs->data[i].utf8_substr = sv;
8959             sv_utf8_upgrade(sv);
8960             if (SvVALID(prog->substrs->data[i].substr)) {
8961                 if (SvTAIL(prog->substrs->data[i].substr)) {
8962                     /* Trim the trailing \n that fbm_compile added last
8963                        time.  */
8964                     SvCUR_set(sv, SvCUR(sv) - 1);
8965                     /* Whilst this makes the SV technically "invalid" (as its
8966                        buffer is no longer followed by "\0") when fbm_compile()
8967                        adds the "\n" back, a "\0" is restored.  */
8968                     fbm_compile(sv, FBMcf_TAIL);
8969                 } else
8970                     fbm_compile(sv, 0);
8971             }
8972             if (prog->substrs->data[i].substr == prog->check_substr)
8973                 prog->check_utf8 = sv;
8974         }
8975     } while (i--);
8976 }
8977
8978 STATIC bool
8979 S_to_byte_substr(pTHX_ regexp *prog)
8980 {
8981     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8982      * on the converted value; returns FALSE if can't be converted. */
8983
8984     int i = 1;
8985
8986     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8987
8988     do {
8989         if (prog->substrs->data[i].utf8_substr
8990             && !prog->substrs->data[i].substr) {
8991             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8992             if (! sv_utf8_downgrade(sv, TRUE)) {
8993                 return FALSE;
8994             }
8995             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8996                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8997                     /* Trim the trailing \n that fbm_compile added last
8998                         time.  */
8999                     SvCUR_set(sv, SvCUR(sv) - 1);
9000                     fbm_compile(sv, FBMcf_TAIL);
9001                 } else
9002                     fbm_compile(sv, 0);
9003             }
9004             prog->substrs->data[i].substr = sv;
9005             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9006                 prog->check_substr = sv;
9007         }
9008     } while (i--);
9009
9010     return TRUE;
9011 }
9012
9013 /*
9014  * ex: set ts=8 sts=4 sw=4 et:
9015  */