src/5017005/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 /*
  46  * pregcomp and pregexec -- regsub and regerror are not used in perl
  47  *
  48  *      Copyright (c) 1986 by University of Toronto.
  49  *      Written by Henry Spencer.  Not derived from licensed software.
  50  *
  51  *      Permission is granted to anyone to use this software for any
  52  *      purpose on any computer system, and to redistribute it freely,
  53  *      subject to the following restrictions:
  54  *
  55  *      1. The author is not responsible for the consequences of use of
  56  *              this software, no matter how awful, even if they arise
  57  *              from defects in it.
  58  *
  59  *      2. The origin of this software must not be misrepresented, either
  60  *              by explicit claim or by omission.
  61  *
  62  *      3. Altered versions must be plainly marked as such, and must not
  63  *              be misrepresented as being the original software.
  64  *
  65  ****    Alterations to Henry's code are...
  66  ****
  67  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  68  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  69  ****    by Larry Wall and others
  70  ****
  71  ****    You may distribute under the terms of either the GNU General Public
  72  ****    License or the Artistic License, as specified in the README file.
  73  *
  74  * Beware that some of this code is subtly aware of the way operator
  75  * precedence is structured in regular expressions.  Serious changes in
  76  * regular-expression syntax might require a total rethink.
  77  */
  78 #include "EXTERN.h"
  79 #define PERL_IN_REGEXEC_C
  80 #include "perl.h"
  81 #include "re_defs.h"
  82
  83 #ifdef PERL_IN_XSUB_RE
  84 #  include "re_comp.h"
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "inline_invlist.c"
  90 #include "unicode_constants.h"
  91
  92 #define RF_tainted      1       /* tainted information used? e.g. locale */
  93 #define RF_warned       2               /* warned about big count? */
  94
  95 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  96
  97 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  98
  99 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 /* Valid for non-utf8 strings: avoids the reginclass
 106  * call if there are no complications: i.e., if everything matchable is
 107  * straight forward in the bitmap */
 108 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 109                                               : ANYOF_BITMAP_TEST(p,*(c)))
 110
 111 /*
 112  * Forwards.
 113  */
 114
 115 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 116 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 117
 118 #define HOPc(pos,off) \
 119         (char *)(PL_reg_match_utf8 \
 120             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 121             : (U8*)(pos + off))
 122 #define HOPBACKc(pos, off) \
 123         (char*)(PL_reg_match_utf8\
 124             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 125             : (pos - off >= PL_bostr)           \
 126                 ? (U8*)pos - off                \
 127                 : NULL)
 128
 129 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 130 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 131
 132
 133 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 134 #define NEXTCHR_IS_EOS (nextchr < 0)
 135
 136 #define SET_nextchr \
 137     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 138
 139 #define SET_locinput(p) \
 140     locinput = (p);  \
 141     SET_nextchr
 142
 143
 144 /* these are unrolled below in the CCC_TRY_XXX defined */
 145 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 146     if (!CAT2(PL_utf8_,class)) { \
 147         bool ok; \
 148         ENTER; save_re_context(); \
 149         ok=CAT2(is_utf8_,class)((const U8*)str); \
 150         PERL_UNUSED_VAR(ok); \
 151         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 152 /* Doesn't do an assert to verify that is correct */
 153 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 154     if (!CAT2(PL_utf8_,class)) { \
 155         bool throw_away PERL_UNUSED_DECL; \
 156         ENTER; save_re_context(); \
 157         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 158         LEAVE; } } STMT_END
 159
 160 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 161 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 162 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 163
 164 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 165         /* No asserts are done for some of these, in case called on a   */  \
 166         /* Unicode version in which they map to nothing */                  \
 167         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 168         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 169
 170 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 171
 172 /* The actual code for CCC_TRY, which uses several variables from the routine
 173  * it's callable from.  It is designed to be the bulk of a case statement.
 174  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 175  *      nextchr matches the class.
 176  * UTF8_TEST is the whole test string to use for utf8 targets
 177  * LOAD is what to use to test, and if not present to load in the swash for the
 178  *      class
 179  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 180  *      UTF8_TEST test.
 181  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 182  * utf8 and a variant, load the swash if necessary and test using the utf8
 183  * test.  Advance to the next character if test is ok, otherwise fail; If not
 184  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 185  * fails, or advance to the next character */
 186
 187 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 188     if (NEXTCHR_IS_EOS) {                                              \
 189         sayNO;                                                                \
 190     }                                                                         \
 191     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 192         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 193         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 194             sayNO;                                                            \
 195         }                                                                     \
 196     }                                                                         \
 197     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 198             sayNO;                                                            \
 199     }                                                                         \
 200     goto increment_locinput;
 201
 202 /* Handle the non-locale cases for a character class and its complement.  It
 203  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 204  * This is because that code fails when the test succeeds, so we want to have
 205  * the test fail so that the code succeeds.  The swash is stored in a
 206  * predictable PL_ place */
 207 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 208                            CLASS, STR)                                        \
 209     case NAME:                                                                \
 210         _CCC_TRY_CODE( !, FUNC,                                               \
 211                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 212                                             (U8*)locinput, TRUE)),            \
 213                           CLASS, STR)                                         \
 214     case NNAME:                                                               \
 215         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 216                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 217                                             (U8*)locinput, TRUE)),            \
 218                           CLASS, STR)                                         \
 219
 220 /* Generate the case statements for both locale and non-locale character
 221  * classes in regmatch for classes that don't have special unicode semantics.
 222  * Locales don't use an immediate swash, but an intermediary special locale
 223  * function that is called on the pointer to the current place in the input
 224  * string.  That function will resolve to needing the same swash.  One might
 225  * think that because we don't know what the locale will match, we shouldn't
 226  * check with the swash loading function that it loaded properly; ie, that we
 227  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 228  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 229  * irrelevant here */
 230 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 231                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 232                 NAMEA, NNAMEA, FUNCA,                                         \
 233                 CLASS, STR)                                                   \
 234     case NAMEL:                                                               \
 235         PL_reg_flags |= RF_tainted;                                           \
 236         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 237     case NNAMEL:                                                              \
 238         PL_reg_flags |= RF_tainted;                                           \
 239         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 240                        CLASS, STR)                                            \
 241     case NAMEA:                                                               \
 242         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 243             sayNO;                                                            \
 244         }                                                                     \
 245         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 246         locinput++;                                        \
 247         break;                                                                \
 248     case NNAMEA:                                                              \
 249         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 250             sayNO;                                                            \
 251         }                                                                     \
 252         goto increment_locinput;                                              \
 253     /* Generate the non-locale cases */                                       \
 254     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 255
 256 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 257  * statements to handle separate Unicode semantics nodes */
 258 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 259                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 260                   NAMEU, NNAMEU, FUNCU,                                        \
 261                   NAMEA, NNAMEA, FUNCA,                                        \
 262                   CLASS, STR)                                                  \
 263     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 264             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 265             NAMEA, NNAMEA, FUNCA,                                              \
 266             CLASS, STR)                                                        \
 267     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 268
 269 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 270
 271 /* for use after a quantifier and before an EXACT-like node -- japhy */
 272 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 273  *
 274  * NOTE that *nothing* that affects backtracking should be in here, specifically
 275  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 276  * node that is in between two EXACT like nodes when ascertaining what the required
 277  * "follow" character is. This should probably be moved to regex compile time
 278  * although it may be done at run time beause of the REF possibility - more
 279  * investigation required. -- demerphq
 280 */
 281 #define JUMPABLE(rn) (      \
 282     OP(rn) == OPEN ||       \
 283     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 284     OP(rn) == EVAL ||   \
 285     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 286     OP(rn) == PLUS || OP(rn) == MINMOD || \
 287     OP(rn) == KEEPS || \
 288     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 289 )
 290 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 291
 292 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 293
 294 #if 0
 295 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 296    we don't need this definition. */
 297 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 298 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 299 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 300
 301 #else
 302 /* ... so we use this as its faster. */
 303 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 304 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 305 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 306 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 307
 308 #endif
 309
 310 /*
 311   Search for mandatory following text node; for lookahead, the text must
 312   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 313 */
 314 #define FIND_NEXT_IMPT(rn) STMT_START { \
 315     while (JUMPABLE(rn)) { \
 316         const OPCODE type = OP(rn); \
 317         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 318             rn = NEXTOPER(NEXTOPER(rn)); \
 319         else if (type == PLUS) \
 320             rn = NEXTOPER(rn); \
 321         else if (type == IFMATCH) \
 322             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 323         else rn += NEXT_OFF(rn); \
 324     } \
 325 } STMT_END
 326
 327
 328 static void restore_pos(pTHX_ void *arg);
 329
 330 #define REGCP_PAREN_ELEMS 3
 331 #define REGCP_OTHER_ELEMS 3
 332 #define REGCP_FRAME_ELEMS 1
 333 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 334  * are needed for the regexp context stack bookkeeping. */
 335
 336 STATIC CHECKPOINT
 337 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 338 {
 339     dVAR;
 340     const int retval = PL_savestack_ix;
 341     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 342     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 343     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 344     I32 p;
 345     GET_RE_DEBUG_FLAGS_DECL;
 346
 347     PERL_ARGS_ASSERT_REGCPPUSH;
 348
 349     if (paren_elems_to_push < 0)
 350         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 351                    paren_elems_to_push);
 352
 353     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 354         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 355                    " out of range (%lu-%ld)",
 356                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 357
 358     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 359
 360     DEBUG_BUFFERS_r(
 361         if ((int)PL_regsize > (int)parenfloor)
 362             PerlIO_printf(Perl_debug_log,
 363                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 364                 PTR2UV(rex),
 365                 PTR2UV(rex->offs)
 366             );
 367     );
 368     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 369 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 370         SSPUSHINT(rex->offs[p].end);
 371         SSPUSHINT(rex->offs[p].start);
 372         SSPUSHINT(rex->offs[p].start_tmp);
 373         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 374             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 375             (UV)p,
 376             (IV)rex->offs[p].start,
 377             (IV)rex->offs[p].start_tmp,
 378             (IV)rex->offs[p].end
 379         ));
 380     }
 381 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 382     SSPUSHINT(PL_regsize);
 383     SSPUSHINT(rex->lastparen);
 384     SSPUSHINT(rex->lastcloseparen);
 385     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 386
 387     return retval;
 388 }
 389
 390 /* These are needed since we do not localize EVAL nodes: */
 391 #define REGCP_SET(cp)                                           \
 392     DEBUG_STATE_r(                                              \
 393             PerlIO_printf(Perl_debug_log,                       \
 394                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 395                 (IV)PL_savestack_ix));                          \
 396     cp = PL_savestack_ix
 397
 398 #define REGCP_UNWIND(cp)                                        \
 399     DEBUG_STATE_r(                                              \
 400         if (cp != PL_savestack_ix)                              \
 401             PerlIO_printf(Perl_debug_log,                       \
 402                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 403                 (IV)(cp), (IV)PL_savestack_ix));                \
 404     regcpblow(cp)
 405
 406 #define UNWIND_PAREN(lp, lcp)               \
 407     for (n = rex->lastparen; n > lp; n--)   \
 408         rex->offs[n].end = -1;              \
 409     rex->lastparen = n;                     \
 410     rex->lastcloseparen = lcp;
 411
 412
 413 STATIC void
 414 S_regcppop(pTHX_ regexp *rex)
 415 {
 416     dVAR;
 417     UV i;
 418     U32 paren;
 419     GET_RE_DEBUG_FLAGS_DECL;
 420
 421     PERL_ARGS_ASSERT_REGCPPOP;
 422
 423     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 424     i = SSPOPUV;
 425     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 426     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 427     rex->lastcloseparen = SSPOPINT;
 428     rex->lastparen = SSPOPINT;
 429     PL_regsize = SSPOPINT;
 430
 431     i -= REGCP_OTHER_ELEMS;
 432     /* Now restore the parentheses context. */
 433     DEBUG_BUFFERS_r(
 434         if (i || rex->lastparen + 1 <= rex->nparens)
 435             PerlIO_printf(Perl_debug_log,
 436                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 437                 PTR2UV(rex),
 438                 PTR2UV(rex->offs)
 439             );
 440     );
 441     paren = PL_regsize;
 442     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 443         I32 tmps;
 444         rex->offs[paren].start_tmp = SSPOPINT;
 445         rex->offs[paren].start = SSPOPINT;
 446         tmps = SSPOPINT;
 447         if (paren <= rex->lastparen)
 448             rex->offs[paren].end = tmps;
 449         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 450             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 451             (UV)paren,
 452             (IV)rex->offs[paren].start,
 453             (IV)rex->offs[paren].start_tmp,
 454             (IV)rex->offs[paren].end,
 455             (paren > rex->lastparen ? "(skipped)" : ""));
 456         );
 457         paren--;
 458     }
 459 #if 1
 460     /* It would seem that the similar code in regtry()
 461      * already takes care of this, and in fact it is in
 462      * a better location to since this code can #if 0-ed out
 463      * but the code in regtry() is needed or otherwise tests
 464      * requiring null fields (pat.t#187 and split.t#{13,14}
 465      * (as of patchlevel 7877)  will fail.  Then again,
 466      * this code seems to be necessary or otherwise
 467      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 468      * --jhi updated by dapm */
 469     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 470         if (i > PL_regsize)
 471             rex->offs[i].start = -1;
 472         rex->offs[i].end = -1;
 473         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 474             "    \\%"UVuf": %s   ..-1 undeffing\n",
 475             (UV)i,
 476             (i > PL_regsize) ? "-1" : "  "
 477         ));
 478     }
 479 #endif
 480 }
 481
 482 /* restore the parens and associated vars at savestack position ix,
 483  * but without popping the stack */
 484
 485 STATIC void
 486 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 487 {
 488     I32 tmpix = PL_savestack_ix;
 489     PL_savestack_ix = ix;
 490     regcppop(rex);
 491     PL_savestack_ix = tmpix;
 492 }
 493
 494 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 495
 496 /*
 497  * pregexec and friends
 498  */
 499
 500 #ifndef PERL_IN_XSUB_RE
 501 /*
 502  - pregexec - match a regexp against a string
 503  */
 504 I32
 505 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 506          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 507 /* stringarg: the point in the string at which to begin matching */
 508 /* strend:    pointer to null at end of string */
 509 /* strbeg:    real beginning of string */
 510 /* minend:    end of match must be >= minend bytes after stringarg. */
 511 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 512  *            itself is accessed via the pointers above */
 513 /* nosave:    For optimizations. */
 514 {
 515     PERL_ARGS_ASSERT_PREGEXEC;
 516
 517     return
 518         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 519                       nosave ? 0 : REXEC_COPY_STR);
 520 }
 521 #endif
 522
 523 /*
 524  * Need to implement the following flags for reg_anch:
 525  *
 526  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 527  * USE_INTUIT_ML
 528  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 529  * INTUIT_AUTORITATIVE_ML
 530  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 531  * INTUIT_ONCE_ML
 532  *
 533  * Another flag for this function: SECOND_TIME (so that float substrs
 534  * with giant delta may be not rechecked).
 535  */
 536
 537 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 538
 539 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 540    Otherwise, only SvCUR(sv) is used to get strbeg. */
 541
 542 /* XXXX We assume that strpos is strbeg unless sv. */
 543
 544 /* XXXX Some places assume that there is a fixed substring.
 545         An update may be needed if optimizer marks as "INTUITable"
 546         RExen without fixed substrings.  Similarly, it is assumed that
 547         lengths of all the strings are no more than minlen, thus they
 548         cannot come from lookahead.
 549         (Or minlen should take into account lookahead.)
 550   NOTE: Some of this comment is not correct. minlen does now take account
 551   of lookahead/behind. Further research is required. -- demerphq
 552
 553 */
 554
 555 /* A failure to find a constant substring means that there is no need to make
 556    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 557    finding a substring too deep into the string means that less calls to
 558    regtry() should be needed.
 559
 560    REx compiler's optimizer found 4 possible hints:
 561         a) Anchored substring;
 562         b) Fixed substring;
 563         c) Whether we are anchored (beginning-of-line or \G);
 564         d) First node (of those at offset 0) which may distinguish positions;
 565    We use a)b)d) and multiline-part of c), and try to find a position in the
 566    string which does not contradict any of them.
 567  */
 568
 569 /* Most of decisions we do here should have been done at compile time.
 570    The nodes of the REx which we used for the search should have been
 571    deleted from the finite automaton. */
 572
 573 char *
 574 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 575                      char *strend, const U32 flags, re_scream_pos_data *data)
 576 {
 577     dVAR;
 578     struct regexp *const prog = (struct regexp *)SvANY(rx);
 579     I32 start_shift = 0;
 580     /* Should be nonnegative! */
 581     I32 end_shift   = 0;
 582     char *s;
 583     SV *check;
 584     char *strbeg;
 585     char *t;
 586     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 587     I32 ml_anch;
 588     char *other_last = NULL;    /* other substr checked before this */
 589     char *check_at = NULL;              /* check substr found at this pos */
 590     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 591     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 592     RXi_GET_DECL(prog,progi);
 593 #ifdef DEBUGGING
 594     const char * const i_strpos = strpos;
 595 #endif
 596     GET_RE_DEBUG_FLAGS_DECL;
 597
 598     PERL_ARGS_ASSERT_RE_INTUIT_START;
 599     PERL_UNUSED_ARG(flags);
 600     PERL_UNUSED_ARG(data);
 601
 602     RX_MATCH_UTF8_set(rx,utf8_target);
 603
 604     if (RX_UTF8(rx)) {
 605         PL_reg_flags |= RF_utf8;
 606     }
 607     DEBUG_EXECUTE_r(
 608         debug_start_match(rx, utf8_target, strpos, strend,
 609             sv ? "Guessing start of match in sv for"
 610                : "Guessing start of match in string for");
 611               );
 612
 613     /* CHR_DIST() would be more correct here but it makes things slow. */
 614     if (prog->minlen > strend - strpos) {
 615         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 616                               "String too short... [re_intuit_start]\n"));
 617         goto fail;
 618     }
 619
 620     /* XXX we need to pass strbeg as a separate arg: the following is
 621      * guesswork and can be wrong... */
 622     if (sv && SvPOK(sv)) {
 623         char * p   = SvPVX(sv);
 624         STRLEN cur = SvCUR(sv);
 625         if (p <= strpos && strpos < p + cur) {
 626             strbeg = p;
 627             assert(p <= strend && strend <= p + cur);
 628         }
 629         else
 630             strbeg = strend - cur;
 631     }
 632     else
 633         strbeg = strpos;
 634
 635     PL_regeol = strend;
 636     if (utf8_target) {
 637         if (!prog->check_utf8 && prog->check_substr)
 638             to_utf8_substr(prog);
 639         check = prog->check_utf8;
 640     } else {
 641         if (!prog->check_substr && prog->check_utf8) {
 642             if (! to_byte_substr(prog)) {
 643                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 644                                         non_utf8_target_but_utf8_required));
 645                 goto fail;
 646             }
 647         }
 648         check = prog->check_substr;
 649     }
 650     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 651         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 652                      || ( (prog->extflags & RXf_ANCH_BOL)
 653                           && !multiline ) );    /* Check after \n? */
 654
 655         if (!ml_anch) {
 656           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 657                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 658                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 659                && sv && !SvROK(sv)
 660                && (strpos != strbeg)) {
 661               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 662               goto fail;
 663           }
 664           if (prog->check_offset_min == prog->check_offset_max &&
 665               !(prog->extflags & RXf_CANY_SEEN)) {
 666             /* Substring at constant offset from beg-of-str... */
 667             I32 slen;
 668
 669             s = HOP3c(strpos, prog->check_offset_min, strend);
 670
 671             if (SvTAIL(check)) {
 672                 slen = SvCUR(check);    /* >= 1 */
 673
 674                 if ( strend - s > slen || strend - s < slen - 1
 675                      || (strend - s == slen && strend[-1] != '\n')) {
 676                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 677                     goto fail_finish;
 678                 }
 679                 /* Now should match s[0..slen-2] */
 680                 slen--;
 681                 if (slen && (*SvPVX_const(check) != *s
 682                              || (slen > 1
 683                                  && memNE(SvPVX_const(check), s, slen)))) {
 684                   report_neq:
 685                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 686                     goto fail_finish;
 687                 }
 688             }
 689             else if (*SvPVX_const(check) != *s
 690                      || ((slen = SvCUR(check)) > 1
 691                          && memNE(SvPVX_const(check), s, slen)))
 692                 goto report_neq;
 693             check_at = s;
 694             goto success_at_start;
 695           }
 696         }
 697         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 698         s = strpos;
 699         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 700         end_shift = prog->check_end_shift;
 701
 702         if (!ml_anch) {
 703             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 704                                          - (SvTAIL(check) != 0);
 705             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 706
 707             if (end_shift < eshift)
 708                 end_shift = eshift;
 709         }
 710     }
 711     else {                              /* Can match at random position */
 712         ml_anch = 0;
 713         s = strpos;
 714         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 715         end_shift = prog->check_end_shift;
 716
 717         /* end shift should be non negative here */
 718     }
 719
 720 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 721     if (end_shift < 0)
 722         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 723                    (IV)end_shift, RX_PRECOMP(prog));
 724 #endif
 725
 726   restart:
 727     /* Find a possible match in the region s..strend by looking for
 728        the "check" substring in the region corrected by start/end_shift. */
 729
 730     {
 731         I32 srch_start_shift = start_shift;
 732         I32 srch_end_shift = end_shift;
 733         U8* start_point;
 734         U8* end_point;
 735         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 736             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 737             srch_start_shift = strbeg - s;
 738         }
 739     DEBUG_OPTIMISE_MORE_r({
 740         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 741             (IV)prog->check_offset_min,
 742             (IV)srch_start_shift,
 743             (IV)srch_end_shift,
 744             (IV)prog->check_end_shift);
 745     });
 746
 747         if (prog->extflags & RXf_CANY_SEEN) {
 748             start_point= (U8*)(s + srch_start_shift);
 749             end_point= (U8*)(strend - srch_end_shift);
 750         } else {
 751             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 752             end_point= HOP3(strend, -srch_end_shift, strbeg);
 753         }
 754         DEBUG_OPTIMISE_MORE_r({
 755             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 756                 (int)(end_point - start_point),
 757                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 758                 start_point);
 759         });
 760
 761         s = fbm_instr( start_point, end_point,
 762                       check, multiline ? FBMrf_MULTILINE : 0);
 763     }
 764     /* Update the count-of-usability, remove useless subpatterns,
 765         unshift s.  */
 766
 767     DEBUG_EXECUTE_r({
 768         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 769             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 770         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 771                           (s ? "Found" : "Did not find"),
 772             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 773                 ? "anchored" : "floating"),
 774             quoted,
 775             RE_SV_TAIL(check),
 776             (s ? " at offset " : "...\n") );
 777     });
 778
 779     if (!s)
 780         goto fail_finish;
 781     /* Finish the diagnostic message */
 782     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 783
 784     /* XXX dmq: first branch is for positive lookbehind...
 785        Our check string is offset from the beginning of the pattern.
 786        So we need to do any stclass tests offset forward from that
 787        point. I think. :-(
 788      */
 789
 790
 791
 792     check_at=s;
 793
 794
 795     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 796        Start with the other substr.
 797        XXXX no SCREAM optimization yet - and a very coarse implementation
 798        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 799                 *always* match.  Probably should be marked during compile...
 800        Probably it is right to do no SCREAM here...
 801      */
 802
 803     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 804                 : (prog->float_substr && prog->anchored_substr))
 805     {
 806         /* Take into account the "other" substring. */
 807         /* XXXX May be hopelessly wrong for UTF... */
 808         if (!other_last)
 809             other_last = strpos;
 810         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 811           do_other_anchored:
 812             {
 813                 char * const last = HOP3c(s, -start_shift, strbeg);
 814                 char *last1, *last2;
 815                 char * const saved_s = s;
 816                 SV* must;
 817
 818                 t = s - prog->check_offset_max;
 819                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 820                     && (!utf8_target
 821                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 822                             && t > strpos)))
 823                     NOOP;
 824                 else
 825                     t = strpos;
 826                 t = HOP3c(t, prog->anchored_offset, strend);
 827                 if (t < other_last)     /* These positions already checked */
 828                     t = other_last;
 829                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 830                 if (last < last1)
 831                     last1 = last;
 832                 /* XXXX It is not documented what units *_offsets are in.
 833                    We assume bytes, but this is clearly wrong.
 834                    Meaning this code needs to be carefully reviewed for errors.
 835                    dmq.
 836                   */
 837
 838                 /* On end-of-str: see comment below. */
 839                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 840                 if (must == &PL_sv_undef) {
 841                     s = (char*)NULL;
 842                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 843                 }
 844                 else
 845                     s = fbm_instr(
 846                         (unsigned char*)t,
 847                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 848                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 849                         must,
 850                         multiline ? FBMrf_MULTILINE : 0
 851                     );
 852                 DEBUG_EXECUTE_r({
 853                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 854                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 855                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 856                         (s ? "Found" : "Contradicts"),
 857                         quoted, RE_SV_TAIL(must));
 858                 });
 859
 860
 861                 if (!s) {
 862                     if (last1 >= last2) {
 863                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 864                                                 ", giving up...\n"));
 865                         goto fail_finish;
 866                     }
 867                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 868                         ", trying floating at offset %ld...\n",
 869                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 870                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 871                     s = HOP3c(last, 1, strend);
 872                     goto restart;
 873                 }
 874                 else {
 875                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 876                           (long)(s - i_strpos)));
 877                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 878                     other_last = HOP3c(s, 1, strend);
 879                     s = saved_s;
 880                     if (t == strpos)
 881                         goto try_at_start;
 882                     goto try_at_offset;
 883                 }
 884             }
 885         }
 886         else {          /* Take into account the floating substring. */
 887             char *last, *last1;
 888             char * const saved_s = s;
 889             SV* must;
 890
 891             t = HOP3c(s, -start_shift, strbeg);
 892             last1 = last =
 893                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 894             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 895                 last = HOP3c(t, prog->float_max_offset, strend);
 896             s = HOP3c(t, prog->float_min_offset, strend);
 897             if (s < other_last)
 898                 s = other_last;
 899  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 900             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 901             /* fbm_instr() takes into account exact value of end-of-str
 902                if the check is SvTAIL(ed).  Since false positives are OK,
 903                and end-of-str is not later than strend we are OK. */
 904             if (must == &PL_sv_undef) {
 905                 s = (char*)NULL;
 906                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 907             }
 908             else
 909                 s = fbm_instr((unsigned char*)s,
 910                               (unsigned char*)last + SvCUR(must)
 911                                   - (SvTAIL(must)!=0),
 912                               must, multiline ? FBMrf_MULTILINE : 0);
 913             DEBUG_EXECUTE_r({
 914                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 915                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 916                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 917                     (s ? "Found" : "Contradicts"),
 918                     quoted, RE_SV_TAIL(must));
 919             });
 920             if (!s) {
 921                 if (last1 == last) {
 922                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 923                                             ", giving up...\n"));
 924                     goto fail_finish;
 925                 }
 926                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 927                     ", trying anchored starting at offset %ld...\n",
 928                     (long)(saved_s + 1 - i_strpos)));
 929                 other_last = last;
 930                 s = HOP3c(t, 1, strend);
 931                 goto restart;
 932             }
 933             else {
 934                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 935                       (long)(s - i_strpos)));
 936                 other_last = s; /* Fix this later. --Hugo */
 937                 s = saved_s;
 938                 if (t == strpos)
 939                     goto try_at_start;
 940                 goto try_at_offset;
 941             }
 942         }
 943     }
 944
 945
 946     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 947
 948     DEBUG_OPTIMISE_MORE_r(
 949         PerlIO_printf(Perl_debug_log,
 950             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 951             (IV)prog->check_offset_min,
 952             (IV)prog->check_offset_max,
 953             (IV)(s-strpos),
 954             (IV)(t-strpos),
 955             (IV)(t-s),
 956             (IV)(strend-strpos)
 957         )
 958     );
 959
 960     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 961         && (!utf8_target
 962             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 963                  && t > strpos)))
 964     {
 965         /* Fixed substring is found far enough so that the match
 966            cannot start at strpos. */
 967       try_at_offset:
 968         if (ml_anch && t[-1] != '\n') {
 969             /* Eventually fbm_*() should handle this, but often
 970                anchored_offset is not 0, so this check will not be wasted. */
 971             /* XXXX In the code below we prefer to look for "^" even in
 972                presence of anchored substrings.  And we search even
 973                beyond the found float position.  These pessimizations
 974                are historical artefacts only.  */
 975           find_anchor:
 976             while (t < strend - prog->minlen) {
 977                 if (*t == '\n') {
 978                     if (t < check_at - prog->check_offset_min) {
 979                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 980                             /* Since we moved from the found position,
 981                                we definitely contradict the found anchored
 982                                substr.  Due to the above check we do not
 983                                contradict "check" substr.
 984                                Thus we can arrive here only if check substr
 985                                is float.  Redo checking for "other"=="fixed".
 986                              */
 987                             strpos = t + 1;
 988                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 989                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 990                             goto do_other_anchored;
 991                         }
 992                         /* We don't contradict the found floating substring. */
 993                         /* XXXX Why not check for STCLASS? */
 994                         s = t + 1;
 995                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 996                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 997                         goto set_useful;
 998                     }
 999                     /* Position contradicts check-string */
1000                     /* XXXX probably better to look for check-string
1001                        than for "\n", so one should lower the limit for t? */
1002                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1003                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1004                     other_last = strpos = s = t + 1;
1005                     goto restart;
1006                 }
1007                 t++;
1008             }
1009             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1010                         PL_colors[0], PL_colors[1]));
1011             goto fail_finish;
1012         }
1013         else {
1014             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1015                         PL_colors[0], PL_colors[1]));
1016         }
1017         s = t;
1018       set_useful:
1019         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1020     }
1021     else {
1022         /* The found string does not prohibit matching at strpos,
1023            - no optimization of calling REx engine can be performed,
1024            unless it was an MBOL and we are not after MBOL,
1025            or a future STCLASS check will fail this. */
1026       try_at_start:
1027         /* Even in this situation we may use MBOL flag if strpos is offset
1028            wrt the start of the string. */
1029         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1030             && (strpos != strbeg) && strpos[-1] != '\n'
1031             /* May be due to an implicit anchor of m{.*foo}  */
1032             && !(prog->intflags & PREGf_IMPLICIT))
1033         {
1034             t = strpos;
1035             goto find_anchor;
1036         }
1037         DEBUG_EXECUTE_r( if (ml_anch)
1038             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1039                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1040         );
1041       success_at_start:
1042         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1043             && (utf8_target ? (
1044                 prog->check_utf8                /* Could be deleted already */
1045                 && --BmUSEFUL(prog->check_utf8) < 0
1046                 && (prog->check_utf8 == prog->float_utf8)
1047             ) : (
1048                 prog->check_substr              /* Could be deleted already */
1049                 && --BmUSEFUL(prog->check_substr) < 0
1050                 && (prog->check_substr == prog->float_substr)
1051             )))
1052         {
1053             /* If flags & SOMETHING - do not do it many times on the same match */
1054             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1055             /* XXX Does the destruction order has to change with utf8_target? */
1056             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1057             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1058             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1059             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1060             check = NULL;                       /* abort */
1061             s = strpos;
1062             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1063                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1064             if (prog->intflags & PREGf_IMPLICIT)
1065                 prog->extflags &= ~RXf_ANCH_MBOL;
1066             /* XXXX This is a remnant of the old implementation.  It
1067                     looks wasteful, since now INTUIT can use many
1068                     other heuristics. */
1069             prog->extflags &= ~RXf_USE_INTUIT;
1070             /* XXXX What other flags might need to be cleared in this branch? */
1071         }
1072         else
1073             s = strpos;
1074     }
1075
1076     /* Last resort... */
1077     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1078     /* trie stclasses are too expensive to use here, we are better off to
1079        leave it to regmatch itself */
1080     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1081         /* minlen == 0 is possible if regstclass is \b or \B,
1082            and the fixed substr is ''$.
1083            Since minlen is already taken into account, s+1 is before strend;
1084            accidentally, minlen >= 1 guaranties no false positives at s + 1
1085            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1086            regstclass does not come from lookahead...  */
1087         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1088            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1089         const U8* const str = (U8*)STRING(progi->regstclass);
1090         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1091                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1092                     : 1);
1093         char * endpos;
1094         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1095             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1096         else if (prog->float_substr || prog->float_utf8)
1097             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1098         else
1099             endpos= strend;
1100
1101         if (checked_upto < s)
1102            checked_upto = s;
1103         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1104                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1105
1106         t = s;
1107         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1108         if (s) {
1109             checked_upto = s;
1110         } else {
1111 #ifdef DEBUGGING
1112             const char *what = NULL;
1113 #endif
1114             if (endpos == strend) {
1115                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116                                 "Could not match STCLASS...\n") );
1117                 goto fail;
1118             }
1119             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120                                    "This position contradicts STCLASS...\n") );
1121             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1122                 goto fail;
1123             checked_upto = HOPBACKc(endpos, start_shift);
1124             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1125                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1126             /* Contradict one of substrings */
1127             if (prog->anchored_substr || prog->anchored_utf8) {
1128                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1129                     DEBUG_EXECUTE_r( what = "anchored" );
1130                   hop_and_restart:
1131                     s = HOP3c(t, 1, strend);
1132                     if (s + start_shift + end_shift > strend) {
1133                         /* XXXX Should be taken into account earlier? */
1134                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1135                                                "Could not match STCLASS...\n") );
1136                         goto fail;
1137                     }
1138                     if (!check)
1139                         goto giveup;
1140                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141                                 "Looking for %s substr starting at offset %ld...\n",
1142                                  what, (long)(s + start_shift - i_strpos)) );
1143                     goto restart;
1144                 }
1145                 /* Have both, check_string is floating */
1146                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1147                     goto retry_floating_check;
1148                 /* Recheck anchored substring, but not floating... */
1149                 s = check_at;
1150                 if (!check)
1151                     goto giveup;
1152                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1153                           "Looking for anchored substr starting at offset %ld...\n",
1154                           (long)(other_last - i_strpos)) );
1155                 goto do_other_anchored;
1156             }
1157             /* Another way we could have checked stclass at the
1158                current position only: */
1159             if (ml_anch) {
1160                 s = t = t + 1;
1161                 if (!check)
1162                     goto giveup;
1163                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1164                           "Looking for /%s^%s/m starting at offset %ld...\n",
1165                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1166                 goto try_at_offset;
1167             }
1168             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1169                 goto fail;
1170             /* Check is floating substring. */
1171           retry_floating_check:
1172             t = check_at - start_shift;
1173             DEBUG_EXECUTE_r( what = "floating" );
1174             goto hop_and_restart;
1175         }
1176         if (t != s) {
1177             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1178                         "By STCLASS: moving %ld --> %ld\n",
1179                                   (long)(t - i_strpos), (long)(s - i_strpos))
1180                    );
1181         }
1182         else {
1183             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1184                                   "Does not contradict STCLASS...\n");
1185                    );
1186         }
1187     }
1188   giveup:
1189     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1190                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1191                           PL_colors[5], (long)(s - i_strpos)) );
1192     return s;
1193
1194   fail_finish:                          /* Substring not found */
1195     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1196         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1197   fail:
1198     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1199                           PL_colors[4], PL_colors[5]));
1200     return NULL;
1201 }
1202
1203 #define DECL_TRIE_TYPE(scan) \
1204     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1205                     trie_type = ((scan->flags == EXACT) \
1206                               ? (utf8_target ? trie_utf8 : trie_plain) \
1207                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1208
1209 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1210 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1211     STRLEN skiplen;                                                                 \
1212     switch (trie_type) {                                                            \
1213     case trie_utf8_fold:                                                            \
1214         if ( foldlen>0 ) {                                                          \
1215             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1216             foldlen -= len;                                                         \
1217             uscan += len;                                                           \
1218             len=0;                                                                  \
1219         } else {                                                                    \
1220             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1221             len = UTF8SKIP(uc);                                                     \
1222             skiplen = UNISKIP( uvc );                                               \
1223             foldlen -= skiplen;                                                     \
1224             uscan = foldbuf + skiplen;                                              \
1225         }                                                                           \
1226         break;                                                                      \
1227     case trie_latin_utf8_fold:                                                      \
1228         if ( foldlen>0 ) {                                                          \
1229             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1230             foldlen -= len;                                                         \
1231             uscan += len;                                                           \
1232             len=0;                                                                  \
1233         } else {                                                                    \
1234             len = 1;                                                                \
1235             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1236             skiplen = UNISKIP( uvc );                                               \
1237             foldlen -= skiplen;                                                     \
1238             uscan = foldbuf + skiplen;                                              \
1239         }                                                                           \
1240         break;                                                                      \
1241     case trie_utf8:                                                                 \
1242         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1243         break;                                                                      \
1244     case trie_plain:                                                                \
1245         uvc = (UV)*uc;                                                              \
1246         len = 1;                                                                    \
1247     }                                                                               \
1248     if (uvc < 256) {                                                                \
1249         charid = trie->charmap[ uvc ];                                              \
1250     }                                                                               \
1251     else {                                                                          \
1252         charid = 0;                                                                 \
1253         if (widecharmap) {                                                          \
1254             SV** const svpp = hv_fetch(widecharmap,                                 \
1255                         (char*)&uvc, sizeof(UV), 0);                                \
1256             if (svpp)                                                               \
1257                 charid = (U16)SvIV(*svpp);                                          \
1258         }                                                                           \
1259     }                                                                               \
1260 } STMT_END
1261
1262 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1263 STMT_START {                                              \
1264     while (s <= e) {                                      \
1265         if ( (CoNd)                                       \
1266              && (ln == 1 || folder(s, pat_string, ln))    \
1267              && (!reginfo || regtry(reginfo, &s)) )       \
1268             goto got_it;                                  \
1269         s++;                                              \
1270     }                                                     \
1271 } STMT_END
1272
1273 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1274 STMT_START {                                          \
1275     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1276         CoDe                                          \
1277         s += uskip;                                   \
1278     }                                                 \
1279 } STMT_END
1280
1281 #define REXEC_FBC_SCAN(CoDe)                          \
1282 STMT_START {                                          \
1283     while (s < strend) {                              \
1284         CoDe                                          \
1285         s++;                                          \
1286     }                                                 \
1287 } STMT_END
1288
1289 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1290 REXEC_FBC_UTF8_SCAN(                                  \
1291     if (CoNd) {                                       \
1292         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1293             goto got_it;                              \
1294         else                                          \
1295             tmp = doevery;                            \
1296     }                                                 \
1297     else                                              \
1298         tmp = 1;                                      \
1299 )
1300
1301 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1302 REXEC_FBC_SCAN(                                       \
1303     if (CoNd) {                                       \
1304         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1305             goto got_it;                              \
1306         else                                          \
1307             tmp = doevery;                            \
1308     }                                                 \
1309     else                                              \
1310         tmp = 1;                                      \
1311 )
1312
1313 #define REXEC_FBC_TRYIT               \
1314 if ((!reginfo || regtry(reginfo, &s))) \
1315     goto got_it
1316
1317 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1318     if (utf8_target) {                                             \
1319         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1320     }                                                          \
1321     else {                                                     \
1322         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1323     }
1324
1325 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1326     if (utf8_target) {                                             \
1327         UtFpReLoAd;                                            \
1328         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1329     }                                                          \
1330     else {                                                     \
1331         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1332     }
1333
1334 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1335     PL_reg_flags |= RF_tainted;                                \
1336     if (utf8_target) {                                             \
1337         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1338     }                                                          \
1339     else {                                                     \
1340         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1341     }
1342
1343 #define DUMP_EXEC_POS(li,s,doutf8) \
1344     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1345
1346
1347 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1348         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1349         tmp = TEST_NON_UTF8(tmp);                                              \
1350         REXEC_FBC_UTF8_SCAN(                                                   \
1351             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1352                 tmp = !tmp;                                                    \
1353                 IF_SUCCESS;                                                    \
1354             }                                                                  \
1355             else {                                                             \
1356                 IF_FAIL;                                                       \
1357             }                                                                  \
1358         );                                                                     \
1359
1360 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1361         if (s == PL_bostr) {                                                   \
1362             tmp = '\n';                                                        \
1363         }                                                                      \
1364         else {                                                                 \
1365             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1366             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1367         }                                                                      \
1368         tmp = TeSt1_UtF8;                                                      \
1369         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1370         REXEC_FBC_UTF8_SCAN(                                                   \
1371             if (tmp == ! (TeSt2_UtF8)) { \
1372                 tmp = !tmp;                                                    \
1373                 IF_SUCCESS;                                                    \
1374             }                                                                  \
1375             else {                                                             \
1376                 IF_FAIL;                                                       \
1377             }                                                                  \
1378         );                                                                     \
1379
1380 /* The only difference between the BOUND and NBOUND cases is that
1381  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1382  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1383  * with the other one being empty */
1384 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1385     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1386
1387 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1388     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1389
1390 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1391     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1392
1393 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1394     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1395
1396
1397 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1398  * be passed in completely with the variable name being tested, which isn't
1399  * such a clean interface, but this is easier to read than it was before.  We
1400  * are looking for the boundary (or non-boundary between a word and non-word
1401  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1402  * must be different.  Find the "wordness" of the character just prior to this
1403  * one, and compare it with the wordness of this one.  If they differ, we have
1404  * a boundary.  At the beginning of the string, pretend that the previous
1405  * character was a new-line */
1406 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1407     if (utf8_target) {                                                         \
1408                 UTF8_CODE \
1409     }                                                                          \
1410     else {  /* Not utf8 */                                                     \
1411         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1412         tmp = TEST_NON_UTF8(tmp);                                              \
1413         REXEC_FBC_SCAN(                                                        \
1414             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1415                 tmp = !tmp;                                                    \
1416                 IF_SUCCESS;                                                    \
1417             }                                                                  \
1418             else {                                                             \
1419                 IF_FAIL;                                                       \
1420             }                                                                  \
1421         );                                                                     \
1422     }                                                                          \
1423     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1424         goto got_it;
1425
1426 /* We know what class REx starts with.  Try to find this position... */
1427 /* if reginfo is NULL, its a dryrun */
1428 /* annoyingly all the vars in this routine have different names from their counterparts
1429    in regmatch. /grrr */
1430
1431 STATIC char *
1432 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1433     const char *strend, regmatch_info *reginfo)
1434 {
1435         dVAR;
1436         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1437         char *pat_string;   /* The pattern's exactish string */
1438         char *pat_end;      /* ptr to end char of pat_string */
1439         re_fold_t folder;       /* Function for computing non-utf8 folds */
1440         const U8 *fold_array;   /* array for folding ords < 256 */
1441         STRLEN ln;
1442         STRLEN lnc;
1443         STRLEN uskip;
1444         U8 c1;
1445         U8 c2;
1446         char *e;
1447         I32 tmp = 1;    /* Scratch variable? */
1448         const bool utf8_target = PL_reg_match_utf8;
1449         UV utf8_fold_flags = 0;
1450         RXi_GET_DECL(prog,progi);
1451
1452         PERL_ARGS_ASSERT_FIND_BYCLASS;
1453
1454         /* We know what class it must start with. */
1455         switch (OP(c)) {
1456         case ANYOF:
1457             if (utf8_target) {
1458                 STRLEN inclasslen = strend - s;
1459                 REXEC_FBC_UTF8_CLASS_SCAN(
1460                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1461             }
1462             else {
1463                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1464             }
1465             break;
1466         case CANY:
1467             REXEC_FBC_SCAN(
1468                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1469                     goto got_it;
1470                 else
1471                     tmp = doevery;
1472             );
1473             break;
1474
1475         case EXACTFA:
1476             if (UTF_PATTERN || utf8_target) {
1477                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1478                 goto do_exactf_utf8;
1479             }
1480             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1481             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1482             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1483
1484         case EXACTF:
1485             if (utf8_target) {
1486
1487                 /* regcomp.c already folded this if pattern is in UTF-8 */
1488                 utf8_fold_flags = 0;
1489                 goto do_exactf_utf8;
1490             }
1491             fold_array = PL_fold;
1492             folder = foldEQ;
1493             goto do_exactf_non_utf8;
1494
1495         case EXACTFL:
1496             if (UTF_PATTERN || utf8_target) {
1497                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1498                 goto do_exactf_utf8;
1499             }
1500             fold_array = PL_fold_locale;
1501             folder = foldEQ_locale;
1502             goto do_exactf_non_utf8;
1503
1504         case EXACTFU_SS:
1505             if (UTF_PATTERN) {
1506                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1507             }
1508             goto do_exactf_utf8;
1509
1510         case EXACTFU_TRICKYFOLD:
1511         case EXACTFU:
1512             if (UTF_PATTERN || utf8_target) {
1513                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1514                 goto do_exactf_utf8;
1515             }
1516
1517             /* Any 'ss' in the pattern should have been replaced by regcomp,
1518              * so we don't have to worry here about this single special case
1519              * in the Latin1 range */
1520             fold_array = PL_fold_latin1;
1521             folder = foldEQ_latin1;
1522
1523             /* FALL THROUGH */
1524
1525         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1526                                are no glitches with fold-length differences
1527                                between the target string and pattern */
1528
1529             /* The idea in the non-utf8 EXACTF* cases is to first find the
1530              * first character of the EXACTF* node and then, if necessary,
1531              * case-insensitively compare the full text of the node.  c1 is the
1532              * first character.  c2 is its fold.  This logic will not work for
1533              * Unicode semantics and the german sharp ss, which hence should
1534              * not be compiled into a node that gets here. */
1535             pat_string = STRING(c);
1536             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1537
1538             /* We know that we have to match at least 'ln' bytes (which is the
1539              * same as characters, since not utf8).  If we have to match 3
1540              * characters, and there are only 2 availabe, we know without
1541              * trying that it will fail; so don't start a match past the
1542              * required minimum number from the far end */
1543             e = HOP3c(strend, -((I32)ln), s);
1544
1545             if (!reginfo && e < s) {
1546                 e = s;                  /* Due to minlen logic of intuit() */
1547             }
1548
1549             c1 = *pat_string;
1550             c2 = fold_array[c1];
1551             if (c1 == c2) { /* If char and fold are the same */
1552                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1553             }
1554             else {
1555                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1556             }
1557             break;
1558
1559         do_exactf_utf8:
1560         {
1561             unsigned expansion;
1562
1563
1564             /* If one of the operands is in utf8, we can't use the simpler
1565              * folding above, due to the fact that many different characters
1566              * can have the same fold, or portion of a fold, or different-
1567              * length fold */
1568             pat_string = STRING(c);
1569             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1570             pat_end = pat_string + ln;
1571             lnc = (UTF_PATTERN) /* length to match in characters */
1572                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1573                     : ln;
1574
1575             /* We have 'lnc' characters to match in the pattern, but because of
1576              * multi-character folding, each character in the target can match
1577              * up to 3 characters (Unicode guarantees it will never exceed
1578              * this) if it is utf8-encoded; and up to 2 if not (based on the
1579              * fact that the Latin 1 folds are already determined, and the
1580              * only multi-char fold in that range is the sharp-s folding to
1581              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1582              * string character.  Adjust lnc accordingly, rounding up, so that
1583              * if we need to match at least 4+1/3 chars, that really is 5. */
1584             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1585             lnc = (lnc + expansion - 1) / expansion;
1586
1587             /* As in the non-UTF8 case, if we have to match 3 characters, and
1588              * only 2 are left, it's guaranteed to fail, so don't start a
1589              * match that would require us to go beyond the end of the string
1590              */
1591             e = HOP3c(strend, -((I32)lnc), s);
1592
1593             if (!reginfo && e < s) {
1594                 e = s;                  /* Due to minlen logic of intuit() */
1595             }
1596
1597             /* XXX Note that we could recalculate e to stop the loop earlier,
1598              * as the worst case expansion above will rarely be met, and as we
1599              * go along we would usually find that e moves further to the left.
1600              * This would happen only after we reached the point in the loop
1601              * where if there were no expansion we should fail.  Unclear if
1602              * worth the expense */
1603
1604             while (s <= e) {
1605                 char *my_strend= (char *)strend;
1606                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1607                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1608                     && (!reginfo || regtry(reginfo, &s)) )
1609                 {
1610                     goto got_it;
1611                 }
1612                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1613             }
1614             break;
1615         }
1616         case BOUNDL:
1617             PL_reg_flags |= RF_tainted;
1618             FBC_BOUND(isALNUM_LC,
1619                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1620                       isALNUM_LC_utf8((U8*)s));
1621             break;
1622         case NBOUNDL:
1623             PL_reg_flags |= RF_tainted;
1624             FBC_NBOUND(isALNUM_LC,
1625                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1626                        isALNUM_LC_utf8((U8*)s));
1627             break;
1628         case BOUND:
1629             FBC_BOUND(isWORDCHAR,
1630                       isALNUM_uni(tmp),
1631                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1632             break;
1633         case BOUNDA:
1634             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1635                              isWORDCHAR_A(tmp),
1636                              isWORDCHAR_A((U8*)s));
1637             break;
1638         case NBOUND:
1639             FBC_NBOUND(isWORDCHAR,
1640                        isALNUM_uni(tmp),
1641                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1642             break;
1643         case NBOUNDA:
1644             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1645                               isWORDCHAR_A(tmp),
1646                               isWORDCHAR_A((U8*)s));
1647             break;
1648         case BOUNDU:
1649             FBC_BOUND(isWORDCHAR_L1,
1650                       isALNUM_uni(tmp),
1651                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1652             break;
1653         case NBOUNDU:
1654             FBC_NBOUND(isWORDCHAR_L1,
1655                        isALNUM_uni(tmp),
1656                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657             break;
1658         case ALNUML:
1659             REXEC_FBC_CSCAN_TAINT(
1660                 isALNUM_LC_utf8((U8*)s),
1661                 isALNUM_LC(*s)
1662             );
1663             break;
1664         case ALNUMU:
1665             REXEC_FBC_CSCAN_PRELOAD(
1666                 LOAD_UTF8_CHARCLASS_ALNUM(),
1667                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1668                 isWORDCHAR_L1((U8) *s)
1669             );
1670             break;
1671         case ALNUM:
1672             REXEC_FBC_CSCAN_PRELOAD(
1673                 LOAD_UTF8_CHARCLASS_ALNUM(),
1674                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1675                 isWORDCHAR((U8) *s)
1676             );
1677             break;
1678         case ALNUMA:
1679             /* Don't need to worry about utf8, as it can match only a single
1680              * byte invariant character */
1681             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1682             break;
1683         case NALNUMU:
1684             REXEC_FBC_CSCAN_PRELOAD(
1685                 LOAD_UTF8_CHARCLASS_ALNUM(),
1686                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1687                 ! isWORDCHAR_L1((U8) *s)
1688             );
1689             break;
1690         case NALNUM:
1691             REXEC_FBC_CSCAN_PRELOAD(
1692                 LOAD_UTF8_CHARCLASS_ALNUM(),
1693                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1694                 ! isALNUM(*s)
1695             );
1696             break;
1697         case NALNUMA:
1698             REXEC_FBC_CSCAN(
1699                 !isWORDCHAR_A(*s),
1700                 !isWORDCHAR_A(*s)
1701             );
1702             break;
1703         case NALNUML:
1704             REXEC_FBC_CSCAN_TAINT(
1705                 !isALNUM_LC_utf8((U8*)s),
1706                 !isALNUM_LC(*s)
1707             );
1708             break;
1709         case SPACEU:
1710             REXEC_FBC_CSCAN_PRELOAD(
1711                 LOAD_UTF8_CHARCLASS_SPACE(),
1712                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1713                 isSPACE_L1((U8) *s)
1714             );
1715             break;
1716         case SPACE:
1717             REXEC_FBC_CSCAN_PRELOAD(
1718                 LOAD_UTF8_CHARCLASS_SPACE(),
1719                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1720                 isSPACE((U8) *s)
1721             );
1722             break;
1723         case SPACEA:
1724             /* Don't need to worry about utf8, as it can match only a single
1725              * byte invariant character */
1726             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1727             break;
1728         case SPACEL:
1729             REXEC_FBC_CSCAN_TAINT(
1730                 isSPACE_LC_utf8((U8*)s),
1731                 isSPACE_LC(*s)
1732             );
1733             break;
1734         case NSPACEU:
1735             REXEC_FBC_CSCAN_PRELOAD(
1736                 LOAD_UTF8_CHARCLASS_SPACE(),
1737                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1738                 ! isSPACE_L1((U8) *s)
1739             );
1740             break;
1741         case NSPACE:
1742             REXEC_FBC_CSCAN_PRELOAD(
1743                 LOAD_UTF8_CHARCLASS_SPACE(),
1744                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1745                 ! isSPACE((U8) *s)
1746             );
1747             break;
1748         case NSPACEA:
1749             REXEC_FBC_CSCAN(
1750                 !isSPACE_A(*s),
1751                 !isSPACE_A(*s)
1752             );
1753             break;
1754         case NSPACEL:
1755             REXEC_FBC_CSCAN_TAINT(
1756                 !isSPACE_LC_utf8((U8*)s),
1757                 !isSPACE_LC(*s)
1758             );
1759             break;
1760         case DIGIT:
1761             REXEC_FBC_CSCAN_PRELOAD(
1762                 LOAD_UTF8_CHARCLASS_DIGIT(),
1763                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1764                 isDIGIT(*s)
1765             );
1766             break;
1767         case DIGITA:
1768             /* Don't need to worry about utf8, as it can match only a single
1769              * byte invariant character */
1770             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1771             break;
1772         case DIGITL:
1773             REXEC_FBC_CSCAN_TAINT(
1774                 isDIGIT_LC_utf8((U8*)s),
1775                 isDIGIT_LC(*s)
1776             );
1777             break;
1778         case NDIGIT:
1779             REXEC_FBC_CSCAN_PRELOAD(
1780                 LOAD_UTF8_CHARCLASS_DIGIT(),
1781                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1782                 !isDIGIT(*s)
1783             );
1784             break;
1785         case NDIGITA:
1786             REXEC_FBC_CSCAN(
1787                 !isDIGIT_A(*s),
1788                 !isDIGIT_A(*s)
1789             );
1790             break;
1791         case NDIGITL:
1792             REXEC_FBC_CSCAN_TAINT(
1793                 !isDIGIT_LC_utf8((U8*)s),
1794                 !isDIGIT_LC(*s)
1795             );
1796             break;
1797         case LNBREAK:
1798             REXEC_FBC_CSCAN(
1799                 is_LNBREAK_utf8_safe(s, strend),
1800                 is_LNBREAK_latin1_safe(s, strend)
1801             );
1802             break;
1803         case VERTWS:
1804             REXEC_FBC_CSCAN(
1805                 is_VERTWS_utf8_safe(s, strend),
1806                 is_VERTWS_latin1_safe(s, strend)
1807             );
1808             break;
1809         case NVERTWS:
1810             REXEC_FBC_CSCAN(
1811                 !is_VERTWS_utf8_safe(s, strend),
1812                 !is_VERTWS_latin1_safe(s, strend)
1813             );
1814             break;
1815         case HORIZWS:
1816             REXEC_FBC_CSCAN(
1817                 is_HORIZWS_utf8_safe(s, strend),
1818                 is_HORIZWS_latin1_safe(s, strend)
1819             );
1820             break;
1821         case NHORIZWS:
1822             REXEC_FBC_CSCAN(
1823                 !is_HORIZWS_utf8_safe(s, strend),
1824                 !is_HORIZWS_latin1_safe(s, strend)
1825             );
1826             break;
1827         case POSIXA:
1828             /* Don't need to worry about utf8, as it can match only a single
1829             * byte invariant character.  The flag in this node type is the
1830             * class number to pass to _generic_isCC() to build a mask for
1831             * searching in PL_charclass[] */
1832             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1833             break;
1834         case NPOSIXA:
1835             REXEC_FBC_CSCAN(
1836                 !_generic_isCC_A(*s, FLAGS(c)),
1837                 !_generic_isCC_A(*s, FLAGS(c))
1838             );
1839             break;
1840
1841         case AHOCORASICKC:
1842         case AHOCORASICK:
1843             {
1844                 DECL_TRIE_TYPE(c);
1845                 /* what trie are we using right now */
1846                 reg_ac_data *aho
1847                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1848                 reg_trie_data *trie
1849                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1850                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1851
1852                 const char *last_start = strend - trie->minlen;
1853 #ifdef DEBUGGING
1854                 const char *real_start = s;
1855 #endif
1856                 STRLEN maxlen = trie->maxlen;
1857                 SV *sv_points;
1858                 U8 **points; /* map of where we were in the input string
1859                                 when reading a given char. For ASCII this
1860                                 is unnecessary overhead as the relationship
1861                                 is always 1:1, but for Unicode, especially
1862                                 case folded Unicode this is not true. */
1863                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1864                 U8 *bitmap=NULL;
1865
1866
1867                 GET_RE_DEBUG_FLAGS_DECL;
1868
1869                 /* We can't just allocate points here. We need to wrap it in
1870                  * an SV so it gets freed properly if there is a croak while
1871                  * running the match */
1872                 ENTER;
1873                 SAVETMPS;
1874                 sv_points=newSV(maxlen * sizeof(U8 *));
1875                 SvCUR_set(sv_points,
1876                     maxlen * sizeof(U8 *));
1877                 SvPOK_on(sv_points);
1878                 sv_2mortal(sv_points);
1879                 points=(U8**)SvPV_nolen(sv_points );
1880                 if ( trie_type != trie_utf8_fold
1881                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1882                 {
1883                     if (trie->bitmap)
1884                         bitmap=(U8*)trie->bitmap;
1885                     else
1886                         bitmap=(U8*)ANYOF_BITMAP(c);
1887                 }
1888                 /* this is the Aho-Corasick algorithm modified a touch
1889                    to include special handling for long "unknown char"
1890                    sequences. The basic idea being that we use AC as long
1891                    as we are dealing with a possible matching char, when
1892                    we encounter an unknown char (and we have not encountered
1893                    an accepting state) we scan forward until we find a legal
1894                    starting char.
1895                    AC matching is basically that of trie matching, except
1896                    that when we encounter a failing transition, we fall back
1897                    to the current states "fail state", and try the current char
1898                    again, a process we repeat until we reach the root state,
1899                    state 1, or a legal transition. If we fail on the root state
1900                    then we can either terminate if we have reached an accepting
1901                    state previously, or restart the entire process from the beginning
1902                    if we have not.
1903
1904                  */
1905                 while (s <= last_start) {
1906                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1907                     U8 *uc = (U8*)s;
1908                     U16 charid = 0;
1909                     U32 base = 1;
1910                     U32 state = 1;
1911                     UV uvc = 0;
1912                     STRLEN len = 0;
1913                     STRLEN foldlen = 0;
1914                     U8 *uscan = (U8*)NULL;
1915                     U8 *leftmost = NULL;
1916 #ifdef DEBUGGING
1917                     U32 accepted_word= 0;
1918 #endif
1919                     U32 pointpos = 0;
1920
1921                     while ( state && uc <= (U8*)strend ) {
1922                         int failed=0;
1923                         U32 word = aho->states[ state ].wordnum;
1924
1925                         if( state==1 ) {
1926                             if ( bitmap ) {
1927                                 DEBUG_TRIE_EXECUTE_r(
1928                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1929                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1930                                             (char *)uc, utf8_target );
1931                                         PerlIO_printf( Perl_debug_log,
1932                                             " Scanning for legal start char...\n");
1933                                     }
1934                                 );
1935                                 if (utf8_target) {
1936                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1937                                         uc += UTF8SKIP(uc);
1938                                     }
1939                                 } else {
1940                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1941                                         uc++;
1942                                     }
1943                                 }
1944                                 s= (char *)uc;
1945                             }
1946                             if (uc >(U8*)last_start) break;
1947                         }
1948
1949                         if ( word ) {
1950                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1951                             if (!leftmost || lpos < leftmost) {
1952                                 DEBUG_r(accepted_word=word);
1953                                 leftmost= lpos;
1954                             }
1955                             if (base==0) break;
1956
1957                         }
1958                         points[pointpos++ % maxlen]= uc;
1959                         if (foldlen || uc < (U8*)strend) {
1960                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1961                                              widecharmap, uc,
1962                                              uscan, len, uvc, charid, foldlen,
1963                                              foldbuf, uniflags);
1964                             DEBUG_TRIE_EXECUTE_r({
1965                                 dump_exec_pos( (char *)uc, c, strend,
1966                                             real_start, s, utf8_target);
1967                                 PerlIO_printf(Perl_debug_log,
1968                                     " Charid:%3u CP:%4"UVxf" ",
1969                                      charid, uvc);
1970                             });
1971                         }
1972                         else {
1973                             len = 0;
1974                             charid = 0;
1975                         }
1976
1977
1978                         do {
1979 #ifdef DEBUGGING
1980                             word = aho->states[ state ].wordnum;
1981 #endif
1982                             base = aho->states[ state ].trans.base;
1983
1984                             DEBUG_TRIE_EXECUTE_r({
1985                                 if (failed)
1986                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1987                                         s,   utf8_target );
1988                                 PerlIO_printf( Perl_debug_log,
1989                                     "%sState: %4"UVxf", word=%"UVxf,
1990                                     failed ? " Fail transition to " : "",
1991                                     (UV)state, (UV)word);
1992                             });
1993                             if ( base ) {
1994                                 U32 tmp;
1995                                 I32 offset;
1996                                 if (charid &&
1997                                      ( ((offset = base + charid
1998                                         - 1 - trie->uniquecharcount)) >= 0)
1999                                      && ((U32)offset < trie->lasttrans)
2000                                      && trie->trans[offset].check == state
2001                                      && (tmp=trie->trans[offset].next))
2002                                 {
2003                                     DEBUG_TRIE_EXECUTE_r(
2004                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2005                                     state = tmp;
2006                                     break;
2007                                 }
2008                                 else {
2009                                     DEBUG_TRIE_EXECUTE_r(
2010                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2011                                     failed = 1;
2012                                     state = aho->fail[state];
2013                                 }
2014                             }
2015                             else {
2016                                 /* we must be accepting here */
2017                                 DEBUG_TRIE_EXECUTE_r(
2018                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2019                                 failed = 1;
2020                                 break;
2021                             }
2022                         } while(state);
2023                         uc += len;
2024                         if (failed) {
2025                             if (leftmost)
2026                                 break;
2027                             if (!state) state = 1;
2028                         }
2029                     }
2030                     if ( aho->states[ state ].wordnum ) {
2031                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2032                         if (!leftmost || lpos < leftmost) {
2033                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2034                             leftmost = lpos;
2035                         }
2036                     }
2037                     if (leftmost) {
2038                         s = (char*)leftmost;
2039                         DEBUG_TRIE_EXECUTE_r({
2040                             PerlIO_printf(
2041                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2042                                 (UV)accepted_word, (IV)(s - real_start)
2043                             );
2044                         });
2045                         if (!reginfo || regtry(reginfo, &s)) {
2046                             FREETMPS;
2047                             LEAVE;
2048                             goto got_it;
2049                         }
2050                         s = HOPc(s,1);
2051                         DEBUG_TRIE_EXECUTE_r({
2052                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2053                         });
2054                     } else {
2055                         DEBUG_TRIE_EXECUTE_r(
2056                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2057                         break;
2058                     }
2059                 }
2060                 FREETMPS;
2061                 LEAVE;
2062             }
2063             break;
2064         default:
2065             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2066             break;
2067         }
2068         return 0;
2069       got_it:
2070         return s;
2071 }
2072
2073
2074 /*
2075  - regexec_flags - match a regexp against a string
2076  */
2077 I32
2078 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2079               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2080 /* stringarg: the point in the string at which to begin matching */
2081 /* strend:    pointer to null at end of string */
2082 /* strbeg:    real beginning of string */
2083 /* minend:    end of match must be >= minend bytes after stringarg. */
2084 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2085  *            itself is accessed via the pointers above */
2086 /* data:      May be used for some additional optimizations.
2087               Currently its only used, with a U32 cast, for transmitting
2088               the ganch offset when doing a /g match. This will change */
2089 /* nosave:    For optimizations. */
2090
2091 {
2092     dVAR;
2093     struct regexp *const prog = (struct regexp *)SvANY(rx);
2094     /*register*/ char *s;
2095     regnode *c;
2096     /*register*/ char *startpos = stringarg;
2097     I32 minlen;         /* must match at least this many chars */
2098     I32 dontbother = 0; /* how many characters not to try at end */
2099     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2100     I32 scream_pos = -1;                /* Internal iterator of scream. */
2101     char *scream_olds = NULL;
2102     const bool utf8_target = cBOOL(DO_UTF8(sv));
2103     I32 multiline;
2104     RXi_GET_DECL(prog,progi);
2105     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2106     regexp_paren_pair *swap = NULL;
2107     GET_RE_DEBUG_FLAGS_DECL;
2108
2109     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2110     PERL_UNUSED_ARG(data);
2111
2112     /* Be paranoid... */
2113     if (prog == NULL || startpos == NULL) {
2114         Perl_croak(aTHX_ "NULL regexp parameter");
2115         return 0;
2116     }
2117
2118     multiline = prog->extflags & RXf_PMf_MULTILINE;
2119     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2120
2121     RX_MATCH_UTF8_set(rx, utf8_target);
2122     DEBUG_EXECUTE_r(
2123         debug_start_match(rx, utf8_target, startpos, strend,
2124         "Matching");
2125     );
2126
2127     minlen = prog->minlen;
2128
2129     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2130         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2131                               "String too short [regexec_flags]...\n"));
2132         goto phooey;
2133     }
2134
2135
2136     /* Check validity of program. */
2137     if (UCHARAT(progi->program) != REG_MAGIC) {
2138         Perl_croak(aTHX_ "corrupted regexp program");
2139     }
2140
2141     PL_reg_flags = 0;
2142     PL_reg_state.re_state_eval_setup_done = FALSE;
2143     PL_reg_maxiter = 0;
2144
2145     if (RX_UTF8(rx))
2146         PL_reg_flags |= RF_utf8;
2147
2148     /* Mark beginning of line for ^ and lookbehind. */
2149     reginfo.bol = startpos; /* XXX not used ??? */
2150     PL_bostr  = strbeg;
2151     reginfo.sv = sv;
2152
2153     /* Mark end of line for $ (and such) */
2154     PL_regeol = strend;
2155
2156     /* see how far we have to get to not match where we matched before */
2157     reginfo.till = startpos+minend;
2158
2159     /* If there is a "must appear" string, look for it. */
2160     s = startpos;
2161
2162     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2163         MAGIC *mg;
2164         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2165             reginfo.ganch = startpos + prog->gofs;
2166             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2167               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2168         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2169                   && SvMAGIC(sv)
2170                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2171                   && mg->mg_len >= 0) {
2172             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2173             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2174                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2175
2176             if (prog->extflags & RXf_ANCH_GPOS) {
2177                 if (s > reginfo.ganch)
2178                     goto phooey;
2179                 s = reginfo.ganch - prog->gofs;
2180                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2181                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2182                 if (s < strbeg)
2183                     goto phooey;
2184             }
2185         }
2186         else if (data) {
2187             reginfo.ganch = strbeg + PTR2UV(data);
2188             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2190
2191         } else {                                /* pos() not defined */
2192             reginfo.ganch = strbeg;
2193             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194                  "GPOS: reginfo.ganch = strbeg\n"));
2195         }
2196     }
2197     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2198         /* We have to be careful. If the previous successful match
2199            was from this regex we don't want a subsequent partially
2200            successful match to clobber the old results.
2201            So when we detect this possibility we add a swap buffer
2202            to the re, and switch the buffer each match. If we fail
2203            we switch it back, otherwise we leave it swapped.
2204         */
2205         swap = prog->offs;
2206         /* do we need a save destructor here for eval dies? */
2207         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2208         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2209             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2210             PTR2UV(prog),
2211             PTR2UV(swap),
2212             PTR2UV(prog->offs)
2213         ));
2214     }
2215     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2216         re_scream_pos_data d;
2217
2218         d.scream_olds = &scream_olds;
2219         d.scream_pos = &scream_pos;
2220         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2221         if (!s) {
2222             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2223             goto phooey;        /* not present */
2224         }
2225     }
2226
2227
2228
2229     /* Simplest case:  anchored match need be tried only once. */
2230     /*  [unless only anchor is BOL and multiline is set] */
2231     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2232         if (s == startpos && regtry(&reginfo, &startpos))
2233             goto got_it;
2234         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2235                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2236         {
2237             char *end;
2238
2239             if (minlen)
2240                 dontbother = minlen - 1;
2241             end = HOP3c(strend, -dontbother, strbeg) - 1;
2242             /* for multiline we only have to try after newlines */
2243             if (prog->check_substr || prog->check_utf8) {
2244                 /* because of the goto we can not easily reuse the macros for bifurcating the
2245                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2246                 if (utf8_target) {
2247                     if (s == startpos)
2248                         goto after_try_utf8;
2249                     while (1) {
2250                         if (regtry(&reginfo, &s)) {
2251                             goto got_it;
2252                         }
2253                       after_try_utf8:
2254                         if (s > end) {
2255                             goto phooey;
2256                         }
2257                         if (prog->extflags & RXf_USE_INTUIT) {
2258                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2259                             if (!s) {
2260                                 goto phooey;
2261                             }
2262                         }
2263                         else {
2264                             s += UTF8SKIP(s);
2265                         }
2266                     }
2267                 } /* end search for check string in unicode */
2268                 else {
2269                     if (s == startpos) {
2270                         goto after_try_latin;
2271                     }
2272                     while (1) {
2273                         if (regtry(&reginfo, &s)) {
2274                             goto got_it;
2275                         }
2276                       after_try_latin:
2277                         if (s > end) {
2278                             goto phooey;
2279                         }
2280                         if (prog->extflags & RXf_USE_INTUIT) {
2281                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2282                             if (!s) {
2283                                 goto phooey;
2284                             }
2285                         }
2286                         else {
2287                             s++;
2288                         }
2289                     }
2290                 } /* end search for check string in latin*/
2291             } /* end search for check string */
2292             else { /* search for newline */
2293                 if (s > startpos) {
2294                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2295                     s--;
2296                 }
2297                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2298                 while (s <= end) { /* note it could be possible to match at the end of the string */
2299                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2300                         if (regtry(&reginfo, &s))
2301                             goto got_it;
2302                     }
2303                 }
2304             } /* end search for newline */
2305         } /* end anchored/multiline check string search */
2306         goto phooey;
2307     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2308     {
2309         /* the warning about reginfo.ganch being used without initialization
2310            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2311            and we only enter this block when the same bit is set. */
2312         char *tmp_s = reginfo.ganch - prog->gofs;
2313
2314         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2315             goto got_it;
2316         goto phooey;
2317     }
2318
2319     /* Messy cases:  unanchored match. */
2320     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2321         /* we have /x+whatever/ */
2322         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2323         char ch;
2324 #ifdef DEBUGGING
2325         int did_match = 0;
2326 #endif
2327         if (utf8_target) {
2328             if (! prog->anchored_utf8) {
2329                 to_utf8_substr(prog);
2330             }
2331             ch = SvPVX_const(prog->anchored_utf8)[0];
2332             REXEC_FBC_SCAN(
2333                 if (*s == ch) {
2334                     DEBUG_EXECUTE_r( did_match = 1 );
2335                     if (regtry(&reginfo, &s)) goto got_it;
2336                     s += UTF8SKIP(s);
2337                     while (s < strend && *s == ch)
2338                         s += UTF8SKIP(s);
2339                 }
2340             );
2341
2342         }
2343         else {
2344             if (! prog->anchored_substr) {
2345                 if (! to_byte_substr(prog)) {
2346                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2347                                             non_utf8_target_but_utf8_required));
2348                     goto phooey;
2349                 }
2350             }
2351             ch = SvPVX_const(prog->anchored_substr)[0];
2352             REXEC_FBC_SCAN(
2353                 if (*s == ch) {
2354                     DEBUG_EXECUTE_r( did_match = 1 );
2355                     if (regtry(&reginfo, &s)) goto got_it;
2356                     s++;
2357                     while (s < strend && *s == ch)
2358                         s++;
2359                 }
2360             );
2361         }
2362         DEBUG_EXECUTE_r(if (!did_match)
2363                 PerlIO_printf(Perl_debug_log,
2364                                   "Did not find anchored character...\n")
2365                );
2366     }
2367     else if (prog->anchored_substr != NULL
2368               || prog->anchored_utf8 != NULL
2369               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2370                   && prog->float_max_offset < strend - s)) {
2371         SV *must;
2372         I32 back_max;
2373         I32 back_min;
2374         char *last;
2375         char *last1;            /* Last position checked before */
2376 #ifdef DEBUGGING
2377         int did_match = 0;
2378 #endif
2379         if (prog->anchored_substr || prog->anchored_utf8) {
2380             if (utf8_target) {
2381                 if (! prog->anchored_utf8) {
2382                     to_utf8_substr(prog);
2383                 }
2384                 must = prog->anchored_utf8;
2385             }
2386             else {
2387                 if (! prog->anchored_substr) {
2388                     if (! to_byte_substr(prog)) {
2389                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2390                                             non_utf8_target_but_utf8_required));
2391                         goto phooey;
2392                     }
2393                 }
2394                 must = prog->anchored_substr;
2395             }
2396             back_max = back_min = prog->anchored_offset;
2397         } else {
2398             if (utf8_target) {
2399                 if (! prog->float_utf8) {
2400                     to_utf8_substr(prog);
2401                 }
2402                 must = prog->float_utf8;
2403             }
2404             else {
2405                 if (! prog->float_substr) {
2406                     if (! to_byte_substr(prog)) {
2407                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2408                                             non_utf8_target_but_utf8_required));
2409                         goto phooey;
2410                     }
2411                 }
2412                 must = prog->float_substr;
2413             }
2414             back_max = prog->float_max_offset;
2415             back_min = prog->float_min_offset;
2416         }
2417
2418         if (back_min<0) {
2419             last = strend;
2420         } else {
2421             last = HOP3c(strend,        /* Cannot start after this */
2422                   -(I32)(CHR_SVLEN(must)
2423                          - (SvTAIL(must) != 0) + back_min), strbeg);
2424         }
2425         if (s > PL_bostr)
2426             last1 = HOPc(s, -1);
2427         else
2428             last1 = s - 1;      /* bogus */
2429
2430         /* XXXX check_substr already used to find "s", can optimize if
2431            check_substr==must. */
2432         scream_pos = -1;
2433         dontbother = end_shift;
2434         strend = HOPc(strend, -dontbother);
2435         while ( (s <= last) &&
2436                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2437                                   (unsigned char*)strend, must,
2438                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2439             DEBUG_EXECUTE_r( did_match = 1 );
2440             if (HOPc(s, -back_max) > last1) {
2441                 last1 = HOPc(s, -back_min);
2442                 s = HOPc(s, -back_max);
2443             }
2444             else {
2445                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2446
2447                 last1 = HOPc(s, -back_min);
2448                 s = t;
2449             }
2450             if (utf8_target) {
2451                 while (s <= last1) {
2452                     if (regtry(&reginfo, &s))
2453                         goto got_it;
2454                     if (s >= last1) {
2455                         s++; /* to break out of outer loop */
2456                         break;
2457                     }
2458                     s += UTF8SKIP(s);
2459                 }
2460             }
2461             else {
2462                 while (s <= last1) {
2463                     if (regtry(&reginfo, &s))
2464                         goto got_it;
2465                     s++;
2466                 }
2467             }
2468         }
2469         DEBUG_EXECUTE_r(if (!did_match) {
2470             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2471                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2472             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2473                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2474                                ? "anchored" : "floating"),
2475                 quoted, RE_SV_TAIL(must));
2476         });
2477         goto phooey;
2478     }
2479     else if ( (c = progi->regstclass) ) {
2480         if (minlen) {
2481             const OPCODE op = OP(progi->regstclass);
2482             /* don't bother with what can't match */
2483             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2484                 strend = HOPc(strend, -(minlen - 1));
2485         }
2486         DEBUG_EXECUTE_r({
2487             SV * const prop = sv_newmortal();
2488             regprop(prog, prop, c);
2489             {
2490                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2491                     s,strend-s,60);
2492                 PerlIO_printf(Perl_debug_log,
2493                     "Matching stclass %.*s against %s (%d bytes)\n",
2494                     (int)SvCUR(prop), SvPVX_const(prop),
2495                      quoted, (int)(strend - s));
2496             }
2497         });
2498         if (find_byclass(prog, c, s, strend, &reginfo))
2499             goto got_it;
2500         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2501     }
2502     else {
2503         dontbother = 0;
2504         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2505             /* Trim the end. */
2506             char *last= NULL;
2507             SV* float_real;
2508             STRLEN len;
2509             const char *little;
2510
2511             if (utf8_target) {
2512                 if (! prog->float_utf8) {
2513                     to_utf8_substr(prog);
2514                 }
2515                 float_real = prog->float_utf8;
2516             }
2517             else {
2518                 if (! prog->float_substr) {
2519                     if (! to_byte_substr(prog)) {
2520                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2521                                             non_utf8_target_but_utf8_required));
2522                         goto phooey;
2523                     }
2524                 }
2525                 float_real = prog->float_substr;
2526             }
2527
2528             little = SvPV_const(float_real, len);
2529             if (SvTAIL(float_real)) {
2530                     /* This means that float_real contains an artificial \n on
2531                      * the end due to the presence of something like this:
2532                      * /foo$/ where we can match both "foo" and "foo\n" at the
2533                      * end of the string.  So we have to compare the end of the
2534                      * string first against the float_real without the \n and
2535                      * then against the full float_real with the string.  We
2536                      * have to watch out for cases where the string might be
2537                      * smaller than the float_real or the float_real without
2538                      * the \n. */
2539                     char *checkpos= strend - len;
2540                     DEBUG_OPTIMISE_r(
2541                         PerlIO_printf(Perl_debug_log,
2542                             "%sChecking for float_real.%s\n",
2543                             PL_colors[4], PL_colors[5]));
2544                     if (checkpos + 1 < strbeg) {
2545                         /* can't match, even if we remove the trailing \n
2546                          * string is too short to match */
2547                         DEBUG_EXECUTE_r(
2548                             PerlIO_printf(Perl_debug_log,
2549                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2550                                 PL_colors[4], PL_colors[5]));
2551                         goto phooey;
2552                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2553                         /* can match, the end of the string matches without the
2554                          * "\n" */
2555                         last = checkpos + 1;
2556                     } else if (checkpos < strbeg) {
2557                         /* cant match, string is too short when the "\n" is
2558                          * included */
2559                         DEBUG_EXECUTE_r(
2560                             PerlIO_printf(Perl_debug_log,
2561                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2562                                 PL_colors[4], PL_colors[5]));
2563                         goto phooey;
2564                     } else if (!multiline) {
2565                         /* non multiline match, so compare with the "\n" at the
2566                          * end of the string */
2567                         if (memEQ(checkpos, little, len)) {
2568                             last= checkpos;
2569                         } else {
2570                             DEBUG_EXECUTE_r(
2571                                 PerlIO_printf(Perl_debug_log,
2572                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2573                                     PL_colors[4], PL_colors[5]));
2574                             goto phooey;
2575                         }
2576                     } else {
2577                         /* multiline match, so we have to search for a place
2578                          * where the full string is located */
2579                         goto find_last;
2580                     }
2581             } else {
2582                   find_last:
2583                     if (len)
2584                         last = rninstr(s, strend, little, little + len);
2585                     else
2586                         last = strend;  /* matching "$" */
2587             }
2588             if (!last) {
2589                 /* at one point this block contained a comment which was
2590                  * probably incorrect, which said that this was a "should not
2591                  * happen" case.  Even if it was true when it was written I am
2592                  * pretty sure it is not anymore, so I have removed the comment
2593                  * and replaced it with this one. Yves */
2594                 DEBUG_EXECUTE_r(
2595                     PerlIO_printf(Perl_debug_log,
2596                         "String does not contain required substring, cannot match.\n"
2597                     ));
2598                 goto phooey;
2599             }
2600             dontbother = strend - last + prog->float_min_offset;
2601         }
2602         if (minlen && (dontbother < minlen))
2603             dontbother = minlen - 1;
2604         strend -= dontbother;              /* this one's always in bytes! */
2605         /* We don't know much -- general case. */
2606         if (utf8_target) {
2607             for (;;) {
2608                 if (regtry(&reginfo, &s))
2609                     goto got_it;
2610                 if (s >= strend)
2611                     break;
2612                 s += UTF8SKIP(s);
2613             };
2614         }
2615         else {
2616             do {
2617                 if (regtry(&reginfo, &s))
2618                     goto got_it;
2619             } while (s++ < strend);
2620         }
2621     }
2622
2623     /* Failure. */
2624     goto phooey;
2625
2626 got_it:
2627     DEBUG_BUFFERS_r(
2628         if (swap)
2629             PerlIO_printf(Perl_debug_log,
2630                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2631                 PTR2UV(prog),
2632                 PTR2UV(swap)
2633             );
2634     );
2635     Safefree(swap);
2636     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2637
2638     if (PL_reg_state.re_state_eval_setup_done)
2639         restore_pos(aTHX_ prog);
2640     if (RXp_PAREN_NAMES(prog))
2641         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2642
2643     /* make sure $`, $&, $', and $digit will work later */
2644     if ( !(flags & REXEC_NOT_FIRST) ) {
2645         if (flags & REXEC_COPY_STR) {
2646 #ifdef PERL_OLD_COPY_ON_WRITE
2647             if ((SvIsCOW(sv)
2648                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2649                 if (DEBUG_C_TEST) {
2650                     PerlIO_printf(Perl_debug_log,
2651                                   "Copy on write: regexp capture, type %d\n",
2652                                   (int) SvTYPE(sv));
2653                 }
2654                 RX_MATCH_COPY_FREE(rx);
2655                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2656                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2657                 assert (SvPOKp(prog->saved_copy));
2658                 prog->sublen  = PL_regeol - strbeg;
2659                 prog->suboffset = 0;
2660                 prog->subcoffset = 0;
2661             } else
2662 #endif
2663             {
2664                 I32 min = 0;
2665                 I32 max = PL_regeol - strbeg;
2666                 I32 sublen;
2667
2668                 if (    (flags & REXEC_COPY_SKIP_POST)
2669                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2670                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2671                 ) { /* don't copy $' part of string */
2672                     U32 n = 0;
2673                     max = -1;
2674                     /* calculate the right-most part of the string covered
2675                      * by a capture. Due to look-ahead, this may be to
2676                      * the right of $&, so we have to scan all captures */
2677                     while (n <= prog->lastparen) {
2678                         if (prog->offs[n].end > max)
2679                             max = prog->offs[n].end;
2680                         n++;
2681                     }
2682                     if (max == -1)
2683                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2684                                 ? prog->offs[0].start
2685                                 : 0;
2686                     assert(max >= 0 && max <= PL_regeol - strbeg);
2687                 }
2688
2689                 if (    (flags & REXEC_COPY_SKIP_PRE)
2690                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2691                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2692                 ) { /* don't copy $` part of string */
2693                     U32 n = 0;
2694                     min = max;
2695                     /* calculate the left-most part of the string covered
2696                      * by a capture. Due to look-behind, this may be to
2697                      * the left of $&, so we have to scan all captures */
2698                     while (min && n <= prog->lastparen) {
2699                         if (   prog->offs[n].start != -1
2700                             && prog->offs[n].start < min)
2701                         {
2702                             min = prog->offs[n].start;
2703                         }
2704                         n++;
2705                     }
2706                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2707                         && min >  prog->offs[0].end
2708                     )
2709                         min = prog->offs[0].end;
2710
2711                 }
2712
2713                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2714                 sublen = max - min;
2715
2716                 if (RX_MATCH_COPIED(rx)) {
2717                     if (sublen > prog->sublen)
2718                         prog->subbeg =
2719                                 (char*)saferealloc(prog->subbeg, sublen+1);
2720                 }
2721                 else
2722                     prog->subbeg = (char*)safemalloc(sublen+1);
2723                 Copy(strbeg + min, prog->subbeg, sublen, char);
2724                 prog->subbeg[sublen] = '\0';
2725                 prog->suboffset = min;
2726                 prog->sublen = sublen;
2727                 RX_MATCH_COPIED_on(rx);
2728             }
2729             prog->subcoffset = prog->suboffset;
2730             if (prog->suboffset && utf8_target) {
2731                 /* Convert byte offset to chars.
2732                  * XXX ideally should only compute this if @-/@+
2733                  * has been seen, a la PL_sawampersand ??? */
2734
2735                 /* If there's a direct correspondence between the
2736                  * string which we're matching and the original SV,
2737                  * then we can use the utf8 len cache associated with
2738                  * the SV. In particular, it means that under //g,
2739                  * sv_pos_b2u() will use the previously cached
2740                  * position to speed up working out the new length of
2741                  * subcoffset, rather than counting from the start of
2742                  * the string each time. This stops
2743                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2744                  * from going quadratic */
2745                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2746                     sv_pos_b2u(sv, &(prog->subcoffset));
2747                 else
2748                     prog->subcoffset = utf8_length((U8*)strbeg,
2749                                         (U8*)(strbeg+prog->suboffset));
2750             }
2751         }
2752         else {
2753             RX_MATCH_COPY_FREE(rx);
2754             prog->subbeg = strbeg;
2755             prog->suboffset = 0;
2756             prog->subcoffset = 0;
2757             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2758         }
2759     }
2760
2761     return 1;
2762
2763 phooey:
2764     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2765                           PL_colors[4], PL_colors[5]));
2766     if (PL_reg_state.re_state_eval_setup_done)
2767         restore_pos(aTHX_ prog);
2768     if (swap) {
2769         /* we failed :-( roll it back */
2770         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2771             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2772             PTR2UV(prog),
2773             PTR2UV(prog->offs),
2774             PTR2UV(swap)
2775         ));
2776         Safefree(prog->offs);
2777         prog->offs = swap;
2778     }
2779     return 0;
2780 }
2781
2782
2783 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2784  * Do inc before dec, in case old and new rex are the same */
2785 #define SET_reg_curpm(Re2) \
2786     if (PL_reg_state.re_state_eval_setup_done) {    \
2787         (void)ReREFCNT_inc(Re2);                    \
2788         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2789         PM_SETRE((PL_reg_curpm), (Re2));            \
2790     }
2791
2792
2793 /*
2794  - regtry - try match at specific point
2795  */
2796 STATIC I32                      /* 0 failure, 1 success */
2797 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2798 {
2799     dVAR;
2800     CHECKPOINT lastcp;
2801     REGEXP *const rx = reginfo->prog;
2802     regexp *const prog = (struct regexp *)SvANY(rx);
2803     I32 result;
2804     RXi_GET_DECL(prog,progi);
2805     GET_RE_DEBUG_FLAGS_DECL;
2806
2807     PERL_ARGS_ASSERT_REGTRY;
2808
2809     reginfo->cutpoint=NULL;
2810
2811     if ((prog->extflags & RXf_EVAL_SEEN)
2812         && !PL_reg_state.re_state_eval_setup_done)
2813     {
2814         MAGIC *mg;
2815
2816         PL_reg_state.re_state_eval_setup_done = TRUE;
2817         if (reginfo->sv) {
2818             /* Make $_ available to executed code. */
2819             if (reginfo->sv != DEFSV) {
2820                 SAVE_DEFSV;
2821                 DEFSV_set(reginfo->sv);
2822             }
2823
2824             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2825                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2826                 /* prepare for quick setting of pos */
2827 #ifdef PERL_OLD_COPY_ON_WRITE
2828                 if (SvIsCOW(reginfo->sv))
2829                     sv_force_normal_flags(reginfo->sv, 0);
2830 #endif
2831                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2832                                  &PL_vtbl_mglob, NULL, 0);
2833                 mg->mg_len = -1;
2834             }
2835             PL_reg_magic    = mg;
2836             PL_reg_oldpos   = mg->mg_len;
2837             SAVEDESTRUCTOR_X(restore_pos, prog);
2838         }
2839         if (!PL_reg_curpm) {
2840             Newxz(PL_reg_curpm, 1, PMOP);
2841 #ifdef USE_ITHREADS
2842             {
2843                 SV* const repointer = &PL_sv_undef;
2844                 /* this regexp is also owned by the new PL_reg_curpm, which
2845                    will try to free it.  */
2846                 av_push(PL_regex_padav, repointer);
2847                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2848                 PL_regex_pad = AvARRAY(PL_regex_padav);
2849             }
2850 #endif
2851         }
2852         SET_reg_curpm(rx);
2853         PL_reg_oldcurpm = PL_curpm;
2854         PL_curpm = PL_reg_curpm;
2855         if (RXp_MATCH_COPIED(prog)) {
2856             /*  Here is a serious problem: we cannot rewrite subbeg,
2857                 since it may be needed if this match fails.  Thus
2858                 $` inside (?{}) could fail... */
2859             PL_reg_oldsaved = prog->subbeg;
2860             PL_reg_oldsavedlen = prog->sublen;
2861             PL_reg_oldsavedoffset = prog->suboffset;
2862             PL_reg_oldsavedcoffset = prog->suboffset;
2863 #ifdef PERL_OLD_COPY_ON_WRITE
2864             PL_nrs = prog->saved_copy;
2865 #endif
2866             RXp_MATCH_COPIED_off(prog);
2867         }
2868         else
2869             PL_reg_oldsaved = NULL;
2870         prog->subbeg = PL_bostr;
2871         prog->suboffset = 0;
2872         prog->subcoffset = 0;
2873         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2874     }
2875 #ifdef DEBUGGING
2876     PL_reg_starttry = *startposp;
2877 #endif
2878     prog->offs[0].start = *startposp - PL_bostr;
2879     prog->lastparen = 0;
2880     prog->lastcloseparen = 0;
2881     PL_regsize = 0;
2882
2883     /* XXXX What this code is doing here?!!!  There should be no need
2884        to do this again and again, prog->lastparen should take care of
2885        this!  --ilya*/
2886
2887     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2888      * Actually, the code in regcppop() (which Ilya may be meaning by
2889      * prog->lastparen), is not needed at all by the test suite
2890      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2891      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2892      * Meanwhile, this code *is* needed for the
2893      * above-mentioned test suite tests to succeed.  The common theme
2894      * on those tests seems to be returning null fields from matches.
2895      * --jhi updated by dapm */
2896 #if 1
2897     if (prog->nparens) {
2898         regexp_paren_pair *pp = prog->offs;
2899         I32 i;
2900         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2901             ++pp;
2902             pp->start = -1;
2903             pp->end = -1;
2904         }
2905     }
2906 #endif
2907     REGCP_SET(lastcp);
2908     result = regmatch(reginfo, *startposp, progi->program + 1);
2909     if (result != -1) {
2910         prog->offs[0].end = result;
2911         return 1;
2912     }
2913     if (reginfo->cutpoint)
2914         *startposp= reginfo->cutpoint;
2915     REGCP_UNWIND(lastcp);
2916     return 0;
2917 }
2918
2919
2920 #define sayYES goto yes
2921 #define sayNO goto no
2922 #define sayNO_SILENT goto no_silent
2923
2924 /* we dont use STMT_START/END here because it leads to
2925    "unreachable code" warnings, which are bogus, but distracting. */
2926 #define CACHEsayNO \
2927     if (ST.cache_mask) \
2928        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2929     sayNO
2930
2931 /* this is used to determine how far from the left messages like
2932    'failed...' are printed. It should be set such that messages
2933    are inline with the regop output that created them.
2934 */
2935 #define REPORT_CODE_OFF 32
2936
2937
2938 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2939 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2940 #define CHRTEST_NOT_A_CP_1 -999
2941 #define CHRTEST_NOT_A_CP_2 -998
2942
2943 #define SLAB_FIRST(s) (&(s)->states[0])
2944 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2945
2946 /* grab a new slab and return the first slot in it */
2947
2948 STATIC regmatch_state *
2949 S_push_slab(pTHX)
2950 {
2951 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2952     dMY_CXT;
2953 #endif
2954     regmatch_slab *s = PL_regmatch_slab->next;
2955     if (!s) {
2956         Newx(s, 1, regmatch_slab);
2957         s->prev = PL_regmatch_slab;
2958         s->next = NULL;
2959         PL_regmatch_slab->next = s;
2960     }
2961     PL_regmatch_slab = s;
2962     return SLAB_FIRST(s);
2963 }
2964
2965
2966 /* push a new state then goto it */
2967
2968 #define PUSH_STATE_GOTO(state, node, input) \
2969     pushinput = input; \
2970     scan = node; \
2971     st->resume_state = state; \
2972     goto push_state;
2973
2974 /* push a new state with success backtracking, then goto it */
2975
2976 #define PUSH_YES_STATE_GOTO(state, node, input) \
2977     pushinput = input; \
2978     scan = node; \
2979     st->resume_state = state; \
2980     goto push_yes_state;
2981
2982
2983
2984
2985 /*
2986
2987 regmatch() - main matching routine
2988
2989 This is basically one big switch statement in a loop. We execute an op,
2990 set 'next' to point the next op, and continue. If we come to a point which
2991 we may need to backtrack to on failure such as (A|B|C), we push a
2992 backtrack state onto the backtrack stack. On failure, we pop the top
2993 state, and re-enter the loop at the state indicated. If there are no more
2994 states to pop, we return failure.
2995
2996 Sometimes we also need to backtrack on success; for example /A+/, where
2997 after successfully matching one A, we need to go back and try to
2998 match another one; similarly for lookahead assertions: if the assertion
2999 completes successfully, we backtrack to the state just before the assertion
3000 and then carry on.  In these cases, the pushed state is marked as
3001 'backtrack on success too'. This marking is in fact done by a chain of
3002 pointers, each pointing to the previous 'yes' state. On success, we pop to
3003 the nearest yes state, discarding any intermediate failure-only states.
3004 Sometimes a yes state is pushed just to force some cleanup code to be
3005 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3006 it to free the inner regex.
3007
3008 Note that failure backtracking rewinds the cursor position, while
3009 success backtracking leaves it alone.
3010
3011 A pattern is complete when the END op is executed, while a subpattern
3012 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3013 ops trigger the "pop to last yes state if any, otherwise return true"
3014 behaviour.
3015
3016 A common convention in this function is to use A and B to refer to the two
3017 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3018 the subpattern to be matched possibly multiple times, while B is the entire
3019 rest of the pattern. Variable and state names reflect this convention.
3020
3021 The states in the main switch are the union of ops and failure/success of
3022 substates associated with with that op.  For example, IFMATCH is the op
3023 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3024 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3025 successfully matched A and IFMATCH_A_fail is a state saying that we have
3026 just failed to match A. Resume states always come in pairs. The backtrack
3027 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3028 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3029 on success or failure.
3030
3031 The struct that holds a backtracking state is actually a big union, with
3032 one variant for each major type of op. The variable st points to the
3033 top-most backtrack struct. To make the code clearer, within each
3034 block of code we #define ST to alias the relevant union.
3035
3036 Here's a concrete example of a (vastly oversimplified) IFMATCH
3037 implementation:
3038
3039     switch (state) {
3040     ....
3041
3042 #define ST st->u.ifmatch
3043
3044     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3045         ST.foo = ...; // some state we wish to save
3046         ...
3047         // push a yes backtrack state with a resume value of
3048         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3049         // first node of A:
3050         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3051         // NOTREACHED
3052
3053     case IFMATCH_A: // we have successfully executed A; now continue with B
3054         next = B;
3055         bar = ST.foo; // do something with the preserved value
3056         break;
3057
3058     case IFMATCH_A_fail: // A failed, so the assertion failed
3059         ...;   // do some housekeeping, then ...
3060         sayNO; // propagate the failure
3061
3062 #undef ST
3063
3064     ...
3065     }
3066
3067 For any old-timers reading this who are familiar with the old recursive
3068 approach, the code above is equivalent to:
3069
3070     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3071     {
3072         int foo = ...
3073         ...
3074         if (regmatch(A)) {
3075             next = B;
3076             bar = foo;
3077             break;
3078         }
3079         ...;   // do some housekeeping, then ...
3080         sayNO; // propagate the failure
3081     }
3082
3083 The topmost backtrack state, pointed to by st, is usually free. If you
3084 want to claim it, populate any ST.foo fields in it with values you wish to
3085 save, then do one of
3086
3087         PUSH_STATE_GOTO(resume_state, node, newinput);
3088         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3089
3090 which sets that backtrack state's resume value to 'resume_state', pushes a
3091 new free entry to the top of the backtrack stack, then goes to 'node'.
3092 On backtracking, the free slot is popped, and the saved state becomes the
3093 new free state. An ST.foo field in this new top state can be temporarily
3094 accessed to retrieve values, but once the main loop is re-entered, it
3095 becomes available for reuse.
3096
3097 Note that the depth of the backtrack stack constantly increases during the
3098 left-to-right execution of the pattern, rather than going up and down with
3099 the pattern nesting. For example the stack is at its maximum at Z at the
3100 end of the pattern, rather than at X in the following:
3101
3102     /(((X)+)+)+....(Y)+....Z/
3103
3104 The only exceptions to this are lookahead/behind assertions and the cut,
3105 (?>A), which pop all the backtrack states associated with A before
3106 continuing.
3107
3108 Backtrack state structs are allocated in slabs of about 4K in size.
3109 PL_regmatch_state and st always point to the currently active state,
3110 and PL_regmatch_slab points to the slab currently containing
3111 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3112 allocated, and is never freed until interpreter destruction. When the slab
3113 is full, a new one is allocated and chained to the end. At exit from
3114 regmatch(), slabs allocated since entry are freed.
3115
3116 */
3117
3118
3119 #define DEBUG_STATE_pp(pp)                                  \
3120     DEBUG_STATE_r({                                         \
3121         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3122         PerlIO_printf(Perl_debug_log,                       \
3123             "    %*s"pp" %s%s%s%s%s\n",                     \
3124             depth*2, "",                                    \
3125             PL_reg_name[st->resume_state],                     \
3126             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3127             ((st==yes_state) ? "Y" : ""),                   \
3128             ((st==mark_state) ? "M" : ""),                  \
3129             ((st==yes_state||st==mark_state) ? "]" : "")    \
3130         );                                                  \
3131     });
3132
3133
3134 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3135
3136 #ifdef DEBUGGING
3137
3138 STATIC void
3139 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3140     const char *start, const char *end, const char *blurb)
3141 {
3142     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3143
3144     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3145
3146     if (!PL_colorset)
3147             reginitcolors();
3148     {
3149         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3150             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3151
3152         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3153             start, end - start, 60);
3154
3155         PerlIO_printf(Perl_debug_log,
3156             "%s%s REx%s %s against %s\n",
3157                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3158
3159         if (utf8_target||utf8_pat)
3160             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3161                 utf8_pat ? "pattern" : "",
3162                 utf8_pat && utf8_target ? " and " : "",
3163                 utf8_target ? "string" : ""
3164             );
3165     }
3166 }
3167
3168 STATIC void
3169 S_dump_exec_pos(pTHX_ const char *locinput,
3170                       const regnode *scan,
3171                       const char *loc_regeol,
3172                       const char *loc_bostr,
3173                       const char *loc_reg_starttry,
3174                       const bool utf8_target)
3175 {
3176     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3177     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3178     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3179     /* The part of the string before starttry has one color
3180        (pref0_len chars), between starttry and current
3181        position another one (pref_len - pref0_len chars),
3182        after the current position the third one.
3183        We assume that pref0_len <= pref_len, otherwise we
3184        decrease pref0_len.  */
3185     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3186         ? (5 + taill) - l : locinput - loc_bostr;
3187     int pref0_len;
3188
3189     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3190
3191     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3192         pref_len++;
3193     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3194     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3195         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3196               ? (5 + taill) - pref_len : loc_regeol - locinput);
3197     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3198         l--;
3199     if (pref0_len < 0)
3200         pref0_len = 0;
3201     if (pref0_len > pref_len)
3202         pref0_len = pref_len;
3203     {
3204         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3205
3206         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3207             (locinput - pref_len),pref0_len, 60, 4, 5);
3208
3209         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3210                     (locinput - pref_len + pref0_len),
3211                     pref_len - pref0_len, 60, 2, 3);
3212
3213         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3214                     locinput, loc_regeol - locinput, 10, 0, 1);
3215
3216         const STRLEN tlen=len0+len1+len2;
3217         PerlIO_printf(Perl_debug_log,
3218                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3219                     (IV)(locinput - loc_bostr),
3220                     len0, s0,
3221                     len1, s1,
3222                     (docolor ? "" : "> <"),
3223                     len2, s2,
3224                     (int)(tlen > 19 ? 0 :  19 - tlen),
3225                     "");
3226     }
3227 }
3228
3229 #endif
3230
3231 /* reg_check_named_buff_matched()
3232  * Checks to see if a named buffer has matched. The data array of
3233  * buffer numbers corresponding to the buffer is expected to reside
3234  * in the regexp->data->data array in the slot stored in the ARG() of
3235  * node involved. Note that this routine doesn't actually care about the
3236  * name, that information is not preserved from compilation to execution.
3237  * Returns the index of the leftmost defined buffer with the given name
3238  * or 0 if non of the buffers matched.
3239  */
3240 STATIC I32
3241 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3242 {
3243     I32 n;
3244     RXi_GET_DECL(rex,rexi);
3245     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3246     I32 *nums=(I32*)SvPVX(sv_dat);
3247
3248     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3249
3250     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3251         if ((I32)rex->lastparen >= nums[n] &&
3252             rex->offs[nums[n]].end != -1)
3253         {
3254             return nums[n];
3255         }
3256     }
3257     return 0;
3258 }
3259
3260
3261 /* free all slabs above current one  - called during LEAVE_SCOPE */
3262
3263 STATIC void
3264 S_clear_backtrack_stack(pTHX_ void *p)
3265 {
3266     regmatch_slab *s = PL_regmatch_slab->next;
3267     PERL_UNUSED_ARG(p);
3268
3269     if (!s)
3270         return;
3271     PL_regmatch_slab->next = NULL;
3272     while (s) {
3273         regmatch_slab * const osl = s;
3274         s = s->next;
3275         Safefree(osl);
3276     }
3277 }
3278 static bool
3279 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3280 {
3281     /* This function determines if there are one or two characters that match
3282      * the first character of the passed-in EXACTish node <text_node>, and if
3283      * so, returns them in the passed-in pointers.
3284      *
3285      * If it determines that no possible character in the target string can
3286      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3287      * the first character in <text_node> requires UTF-8 to represent, and the
3288      * target string isn't in UTF-8.)
3289      *
3290      * If there are more than two characters that could match the beginning of
3291      * <text_node>, or if more context is required to determine a match or not,
3292      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3293      *
3294      * The motiviation behind this function is to allow the caller to set up
3295      * tight loops for matching.  If <text_node> is of type EXACT, there is
3296      * only one possible character that can match its first character, and so
3297      * the situation is quite simple.  But things get much more complicated if
3298      * folding is involved.  It may be that the first character of an EXACTFish
3299      * node doesn't participate in any possible fold, e.g., punctuation, so it
3300      * can be matched only by itself.  The vast majority of characters that are
3301      * in folds match just two things, their lower and upper-case equivalents.
3302      * But not all are like that; some have multiple possible matches, or match
3303      * sequences of more than one character.  This function sorts all that out.
3304      *
3305      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3306      * loop of trying to match A*, we know we can't exit where the thing
3307      * following it isn't a B.  And something can't be a B unless it is the
3308      * beginning of B.  By putting a quick test for that beginning in a tight
3309      * loop, we can rule out things that can't possibly be B without having to
3310      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3311      * character, we can make a tight loop matching A*, using the outputs of
3312      * this function.
3313      *
3314      * If the target string to match isn't in UTF-8, and there aren't
3315      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3316      * the one or two possible octets (which are characters in this situation)
3317      * that can match.  In all cases, if there is only one character that can
3318      * match, *<c1p> and *<c2p> will be identical.
3319      *
3320      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3321      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3322      * can match the beginning of <text_node>.  They should be declared with at
3323      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3324      * undefined what these contain.)  If one or both of the buffers are
3325      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3326      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3327      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3328      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3329      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3330
3331     const bool utf8_target = PL_reg_match_utf8;
3332
3333     UV c1, c2;
3334     bool use_chrtest_void = FALSE;
3335
3336     /* Used when we have both utf8 input and utf8 output, to avoid converting
3337      * to/from code points */
3338     bool utf8_has_been_setup = FALSE;
3339
3340     dVAR;
3341
3342     U8 *pat = (U8*)STRING(text_node);
3343
3344     if (OP(text_node) == EXACT) {
3345
3346         /* In an exact node, only one thing can be matched, that first
3347          * character.  If both the pat and the target are UTF-8, we can just
3348          * copy the input to the output, avoiding finding the code point of
3349          * that character */
3350         if (! UTF_PATTERN) {
3351             c2 = c1 = *pat;
3352         }
3353         else if (utf8_target) {
3354             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3355             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3356             utf8_has_been_setup = TRUE;
3357         }
3358         else {
3359             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3360         }
3361     }
3362     else /* an EXACTFish node */
3363          if ((UTF_PATTERN
3364                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3365                                                     pat + STR_LEN(text_node)))
3366              || (! UTF_PATTERN
3367                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3368                                                     pat + STR_LEN(text_node))))
3369     {
3370         /* Multi-character folds require more context to sort out.  Also
3371          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3372          * handled outside this routine */
3373         use_chrtest_void = TRUE;
3374     }
3375     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3376         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3377         if (c1 > 256) {
3378             /* Load the folds hash, if not already done */
3379             SV** listp;
3380             if (! PL_utf8_foldclosures) {
3381                 if (! PL_utf8_tofold) {
3382                     U8 dummy[UTF8_MAXBYTES+1];
3383
3384                     /* Force loading this by folding an above-Latin1 char */
3385                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3386                     assert(PL_utf8_tofold); /* Verify that worked */
3387                 }
3388                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3389             }
3390
3391             /* The fold closures data structure is a hash with the keys being
3392              * the UTF-8 of every character that is folded to, like 'k', and
3393              * the values each an array of all code points that fold to its
3394              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3395              * not included */
3396             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3397                                      (char *) pat,
3398                                      UTF8SKIP(pat),
3399                                      FALSE))))
3400             {
3401                 /* Not found in the hash, therefore there are no folds
3402                  * containing it, so there is only a single character that
3403                  * could match */
3404                 c2 = c1;
3405             }
3406             else {  /* Does participate in folds */
3407                 AV* list = (AV*) *listp;
3408                 if (av_len(list) != 1) {
3409
3410                     /* If there aren't exactly two folds to this, it is outside
3411                      * the scope of this function */
3412                     use_chrtest_void = TRUE;
3413                 }
3414                 else {  /* There are two.  Get them */
3415                     SV** c_p = av_fetch(list, 0, FALSE);
3416                     if (c_p == NULL) {
3417                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3418                     }
3419                     c1 = SvUV(*c_p);
3420
3421                     c_p = av_fetch(list, 1, FALSE);
3422                     if (c_p == NULL) {
3423                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3424                     }
3425                     c2 = SvUV(*c_p);
3426
3427                     /* Folds that cross the 255/256 boundary are forbidden if
3428                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3429                      * pattern character is above 256, and its only other match
3430                      * is below 256, the only legal match will be to itself.
3431                      * We have thrown away the original, so have to compute
3432                      * which is the one above 255 */
3433                     if ((c1 < 256) != (c2 < 256)) {
3434                         if (OP(text_node) == EXACTFL
3435                             || (OP(text_node) == EXACTFA
3436                                 && (isASCII(c1) || isASCII(c2))))
3437                         {
3438                             if (c1 < 256) {
3439                                 c1 = c2;
3440                             }
3441                             else {
3442                                 c2 = c1;
3443                             }
3444                         }
3445                     }
3446                 }
3447             }
3448         }
3449         else /* Here, c1 is < 255 */
3450              if (utf8_target
3451                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3452                  && OP(text_node) != EXACTFL
3453                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3454         {
3455             /* Here, there could be something above Latin1 in the target which
3456              * folds to this character in the pattern.  All such cases except
3457              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3458              * involved in their folds, so are outside the scope of this
3459              * function */
3460             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3461                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3462             }
3463             else {
3464                 use_chrtest_void = TRUE;
3465             }
3466         }
3467         else { /* Here nothing above Latin1 can fold to the pattern character */
3468             switch (OP(text_node)) {
3469
3470                 case EXACTFL:   /* /l rules */
3471                     c2 = PL_fold_locale[c1];
3472                     break;
3473
3474                 case EXACTF:
3475                     if (! utf8_target) {    /* /d rules */
3476                         c2 = PL_fold[c1];
3477                         break;
3478                     }
3479                     /* FALLTHROUGH */
3480                     /* /u rules for all these.  This happens to work for
3481                      * EXACTFA as nothing in Latin1 folds to ASCII */
3482                 case EXACTFA:
3483                 case EXACTFU_TRICKYFOLD:
3484                 case EXACTFU_SS:
3485                 case EXACTFU:
3486                     c2 = PL_fold_latin1[c1];
3487                     break;
3488
3489                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3490             }
3491         }
3492     }
3493
3494     /* Here have figured things out.  Set up the returns */
3495     if (use_chrtest_void) {
3496         *c2p = *c1p = CHRTEST_VOID;
3497     }
3498     else if (utf8_target) {
3499         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3500             uvchr_to_utf8(c1_utf8, c1);
3501             uvchr_to_utf8(c2_utf8, c2);
3502         }
3503
3504         /* Invariants are stored in both the utf8 and byte outputs; Use
3505          * negative numbers otherwise for the byte ones.  Make sure that the
3506          * byte ones are the same iff the utf8 ones are the same */
3507         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3508         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3509                 ? *c2_utf8
3510                 : (c1 == c2)
3511                   ? CHRTEST_NOT_A_CP_1
3512                   : CHRTEST_NOT_A_CP_2;
3513     }
3514     else if (c1 > 255) {
3515        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3516                            can represent */
3517            return FALSE;
3518        }
3519
3520        *c1p = *c2p = c2;    /* c2 is the only representable value */
3521     }
3522     else {  /* c1 is representable; see about c2 */
3523        *c1p = c1;
3524        *c2p = (c2 < 256) ? c2 : c1;
3525     }
3526
3527     return TRUE;
3528 }
3529
3530 /* returns -1 on failure, $+[0] on success */
3531 STATIC I32
3532 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3533 {
3534 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3535     dMY_CXT;
3536 #endif
3537     dVAR;
3538     const bool utf8_target = PL_reg_match_utf8;
3539     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3540     REGEXP *rex_sv = reginfo->prog;
3541     regexp *rex = (struct regexp *)SvANY(rex_sv);
3542     RXi_GET_DECL(rex,rexi);
3543     I32 oldsave;
3544     /* the current state. This is a cached copy of PL_regmatch_state */
3545     regmatch_state *st;
3546     /* cache heavy used fields of st in registers */
3547     regnode *scan;
3548     regnode *next;
3549     U32 n = 0;  /* general value; init to avoid compiler warning */
3550     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3551     char *locinput = startpos;
3552     char *pushinput; /* where to continue after a PUSH */
3553     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3554
3555     bool result = 0;        /* return value of S_regmatch */
3556     int depth = 0;          /* depth of backtrack stack */
3557     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3558     const U32 max_nochange_depth =
3559         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3560         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3561     regmatch_state *yes_state = NULL; /* state to pop to on success of
3562                                                             subpattern */
3563     /* mark_state piggy backs on the yes_state logic so that when we unwind
3564        the stack on success we can update the mark_state as we go */
3565     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3566     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3567     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3568     U32 state_num;
3569     bool no_final = 0;      /* prevent failure from backtracking? */
3570     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3571     char *startpoint = locinput;
3572     SV *popmark = NULL;     /* are we looking for a mark? */
3573     SV *sv_commit = NULL;   /* last mark name seen in failure */
3574     SV *sv_yes_mark = NULL; /* last mark name we have seen
3575                                during a successful match */
3576     U32 lastopen = 0;       /* last open we saw */
3577     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3578     SV* const oreplsv = GvSV(PL_replgv);
3579     /* these three flags are set by various ops to signal information to
3580      * the very next op. They have a useful lifetime of exactly one loop
3581      * iteration, and are not preserved or restored by state pushes/pops
3582      */
3583     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3584     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3585     int logical = 0;        /* the following EVAL is:
3586                                 0: (?{...})
3587                                 1: (?(?{...})X|Y)
3588                                 2: (??{...})
3589                                or the following IFMATCH/UNLESSM is:
3590                                 false: plain (?=foo)
3591                                 true:  used as a condition: (?(?=foo))
3592                             */
3593     PAD* last_pad = NULL;
3594     dMULTICALL;
3595     I32 gimme = G_SCALAR;
3596     CV *caller_cv = NULL;       /* who called us */
3597     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3598     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3599
3600 #ifdef DEBUGGING
3601     GET_RE_DEBUG_FLAGS_DECL;
3602 #endif
3603
3604     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3605     multicall_oldcatch = 0;
3606     multicall_cv = NULL;
3607     cx = NULL;
3608     PERL_UNUSED_VAR(multicall_cop);
3609     PERL_UNUSED_VAR(newsp);
3610
3611
3612     PERL_ARGS_ASSERT_REGMATCH;
3613
3614     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3615             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3616     }));
3617     /* on first ever call to regmatch, allocate first slab */
3618     if (!PL_regmatch_slab) {
3619         Newx(PL_regmatch_slab, 1, regmatch_slab);
3620         PL_regmatch_slab->prev = NULL;
3621         PL_regmatch_slab->next = NULL;
3622         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3623     }
3624
3625     oldsave = PL_savestack_ix;
3626     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3627     SAVEVPTR(PL_regmatch_slab);
3628     SAVEVPTR(PL_regmatch_state);
3629
3630     /* grab next free state slot */
3631     st = ++PL_regmatch_state;
3632     if (st >  SLAB_LAST(PL_regmatch_slab))
3633         st = PL_regmatch_state = S_push_slab(aTHX);
3634
3635     /* Note that nextchr is a byte even in UTF */
3636     SET_nextchr;
3637     scan = prog;
3638     while (scan != NULL) {
3639
3640         DEBUG_EXECUTE_r( {
3641             SV * const prop = sv_newmortal();
3642             regnode *rnext=regnext(scan);
3643             DUMP_EXEC_POS( locinput, scan, utf8_target );
3644             regprop(rex, prop, scan);
3645
3646             PerlIO_printf(Perl_debug_log,
3647                     "%3"IVdf":%*s%s(%"IVdf")\n",
3648                     (IV)(scan - rexi->program), depth*2, "",
3649                     SvPVX_const(prop),
3650                     (PL_regkind[OP(scan)] == END || !rnext) ?
3651                         0 : (IV)(rnext - rexi->program));
3652         });
3653
3654         next = scan + NEXT_OFF(scan);
3655         if (next == scan)
3656             next = NULL;
3657         state_num = OP(scan);
3658
3659         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3660       reenter_switch:
3661
3662         SET_nextchr;
3663         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3664
3665         switch (state_num) {
3666         case BOL: /*  /^../  */
3667             if (locinput == PL_bostr)
3668             {
3669                 /* reginfo->till = reginfo->bol; */
3670                 break;
3671             }
3672             sayNO;
3673
3674         case MBOL: /*  /^../m  */
3675             if (locinput == PL_bostr ||
3676                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3677             {
3678                 break;
3679             }
3680             sayNO;
3681
3682         case SBOL: /*  /^../s  */
3683             if (locinput == PL_bostr)
3684                 break;
3685             sayNO;
3686
3687         case GPOS: /*  \G  */
3688             if (locinput == reginfo->ganch)
3689                 break;
3690             sayNO;
3691
3692         case KEEPS: /*   \K  */
3693             /* update the startpoint */
3694             st->u.keeper.val = rex->offs[0].start;
3695             rex->offs[0].start = locinput - PL_bostr;
3696             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3697             /*NOT-REACHED*/
3698         case KEEPS_next_fail:
3699             /* rollback the start point change */
3700             rex->offs[0].start = st->u.keeper.val;
3701             sayNO_SILENT;
3702             /*NOT-REACHED*/
3703
3704         case EOL: /* /..$/  */
3705                 goto seol;
3706
3707         case MEOL: /* /..$/m  */
3708             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3709                 sayNO;
3710             break;
3711
3712         case SEOL: /* /..$/s  */
3713           seol:
3714             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3715                 sayNO;
3716             if (PL_regeol - locinput > 1)
3717                 sayNO;
3718             break;
3719
3720         case EOS: /*  \z  */
3721             if (!NEXTCHR_IS_EOS)
3722                 sayNO;
3723             break;
3724
3725         case SANY: /*  /./s  */
3726             if (NEXTCHR_IS_EOS)
3727                 sayNO;
3728             goto increment_locinput;
3729
3730         case CANY: /*  \C  */
3731             if (NEXTCHR_IS_EOS)
3732                 sayNO;
3733             locinput++;
3734             break;
3735
3736         case REG_ANY: /*  /./  */
3737             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3738                 sayNO;
3739             goto increment_locinput;
3740
3741
3742 #undef  ST
3743 #define ST st->u.trie
3744         case TRIEC: /* (ab|cd) with known charclass */
3745             /* In this case the charclass data is available inline so
3746                we can fail fast without a lot of extra overhead.
3747              */
3748             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3749                 DEBUG_EXECUTE_r(
3750                     PerlIO_printf(Perl_debug_log,
3751                               "%*s  %sfailed to match trie start class...%s\n",
3752                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3753                 );
3754                 sayNO_SILENT;
3755                 assert(0); /* NOTREACHED */
3756             }
3757             /* FALL THROUGH */
3758         case TRIE:  /* (ab|cd)  */
3759             /* the basic plan of execution of the trie is:
3760              * At the beginning, run though all the states, and
3761              * find the longest-matching word. Also remember the position
3762              * of the shortest matching word. For example, this pattern:
3763              *    1  2 3 4    5
3764              *    ab|a|x|abcd|abc
3765              * when matched against the string "abcde", will generate
3766              * accept states for all words except 3, with the longest
3767              * matching word being 4, and the shortest being 2 (with
3768              * the position being after char 1 of the string).
3769              *
3770              * Then for each matching word, in word order (i.e. 1,2,4,5),
3771              * we run the remainder of the pattern; on each try setting
3772              * the current position to the character following the word,
3773              * returning to try the next word on failure.
3774              *
3775              * We avoid having to build a list of words at runtime by
3776              * using a compile-time structure, wordinfo[].prev, which
3777              * gives, for each word, the previous accepting word (if any).
3778              * In the case above it would contain the mappings 1->2, 2->0,
3779              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3780              * the longest word (4 above), a list of all words, by
3781              * following the list of prev pointers; this gives us the
3782              * unordered list 4,5,1,2. Then given the current word we have
3783              * just tried, we can go through the list and find the
3784              * next-biggest word to try (so if we just failed on word 2,
3785              * the next in the list is 4).
3786              *
3787              * Since at runtime we don't record the matching position in
3788              * the string for each word, we have to work that out for
3789              * each word we're about to process. The wordinfo table holds
3790              * the character length of each word; given that we recorded
3791              * at the start: the position of the shortest word and its
3792              * length in chars, we just need to move the pointer the
3793              * difference between the two char lengths. Depending on
3794              * Unicode status and folding, that's cheap or expensive.
3795              *
3796              * This algorithm is optimised for the case where are only a
3797              * small number of accept states, i.e. 0,1, or maybe 2.
3798              * With lots of accepts states, and having to try all of them,
3799              * it becomes quadratic on number of accept states to find all
3800              * the next words.
3801              */
3802
3803             {
3804                 /* what type of TRIE am I? (utf8 makes this contextual) */
3805                 DECL_TRIE_TYPE(scan);
3806
3807                 /* what trie are we using right now */
3808                 reg_trie_data * const trie
3809                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3810                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3811                 U32 state = trie->startstate;
3812
3813                 if (   trie->bitmap
3814                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3815                 {
3816                     if (trie->states[ state ].wordnum) {
3817                          DEBUG_EXECUTE_r(
3818                             PerlIO_printf(Perl_debug_log,
3819                                           "%*s  %smatched empty string...%s\n",
3820                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3821                         );
3822                         if (!trie->jump)
3823                             break;
3824                     } else {
3825                         DEBUG_EXECUTE_r(
3826                             PerlIO_printf(Perl_debug_log,
3827                                           "%*s  %sfailed to match trie start class...%s\n",
3828                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3829                         );
3830                         sayNO_SILENT;
3831                    }
3832                 }
3833
3834             {
3835                 U8 *uc = ( U8* )locinput;
3836
3837                 STRLEN len = 0;
3838                 STRLEN foldlen = 0;
3839                 U8 *uscan = (U8*)NULL;
3840                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3841                 U32 charcount = 0; /* how many input chars we have matched */
3842                 U32 accepted = 0; /* have we seen any accepting states? */
3843
3844                 ST.jump = trie->jump;
3845                 ST.me = scan;
3846                 ST.firstpos = NULL;
3847                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3848                 ST.nextword = 0;
3849
3850                 /* fully traverse the TRIE; note the position of the
3851                    shortest accept state and the wordnum of the longest
3852                    accept state */
3853
3854                 while ( state && uc <= (U8*)PL_regeol ) {
3855                     U32 base = trie->states[ state ].trans.base;
3856                     UV uvc = 0;
3857                     U16 charid = 0;
3858                     U16 wordnum;
3859                     wordnum = trie->states[ state ].wordnum;
3860
3861                     if (wordnum) { /* it's an accept state */
3862                         if (!accepted) {
3863                             accepted = 1;
3864                             /* record first match position */
3865                             if (ST.longfold) {
3866                                 ST.firstpos = (U8*)locinput;
3867                                 ST.firstchars = 0;
3868                             }
3869                             else {
3870                                 ST.firstpos = uc;
3871                                 ST.firstchars = charcount;
3872                             }
3873                         }
3874                         if (!ST.nextword || wordnum < ST.nextword)
3875                             ST.nextword = wordnum;
3876                         ST.topword = wordnum;
3877                     }
3878
3879                     DEBUG_TRIE_EXECUTE_r({
3880                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3881                                 PerlIO_printf( Perl_debug_log,
3882                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3883                                     2+depth * 2, "", PL_colors[4],
3884                                     (UV)state, (accepted ? 'Y' : 'N'));
3885                     });
3886
3887                     /* read a char and goto next state */
3888                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3889                         I32 offset;
3890                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3891                                              uscan, len, uvc, charid, foldlen,
3892                                              foldbuf, uniflags);
3893                         charcount++;
3894                         if (foldlen>0)
3895                             ST.longfold = TRUE;
3896                         if (charid &&
3897                              ( ((offset =
3898                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3899
3900                              && ((U32)offset < trie->lasttrans)
3901                              && trie->trans[offset].check == state)
3902                         {
3903                             state = trie->trans[offset].next;
3904                         }
3905                         else {
3906                             state = 0;
3907                         }
3908                         uc += len;
3909
3910                     }
3911                     else {
3912                         state = 0;
3913                     }
3914                     DEBUG_TRIE_EXECUTE_r(
3915                         PerlIO_printf( Perl_debug_log,
3916                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3917                             charid, uvc, (UV)state, PL_colors[5] );
3918                     );
3919                 }
3920                 if (!accepted)
3921                    sayNO;
3922
3923                 /* calculate total number of accept states */
3924                 {
3925                     U16 w = ST.topword;
3926                     accepted = 0;
3927                     while (w) {
3928                         w = trie->wordinfo[w].prev;
3929                         accepted++;
3930                     }
3931                     ST.accepted = accepted;
3932                 }
3933
3934                 DEBUG_EXECUTE_r(
3935                     PerlIO_printf( Perl_debug_log,
3936                         "%*s  %sgot %"IVdf" possible matches%s\n",
3937                         REPORT_CODE_OFF + depth * 2, "",
3938                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3939                 );
3940                 goto trie_first_try; /* jump into the fail handler */
3941             }}
3942             assert(0); /* NOTREACHED */
3943
3944         case TRIE_next_fail: /* we failed - try next alternative */
3945         {
3946             U8 *uc;
3947             if ( ST.jump) {
3948                 REGCP_UNWIND(ST.cp);
3949                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3950             }
3951             if (!--ST.accepted) {
3952                 DEBUG_EXECUTE_r({
3953                     PerlIO_printf( Perl_debug_log,
3954                         "%*s  %sTRIE failed...%s\n",
3955                         REPORT_CODE_OFF+depth*2, "",
3956                         PL_colors[4],
3957                         PL_colors[5] );
3958                 });
3959                 sayNO_SILENT;
3960             }
3961             {
3962                 /* Find next-highest word to process.  Note that this code
3963                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3964                 U16 min = 0;
3965                 U16 word;
3966                 U16 const nextword = ST.nextword;
3967                 reg_trie_wordinfo * const wordinfo
3968                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3969                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3970                     if (word > nextword && (!min || word < min))
3971                         min = word;
3972                 }
3973                 ST.nextword = min;
3974             }
3975
3976           trie_first_try:
3977             if (do_cutgroup) {
3978                 do_cutgroup = 0;
3979                 no_final = 0;
3980             }
3981
3982             if ( ST.jump) {
3983                 ST.lastparen = rex->lastparen;
3984                 ST.lastcloseparen = rex->lastcloseparen;
3985                 REGCP_SET(ST.cp);
3986             }
3987
3988             /* find start char of end of current word */
3989             {
3990                 U32 chars; /* how many chars to skip */
3991                 reg_trie_data * const trie
3992                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3993
3994                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3995                             >=  ST.firstchars);
3996                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3997                             - ST.firstchars;
3998                 uc = ST.firstpos;
3999
4000                 if (ST.longfold) {
4001                     /* the hard option - fold each char in turn and find
4002                      * its folded length (which may be different */
4003                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4004                     STRLEN foldlen;
4005                     STRLEN len;
4006                     UV uvc;
4007                     U8 *uscan;
4008
4009                     while (chars) {
4010                         if (utf8_target) {
4011                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4012                                                     uniflags);
4013                             uc += len;
4014                         }
4015                         else {
4016                             uvc = *uc;
4017                             uc++;
4018                         }
4019                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4020                         uscan = foldbuf;
4021                         while (foldlen) {
4022                             if (!--chars)
4023                                 break;
4024                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4025                                             uniflags);
4026                             uscan += len;
4027                             foldlen -= len;
4028                         }
4029                     }
4030                 }
4031                 else {
4032                     if (utf8_target)
4033                         while (chars--)
4034                             uc += UTF8SKIP(uc);
4035                     else
4036                         uc += chars;
4037                 }
4038             }
4039
4040             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4041                             ? ST.jump[ST.nextword]
4042                             : NEXT_OFF(ST.me));
4043
4044             DEBUG_EXECUTE_r({
4045                 PerlIO_printf( Perl_debug_log,
4046                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4047                     REPORT_CODE_OFF+depth*2, "",
4048                     PL_colors[4],
4049                     ST.nextword,
4050                     PL_colors[5]
4051                     );
4052             });
4053
4054             if (ST.accepted > 1 || has_cutgroup) {
4055                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4056                 assert(0); /* NOTREACHED */
4057             }
4058             /* only one choice left - just continue */
4059             DEBUG_EXECUTE_r({
4060                 AV *const trie_words
4061                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4062                 SV ** const tmp = av_fetch( trie_words,
4063                     ST.nextword-1, 0 );
4064                 SV *sv= tmp ? sv_newmortal() : NULL;
4065
4066                 PerlIO_printf( Perl_debug_log,
4067                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4068                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4069                     ST.nextword,
4070                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4071                             PL_colors[0], PL_colors[1],
4072                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4073                         )
4074                     : "not compiled under -Dr",
4075                     PL_colors[5] );
4076             });
4077
4078             locinput = (char*)uc;
4079             continue; /* execute rest of RE */
4080             assert(0); /* NOTREACHED */
4081         }
4082 #undef  ST
4083
4084         case EXACT: {            /*  /abc/        */
4085             char *s = STRING(scan);
4086             ln = STR_LEN(scan);
4087             if (utf8_target != UTF_PATTERN) {
4088                 /* The target and the pattern have differing utf8ness. */
4089                 char *l = locinput;
4090                 const char * const e = s + ln;
4091
4092                 if (utf8_target) {
4093                     /* The target is utf8, the pattern is not utf8.
4094                      * Above-Latin1 code points can't match the pattern;
4095                      * invariants match exactly, and the other Latin1 ones need
4096                      * to be downgraded to a single byte in order to do the
4097                      * comparison.  (If we could be confident that the target
4098                      * is not malformed, this could be refactored to have fewer
4099                      * tests by just assuming that if the first bytes match, it
4100                      * is an invariant, but there are tests in the test suite
4101                      * dealing with (??{...}) which violate this) */
4102                     while (s < e) {
4103                         if (l >= PL_regeol)
4104                              sayNO;
4105                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4106                             sayNO;
4107                         }
4108                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4109                             if (*l != *s) {
4110                                 sayNO;
4111                             }
4112                             l++;
4113                         }
4114                         else {
4115                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4116                                 sayNO;
4117                             }
4118                             l += 2;
4119                         }
4120                         s++;
4121                     }
4122                 }
4123                 else {
4124                     /* The target is not utf8, the pattern is utf8. */
4125                     while (s < e) {
4126                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4127                         {
4128                             sayNO;
4129                         }
4130                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4131                             if (*s != *l) {
4132                                 sayNO;
4133                             }
4134                             s++;
4135                         }
4136                         else {
4137                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4138                                 sayNO;
4139                             }
4140                             s += 2;
4141                         }
4142                         l++;
4143                     }
4144                 }
4145                 locinput = l;
4146                 break;
4147             }
4148             /* The target and the pattern have the same utf8ness. */
4149             /* Inline the first character, for speed. */
4150             if (UCHARAT(s) != nextchr)
4151                 sayNO;
4152             if (PL_regeol - locinput < ln)
4153                 sayNO;
4154             if (ln > 1 && memNE(s, locinput, ln))
4155                 sayNO;
4156             locinput += ln;
4157             break;
4158             }
4159
4160         case EXACTFL: {          /*  /abc/il      */
4161             re_fold_t folder;
4162             const U8 * fold_array;
4163             const char * s;
4164             U32 fold_utf8_flags;
4165
4166             PL_reg_flags |= RF_tainted;
4167             folder = foldEQ_locale;
4168             fold_array = PL_fold_locale;
4169             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4170             goto do_exactf;
4171
4172         case EXACTFU_SS:         /*  /\x{df}/iu   */
4173         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4174         case EXACTFU:            /*  /abc/iu      */
4175             folder = foldEQ_latin1;
4176             fold_array = PL_fold_latin1;
4177             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4178             goto do_exactf;
4179
4180         case EXACTFA:            /*  /abc/iaa     */
4181             folder = foldEQ_latin1;
4182             fold_array = PL_fold_latin1;
4183             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4184             goto do_exactf;
4185
4186         case EXACTF:             /*  /abc/i       */
4187             folder = foldEQ;
4188             fold_array = PL_fold;
4189             fold_utf8_flags = 0;
4190
4191           do_exactf:
4192             s = STRING(scan);
4193             ln = STR_LEN(scan);
4194
4195             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4196               /* Either target or the pattern are utf8, or has the issue where
4197                * the fold lengths may differ. */
4198                 const char * const l = locinput;
4199                 char *e = PL_regeol;
4200
4201                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4202                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4203                 {
4204                     sayNO;
4205                 }
4206                 locinput = e;
4207                 break;
4208             }
4209
4210             /* Neither the target nor the pattern are utf8 */
4211             if (UCHARAT(s) != nextchr
4212                 && !NEXTCHR_IS_EOS
4213                 && UCHARAT(s) != fold_array[nextchr])
4214             {
4215                 sayNO;
4216             }
4217             if (PL_regeol - locinput < ln)
4218                 sayNO;
4219             if (ln > 1 && ! folder(s, locinput, ln))
4220                 sayNO;
4221             locinput += ln;
4222             break;
4223         }
4224
4225         /* XXX Could improve efficiency by separating these all out using a
4226          * macro or in-line function.  At that point regcomp.c would no longer
4227          * have to set the FLAGS fields of these */
4228         case BOUNDL:  /*  /\b/l  */
4229         case NBOUNDL: /*  /\B/l  */
4230             PL_reg_flags |= RF_tainted;
4231             /* FALL THROUGH */
4232         case BOUND:   /*  /\b/   */
4233         case BOUNDU:  /*  /\b/u  */
4234         case BOUNDA:  /*  /\b/a  */
4235         case NBOUND:  /*  /\B/   */
4236         case NBOUNDU: /*  /\B/u  */
4237         case NBOUNDA: /*  /\B/a  */
4238             /* was last char in word? */
4239             if (utf8_target
4240                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4241                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4242             {
4243                 if (locinput == PL_bostr)
4244                     ln = '\n';
4245                 else {
4246                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4247
4248                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4249                 }
4250                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4251                     ln = isALNUM_uni(ln);
4252                     if (NEXTCHR_IS_EOS)
4253                         n = 0;
4254                     else {
4255                         LOAD_UTF8_CHARCLASS_ALNUM();
4256                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4257                                                                 utf8_target);
4258                     }
4259                 }
4260                 else {
4261                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4262                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4263                 }
4264             }
4265             else {
4266
4267                 /* Here the string isn't utf8, or is utf8 and only ascii
4268                  * characters are to match \w.  In the latter case looking at
4269                  * the byte just prior to the current one may be just the final
4270                  * byte of a multi-byte character.  This is ok.  There are two
4271                  * cases:
4272                  * 1) it is a single byte character, and then the test is doing
4273                  *      just what it's supposed to.
4274                  * 2) it is a multi-byte character, in which case the final
4275                  *      byte is never mistakable for ASCII, and so the test
4276                  *      will say it is not a word character, which is the
4277                  *      correct answer. */
4278                 ln = (locinput != PL_bostr) ?
4279                     UCHARAT(locinput - 1) : '\n';
4280                 switch (FLAGS(scan)) {
4281                     case REGEX_UNICODE_CHARSET:
4282                         ln = isWORDCHAR_L1(ln);
4283                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4284                         break;
4285                     case REGEX_LOCALE_CHARSET:
4286                         ln = isALNUM_LC(ln);
4287                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4288                         break;
4289                     case REGEX_DEPENDS_CHARSET:
4290                         ln = isALNUM(ln);
4291                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4292                         break;
4293                     case REGEX_ASCII_RESTRICTED_CHARSET:
4294                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4295                         ln = isWORDCHAR_A(ln);
4296                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4297                         break;
4298                     default:
4299                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4300                         break;
4301                 }
4302             }
4303             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4304              * regcomp.sym */
4305             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4306                     sayNO;
4307             break;
4308
4309         case ANYOF:  /*  /[abc]/       */
4310             if (NEXTCHR_IS_EOS)
4311                 sayNO;
4312             if (utf8_target) {
4313                 STRLEN inclasslen = PL_regeol - locinput;
4314                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
4315                     sayNO;
4316                 locinput += inclasslen;
4317                 break;
4318             }
4319             else {
4320                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4321                     sayNO;
4322                 locinput++;
4323                 break;
4324             }
4325             break;
4326
4327         /* Special char classes: \d, \w etc.
4328          * The defines start on line 166 or so */
4329         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4330                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4331                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4332                   ALNUMA, NALNUMA, isWORDCHAR_A,
4333                   alnum, "a");
4334
4335         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
4336                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4337                   SPACEU, NSPACEU, isSPACE_L1,
4338                   SPACEA, NSPACEA, isSPACE_A,
4339                   space, " ");
4340
4341         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4342                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4343                 DIGITA, NDIGITA, isDIGIT_A,
4344                 digit, "0");
4345
4346         case POSIXA: /* /[[:ascii:]]/ etc */
4347             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4348                 sayNO;
4349             }
4350             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4351             locinput++;
4352             break;
4353
4354         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4355             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4356                 sayNO;
4357             }
4358             goto increment_locinput;
4359
4360         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4361                        a Unicode extended Grapheme Cluster */
4362             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4363               extended Grapheme Cluster is:
4364
4365                CR LF
4366                | Prepend* Begin Extend*
4367                | .
4368
4369                Begin is:           ( Special_Begin | ! Control )
4370                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4371                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4372                Control is:         [ GCB_Control  CR  LF ]
4373                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4374
4375                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4376                we can rewrite
4377
4378                    Begin is ( Regular_Begin + Special Begin )
4379
4380                It turns out that 98.4% of all Unicode code points match
4381                Regular_Begin.  Doing it this way eliminates a table match in
4382                the previous implementation for almost all Unicode code points.
4383
4384                There is a subtlety with Prepend* which showed up in testing.
4385                Note that the Begin, and only the Begin is required in:
4386                 | Prepend* Begin Extend*
4387                Also, Begin contains '! Control'.  A Prepend must be a
4388                '!  Control', which means it must also be a Begin.  What it
4389                comes down to is that if we match Prepend* and then find no
4390                suitable Begin afterwards, that if we backtrack the last
4391                Prepend, that one will be a suitable Begin.
4392             */
4393
4394             if (NEXTCHR_IS_EOS)
4395                 sayNO;
4396             if  (! utf8_target) {
4397
4398                 /* Match either CR LF  or '.', as all the other possibilities
4399                  * require utf8 */
4400                 locinput++;         /* Match the . or CR */
4401                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4402                                        match the LF */
4403                     && locinput < PL_regeol
4404                     && UCHARAT(locinput) == '\n') locinput++;
4405             }
4406             else {
4407
4408                 /* Utf8: See if is ( CR LF ); already know that locinput <
4409                  * PL_regeol, so locinput+1 is in bounds */
4410                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4411                         && UCHARAT(locinput + 1) == '\n')
4412                 {
4413                     locinput += 2;
4414                 }
4415                 else {
4416                     STRLEN len;
4417
4418                     /* In case have to backtrack to beginning, then match '.' */
4419                     char *starting = locinput;
4420
4421                     /* In case have to backtrack the last prepend */
4422                     char *previous_prepend = 0;
4423
4424                     LOAD_UTF8_CHARCLASS_GCB();
4425
4426                     /* Match (prepend)*   */
4427                     while (locinput < PL_regeol
4428                            && (len = is_GCB_Prepend_utf8(locinput)))
4429                     {
4430                         previous_prepend = locinput;
4431                         locinput += len;
4432                     }
4433
4434                     /* As noted above, if we matched a prepend character, but
4435                      * the next thing won't match, back off the last prepend we
4436                      * matched, as it is guaranteed to match the begin */
4437                     if (previous_prepend
4438                         && (locinput >=  PL_regeol
4439                             || (! swash_fetch(PL_utf8_X_regular_begin,
4440                                              (U8*)locinput, utf8_target)
4441                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4442                         )
4443                     {
4444                         locinput = previous_prepend;
4445                     }
4446
4447                     /* Note that here we know PL_regeol > locinput, as we
4448                      * tested that upon input to this switch case, and if we
4449                      * moved locinput forward, we tested the result just above
4450                      * and it either passed, or we backed off so that it will
4451                      * now pass */
4452                     if (swash_fetch(PL_utf8_X_regular_begin,
4453                                     (U8*)locinput, utf8_target)) {
4454                         locinput += UTF8SKIP(locinput);
4455                     }
4456                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4457
4458                         /* Here did not match the required 'Begin' in the
4459                          * second term.  So just match the very first
4460                          * character, the '.' of the final term of the regex */
4461                         locinput = starting + UTF8SKIP(starting);
4462                         goto exit_utf8;
4463                     } else {
4464
4465                         /* Here is a special begin.  It can be composed of
4466                          * several individual characters.  One possibility is
4467                          * RI+ */
4468                         if ((len = is_GCB_RI_utf8(locinput))) {
4469                             locinput += len;
4470                             while (locinput < PL_regeol
4471                                    && (len = is_GCB_RI_utf8(locinput)))
4472                             {
4473                                 locinput += len;
4474                             }
4475                         } else if ((len = is_GCB_T_utf8(locinput))) {
4476                             /* Another possibility is T+ */
4477                             locinput += len;
4478                             while (locinput < PL_regeol
4479                                 && (len = is_GCB_T_utf8(locinput)))
4480                             {
4481                                 locinput += len;
4482                             }
4483                         } else {
4484
4485                             /* Here, neither RI+ nor T+; must be some other
4486                              * Hangul.  That means it is one of the others: L,
4487                              * LV, LVT or V, and matches:
4488                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4489
4490                             /* Match L*           */
4491                             while (locinput < PL_regeol
4492                                    && (len = is_GCB_L_utf8(locinput)))
4493                             {
4494                                 locinput += len;
4495                             }
4496
4497                             /* Here, have exhausted L*.  If the next character
4498                              * is not an LV, LVT nor V, it means we had to have
4499                              * at least one L, so matches L+ in the original
4500                              * equation, we have a complete hangul syllable.
4501                              * Are done. */
4502
4503                             if (locinput < PL_regeol
4504                                 && is_GCB_LV_LVT_V_utf8(locinput))
4505                             {
4506
4507                                 /* Otherwise keep going.  Must be LV, LVT or V.
4508                                  * See if LVT */
4509                                 if (is_utf8_X_LVT((U8*)locinput)) {
4510                                     locinput += UTF8SKIP(locinput);
4511                                 } else {
4512
4513                                     /* Must be  V or LV.  Take it, then match
4514                                      * V*     */
4515                                     locinput += UTF8SKIP(locinput);
4516                                     while (locinput < PL_regeol
4517                                            && (len = is_GCB_V_utf8(locinput)))
4518                                     {
4519                                         locinput += len;
4520                                     }
4521                                 }
4522
4523                                 /* And any of LV, LVT, or V can be followed
4524                                  * by T*            */
4525                                 while (locinput < PL_regeol
4526                                        && (len = is_GCB_T_utf8(locinput)))
4527                                 {
4528                                     locinput += len;
4529                                 }
4530                             }
4531                         }
4532                     }
4533
4534                     /* Match any extender */
4535                     while (locinput < PL_regeol
4536                             && swash_fetch(PL_utf8_X_extend,
4537                                             (U8*)locinput, utf8_target))
4538                     {
4539                         locinput += UTF8SKIP(locinput);
4540                     }
4541                 }
4542             exit_utf8:
4543                 if (locinput > PL_regeol) sayNO;
4544             }
4545             break;
4546
4547         case NREFFL:  /*  /\g{name}/il  */
4548         {   /* The capture buffer cases.  The ones beginning with N for the
4549                named buffers just convert to the equivalent numbered and
4550                pretend they were called as the corresponding numbered buffer
4551                op.  */
4552             /* don't initialize these in the declaration, it makes C++
4553                unhappy */
4554             char *s;
4555             char type;
4556             re_fold_t folder;
4557             const U8 *fold_array;
4558             UV utf8_fold_flags;
4559
4560             PL_reg_flags |= RF_tainted;
4561             folder = foldEQ_locale;
4562             fold_array = PL_fold_locale;
4563             type = REFFL;
4564             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4565             goto do_nref;
4566
4567         case NREFFA:  /*  /\g{name}/iaa  */
4568             folder = foldEQ_latin1;
4569             fold_array = PL_fold_latin1;
4570             type = REFFA;
4571             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4572             goto do_nref;
4573
4574         case NREFFU:  /*  /\g{name}/iu  */
4575             folder = foldEQ_latin1;
4576             fold_array = PL_fold_latin1;
4577             type = REFFU;
4578             utf8_fold_flags = 0;
4579             goto do_nref;
4580
4581         case NREFF:  /*  /\g{name}/i  */
4582             folder = foldEQ;
4583             fold_array = PL_fold;
4584             type = REFF;
4585             utf8_fold_flags = 0;
4586             goto do_nref;
4587
4588         case NREF:  /*  /\g{name}/   */
4589             type = REF;
4590             folder = NULL;
4591             fold_array = NULL;
4592             utf8_fold_flags = 0;
4593           do_nref:
4594
4595             /* For the named back references, find the corresponding buffer
4596              * number */
4597             n = reg_check_named_buff_matched(rex,scan);
4598
4599             if ( ! n ) {
4600                 sayNO;
4601             }
4602             goto do_nref_ref_common;
4603
4604         case REFFL:  /*  /\1/il  */
4605             PL_reg_flags |= RF_tainted;
4606             folder = foldEQ_locale;
4607             fold_array = PL_fold_locale;
4608             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4609             goto do_ref;
4610
4611         case REFFA:  /*  /\1/iaa  */
4612             folder = foldEQ_latin1;
4613             fold_array = PL_fold_latin1;
4614             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4615             goto do_ref;
4616
4617         case REFFU:  /*  /\1/iu  */
4618             folder = foldEQ_latin1;
4619             fold_array = PL_fold_latin1;
4620             utf8_fold_flags = 0;
4621             goto do_ref;
4622
4623         case REFF:  /*  /\1/i  */
4624             folder = foldEQ;
4625             fold_array = PL_fold;
4626             utf8_fold_flags = 0;
4627             goto do_ref;
4628
4629         case REF:  /*  /\1/    */
4630             folder = NULL;
4631             fold_array = NULL;
4632             utf8_fold_flags = 0;
4633
4634           do_ref:
4635             type = OP(scan);
4636             n = ARG(scan);  /* which paren pair */
4637
4638           do_nref_ref_common:
4639             ln = rex->offs[n].start;
4640             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4641             if (rex->lastparen < n || ln == -1)
4642                 sayNO;                  /* Do not match unless seen CLOSEn. */
4643             if (ln == rex->offs[n].end)
4644                 break;
4645
4646             s = PL_bostr + ln;
4647             if (type != REF     /* REF can do byte comparison */
4648                 && (utf8_target || type == REFFU))
4649             { /* XXX handle REFFL better */
4650                 char * limit = PL_regeol;
4651
4652                 /* This call case insensitively compares the entire buffer
4653                     * at s, with the current input starting at locinput, but
4654                     * not going off the end given by PL_regeol, and returns in
4655                     * <limit> upon success, how much of the current input was
4656                     * matched */
4657                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4658                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4659                 {
4660                     sayNO;
4661                 }
4662                 locinput = limit;
4663                 break;
4664             }
4665
4666             /* Not utf8:  Inline the first character, for speed. */
4667             if (!NEXTCHR_IS_EOS &&
4668                 UCHARAT(s) != nextchr &&
4669                 (type == REF ||
4670                  UCHARAT(s) != fold_array[nextchr]))
4671                 sayNO;
4672             ln = rex->offs[n].end - ln;
4673             if (locinput + ln > PL_regeol)
4674                 sayNO;
4675             if (ln > 1 && (type == REF
4676                            ? memNE(s, locinput, ln)
4677                            : ! folder(s, locinput, ln)))
4678                 sayNO;
4679             locinput += ln;
4680             break;
4681         }
4682
4683         case NOTHING: /* null op; e.g. the 'nothing' following
4684                        * the '*' in m{(a+|b)*}' */
4685             break;
4686         case TAIL: /* placeholder while compiling (A|B|C) */
4687             break;
4688
4689         case BACK: /* ??? doesn't appear to be used ??? */
4690             break;
4691
4692 #undef  ST
4693 #define ST st->u.eval
4694         {
4695             SV *ret;
4696             REGEXP *re_sv;
4697             regexp *re;
4698             regexp_internal *rei;
4699             regnode *startpoint;
4700
4701         case GOSTART: /*  (?R)  */
4702         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4703             if (cur_eval && cur_eval->locinput==locinput) {
4704                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4705                     Perl_croak(aTHX_ "Infinite recursion in regex");
4706                 if ( ++nochange_depth > max_nochange_depth )
4707                     Perl_croak(aTHX_
4708                         "Pattern subroutine nesting without pos change"
4709                         " exceeded limit in regex");
4710             } else {
4711                 nochange_depth = 0;
4712             }
4713             re_sv = rex_sv;
4714             re = rex;
4715             rei = rexi;
4716             if (OP(scan)==GOSUB) {
4717                 startpoint = scan + ARG2L(scan);
4718                 ST.close_paren = ARG(scan);
4719             } else {
4720                 startpoint = rei->program+1;
4721                 ST.close_paren = 0;
4722             }
4723             goto eval_recurse_doit;
4724             assert(0); /* NOTREACHED */
4725
4726         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4727             if (cur_eval && cur_eval->locinput==locinput) {
4728                 if ( ++nochange_depth > max_nochange_depth )
4729                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4730             } else {
4731                 nochange_depth = 0;
4732             }
4733             {
4734                 /* execute the code in the {...} */
4735
4736                 dSP;
4737                 SV ** before;
4738                 OP * const oop = PL_op;
4739                 COP * const ocurcop = PL_curcop;
4740                 OP *nop;
4741                 char *saved_regeol = PL_regeol;
4742                 struct re_save_state saved_state;
4743                 CV *newcv;
4744
4745                 /* save *all* paren positions */
4746                 regcppush(rex, 0);
4747                 REGCP_SET(runops_cp);
4748
4749                 /* To not corrupt the existing regex state while executing the
4750                  * eval we would normally put it on the save stack, like with
4751                  * save_re_context. However, re-evals have a weird scoping so we
4752                  * can't just add ENTER/LEAVE here. With that, things like
4753                  *
4754                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4755                  *
4756                  * would break, as they expect the localisation to be unwound
4757                  * only when the re-engine backtracks through the bit that
4758                  * localised it.
4759                  *
4760                  * What we do instead is just saving the state in a local c
4761                  * variable.
4762                  */
4763                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4764
4765                 PL_reg_state.re_reparsing = FALSE;
4766
4767                 if (!caller_cv)
4768                     caller_cv = find_runcv(NULL);
4769
4770                 n = ARG(scan);
4771
4772                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4773                     newcv = ((struct regexp *)SvANY(
4774                                                 (REGEXP*)(rexi->data->data[n])
4775                                             ))->qr_anoncv
4776                                         ;
4777                     nop = (OP*)rexi->data->data[n+1];
4778                 }
4779                 else if (rexi->data->what[n] == 'l') { /* literal code */
4780                     newcv = caller_cv;
4781                     nop = (OP*)rexi->data->data[n];
4782                     assert(CvDEPTH(newcv));
4783                 }
4784                 else {
4785                     /* literal with own CV */
4786                     assert(rexi->data->what[n] == 'L');
4787                     newcv = rex->qr_anoncv;
4788                     nop = (OP*)rexi->data->data[n];
4789                 }
4790
4791                 /* normally if we're about to execute code from the same
4792                  * CV that we used previously, we just use the existing
4793                  * CX stack entry. However, its possible that in the
4794                  * meantime we may have backtracked, popped from the save
4795                  * stack, and undone the SAVECOMPPAD(s) associated with
4796                  * PUSH_MULTICALL; in which case PL_comppad no longer
4797                  * points to newcv's pad. */
4798                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4799                 {
4800                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4801                     if (last_pushed_cv) {
4802                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4803                     }
4804                     else {
4805                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4806                     }
4807                     last_pushed_cv = newcv;
4808                 }
4809                 last_pad = PL_comppad;
4810
4811                 /* the initial nextstate you would normally execute
4812                  * at the start of an eval (which would cause error
4813                  * messages to come from the eval), may be optimised
4814                  * away from the execution path in the regex code blocks;
4815                  * so manually set PL_curcop to it initially */
4816                 {
4817                     OP *o = cUNOPx(nop)->op_first;
4818                     assert(o->op_type == OP_NULL);
4819                     if (o->op_targ == OP_SCOPE) {
4820                         o = cUNOPo->op_first;
4821                     }
4822                     else {
4823                         assert(o->op_targ == OP_LEAVE);
4824                         o = cUNOPo->op_first;
4825                         assert(o->op_type == OP_ENTER);
4826                         o = o->op_sibling;
4827                     }
4828
4829                     if (o->op_type != OP_STUB) {
4830                         assert(    o->op_type == OP_NEXTSTATE
4831                                 || o->op_type == OP_DBSTATE
4832                                 || (o->op_type == OP_NULL
4833                                     &&  (  o->op_targ == OP_NEXTSTATE
4834                                         || o->op_targ == OP_DBSTATE
4835                                         )
4836                                     )
4837                         );
4838                         PL_curcop = (COP*)o;
4839                     }
4840                 }
4841                 nop = nop->op_next;
4842
4843                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4844                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4845
4846                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4847
4848                 if (sv_yes_mark) {
4849                     SV *sv_mrk = get_sv("REGMARK", 1);
4850                     sv_setsv(sv_mrk, sv_yes_mark);
4851                 }
4852
4853                 /* we don't use MULTICALL here as we want to call the
4854                  * first op of the block of interest, rather than the
4855                  * first op of the sub */
4856                 before = SP;
4857                 PL_op = nop;
4858                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4859                 SPAGAIN;
4860                 if (SP == before)
4861                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4862                 else {
4863                     ret = POPs;
4864                     PUTBACK;
4865                 }
4866
4867                 /* before restoring everything, evaluate the returned
4868                  * value, so that 'uninit' warnings don't use the wrong
4869                  * PL_op or pad. Also need to process any magic vars
4870                  * (e.g. $1) *before* parentheses are restored */
4871
4872                 PL_op = NULL;
4873
4874                 re_sv = NULL;
4875                 if (logical == 0)        /*   (?{})/   */
4876                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4877                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4878                     sw = cBOOL(SvTRUE(ret));
4879                     logical = 0;
4880                 }
4881                 else {                   /*  /(??{})  */
4882                     /*  if its overloaded, let the regex compiler handle
4883                      *  it; otherwise extract regex, or stringify  */
4884                     if (!SvAMAGIC(ret)) {
4885                         SV *sv = ret;
4886                         if (SvROK(sv))
4887                             sv = SvRV(sv);
4888                         if (SvTYPE(sv) == SVt_REGEXP)
4889                             re_sv = (REGEXP*) sv;
4890                         else if (SvSMAGICAL(sv)) {
4891                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4892                             if (mg)
4893                                 re_sv = (REGEXP *) mg->mg_obj;
4894                         }
4895
4896                         /* force any magic, undef warnings here */
4897                         if (!re_sv) {
4898                             ret = sv_mortalcopy(ret);
4899                             (void) SvPV_force_nolen(ret);
4900                         }
4901                     }
4902
4903                 }
4904
4905                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4906
4907                 /* *** Note that at this point we don't restore
4908                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4909                  * be used again soon. This is safe as long as nothing
4910                  * in the regexp code uses the pad ! */
4911                 PL_op = oop;
4912                 PL_curcop = ocurcop;
4913                 PL_regeol = saved_regeol;
4914                 S_regcp_restore(aTHX_ rex, runops_cp);
4915
4916                 if (logical != 2)
4917                     break;
4918             }
4919
4920                 /* only /(??{})/  from now on */
4921                 logical = 0;
4922                 {
4923                     /* extract RE object from returned value; compiling if
4924                      * necessary */
4925
4926                     if (re_sv) {
4927                         re_sv = reg_temp_copy(NULL, re_sv);
4928                     }
4929                     else {
4930                         U32 pm_flags = 0;
4931                         const I32 osize = PL_regsize;
4932
4933                         if (SvUTF8(ret) && IN_BYTES) {
4934                             /* In use 'bytes': make a copy of the octet
4935                              * sequence, but without the flag on */
4936                             STRLEN len;
4937                             const char *const p = SvPV(ret, len);
4938                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4939                         }
4940                         if (rex->intflags & PREGf_USE_RE_EVAL)
4941                             pm_flags |= PMf_USE_RE_EVAL;
4942
4943                         /* if we got here, it should be an engine which
4944                          * supports compiling code blocks and stuff */
4945                         assert(rex->engine && rex->engine->op_comp);
4946                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4947                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4948                                     rex->engine, NULL, NULL,
4949                                     /* copy /msix etc to inner pattern */
4950                                     scan->flags,
4951                                     pm_flags);
4952
4953                         if (!(SvFLAGS(ret)
4954                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4955                                  | SVs_GMG))) {
4956                             /* This isn't a first class regexp. Instead, it's
4957                                caching a regexp onto an existing, Perl visible
4958                                scalar.  */
4959                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4960                         }
4961                         PL_regsize = osize;
4962                         /* safe to do now that any $1 etc has been
4963                          * interpolated into the new pattern string and
4964                          * compiled */
4965                         S_regcp_restore(aTHX_ rex, runops_cp);
4966                     }
4967                     re = (struct regexp *)SvANY(re_sv);
4968                 }
4969                 RXp_MATCH_COPIED_off(re);
4970                 re->subbeg = rex->subbeg;
4971                 re->sublen = rex->sublen;
4972                 re->suboffset = rex->suboffset;
4973                 re->subcoffset = rex->subcoffset;
4974                 rei = RXi_GET(re);
4975                 DEBUG_EXECUTE_r(
4976                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4977                         "Matching embedded");
4978                 );
4979                 startpoint = rei->program + 1;
4980                 ST.close_paren = 0; /* only used for GOSUB */
4981
4982         eval_recurse_doit: /* Share code with GOSUB below this line */
4983                 /* run the pattern returned from (??{...}) */
4984                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4985                 REGCP_SET(ST.lastcp);
4986
4987                 re->lastparen = 0;
4988                 re->lastcloseparen = 0;
4989
4990                 PL_regsize = 0;
4991
4992                 /* XXXX This is too dramatic a measure... */
4993                 PL_reg_maxiter = 0;
4994
4995                 ST.toggle_reg_flags = PL_reg_flags;
4996                 if (RX_UTF8(re_sv))
4997                     PL_reg_flags |= RF_utf8;
4998                 else
4999                     PL_reg_flags &= ~RF_utf8;
5000                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5001
5002                 ST.prev_rex = rex_sv;
5003                 ST.prev_curlyx = cur_curlyx;
5004                 rex_sv = re_sv;
5005                 SET_reg_curpm(rex_sv);
5006                 rex = re;
5007                 rexi = rei;
5008                 cur_curlyx = NULL;
5009                 ST.B = next;
5010                 ST.prev_eval = cur_eval;
5011                 cur_eval = st;
5012                 /* now continue from first node in postoned RE */
5013                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5014                 assert(0); /* NOTREACHED */
5015         }
5016
5017         case EVAL_AB: /* cleanup after a successful (??{A})B */
5018             /* note: this is called twice; first after popping B, then A */
5019             PL_reg_flags ^= ST.toggle_reg_flags;
5020             rex_sv = ST.prev_rex;
5021             SET_reg_curpm(rex_sv);
5022             rex = (struct regexp *)SvANY(rex_sv);
5023             rexi = RXi_GET(rex);
5024             regcpblow(ST.cp);
5025             cur_eval = ST.prev_eval;
5026             cur_curlyx = ST.prev_curlyx;
5027
5028             /* XXXX This is too dramatic a measure... */
5029             PL_reg_maxiter = 0;
5030             if ( nochange_depth )
5031                 nochange_depth--;
5032             sayYES;
5033
5034
5035         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5036             /* note: this is called twice; first after popping B, then A */
5037             PL_reg_flags ^= ST.toggle_reg_flags;
5038             rex_sv = ST.prev_rex;
5039             SET_reg_curpm(rex_sv);
5040             rex = (struct regexp *)SvANY(rex_sv);
5041             rexi = RXi_GET(rex);
5042
5043             REGCP_UNWIND(ST.lastcp);
5044             regcppop(rex);
5045             cur_eval = ST.prev_eval;
5046             cur_curlyx = ST.prev_curlyx;
5047             /* XXXX This is too dramatic a measure... */
5048             PL_reg_maxiter = 0;
5049             if ( nochange_depth )
5050                 nochange_depth--;
5051             sayNO_SILENT;
5052 #undef ST
5053
5054         case OPEN: /*  (  */
5055             n = ARG(scan);  /* which paren pair */
5056             rex->offs[n].start_tmp = locinput - PL_bostr;
5057             if (n > PL_regsize)
5058                 PL_regsize = n;
5059             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5060                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5061                 PTR2UV(rex),
5062                 PTR2UV(rex->offs),
5063                 (UV)n,
5064                 (IV)rex->offs[n].start_tmp,
5065                 (UV)PL_regsize
5066             ));
5067             lastopen = n;
5068             break;
5069
5070 /* XXX really need to log other places start/end are set too */
5071 #define CLOSE_CAPTURE \
5072     rex->offs[n].start = rex->offs[n].start_tmp; \
5073     rex->offs[n].end = locinput - PL_bostr; \
5074     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5075         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5076         PTR2UV(rex), \
5077         PTR2UV(rex->offs), \
5078         (UV)n, \
5079         (IV)rex->offs[n].start, \
5080         (IV)rex->offs[n].end \
5081     ))
5082
5083         case CLOSE:  /*  )  */
5084             n = ARG(scan);  /* which paren pair */
5085             CLOSE_CAPTURE;
5086             /*if (n > PL_regsize)
5087                 PL_regsize = n;*/
5088             if (n > rex->lastparen)
5089                 rex->lastparen = n;
5090             rex->lastcloseparen = n;
5091             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5092                 goto fake_end;
5093             }
5094             break;
5095
5096         case ACCEPT:  /*  (*ACCEPT)  */
5097             if (ARG(scan)){
5098                 regnode *cursor;
5099                 for (cursor=scan;
5100                      cursor && OP(cursor)!=END;
5101                      cursor=regnext(cursor))
5102                 {
5103                     if ( OP(cursor)==CLOSE ){
5104                         n = ARG(cursor);
5105                         if ( n <= lastopen ) {
5106                             CLOSE_CAPTURE;
5107                             /*if (n > PL_regsize)
5108                             PL_regsize = n;*/
5109                             if (n > rex->lastparen)
5110                                 rex->lastparen = n;
5111                             rex->lastcloseparen = n;
5112                             if ( n == ARG(scan) || (cur_eval &&
5113                                 cur_eval->u.eval.close_paren == n))
5114                                 break;
5115                         }
5116                     }
5117                 }
5118             }
5119             goto fake_end;
5120             /*NOTREACHED*/
5121
5122         case GROUPP:  /*  (?(1))  */
5123             n = ARG(scan);  /* which paren pair */
5124             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5125             break;
5126
5127         case NGROUPP:  /*  (?(<name>))  */
5128             /* reg_check_named_buff_matched returns 0 for no match */
5129             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5130             break;
5131
5132         case INSUBP:   /*  (?(R))  */
5133             n = ARG(scan);
5134             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5135             break;
5136
5137         case DEFINEP:  /*  (?(DEFINE))  */
5138             sw = 0;
5139             break;
5140
5141         case IFTHEN:   /*  (?(cond)A|B)  */
5142             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5143             if (sw)
5144                 next = NEXTOPER(NEXTOPER(scan));
5145             else {
5146                 next = scan + ARG(scan);
5147                 if (OP(next) == IFTHEN) /* Fake one. */
5148                     next = NEXTOPER(NEXTOPER(next));
5149             }
5150             break;
5151
5152         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5153             logical = scan->flags;
5154             break;
5155
5156 /*******************************************************************
5157
5158 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5159 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5160 STAR/PLUS/CURLY/CURLYN are used instead.)
5161
5162 A*B is compiled as <CURLYX><A><WHILEM><B>
5163
5164 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5165 state, which contains the current count, initialised to -1. It also sets
5166 cur_curlyx to point to this state, with any previous value saved in the
5167 state block.
5168
5169 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5170 since the pattern may possibly match zero times (i.e. it's a while {} loop
5171 rather than a do {} while loop).
5172
5173 Each entry to WHILEM represents a successful match of A. The count in the
5174 CURLYX block is incremented, another WHILEM state is pushed, and execution
5175 passes to A or B depending on greediness and the current count.
5176
5177 For example, if matching against the string a1a2a3b (where the aN are
5178 substrings that match /A/), then the match progresses as follows: (the
5179 pushed states are interspersed with the bits of strings matched so far):
5180
5181     <CURLYX cnt=-1>
5182     <CURLYX cnt=0><WHILEM>
5183     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5184     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5185     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5186     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5187
5188 (Contrast this with something like CURLYM, which maintains only a single
5189 backtrack state:
5190
5191     <CURLYM cnt=0> a1
5192     a1 <CURLYM cnt=1> a2
5193     a1 a2 <CURLYM cnt=2> a3
5194     a1 a2 a3 <CURLYM cnt=3> b
5195 )
5196
5197 Each WHILEM state block marks a point to backtrack to upon partial failure
5198 of A or B, and also contains some minor state data related to that
5199 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5200 overall state, such as the count, and pointers to the A and B ops.
5201
5202 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5203 must always point to the *current* CURLYX block, the rules are:
5204
5205 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5206 and set cur_curlyx to point the new block.
5207
5208 When popping the CURLYX block after a successful or unsuccessful match,
5209 restore the previous cur_curlyx.
5210
5211 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5212 to the outer one saved in the CURLYX block.
5213
5214 When popping the WHILEM block after a successful or unsuccessful B match,
5215 restore the previous cur_curlyx.
5216
5217 Here's an example for the pattern (AI* BI)*BO
5218 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5219
5220 cur_
5221 curlyx backtrack stack
5222 ------ ---------------
5223 NULL
5224 CO     <CO prev=NULL> <WO>
5225 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5226 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5227 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5228
5229 At this point the pattern succeeds, and we work back down the stack to
5230 clean up, restoring as we go:
5231
5232 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5233 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5234 CO     <CO prev=NULL> <WO>
5235 NULL
5236
5237 *******************************************************************/
5238
5239 #define ST st->u.curlyx
5240
5241         case CURLYX:    /* start of /A*B/  (for complex A) */
5242         {
5243             /* No need to save/restore up to this paren */
5244             I32 parenfloor = scan->flags;
5245
5246             assert(next); /* keep Coverity happy */
5247             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5248                 next += ARG(next);
5249
5250             /* XXXX Probably it is better to teach regpush to support
5251                parenfloor > PL_regsize... */
5252             if (parenfloor > (I32)rex->lastparen)
5253                 parenfloor = rex->lastparen; /* Pessimization... */
5254
5255             ST.prev_curlyx= cur_curlyx;
5256             cur_curlyx = st;
5257             ST.cp = PL_savestack_ix;
5258
5259             /* these fields contain the state of the current curly.
5260              * they are accessed by subsequent WHILEMs */
5261             ST.parenfloor = parenfloor;
5262             ST.me = scan;
5263             ST.B = next;
5264             ST.minmod = minmod;
5265             minmod = 0;
5266             ST.count = -1;      /* this will be updated by WHILEM */
5267             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5268
5269             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5270             assert(0); /* NOTREACHED */
5271         }
5272
5273         case CURLYX_end: /* just finished matching all of A*B */
5274             cur_curlyx = ST.prev_curlyx;
5275             sayYES;
5276             assert(0); /* NOTREACHED */
5277
5278         case CURLYX_end_fail: /* just failed to match all of A*B */
5279             regcpblow(ST.cp);
5280             cur_curlyx = ST.prev_curlyx;
5281             sayNO;
5282             assert(0); /* NOTREACHED */
5283
5284
5285 #undef ST
5286 #define ST st->u.whilem
5287
5288         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5289         {
5290             /* see the discussion above about CURLYX/WHILEM */
5291             I32 n;
5292             int min = ARG1(cur_curlyx->u.curlyx.me);
5293             int max = ARG2(cur_curlyx->u.curlyx.me);
5294             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5295
5296             assert(cur_curlyx); /* keep Coverity happy */
5297             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5298             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5299             ST.cache_offset = 0;
5300             ST.cache_mask = 0;
5301
5302
5303             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5304                   "%*s  whilem: matched %ld out of %d..%d\n",
5305                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5306             );
5307
5308             /* First just match a string of min A's. */
5309
5310             if (n < min) {
5311                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5312                 cur_curlyx->u.curlyx.lastloc = locinput;
5313                 REGCP_SET(ST.lastcp);
5314
5315                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5316                 assert(0); /* NOTREACHED */
5317             }
5318
5319             /* If degenerate A matches "", assume A done. */
5320
5321             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5322                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5323                    "%*s  whilem: empty match detected, trying continuation...\n",
5324                    REPORT_CODE_OFF+depth*2, "")
5325                 );
5326                 goto do_whilem_B_max;
5327             }
5328
5329             /* super-linear cache processing */
5330
5331             if (scan->flags) {
5332
5333                 if (!PL_reg_maxiter) {
5334                     /* start the countdown: Postpone detection until we
5335                      * know the match is not *that* much linear. */
5336                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5337                     /* possible overflow for long strings and many CURLYX's */
5338                     if (PL_reg_maxiter < 0)
5339                         PL_reg_maxiter = I32_MAX;
5340                     PL_reg_leftiter = PL_reg_maxiter;
5341                 }
5342
5343                 if (PL_reg_leftiter-- == 0) {
5344                     /* initialise cache */
5345                     const I32 size = (PL_reg_maxiter + 7)/8;
5346                     if (PL_reg_poscache) {
5347                         if ((I32)PL_reg_poscache_size < size) {
5348                             Renew(PL_reg_poscache, size, char);
5349                             PL_reg_poscache_size = size;
5350                         }
5351                         Zero(PL_reg_poscache, size, char);
5352                     }
5353                     else {
5354                         PL_reg_poscache_size = size;
5355                         Newxz(PL_reg_poscache, size, char);
5356                     }
5357                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5358       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5359                               PL_colors[4], PL_colors[5])
5360                     );
5361                 }
5362
5363                 if (PL_reg_leftiter < 0) {
5364                     /* have we already failed at this position? */
5365                     I32 offset, mask;
5366                     offset  = (scan->flags & 0xf) - 1
5367                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5368                     mask    = 1 << (offset % 8);
5369                     offset /= 8;
5370                     if (PL_reg_poscache[offset] & mask) {
5371                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5372                             "%*s  whilem: (cache) already tried at this position...\n",
5373                             REPORT_CODE_OFF+depth*2, "")
5374                         );
5375                         sayNO; /* cache records failure */
5376                     }
5377                     ST.cache_offset = offset;
5378                     ST.cache_mask   = mask;
5379                 }
5380             }
5381
5382             /* Prefer B over A for minimal matching. */
5383
5384             if (cur_curlyx->u.curlyx.minmod) {
5385                 ST.save_curlyx = cur_curlyx;
5386                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5387                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5388                 REGCP_SET(ST.lastcp);
5389                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5390                                     locinput);
5391                 assert(0); /* NOTREACHED */
5392             }
5393
5394             /* Prefer A over B for maximal matching. */
5395
5396             if (n < max) { /* More greed allowed? */
5397                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5398                 cur_curlyx->u.curlyx.lastloc = locinput;
5399                 REGCP_SET(ST.lastcp);
5400                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5401                 assert(0); /* NOTREACHED */
5402             }
5403             goto do_whilem_B_max;
5404         }
5405         assert(0); /* NOTREACHED */
5406
5407         case WHILEM_B_min: /* just matched B in a minimal match */
5408         case WHILEM_B_max: /* just matched B in a maximal match */
5409             cur_curlyx = ST.save_curlyx;
5410             sayYES;
5411             assert(0); /* NOTREACHED */
5412
5413         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5414             cur_curlyx = ST.save_curlyx;
5415             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5416             cur_curlyx->u.curlyx.count--;
5417             CACHEsayNO;
5418             assert(0); /* NOTREACHED */
5419
5420         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5421             /* FALL THROUGH */
5422         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5423             REGCP_UNWIND(ST.lastcp);
5424             regcppop(rex);
5425             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5426             cur_curlyx->u.curlyx.count--;
5427             CACHEsayNO;
5428             assert(0); /* NOTREACHED */
5429
5430         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5431             REGCP_UNWIND(ST.lastcp);
5432             regcppop(rex);      /* Restore some previous $<digit>s? */
5433             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5434                 "%*s  whilem: failed, trying continuation...\n",
5435                 REPORT_CODE_OFF+depth*2, "")
5436             );
5437           do_whilem_B_max:
5438             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5439                 && ckWARN(WARN_REGEXP)
5440                 && !(PL_reg_flags & RF_warned))
5441             {
5442                 PL_reg_flags |= RF_warned;
5443                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5444                      "Complex regular subexpression recursion limit (%d) "
5445                      "exceeded",
5446                      REG_INFTY - 1);
5447             }
5448
5449             /* now try B */
5450             ST.save_curlyx = cur_curlyx;
5451             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5452             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5453                                 locinput);
5454             assert(0); /* NOTREACHED */
5455
5456         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5457             cur_curlyx = ST.save_curlyx;
5458             REGCP_UNWIND(ST.lastcp);
5459             regcppop(rex);
5460
5461             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5462                 /* Maximum greed exceeded */
5463                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5464                     && ckWARN(WARN_REGEXP)
5465                     && !(PL_reg_flags & RF_warned))
5466                 {
5467                     PL_reg_flags |= RF_warned;
5468                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5469                         "Complex regular subexpression recursion "
5470                         "limit (%d) exceeded",
5471                         REG_INFTY - 1);
5472                 }
5473                 cur_curlyx->u.curlyx.count--;
5474                 CACHEsayNO;
5475             }
5476
5477             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5478                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5479             );
5480             /* Try grabbing another A and see if it helps. */
5481             cur_curlyx->u.curlyx.lastloc = locinput;
5482             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5483             REGCP_SET(ST.lastcp);
5484             PUSH_STATE_GOTO(WHILEM_A_min,
5485                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5486                 locinput);
5487             assert(0); /* NOTREACHED */
5488
5489 #undef  ST
5490 #define ST st->u.branch
5491
5492         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5493             next = scan + ARG(scan);
5494             if (next == scan)
5495                 next = NULL;
5496             scan = NEXTOPER(scan);
5497             /* FALL THROUGH */
5498
5499         case BRANCH:        /*  /(...|A|...)/ */
5500             scan = NEXTOPER(scan); /* scan now points to inner node */
5501             ST.lastparen = rex->lastparen;
5502             ST.lastcloseparen = rex->lastcloseparen;
5503             ST.next_branch = next;
5504             REGCP_SET(ST.cp);
5505
5506             /* Now go into the branch */
5507             if (has_cutgroup) {
5508                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5509             } else {
5510                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5511             }
5512             assert(0); /* NOTREACHED */
5513
5514         case CUTGROUP:  /*  /(*THEN)/  */
5515             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5516                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5517             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5518             assert(0); /* NOTREACHED */
5519
5520         case CUTGROUP_next_fail:
5521             do_cutgroup = 1;
5522             no_final = 1;
5523             if (st->u.mark.mark_name)
5524                 sv_commit = st->u.mark.mark_name;
5525             sayNO;
5526             assert(0); /* NOTREACHED */
5527
5528         case BRANCH_next:
5529             sayYES;
5530             assert(0); /* NOTREACHED */
5531
5532         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5533             if (do_cutgroup) {
5534                 do_cutgroup = 0;
5535                 no_final = 0;
5536             }
5537             REGCP_UNWIND(ST.cp);
5538             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5539             scan = ST.next_branch;
5540             /* no more branches? */
5541             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5542                 DEBUG_EXECUTE_r({
5543                     PerlIO_printf( Perl_debug_log,
5544                         "%*s  %sBRANCH failed...%s\n",
5545                         REPORT_CODE_OFF+depth*2, "",
5546                         PL_colors[4],
5547                         PL_colors[5] );
5548                 });
5549                 sayNO_SILENT;
5550             }
5551             continue; /* execute next BRANCH[J] op */
5552             assert(0); /* NOTREACHED */
5553
5554         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5555             minmod = 1;
5556             break;
5557
5558 #undef  ST
5559 #define ST st->u.curlym
5560
5561         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5562
5563             /* This is an optimisation of CURLYX that enables us to push
5564              * only a single backtracking state, no matter how many matches
5565              * there are in {m,n}. It relies on the pattern being constant
5566              * length, with no parens to influence future backrefs
5567              */
5568
5569             ST.me = scan;
5570             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5571
5572             ST.lastparen      = rex->lastparen;
5573             ST.lastcloseparen = rex->lastcloseparen;
5574
5575             /* if paren positive, emulate an OPEN/CLOSE around A */
5576             if (ST.me->flags) {
5577                 U32 paren = ST.me->flags;
5578                 if (paren > PL_regsize)
5579                     PL_regsize = paren;
5580                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5581             }
5582             ST.A = scan;
5583             ST.B = next;
5584             ST.alen = 0;
5585             ST.count = 0;
5586             ST.minmod = minmod;
5587             minmod = 0;
5588             ST.c1 = CHRTEST_UNINIT;
5589             REGCP_SET(ST.cp);
5590
5591             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5592                 goto curlym_do_B;
5593
5594           curlym_do_A: /* execute the A in /A{m,n}B/  */
5595             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5596             assert(0); /* NOTREACHED */
5597
5598         case CURLYM_A: /* we've just matched an A */
5599             ST.count++;
5600             /* after first match, determine A's length: u.curlym.alen */
5601             if (ST.count == 1) {
5602                 if (PL_reg_match_utf8) {
5603                     char *s = st->locinput;
5604                     while (s < locinput) {
5605                         ST.alen++;
5606                         s += UTF8SKIP(s);
5607                     }
5608                 }
5609                 else {
5610                     ST.alen = locinput - st->locinput;
5611                 }
5612                 if (ST.alen == 0)
5613                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5614             }
5615             DEBUG_EXECUTE_r(
5616                 PerlIO_printf(Perl_debug_log,
5617                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5618                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5619                           (IV) ST.count, (IV)ST.alen)
5620             );
5621
5622             if (cur_eval && cur_eval->u.eval.close_paren &&
5623                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5624                 goto fake_end;
5625
5626             {
5627                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5628                 if ( max == REG_INFTY || ST.count < max )
5629                     goto curlym_do_A; /* try to match another A */
5630             }
5631             goto curlym_do_B; /* try to match B */
5632
5633         case CURLYM_A_fail: /* just failed to match an A */
5634             REGCP_UNWIND(ST.cp);
5635
5636             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5637                 || (cur_eval && cur_eval->u.eval.close_paren &&
5638                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5639                 sayNO;
5640
5641           curlym_do_B: /* execute the B in /A{m,n}B/  */
5642             if (ST.c1 == CHRTEST_UNINIT) {
5643                 /* calculate c1 and c2 for possible match of 1st char
5644                  * following curly */
5645                 ST.c1 = ST.c2 = CHRTEST_VOID;
5646                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5647                     regnode *text_node = ST.B;
5648                     if (! HAS_TEXT(text_node))
5649                         FIND_NEXT_IMPT(text_node);
5650                     /* this used to be
5651
5652                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5653
5654                         But the former is redundant in light of the latter.
5655
5656                         if this changes back then the macro for
5657                         IS_TEXT and friends need to change.
5658                      */
5659                     if (PL_regkind[OP(text_node)] == EXACT) {
5660                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5661                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5662                         {
5663                             sayNO;
5664                         }
5665                     }
5666                 }
5667             }
5668
5669             DEBUG_EXECUTE_r(
5670                 PerlIO_printf(Perl_debug_log,
5671                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5672                     (int)(REPORT_CODE_OFF+(depth*2)),
5673                     "", (IV)ST.count)
5674                 );
5675             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5676                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5677                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5678                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5679                     {
5680                         /* simulate B failing */
5681                         DEBUG_OPTIMISE_r(
5682                             PerlIO_printf(Perl_debug_log,
5683                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5684                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5685                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5686                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5687                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5688                         );
5689                         state_num = CURLYM_B_fail;
5690                         goto reenter_switch;
5691                     }
5692                 }
5693                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5694                     /* simulate B failing */
5695                     DEBUG_OPTIMISE_r(
5696                         PerlIO_printf(Perl_debug_log,
5697                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5698                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5699                             (int) nextchr, ST.c1, ST.c2)
5700                     );
5701                     state_num = CURLYM_B_fail;
5702                     goto reenter_switch;
5703                 }
5704             }
5705
5706             if (ST.me->flags) {
5707                 /* emulate CLOSE: mark current A as captured */
5708                 I32 paren = ST.me->flags;
5709                 if (ST.count) {
5710                     rex->offs[paren].start
5711                         = HOPc(locinput, -ST.alen) - PL_bostr;
5712                     rex->offs[paren].end = locinput - PL_bostr;
5713                     if ((U32)paren > rex->lastparen)
5714                         rex->lastparen = paren;
5715                     rex->lastcloseparen = paren;
5716                 }
5717                 else
5718                     rex->offs[paren].end = -1;
5719                 if (cur_eval && cur_eval->u.eval.close_paren &&
5720                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5721                 {
5722                     if (ST.count)
5723                         goto fake_end;
5724                     else
5725                         sayNO;
5726                 }
5727             }
5728
5729             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5730             assert(0); /* NOTREACHED */
5731
5732         case CURLYM_B_fail: /* just failed to match a B */
5733             REGCP_UNWIND(ST.cp);
5734             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5735             if (ST.minmod) {
5736                 I32 max = ARG2(ST.me);
5737                 if (max != REG_INFTY && ST.count == max)
5738                     sayNO;
5739                 goto curlym_do_A; /* try to match a further A */
5740             }
5741             /* backtrack one A */
5742             if (ST.count == ARG1(ST.me) /* min */)
5743                 sayNO;
5744             ST.count--;
5745             SET_locinput(HOPc(locinput, -ST.alen));
5746             goto curlym_do_B; /* try to match B */
5747
5748 #undef ST
5749 #define ST st->u.curly
5750
5751 #define CURLY_SETPAREN(paren, success) \
5752     if (paren) { \
5753         if (success) { \
5754             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5755             rex->offs[paren].end = locinput - PL_bostr; \
5756             if (paren > rex->lastparen) \
5757                 rex->lastparen = paren; \
5758             rex->lastcloseparen = paren; \
5759         } \
5760         else { \
5761             rex->offs[paren].end = -1; \
5762             rex->lastparen      = ST.lastparen; \
5763             rex->lastcloseparen = ST.lastcloseparen; \
5764         } \
5765     }
5766
5767         case STAR:              /*  /A*B/ where A is width 1 char */
5768             ST.paren = 0;
5769             ST.min = 0;
5770             ST.max = REG_INFTY;
5771             scan = NEXTOPER(scan);
5772             goto repeat;
5773
5774         case PLUS:              /*  /A+B/ where A is width 1 char */
5775             ST.paren = 0;
5776             ST.min = 1;
5777             ST.max = REG_INFTY;
5778             scan = NEXTOPER(scan);
5779             goto repeat;
5780
5781         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5782             ST.paren = scan->flags;     /* Which paren to set */
5783             ST.lastparen      = rex->lastparen;
5784             ST.lastcloseparen = rex->lastcloseparen;
5785             if (ST.paren > PL_regsize)
5786                 PL_regsize = ST.paren;
5787             ST.min = ARG1(scan);  /* min to match */
5788             ST.max = ARG2(scan);  /* max to match */
5789             if (cur_eval && cur_eval->u.eval.close_paren &&
5790                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5791                 ST.min=1;
5792                 ST.max=1;
5793             }
5794             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5795             goto repeat;
5796
5797         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5798             ST.paren = 0;
5799             ST.min = ARG1(scan);  /* min to match */
5800             ST.max = ARG2(scan);  /* max to match */
5801             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5802           repeat:
5803             /*
5804             * Lookahead to avoid useless match attempts
5805             * when we know what character comes next.
5806             *
5807             * Used to only do .*x and .*?x, but now it allows
5808             * for )'s, ('s and (?{ ... })'s to be in the way
5809             * of the quantifier and the EXACT-like node.  -- japhy
5810             */
5811
5812             assert(ST.min <= ST.max);
5813             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5814                 ST.c1 = ST.c2 = CHRTEST_VOID;
5815             }
5816             else {
5817                 regnode *text_node = next;
5818
5819                 if (! HAS_TEXT(text_node))
5820                     FIND_NEXT_IMPT(text_node);
5821
5822                 if (! HAS_TEXT(text_node))
5823                     ST.c1 = ST.c2 = CHRTEST_VOID;
5824                 else {
5825                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5826                         ST.c1 = ST.c2 = CHRTEST_VOID;
5827                     }
5828                     else {
5829
5830                     /*  Currently we only get here when
5831
5832                         PL_rekind[OP(text_node)] == EXACT
5833
5834                         if this changes back then the macro for IS_TEXT and
5835                         friends need to change. */
5836                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5837                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5838                         {
5839                             sayNO;
5840                         }
5841                     }
5842                 }
5843             }
5844
5845             ST.A = scan;
5846             ST.B = next;
5847             if (minmod) {
5848                 char *li = locinput;
5849                 minmod = 0;
5850                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5851                     sayNO;
5852                 SET_locinput(li);
5853                 ST.count = ST.min;
5854                 REGCP_SET(ST.cp);
5855                 if (ST.c1 == CHRTEST_VOID)
5856                     goto curly_try_B_min;
5857
5858                 ST.oldloc = locinput;
5859
5860                 /* set ST.maxpos to the furthest point along the
5861                  * string that could possibly match */
5862                 if  (ST.max == REG_INFTY) {
5863                     ST.maxpos = PL_regeol - 1;
5864                     if (utf8_target)
5865                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5866                             ST.maxpos--;
5867                 }
5868                 else if (utf8_target) {
5869                     int m = ST.max - ST.min;
5870                     for (ST.maxpos = locinput;
5871                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5872                         ST.maxpos += UTF8SKIP(ST.maxpos);
5873                 }
5874                 else {
5875                     ST.maxpos = locinput + ST.max - ST.min;
5876                     if (ST.maxpos >= PL_regeol)
5877                         ST.maxpos = PL_regeol - 1;
5878                 }
5879                 goto curly_try_B_min_known;
5880
5881             }
5882             else {
5883                 /* avoid taking address of locinput, so it can remain
5884                  * a register var */
5885                 char *li = locinput;
5886                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5887                 if (ST.count < ST.min)
5888                     sayNO;
5889                 SET_locinput(li);
5890                 if ((ST.count > ST.min)
5891                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5892                 {
5893                     /* A{m,n} must come at the end of the string, there's
5894                      * no point in backing off ... */
5895                     ST.min = ST.count;
5896                     /* ...except that $ and \Z can match before *and* after
5897                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5898                        We may back off by one in this case. */
5899                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5900                         ST.min--;
5901                 }
5902                 REGCP_SET(ST.cp);
5903                 goto curly_try_B_max;
5904             }
5905             assert(0); /* NOTREACHED */
5906
5907
5908         case CURLY_B_min_known_fail:
5909             /* failed to find B in a non-greedy match where c1,c2 valid */
5910
5911             REGCP_UNWIND(ST.cp);
5912             if (ST.paren) {
5913                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5914             }
5915             /* Couldn't or didn't -- move forward. */
5916             ST.oldloc = locinput;
5917             if (utf8_target)
5918                 locinput += UTF8SKIP(locinput);
5919             else
5920                 locinput++;
5921             ST.count++;
5922           curly_try_B_min_known:
5923              /* find the next place where 'B' could work, then call B */
5924             {
5925                 int n;
5926                 if (utf8_target) {
5927                     n = (ST.oldloc == locinput) ? 0 : 1;
5928                     if (ST.c1 == ST.c2) {
5929                         /* set n to utf8_distance(oldloc, locinput) */
5930                         while (locinput <= ST.maxpos
5931                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5932                         {
5933                             locinput += UTF8SKIP(locinput);
5934                             n++;
5935                         }
5936                     }
5937                     else {
5938                         /* set n to utf8_distance(oldloc, locinput) */
5939                         while (locinput <= ST.maxpos
5940                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5941                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5942                         {
5943                             locinput += UTF8SKIP(locinput);
5944                             n++;
5945                         }
5946                     }
5947                 }
5948                 else {  /* Not utf8_target */
5949                     if (ST.c1 == ST.c2) {
5950                         while (locinput <= ST.maxpos &&
5951                                UCHARAT(locinput) != ST.c1)
5952                             locinput++;
5953                     }
5954                     else {
5955                         while (locinput <= ST.maxpos
5956                                && UCHARAT(locinput) != ST.c1
5957                                && UCHARAT(locinput) != ST.c2)
5958                             locinput++;
5959                     }
5960                     n = locinput - ST.oldloc;
5961                 }
5962                 if (locinput > ST.maxpos)
5963                     sayNO;
5964                 if (n) {
5965                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5966                      * at b; check that everything between oldloc and
5967                      * locinput matches */
5968                     char *li = ST.oldloc;
5969                     ST.count += n;
5970                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
5971                         sayNO;
5972                     assert(n == REG_INFTY || locinput == li);
5973                 }
5974                 CURLY_SETPAREN(ST.paren, ST.count);
5975                 if (cur_eval && cur_eval->u.eval.close_paren &&
5976                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5977                     goto fake_end;
5978                 }
5979                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5980             }
5981             assert(0); /* NOTREACHED */
5982
5983
5984         case CURLY_B_min_fail:
5985             /* failed to find B in a non-greedy match where c1,c2 invalid */
5986
5987             REGCP_UNWIND(ST.cp);
5988             if (ST.paren) {
5989                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5990             }
5991             /* failed -- move forward one */
5992             {
5993                 char *li = locinput;
5994                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5995                     sayNO;
5996                 }
5997                 locinput = li;
5998             }
5999             {
6000                 ST.count++;
6001                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6002                         ST.count > 0)) /* count overflow ? */
6003                 {
6004                   curly_try_B_min:
6005                     CURLY_SETPAREN(ST.paren, ST.count);
6006                     if (cur_eval && cur_eval->u.eval.close_paren &&
6007                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6008                         goto fake_end;
6009                     }
6010                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6011                 }
6012             }
6013             sayNO;
6014             assert(0); /* NOTREACHED */
6015
6016
6017         curly_try_B_max:
6018             /* a successful greedy match: now try to match B */
6019             if (cur_eval && cur_eval->u.eval.close_paren &&
6020                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6021                 goto fake_end;
6022             }
6023             {
6024                 bool could_match = locinput < PL_regeol;
6025
6026                 /* If it could work, try it. */
6027                 if (ST.c1 != CHRTEST_VOID && could_match) {
6028                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6029                     {
6030                         could_match = memEQ(locinput,
6031                                             ST.c1_utf8,
6032                                             UTF8SKIP(locinput))
6033                                     || memEQ(locinput,
6034                                              ST.c2_utf8,
6035                                              UTF8SKIP(locinput));
6036                     }
6037                     else {
6038                         could_match = UCHARAT(locinput) == ST.c1
6039                                       || UCHARAT(locinput) == ST.c2;
6040                     }
6041                 }
6042                 if (ST.c1 == CHRTEST_VOID || could_match) {
6043                     CURLY_SETPAREN(ST.paren, ST.count);
6044                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6045                     assert(0); /* NOTREACHED */
6046                 }
6047             }
6048             /* FALL THROUGH */
6049
6050         case CURLY_B_max_fail:
6051             /* failed to find B in a greedy match */
6052
6053             REGCP_UNWIND(ST.cp);
6054             if (ST.paren) {
6055                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6056             }
6057             /*  back up. */
6058             if (--ST.count < ST.min)
6059                 sayNO;
6060             locinput = HOPc(locinput, -1);
6061             goto curly_try_B_max;
6062
6063 #undef ST
6064
6065         case END: /*  last op of main pattern  */
6066             fake_end:
6067             if (cur_eval) {
6068                 /* we've just finished A in /(??{A})B/; now continue with B */
6069                 st->u.eval.toggle_reg_flags
6070                             = cur_eval->u.eval.toggle_reg_flags;
6071                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6072
6073                 st->u.eval.prev_rex = rex_sv;           /* inner */
6074                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6075                 rex_sv = cur_eval->u.eval.prev_rex;
6076                 SET_reg_curpm(rex_sv);
6077                 rex = (struct regexp *)SvANY(rex_sv);
6078                 rexi = RXi_GET(rex);
6079                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6080
6081                 REGCP_SET(st->u.eval.lastcp);
6082
6083                 /* Restore parens of the outer rex without popping the
6084                  * savestack */
6085                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6086
6087                 st->u.eval.prev_eval = cur_eval;
6088                 cur_eval = cur_eval->u.eval.prev_eval;
6089                 DEBUG_EXECUTE_r(
6090                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6091                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6092                 if ( nochange_depth )
6093                     nochange_depth--;
6094
6095                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6096                                     locinput); /* match B */
6097             }
6098
6099             if (locinput < reginfo->till) {
6100                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6101                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6102                                       PL_colors[4],
6103                                       (long)(locinput - PL_reg_starttry),
6104                                       (long)(reginfo->till - PL_reg_starttry),
6105                                       PL_colors[5]));
6106
6107                 sayNO_SILENT;           /* Cannot match: too short. */
6108             }
6109             sayYES;                     /* Success! */
6110
6111         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6112             DEBUG_EXECUTE_r(
6113             PerlIO_printf(Perl_debug_log,
6114                 "%*s  %ssubpattern success...%s\n",
6115                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6116             sayYES;                     /* Success! */
6117
6118 #undef  ST
6119 #define ST st->u.ifmatch
6120
6121         {
6122             char *newstart;
6123
6124         case SUSPEND:   /* (?>A) */
6125             ST.wanted = 1;
6126             newstart = locinput;
6127             goto do_ifmatch;
6128
6129         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6130             ST.wanted = 0;
6131             goto ifmatch_trivial_fail_test;
6132
6133         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6134             ST.wanted = 1;
6135           ifmatch_trivial_fail_test:
6136             if (scan->flags) {
6137                 char * const s = HOPBACKc(locinput, scan->flags);
6138                 if (!s) {
6139                     /* trivial fail */
6140                     if (logical) {
6141                         logical = 0;
6142                         sw = 1 - cBOOL(ST.wanted);
6143                     }
6144                     else if (ST.wanted)
6145                         sayNO;
6146                     next = scan + ARG(scan);
6147                     if (next == scan)
6148                         next = NULL;
6149                     break;
6150                 }
6151                 newstart = s;
6152             }
6153             else
6154                 newstart = locinput;
6155
6156           do_ifmatch:
6157             ST.me = scan;
6158             ST.logical = logical;
6159             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6160
6161             /* execute body of (?...A) */
6162             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6163             assert(0); /* NOTREACHED */
6164         }
6165
6166         case IFMATCH_A_fail: /* body of (?...A) failed */
6167             ST.wanted = !ST.wanted;
6168             /* FALL THROUGH */
6169
6170         case IFMATCH_A: /* body of (?...A) succeeded */
6171             if (ST.logical) {
6172                 sw = cBOOL(ST.wanted);
6173             }
6174             else if (!ST.wanted)
6175                 sayNO;
6176
6177             if (OP(ST.me) != SUSPEND) {
6178                 /* restore old position except for (?>...) */
6179                 locinput = st->locinput;
6180             }
6181             scan = ST.me + ARG(ST.me);
6182             if (scan == ST.me)
6183                 scan = NULL;
6184             continue; /* execute B */
6185
6186 #undef ST
6187
6188         case LONGJMP: /*  alternative with many branches compiles to
6189                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6190             next = scan + ARG(scan);
6191             if (next == scan)
6192                 next = NULL;
6193             break;
6194
6195         case COMMIT:  /*  (*COMMIT)  */
6196             reginfo->cutpoint = PL_regeol;
6197             /* FALLTHROUGH */
6198
6199         case PRUNE:   /*  (*PRUNE)   */
6200             if (!scan->flags)
6201                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6202             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6203             assert(0); /* NOTREACHED */
6204
6205         case COMMIT_next_fail:
6206             no_final = 1;
6207             /* FALLTHROUGH */
6208
6209         case OPFAIL:   /* (*FAIL)  */
6210             sayNO;
6211             assert(0); /* NOTREACHED */
6212
6213 #define ST st->u.mark
6214         case MARKPOINT: /*  (*MARK:foo)  */
6215             ST.prev_mark = mark_state;
6216             ST.mark_name = sv_commit = sv_yes_mark
6217                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6218             mark_state = st;
6219             ST.mark_loc = locinput;
6220             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6221             assert(0); /* NOTREACHED */
6222
6223         case MARKPOINT_next:
6224             mark_state = ST.prev_mark;
6225             sayYES;
6226             assert(0); /* NOTREACHED */
6227
6228         case MARKPOINT_next_fail:
6229             if (popmark && sv_eq(ST.mark_name,popmark))
6230             {
6231                 if (ST.mark_loc > startpoint)
6232                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6233                 popmark = NULL; /* we found our mark */
6234                 sv_commit = ST.mark_name;
6235
6236                 DEBUG_EXECUTE_r({
6237                         PerlIO_printf(Perl_debug_log,
6238                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6239                             REPORT_CODE_OFF+depth*2, "",
6240                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6241                 });
6242             }
6243             mark_state = ST.prev_mark;
6244             sv_yes_mark = mark_state ?
6245                 mark_state->u.mark.mark_name : NULL;
6246             sayNO;
6247             assert(0); /* NOTREACHED */
6248
6249         case SKIP:  /*  (*SKIP)  */
6250             if (scan->flags) {
6251                 /* (*SKIP) : if we fail we cut here*/
6252                 ST.mark_name = NULL;
6253                 ST.mark_loc = locinput;
6254                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6255             } else {
6256                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6257                    otherwise do nothing.  Meaning we need to scan
6258                  */
6259                 regmatch_state *cur = mark_state;
6260                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6261
6262                 while (cur) {
6263                     if ( sv_eq( cur->u.mark.mark_name,
6264                                 find ) )
6265                     {
6266                         ST.mark_name = find;
6267                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6268                     }
6269                     cur = cur->u.mark.prev_mark;
6270                 }
6271             }
6272             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6273             break;
6274
6275         case SKIP_next_fail:
6276             if (ST.mark_name) {
6277                 /* (*CUT:NAME) - Set up to search for the name as we
6278                    collapse the stack*/
6279                 popmark = ST.mark_name;
6280             } else {
6281                 /* (*CUT) - No name, we cut here.*/
6282                 if (ST.mark_loc > startpoint)
6283                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6284                 /* but we set sv_commit to latest mark_name if there
6285                    is one so they can test to see how things lead to this
6286                    cut */
6287                 if (mark_state)
6288                     sv_commit=mark_state->u.mark.mark_name;
6289             }
6290             no_final = 1;
6291             sayNO;
6292             assert(0); /* NOTREACHED */
6293 #undef ST
6294
6295         case LNBREAK: /* \R */
6296             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6297                 locinput += n;
6298             } else
6299                 sayNO;
6300             break;
6301
6302 #define CASE_CLASS(nAmE)                              \
6303         case nAmE:                                    \
6304             if (NEXTCHR_IS_EOS)                       \
6305                 sayNO;                                \
6306             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6307                 locinput += n;                        \
6308             } else                                    \
6309                 sayNO;                                \
6310             break;                                    \
6311         case N##nAmE:                                 \
6312             if (NEXTCHR_IS_EOS)                       \
6313                 sayNO;                                \
6314             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6315                 sayNO;                                \
6316             } else {                                  \
6317                 locinput += UTF8SKIP(locinput);       \
6318             }                                         \
6319             break
6320
6321         CASE_CLASS(VERTWS);  /*  \v \V  */
6322         CASE_CLASS(HORIZWS); /*  \h \H  */
6323 #undef CASE_CLASS
6324
6325         default:
6326             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6327                           PTR2UV(scan), OP(scan));
6328             Perl_croak(aTHX_ "regexp memory corruption");
6329
6330         /* this is a point to jump to in order to increment
6331          * locinput by one character */
6332         increment_locinput:
6333             assert(!NEXTCHR_IS_EOS);
6334             if (utf8_target) {
6335                 locinput += PL_utf8skip[nextchr];
6336                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6337                 if (locinput > PL_regeol)
6338                     sayNO;
6339             }
6340             else
6341                 locinput++;
6342             break;
6343
6344         } /* end switch */
6345
6346         /* switch break jumps here */
6347         scan = next; /* prepare to execute the next op and ... */
6348         continue;    /* ... jump back to the top, reusing st */
6349         assert(0); /* NOTREACHED */
6350
6351       push_yes_state:
6352         /* push a state that backtracks on success */
6353         st->u.yes.prev_yes_state = yes_state;
6354         yes_state = st;
6355         /* FALL THROUGH */
6356       push_state:
6357         /* push a new regex state, then continue at scan  */
6358         {
6359             regmatch_state *newst;
6360
6361             DEBUG_STACK_r({
6362                 regmatch_state *cur = st;
6363                 regmatch_state *curyes = yes_state;
6364                 int curd = depth;
6365                 regmatch_slab *slab = PL_regmatch_slab;
6366                 for (;curd > -1;cur--,curd--) {
6367                     if (cur < SLAB_FIRST(slab)) {
6368                         slab = slab->prev;
6369                         cur = SLAB_LAST(slab);
6370                     }
6371                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6372                         REPORT_CODE_OFF + 2 + depth * 2,"",
6373                         curd, PL_reg_name[cur->resume_state],
6374                         (curyes == cur) ? "yes" : ""
6375                     );
6376                     if (curyes == cur)
6377                         curyes = cur->u.yes.prev_yes_state;
6378                 }
6379             } else
6380                 DEBUG_STATE_pp("push")
6381             );
6382             depth++;
6383             st->locinput = locinput;
6384             newst = st+1;
6385             if (newst >  SLAB_LAST(PL_regmatch_slab))
6386                 newst = S_push_slab(aTHX);
6387             PL_regmatch_state = newst;
6388
6389             locinput = pushinput;
6390             st = newst;
6391             continue;
6392             assert(0); /* NOTREACHED */
6393         }
6394     }
6395
6396     /*
6397     * We get here only if there's trouble -- normally "case END" is
6398     * the terminating point.
6399     */
6400     Perl_croak(aTHX_ "corrupted regexp pointers");
6401     /*NOTREACHED*/
6402     sayNO;
6403
6404 yes:
6405     if (yes_state) {
6406         /* we have successfully completed a subexpression, but we must now
6407          * pop to the state marked by yes_state and continue from there */
6408         assert(st != yes_state);
6409 #ifdef DEBUGGING
6410         while (st != yes_state) {
6411             st--;
6412             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6413                 PL_regmatch_slab = PL_regmatch_slab->prev;
6414                 st = SLAB_LAST(PL_regmatch_slab);
6415             }
6416             DEBUG_STATE_r({
6417                 if (no_final) {
6418                     DEBUG_STATE_pp("pop (no final)");
6419                 } else {
6420                     DEBUG_STATE_pp("pop (yes)");
6421                 }
6422             });
6423             depth--;
6424         }
6425 #else
6426         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6427             || yes_state > SLAB_LAST(PL_regmatch_slab))
6428         {
6429             /* not in this slab, pop slab */
6430             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6431             PL_regmatch_slab = PL_regmatch_slab->prev;
6432             st = SLAB_LAST(PL_regmatch_slab);
6433         }
6434         depth -= (st - yes_state);
6435 #endif
6436         st = yes_state;
6437         yes_state = st->u.yes.prev_yes_state;
6438         PL_regmatch_state = st;
6439
6440         if (no_final)
6441             locinput= st->locinput;
6442         state_num = st->resume_state + no_final;
6443         goto reenter_switch;
6444     }
6445
6446     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6447                           PL_colors[4], PL_colors[5]));
6448
6449     if (PL_reg_state.re_state_eval_setup_done) {
6450         /* each successfully executed (?{...}) block does the equivalent of
6451          *   local $^R = do {...}
6452          * When popping the save stack, all these locals would be undone;
6453          * bypass this by setting the outermost saved $^R to the latest
6454          * value */
6455         if (oreplsv != GvSV(PL_replgv))
6456             sv_setsv(oreplsv, GvSV(PL_replgv));
6457     }
6458     result = 1;
6459     goto final_exit;
6460
6461 no:
6462     DEBUG_EXECUTE_r(
6463         PerlIO_printf(Perl_debug_log,
6464             "%*s  %sfailed...%s\n",
6465             REPORT_CODE_OFF+depth*2, "",
6466             PL_colors[4], PL_colors[5])
6467         );
6468
6469 no_silent:
6470     if (no_final) {
6471         if (yes_state) {
6472             goto yes;
6473         } else {
6474             goto final_exit;
6475         }
6476     }
6477     if (depth) {
6478         /* there's a previous state to backtrack to */
6479         st--;
6480         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6481             PL_regmatch_slab = PL_regmatch_slab->prev;
6482             st = SLAB_LAST(PL_regmatch_slab);
6483         }
6484         PL_regmatch_state = st;
6485         locinput= st->locinput;
6486
6487         DEBUG_STATE_pp("pop");
6488         depth--;
6489         if (yes_state == st)
6490             yes_state = st->u.yes.prev_yes_state;
6491
6492         state_num = st->resume_state + 1; /* failure = success + 1 */
6493         goto reenter_switch;
6494     }
6495     result = 0;
6496
6497   final_exit:
6498     if (rex->intflags & PREGf_VERBARG_SEEN) {
6499         SV *sv_err = get_sv("REGERROR", 1);
6500         SV *sv_mrk = get_sv("REGMARK", 1);
6501         if (result) {
6502             sv_commit = &PL_sv_no;
6503             if (!sv_yes_mark)
6504                 sv_yes_mark = &PL_sv_yes;
6505         } else {
6506             if (!sv_commit)
6507                 sv_commit = &PL_sv_yes;
6508             sv_yes_mark = &PL_sv_no;
6509         }
6510         sv_setsv(sv_err, sv_commit);
6511         sv_setsv(sv_mrk, sv_yes_mark);
6512     }
6513
6514
6515     if (last_pushed_cv) {
6516         dSP;
6517         POP_MULTICALL;
6518         PERL_UNUSED_VAR(SP);
6519     }
6520
6521     /* clean up; in particular, free all slabs above current one */
6522     LEAVE_SCOPE(oldsave);
6523
6524     assert(!result ||  locinput - PL_bostr >= 0);
6525     return result ?  locinput - PL_bostr : -1;
6526 }
6527
6528 /*
6529  - regrepeat - repeatedly match something simple, report how many
6530  *
6531  * startposp - pointer a pointer to the start position.  This is updated
6532  *             to point to the byte following the highest successful
6533  *             match.
6534  * p         - the regnode to be repeatedly matched against.
6535  * max       - maximum number of characters to match.
6536  * depth     - (for debugging) backtracking depth.
6537  */
6538 STATIC I32
6539 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6540 {
6541     dVAR;
6542     char *scan;
6543     I32 c;
6544     char *loceol = PL_regeol;
6545     I32 hardcount = 0;
6546     bool utf8_target = PL_reg_match_utf8;
6547     UV utf8_flags;
6548 #ifndef DEBUGGING
6549     PERL_UNUSED_ARG(depth);
6550 #endif
6551
6552     PERL_ARGS_ASSERT_REGREPEAT;
6553
6554     scan = *startposp;
6555     if (max == REG_INFTY)
6556         max = I32_MAX;
6557     else if (max < loceol - scan)
6558         loceol = scan + max;
6559     switch (OP(p)) {
6560     case REG_ANY:
6561         if (utf8_target) {
6562             loceol = PL_regeol;
6563             while (scan < loceol && hardcount < max && *scan != '\n') {
6564                 scan += UTF8SKIP(scan);
6565                 hardcount++;
6566             }
6567         } else {
6568             while (scan < loceol && *scan != '\n')
6569                 scan++;
6570         }
6571         break;
6572     case SANY:
6573         if (utf8_target) {
6574             loceol = PL_regeol;
6575             while (scan < loceol && hardcount < max) {
6576                 scan += UTF8SKIP(scan);
6577                 hardcount++;
6578             }
6579         }
6580         else
6581             scan = loceol;
6582         break;
6583     case CANY:
6584         scan = loceol;
6585         break;
6586     case EXACT:
6587         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6588
6589         c = (U8)*STRING(p);
6590
6591         /* Can use a simple loop if the pattern char to match on is invariant
6592          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6593          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6594          * true iff it doesn't matter if the argument is in UTF-8 or not */
6595         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6596             while (scan < loceol && UCHARAT(scan) == c) {
6597                 scan++;
6598             }
6599         }
6600         else if (UTF_PATTERN) {
6601             if (utf8_target) {
6602                 STRLEN scan_char_len;
6603                 loceol = PL_regeol;
6604
6605                 /* When both target and pattern are UTF-8, we have to do s
6606                  * string EQ */
6607                 while (hardcount < max
6608                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6609                        && scan_char_len <= STR_LEN(p)
6610                        && memEQ(scan, STRING(p), scan_char_len))
6611                 {
6612                     scan += scan_char_len;
6613                     hardcount++;
6614                 }
6615             }
6616             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6617
6618                 /* Target isn't utf8; convert the character in the UTF-8
6619                  * pattern to non-UTF8, and do a simple loop */
6620                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6621                 while (scan < loceol && UCHARAT(scan) == c) {
6622                     scan++;
6623                 }
6624             } /* else pattern char is above Latin1, can't possibly match the
6625                  non-UTF-8 target */
6626         }
6627         else {
6628
6629             /* Here, the string must be utf8; pattern isn't, and <c> is
6630              * different in utf8 than not, so can't compare them directly.
6631              * Outside the loop, find the two utf8 bytes that represent c, and
6632              * then look for those in sequence in the utf8 string */
6633             U8 high = UTF8_TWO_BYTE_HI(c);
6634             U8 low = UTF8_TWO_BYTE_LO(c);
6635             loceol = PL_regeol;
6636
6637             while (hardcount < max
6638                     && scan + 1 < loceol
6639                     && UCHARAT(scan) == high
6640                     && UCHARAT(scan + 1) == low)
6641             {
6642                 scan += 2;
6643                 hardcount++;
6644             }
6645         }
6646         break;
6647
6648     case EXACTFA:
6649         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6650         goto do_exactf;
6651
6652     case EXACTFL:
6653         PL_reg_flags |= RF_tainted;
6654         utf8_flags = FOLDEQ_UTF8_LOCALE;
6655         goto do_exactf;
6656
6657     case EXACTF:
6658             utf8_flags = 0;
6659             goto do_exactf;
6660
6661     case EXACTFU_SS:
6662     case EXACTFU_TRICKYFOLD:
6663     case EXACTFU:
6664         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6665
6666     do_exactf: {
6667         int c1, c2;
6668         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6669
6670         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6671
6672         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6673             if (c1 == CHRTEST_VOID) {
6674                 /* Use full Unicode fold matching */
6675                 char *tmpeol = loceol;
6676                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6677                 while (hardcount < max
6678                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6679                                              STRING(p), NULL, pat_len,
6680                                              cBOOL(UTF_PATTERN), utf8_flags))
6681                 {
6682                     scan = tmpeol;
6683                     tmpeol = loceol;
6684                     hardcount++;
6685                 }
6686             }
6687             else if (utf8_target) {
6688                 if (c1 == c2) {
6689                     while (hardcount < max
6690                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6691                     {
6692                         scan += UTF8SKIP(scan);
6693                         hardcount++;
6694                     }
6695                 }
6696                 else {
6697                     while (hardcount < max
6698                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6699                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6700                     {
6701                         scan += UTF8SKIP(scan);
6702                         hardcount++;
6703                     }
6704                 }
6705             }
6706             else if (c1 == c2) {
6707                 while (scan < loceol && UCHARAT(scan) == c1) {
6708                     scan++;
6709                 }
6710             }
6711             else {
6712                 while (scan < loceol &&
6713                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6714                 {
6715                     scan++;
6716                 }
6717             }
6718         }
6719         break;
6720     }
6721     case ANYOF:
6722         if (utf8_target) {
6723             STRLEN inclasslen;
6724             loceol = PL_regeol;
6725             inclasslen = loceol - scan;
6726             while (hardcount < max
6727                    && ((inclasslen = loceol - scan) > 0)
6728                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6729             {
6730                 scan += inclasslen;
6731                 hardcount++;
6732             }
6733         } else {
6734             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6735                 scan++;
6736         }
6737         break;
6738     case ALNUMU:
6739         if (utf8_target) {
6740     utf8_wordchar:
6741             loceol = PL_regeol;
6742             LOAD_UTF8_CHARCLASS_ALNUM();
6743             while (hardcount < max && scan < loceol &&
6744                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6745             {
6746                 scan += UTF8SKIP(scan);
6747                 hardcount++;
6748             }
6749         } else {
6750             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6751                 scan++;
6752             }
6753         }
6754         break;
6755     case ALNUM:
6756         if (utf8_target)
6757             goto utf8_wordchar;
6758         while (scan < loceol && isALNUM((U8) *scan)) {
6759             scan++;
6760         }
6761         break;
6762     case ALNUMA:
6763         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6764             scan++;
6765         }
6766         break;
6767     case ALNUML:
6768         PL_reg_flags |= RF_tainted;
6769         if (utf8_target) {
6770             loceol = PL_regeol;
6771             while (hardcount < max && scan < loceol &&
6772                    isALNUM_LC_utf8((U8*)scan)) {
6773                 scan += UTF8SKIP(scan);
6774                 hardcount++;
6775             }
6776         } else {
6777             while (scan < loceol && isALNUM_LC(*scan))
6778                 scan++;
6779         }
6780         break;
6781     case NALNUMU:
6782         if (utf8_target) {
6783
6784     utf8_Nwordchar:
6785
6786             loceol = PL_regeol;
6787             LOAD_UTF8_CHARCLASS_ALNUM();
6788             while (hardcount < max && scan < loceol &&
6789                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6790             {
6791                 scan += UTF8SKIP(scan);
6792                 hardcount++;
6793             }
6794         } else {
6795             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6796                 scan++;
6797             }
6798         }
6799         break;
6800     case NALNUM:
6801         if (utf8_target)
6802             goto utf8_Nwordchar;
6803         while (scan < loceol && ! isALNUM((U8) *scan)) {
6804             scan++;
6805         }
6806         break;
6807
6808     case POSIXA:
6809        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6810             scan++;
6811         }
6812         break;
6813     case NPOSIXA:
6814         if (utf8_target) {
6815             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6816                 scan += UTF8SKIP(scan);
6817             }
6818         }
6819         else {
6820             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6821                 scan++;
6822             }
6823         }
6824         break;
6825     case NALNUMA:
6826         if (utf8_target) {
6827             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6828                 scan += UTF8SKIP(scan);
6829             }
6830         }
6831         else {
6832             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6833                 scan++;
6834             }
6835         }
6836         break;
6837     case NALNUML:
6838         PL_reg_flags |= RF_tainted;
6839         if (utf8_target) {
6840             loceol = PL_regeol;
6841             while (hardcount < max && scan < loceol &&
6842                    !isALNUM_LC_utf8((U8*)scan)) {
6843                 scan += UTF8SKIP(scan);
6844                 hardcount++;
6845             }
6846         } else {
6847             while (scan < loceol && !isALNUM_LC(*scan))
6848                 scan++;
6849         }
6850         break;
6851     case SPACEU:
6852         if (utf8_target) {
6853
6854     utf8_space:
6855
6856             loceol = PL_regeol;
6857             LOAD_UTF8_CHARCLASS_SPACE();
6858             while (hardcount < max && scan < loceol &&
6859                    (*scan == ' ' ||
6860                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6861             {
6862                 scan += UTF8SKIP(scan);
6863                 hardcount++;
6864             }
6865             break;
6866         }
6867         else {
6868             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6869                 scan++;
6870             }
6871             break;
6872         }
6873     case SPACE:
6874         if (utf8_target)
6875             goto utf8_space;
6876
6877         while (scan < loceol && isSPACE((U8) *scan)) {
6878             scan++;
6879         }
6880         break;
6881     case SPACEA:
6882         while (scan < loceol && isSPACE_A((U8) *scan)) {
6883             scan++;
6884         }
6885         break;
6886     case SPACEL:
6887         PL_reg_flags |= RF_tainted;
6888         if (utf8_target) {
6889             loceol = PL_regeol;
6890             while (hardcount < max && scan < loceol &&
6891                    isSPACE_LC_utf8((U8*)scan)) {
6892                 scan += UTF8SKIP(scan);
6893                 hardcount++;
6894             }
6895         } else {
6896             while (scan < loceol && isSPACE_LC(*scan))
6897                 scan++;
6898         }
6899         break;
6900     case NSPACEU:
6901         if (utf8_target) {
6902
6903     utf8_Nspace:
6904
6905             loceol = PL_regeol;
6906             LOAD_UTF8_CHARCLASS_SPACE();
6907             while (hardcount < max && scan < loceol &&
6908                    ! (*scan == ' ' ||
6909                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6910             {
6911                 scan += UTF8SKIP(scan);
6912                 hardcount++;
6913             }
6914             break;
6915         }
6916         else {
6917             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6918                 scan++;
6919             }
6920         }
6921         break;
6922     case NSPACE:
6923         if (utf8_target)
6924             goto utf8_Nspace;
6925
6926         while (scan < loceol && ! isSPACE((U8) *scan)) {
6927             scan++;
6928         }
6929         break;
6930     case NSPACEA:
6931         if (utf8_target) {
6932             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6933                 scan += UTF8SKIP(scan);
6934             }
6935         }
6936         else {
6937             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6938                 scan++;
6939             }
6940         }
6941         break;
6942     case NSPACEL:
6943         PL_reg_flags |= RF_tainted;
6944         if (utf8_target) {
6945             loceol = PL_regeol;
6946             while (hardcount < max && scan < loceol &&
6947                    !isSPACE_LC_utf8((U8*)scan)) {
6948                 scan += UTF8SKIP(scan);
6949                 hardcount++;
6950             }
6951         } else {
6952             while (scan < loceol && !isSPACE_LC(*scan))
6953                 scan++;
6954         }
6955         break;
6956     case DIGIT:
6957         if (utf8_target) {
6958             loceol = PL_regeol;
6959             LOAD_UTF8_CHARCLASS_DIGIT();
6960             while (hardcount < max && scan < loceol &&
6961                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6962                 scan += UTF8SKIP(scan);
6963                 hardcount++;
6964             }
6965         } else {
6966             while (scan < loceol && isDIGIT(*scan))
6967                 scan++;
6968         }
6969         break;
6970     case DIGITA:
6971         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6972             scan++;
6973         }
6974         break;
6975     case DIGITL:
6976         PL_reg_flags |= RF_tainted;
6977         if (utf8_target) {
6978             loceol = PL_regeol;
6979             while (hardcount < max && scan < loceol &&
6980                    isDIGIT_LC_utf8((U8*)scan)) {
6981                 scan += UTF8SKIP(scan);
6982                 hardcount++;
6983             }
6984         } else {
6985             while (scan < loceol && isDIGIT_LC(*scan))
6986                 scan++;
6987         }
6988         break;
6989     case NDIGIT:
6990         if (utf8_target) {
6991             loceol = PL_regeol;
6992             LOAD_UTF8_CHARCLASS_DIGIT();
6993             while (hardcount < max && scan < loceol &&
6994                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6995                 scan += UTF8SKIP(scan);
6996                 hardcount++;
6997             }
6998         } else {
6999             while (scan < loceol && !isDIGIT(*scan))
7000                 scan++;
7001         }
7002         break;
7003     case NDIGITA:
7004         if (utf8_target) {
7005             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7006                 scan += UTF8SKIP(scan);
7007             }
7008         }
7009         else {
7010             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7011                 scan++;
7012             }
7013         }
7014         break;
7015     case NDIGITL:
7016         PL_reg_flags |= RF_tainted;
7017         if (utf8_target) {
7018             loceol = PL_regeol;
7019             while (hardcount < max && scan < loceol &&
7020                    !isDIGIT_LC_utf8((U8*)scan)) {
7021                 scan += UTF8SKIP(scan);
7022                 hardcount++;
7023             }
7024         } else {
7025             while (scan < loceol && !isDIGIT_LC(*scan))
7026                 scan++;
7027         }
7028         break;
7029     case LNBREAK:
7030         Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK");
7031         assert(0); /* NOTREACHED */
7032     case HORIZWS:
7033         if (utf8_target) {
7034             loceol = PL_regeol;
7035             while (hardcount < max && scan < loceol &&
7036                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7037             {
7038                 scan += c;
7039                 hardcount++;
7040             }
7041         } else {
7042             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7043                 scan++;
7044         }
7045         break;
7046     case NHORIZWS:
7047         if (utf8_target) {
7048             loceol = PL_regeol;
7049             while (hardcount < max && scan < loceol &&
7050                         !is_HORIZWS_utf8_safe(scan, loceol))
7051             {
7052                 scan += UTF8SKIP(scan);
7053                 hardcount++;
7054             }
7055         } else {
7056             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7057                 scan++;
7058
7059         }
7060         break;
7061     case VERTWS:
7062         if (utf8_target) {
7063             loceol = PL_regeol;
7064             while (hardcount < max && scan < loceol &&
7065                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7066             {
7067                 scan += c;
7068                 hardcount++;
7069             }
7070         } else {
7071             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7072                 scan++;
7073
7074         }
7075         break;
7076     case NVERTWS:
7077         if (utf8_target) {
7078             loceol = PL_regeol;
7079             while (hardcount < max && scan < loceol &&
7080                                 !is_VERTWS_utf8_safe(scan, loceol))
7081             {
7082                 scan += UTF8SKIP(scan);
7083                 hardcount++;
7084             }
7085         } else {
7086             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7087                 scan++;
7088
7089         }
7090         break;
7091
7092     default:            /* Called on something of 0 width. */
7093         break;          /* So match right here or not at all. */
7094     }
7095
7096     if (hardcount)
7097         c = hardcount;
7098     else
7099         c = scan - *startposp;
7100     *startposp = scan;
7101
7102     DEBUG_r({
7103         GET_RE_DEBUG_FLAGS_DECL;
7104         DEBUG_EXECUTE_r({
7105             SV * const prop = sv_newmortal();
7106             regprop(prog, prop, p);
7107             PerlIO_printf(Perl_debug_log,
7108                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7109                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7110         });
7111     });
7112
7113     return(c);
7114 }
7115
7116
7117 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7118 /*
7119 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7120 create a copy so that changes the caller makes won't change the shared one.
7121 If <altsvp> is non-null, will return NULL in it, for back-compat.
7122  */
7123 SV *
7124 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7125 {
7126     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7127
7128     if (altsvp) {
7129         *altsvp = NULL;
7130     }
7131
7132     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7133 }
7134 #endif
7135
7136 STATIC SV *
7137 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7138 {
7139     /* Returns the swash for the input 'node' in the regex 'prog'.
7140      * If <doinit> is true, will attempt to create the swash if not already
7141      *    done.
7142      * If <listsvp> is non-null, will return the swash initialization string in
7143      *    it.
7144      * Tied intimately to how regcomp.c sets up the data structure */
7145
7146     dVAR;
7147     SV *sw  = NULL;
7148     SV *si  = NULL;
7149     SV*  invlist = NULL;
7150
7151     RXi_GET_DECL(prog,progi);
7152     const struct reg_data * const data = prog ? progi->data : NULL;
7153
7154     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7155
7156     assert(ANYOF_NONBITMAP(node));
7157
7158     if (data && data->count) {
7159         const U32 n = ARG(node);
7160
7161         if (data->what[n] == 's') {
7162             SV * const rv = MUTABLE_SV(data->data[n]);
7163             AV * const av = MUTABLE_AV(SvRV(rv));
7164             SV **const ary = AvARRAY(av);
7165             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7166
7167             si = *ary;  /* ary[0] = the string to initialize the swash with */
7168
7169             /* Elements 2 and 3 are either both present or both absent. [2] is
7170              * any inversion list generated at compile time; [3] indicates if
7171              * that inversion list has any user-defined properties in it. */
7172             if (av_len(av) >= 2) {
7173                 invlist = ary[2];
7174                 if (SvUV(ary[3])) {
7175                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7176                 }
7177             }
7178             else {
7179                 invlist = NULL;
7180             }
7181
7182             /* Element [1] is reserved for the set-up swash.  If already there,
7183              * return it; if not, create it and store it there */
7184             if (SvROK(ary[1])) {
7185                 sw = ary[1];
7186             }
7187             else if (si && doinit) {
7188
7189                 sw = _core_swash_init("utf8", /* the utf8 package */
7190                                       "", /* nameless */
7191                                       si,
7192                                       1, /* binary */
7193                                       0, /* not from tr/// */
7194                                       invlist,
7195                                       &swash_init_flags);
7196                 (void)av_store(av, 1, sw);
7197             }
7198         }
7199     }
7200
7201     if (listsvp) {
7202         SV* matches_string = newSVpvn("", 0);
7203
7204         /* Use the swash, if any, which has to have incorporated into it all
7205          * possibilities */
7206         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7207             && (si && si != &PL_sv_undef))
7208         {
7209
7210             /* If no swash, use the input initialization string, if available */
7211             sv_catsv(matches_string, si);
7212         }
7213
7214         /* Add the inversion list to whatever we have.  This may have come from
7215          * the swash, or from an input parameter */
7216         if (invlist) {
7217             sv_catsv(matches_string, _invlist_contents(invlist));
7218         }
7219         *listsvp = matches_string;
7220     }
7221
7222     return sw;
7223 }
7224
7225 /*
7226  - reginclass - determine if a character falls into a character class
7227
7228   n is the ANYOF regnode
7229   p is the target string
7230   lenp is pointer to the maximum number of bytes of how far to go in p
7231     (This is assumed wthout checking to always be at least the current
7232     character's size)
7233   utf8_target tells whether p is in UTF-8.
7234
7235   Returns true if matched; false otherwise.  If lenp is not NULL, on return
7236   from a successful match, the value it points to will be updated to how many
7237   bytes in p were matched.  If there was no match, the value is undefined,
7238   possibly changed from the input.
7239
7240   Note that this can be a synthetic start class, a combination of various
7241   nodes, so things you think might be mutually exclusive, such as locale,
7242   aren't.  It can match both locale and non-locale
7243
7244  */
7245
7246 STATIC bool
7247 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
7248 {
7249     dVAR;
7250     const char flags = ANYOF_FLAGS(n);
7251     bool match = FALSE;
7252     UV c = *p;
7253     STRLEN c_len = 0;
7254     STRLEN maxlen;
7255
7256     PERL_ARGS_ASSERT_REGINCLASS;
7257
7258     /* If c is not already the code point, get it */
7259     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
7260         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7261                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7262                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7263                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7264                  * UTF8_ALLOW_FFFF */
7265         if (c_len == (STRLEN)-1)
7266             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7267     }
7268     else {
7269         c_len = 1;
7270     }
7271
7272     /* Use passed in max length, or one character if none passed in or less
7273      * than one character.  And assume will match just one character.  This is
7274      * overwritten later if matched more. */
7275     if (lenp) {
7276         maxlen = (*lenp > c_len) ? *lenp : c_len;
7277         *lenp = c_len;
7278
7279     }
7280     else {
7281         maxlen = c_len;
7282     }
7283
7284     /* If this character is potentially in the bitmap, check it */
7285     if (c < 256) {
7286         if (ANYOF_BITMAP_TEST(n, c))
7287             match = TRUE;
7288         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7289                 && ! utf8_target
7290                 && ! isASCII(c))
7291         {
7292             match = TRUE;
7293         }
7294         else if (flags & ANYOF_LOCALE) {
7295             PL_reg_flags |= RF_tainted;
7296
7297             if ((flags & ANYOF_LOC_FOLD)
7298                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7299             {
7300                 match = TRUE;
7301             }
7302             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7303                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7304                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7305                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7306                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7307                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7308                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7309                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7310                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7311                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7312                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7313                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7314                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7315                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7316                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7317                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7318                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7319                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7320                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7321                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7322                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7323                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7324                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7325                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7326                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7327                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7328                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7329                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7330                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7331                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7332                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7333                      ) /* How's that for a conditional? */
7334             ) {
7335                 match = TRUE;
7336             }
7337         }
7338     }
7339
7340     /* If the bitmap didn't (or couldn't) match, and something outside the
7341      * bitmap could match, try that.  Locale nodes specify completely the
7342      * behavior of code points in the bit map (otherwise, a utf8 target would
7343      * cause them to be treated as Unicode and not locale), except in
7344      * the very unlikely event when this node is a synthetic start class, which
7345      * could be a combination of locale and non-locale nodes.  So allow locale
7346      * to match for the synthetic start class, which will give a false
7347      * positive that will be resolved when the match is done again as not part
7348      * of the synthetic start class */
7349     if (!match) {
7350         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7351             match = TRUE;       /* Everything above 255 matches */
7352         }
7353         else if (ANYOF_NONBITMAP(n)
7354                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7355                      || (utf8_target
7356                          && (c >=256
7357                              || (! (flags & ANYOF_LOCALE))
7358                              || (flags & ANYOF_IS_SYNTHETIC)))))
7359         {
7360             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7361             if (sw) {
7362                 U8 * utf8_p;
7363                 if (utf8_target) {
7364                     utf8_p = (U8 *) p;
7365                 } else { /* Convert to utf8 */
7366                     STRLEN len = 1;
7367                     utf8_p = bytes_to_utf8(p, &len);
7368                 }
7369
7370                 if (swash_fetch(sw, utf8_p, TRUE)) {
7371                     match = TRUE;
7372                 }
7373
7374                 /* If we allocated a string above, free it */
7375                 if (! utf8_target) Safefree(utf8_p);
7376             }
7377         }
7378
7379         if (UNICODE_IS_SUPER(c)
7380             && (flags & ANYOF_WARN_SUPER)
7381             && ckWARN_d(WARN_NON_UNICODE))
7382         {
7383             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7384                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7385         }
7386     }
7387
7388     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7389     return cBOOL(flags & ANYOF_INVERT) ^ match;
7390 }
7391
7392 STATIC U8 *
7393 S_reghop3(U8 *s, I32 off, const U8* lim)
7394 {
7395     /* return the position 'off' UTF-8 characters away from 's', forward if
7396      * 'off' >= 0, backwards if negative.  But don't go outside of position
7397      * 'lim', which better be < s  if off < 0 */
7398
7399     dVAR;
7400
7401     PERL_ARGS_ASSERT_REGHOP3;
7402
7403     if (off >= 0) {
7404         while (off-- && s < lim) {
7405             /* XXX could check well-formedness here */
7406             s += UTF8SKIP(s);
7407         }
7408     }
7409     else {
7410         while (off++ && s > lim) {
7411             s--;
7412             if (UTF8_IS_CONTINUED(*s)) {
7413                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7414                     s--;
7415             }
7416             /* XXX could check well-formedness here */
7417         }
7418     }
7419     return s;
7420 }
7421
7422 #ifdef XXX_dmq
7423 /* there are a bunch of places where we use two reghop3's that should
7424    be replaced with this routine. but since thats not done yet
7425    we ifdef it out - dmq
7426 */
7427 STATIC U8 *
7428 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7429 {
7430     dVAR;
7431
7432     PERL_ARGS_ASSERT_REGHOP4;
7433
7434     if (off >= 0) {
7435         while (off-- && s < rlim) {
7436             /* XXX could check well-formedness here */
7437             s += UTF8SKIP(s);
7438         }
7439     }
7440     else {
7441         while (off++ && s > llim) {
7442             s--;
7443             if (UTF8_IS_CONTINUED(*s)) {
7444                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7445                     s--;
7446             }
7447             /* XXX could check well-formedness here */
7448         }
7449     }
7450     return s;
7451 }
7452 #endif
7453
7454 STATIC U8 *
7455 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7456 {
7457     dVAR;
7458
7459     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7460
7461     if (off >= 0) {
7462         while (off-- && s < lim) {
7463             /* XXX could check well-formedness here */
7464             s += UTF8SKIP(s);
7465         }
7466         if (off >= 0)
7467             return NULL;
7468     }
7469     else {
7470         while (off++ && s > lim) {
7471             s--;
7472             if (UTF8_IS_CONTINUED(*s)) {
7473                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7474                     s--;
7475             }
7476             /* XXX could check well-formedness here */
7477         }
7478         if (off <= 0)
7479             return NULL;
7480     }
7481     return s;
7482 }
7483
7484 static void
7485 restore_pos(pTHX_ void *arg)
7486 {
7487     dVAR;
7488     regexp * const rex = (regexp *)arg;
7489     if (PL_reg_state.re_state_eval_setup_done) {
7490         if (PL_reg_oldsaved) {
7491             rex->subbeg = PL_reg_oldsaved;
7492             rex->sublen = PL_reg_oldsavedlen;
7493             rex->suboffset = PL_reg_oldsavedoffset;
7494             rex->subcoffset = PL_reg_oldsavedcoffset;
7495 #ifdef PERL_OLD_COPY_ON_WRITE
7496             rex->saved_copy = PL_nrs;
7497 #endif
7498             RXp_MATCH_COPIED_on(rex);
7499         }
7500         PL_reg_magic->mg_len = PL_reg_oldpos;
7501         PL_reg_state.re_state_eval_setup_done = FALSE;
7502         PL_curpm = PL_reg_oldcurpm;
7503     }
7504 }
7505
7506 STATIC void
7507 S_to_utf8_substr(pTHX_ register regexp *prog)
7508 {
7509     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7510      * on the converted value */
7511
7512     int i = 1;
7513
7514     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7515
7516     do {
7517         if (prog->substrs->data[i].substr
7518             && !prog->substrs->data[i].utf8_substr) {
7519             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7520             prog->substrs->data[i].utf8_substr = sv;
7521             sv_utf8_upgrade(sv);
7522             if (SvVALID(prog->substrs->data[i].substr)) {
7523                 if (SvTAIL(prog->substrs->data[i].substr)) {
7524                     /* Trim the trailing \n that fbm_compile added last
7525                        time.  */
7526                     SvCUR_set(sv, SvCUR(sv) - 1);
7527                     /* Whilst this makes the SV technically "invalid" (as its
7528                        buffer is no longer followed by "\0") when fbm_compile()
7529                        adds the "\n" back, a "\0" is restored.  */
7530                     fbm_compile(sv, FBMcf_TAIL);
7531                 } else
7532                     fbm_compile(sv, 0);
7533             }
7534             if (prog->substrs->data[i].substr == prog->check_substr)
7535                 prog->check_utf8 = sv;
7536         }
7537     } while (i--);
7538 }
7539
7540 STATIC bool
7541 S_to_byte_substr(pTHX_ register regexp *prog)
7542 {
7543     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7544      * on the converted value; returns FALSE if can't be converted. */
7545
7546     dVAR;
7547     int i = 1;
7548
7549     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7550
7551     do {
7552         if (prog->substrs->data[i].utf8_substr
7553             && !prog->substrs->data[i].substr) {
7554             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7555             if (! sv_utf8_downgrade(sv, TRUE)) {
7556                 return FALSE;
7557             }
7558             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7559                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7560                     /* Trim the trailing \n that fbm_compile added last
7561                         time.  */
7562                     SvCUR_set(sv, SvCUR(sv) - 1);
7563                     fbm_compile(sv, FBMcf_TAIL);
7564                 } else
7565                     fbm_compile(sv, 0);
7566             }
7567             prog->substrs->data[i].substr = sv;
7568             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7569                 prog->check_substr = sv;
7570         }
7571     } while (i--);
7572
7573     return TRUE;
7574 }
7575
7576 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7577  * pre-composed Hangul syllables, which are all in a contiguous block and
7578  * arranged there in such a way so as to facilitate alorithmic determination of
7579  * their characteristics.  As such, they don't need a swash, but can be
7580  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7581  * is a GCB=LV */
7582 #define SBASE 0xAC00    /* Start of block */
7583 #define SCount 11172    /* Length of block */
7584 #define TCount 28
7585
7586 #if 0   /* This routine is not currently used */
7587 PERL_STATIC_INLINE bool
7588 S_is_utf8_X_LV(pTHX_ const U8 *p)
7589 {
7590     /* Unlike most other similarly named routines here, this does not create a
7591      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7592
7593     dVAR;
7594
7595     UV cp = valid_utf8_to_uvchr(p, NULL);
7596
7597     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7598
7599     /* The earliest Unicode releases did not have these precomposed Hangul
7600      * syllables.  Set to point to undef in that case, so will return false on
7601      * every call */
7602     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7603         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7604         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7605             SvREFCNT_dec(PL_utf8_X_LV);
7606             PL_utf8_X_LV = &PL_sv_undef;
7607         }
7608     }
7609
7610     return (PL_utf8_X_LV != &PL_sv_undef
7611             && cp >= SBASE && cp < SBASE + SCount
7612             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7613 }
7614 #endif
7615
7616 PERL_STATIC_INLINE bool
7617 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7618 {
7619     /* Unlike most other similarly named routines here, this does not create a
7620      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7621
7622     dVAR;
7623
7624     UV cp = valid_utf8_to_uvchr(p, NULL);
7625
7626     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7627
7628     /* The earliest Unicode releases did not have these precomposed Hangul
7629      * syllables.  Set to point to undef in that case, so will return false on
7630      * every call */
7631     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7632         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7633         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7634             SvREFCNT_dec(PL_utf8_X_LVT);
7635             PL_utf8_X_LVT = &PL_sv_undef;
7636         }
7637     }
7638
7639     return (PL_utf8_X_LVT != &PL_sv_undef
7640             && cp >= SBASE && cp < SBASE + SCount
7641             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7642 }
7643
7644 /*
7645  * Local variables:
7646  * c-indentation-style: bsd
7647  * c-basic-offset: 4
7648  * indent-tabs-mode: nil
7649  * End:
7650  *
7651  * ex: set ts=8 sts=4 sw=4 et:
7652  */