src/5017004/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #include "inline_invlist.c"
  85 #include "unicode_constants.h"
  86
  87 #define RF_tainted      1       /* tainted information used? e.g. locale */
  88 #define RF_warned       2               /* warned about big count? */
  89
  90 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  91
  92 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { \
 128         bool ok; \
 129         ENTER; save_re_context(); \
 130         ok=CAT2(is_utf8_,class)((const U8*)str); \
 131         PERL_UNUSED_VAR(ok); \
 132         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 133 /* Doesn't do an assert to verify that is correct */
 134 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 135     if (!CAT2(PL_utf8_,class)) { \
 136         bool throw_away PERL_UNUSED_DECL; \
 137         ENTER; save_re_context(); \
 138         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 139         LEAVE; } } STMT_END
 140
 141 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 142 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 143 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 144
 145 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 146         /* No asserts are done for some of these, in case called on a   */  \
 147         /* Unicode version in which they map to nothing */                  \
 148         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 149         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 150
 151 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 152
 153 /* The actual code for CCC_TRY, which uses several variables from the routine
 154  * it's callable from.  It is designed to be the bulk of a case statement.
 155  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 156  *      nextchr matches the class.
 157  * UTF8_TEST is the whole test string to use for utf8 targets
 158  * LOAD is what to use to test, and if not present to load in the swash for the
 159  *      class
 160  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 161  *      UTF8_TEST test.
 162  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 163  * utf8 and a variant, load the swash if necessary and test using the utf8
 164  * test.  Advance to the next character if test is ok, otherwise fail; If not
 165  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 166  * fails, or advance to the next character */
 167
 168 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 169     if (locinput >= PL_regeol) {                                              \
 170         sayNO;                                                                \
 171     }                                                                         \
 172     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 173         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 174         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 175             sayNO;                                                            \
 176         }                                                                     \
 177         locinput += PL_utf8skip[nextchr];                                     \
 178         nextchr = UCHARAT(locinput);                                          \
 179         break;                                                                \
 180     }                                                                         \
 181     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 182         sayNO;                                                                \
 183     }                                                                         \
 184     nextchr = UCHARAT(++locinput);                                            \
 185     break;
 186
 187 /* Handle the non-locale cases for a character class and its complement.  It
 188  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 189  * This is because that code fails when the test succeeds, so we want to have
 190  * the test fail so that the code succeeds.  The swash is stored in a
 191  * predictable PL_ place */
 192 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 193                            CLASS, STR)                                        \
 194     case NAME:                                                                \
 195         _CCC_TRY_CODE( !, FUNC,                                               \
 196                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 197                                             (U8*)locinput, TRUE)),            \
 198                           CLASS, STR)                                         \
 199     case NNAME:                                                               \
 200         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 201                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 202                                             (U8*)locinput, TRUE)),            \
 203                           CLASS, STR)                                         \
 204
 205 /* Generate the case statements for both locale and non-locale character
 206  * classes in regmatch for classes that don't have special unicode semantics.
 207  * Locales don't use an immediate swash, but an intermediary special locale
 208  * function that is called on the pointer to the current place in the input
 209  * string.  That function will resolve to needing the same swash.  One might
 210  * think that because we don't know what the locale will match, we shouldn't
 211  * check with the swash loading function that it loaded properly; ie, that we
 212  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 213  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 214  * irrelevant here */
 215 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 216                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 217                 NAMEA, NNAMEA, FUNCA,                                         \
 218                 CLASS, STR)                                                   \
 219     case NAMEL:                                                               \
 220         PL_reg_flags |= RF_tainted;                                           \
 221         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 222     case NNAMEL:                                                              \
 223         PL_reg_flags |= RF_tainted;                                           \
 224         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 225                        CLASS, STR)                                            \
 226     case NAMEA:                                                               \
 227         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 228             sayNO;                                                            \
 229         }                                                                     \
 230         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 231         nextchr = UCHARAT(++locinput);                                        \
 232         break;                                                                \
 233     case NNAMEA:                                                              \
 234         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 235             sayNO;                                                            \
 236         }                                                                     \
 237         if (utf8_target) {                                                    \
 238             locinput += PL_utf8skip[nextchr];                                 \
 239             nextchr = UCHARAT(locinput);                                      \
 240         }                                                                     \
 241         else {                                                                \
 242             nextchr = UCHARAT(++locinput);                                    \
 243         }                                                                     \
 244         break;                                                                \
 245     /* Generate the non-locale cases */                                       \
 246     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 247
 248 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 249  * statements to handle separate Unicode semantics nodes */
 250 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 251                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 252                   NAMEU, NNAMEU, FUNCU,                                        \
 253                   NAMEA, NNAMEA, FUNCA,                                        \
 254                   CLASS, STR)                                                  \
 255     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 256             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 257             NAMEA, NNAMEA, FUNCA,                                              \
 258             CLASS, STR)                                                        \
 259     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 260
 261 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 262
 263 /* for use after a quantifier and before an EXACT-like node -- japhy */
 264 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 265  *
 266  * NOTE that *nothing* that affects backtracking should be in here, specifically
 267  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 268  * node that is in between two EXACT like nodes when ascertaining what the required
 269  * "follow" character is. This should probably be moved to regex compile time
 270  * although it may be done at run time beause of the REF possibility - more
 271  * investigation required. -- demerphq
 272 */
 273 #define JUMPABLE(rn) (      \
 274     OP(rn) == OPEN ||       \
 275     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 276     OP(rn) == EVAL ||   \
 277     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 278     OP(rn) == PLUS || OP(rn) == MINMOD || \
 279     OP(rn) == KEEPS || \
 280     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 281 )
 282 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 283
 284 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 285
 286 #if 0
 287 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 288    we don't need this definition. */
 289 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 290 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 291 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 292
 293 #else
 294 /* ... so we use this as its faster. */
 295 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 296 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 297 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 299
 300 #endif
 301
 302 /*
 303   Search for mandatory following text node; for lookahead, the text must
 304   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 305 */
 306 #define FIND_NEXT_IMPT(rn) STMT_START { \
 307     while (JUMPABLE(rn)) { \
 308         const OPCODE type = OP(rn); \
 309         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 310             rn = NEXTOPER(NEXTOPER(rn)); \
 311         else if (type == PLUS) \
 312             rn = NEXTOPER(rn); \
 313         else if (type == IFMATCH) \
 314             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 315         else rn += NEXT_OFF(rn); \
 316     } \
 317 } STMT_END
 318
 319
 320 static void restore_pos(pTHX_ void *arg);
 321
 322 #define REGCP_PAREN_ELEMS 3
 323 #define REGCP_OTHER_ELEMS 3
 324 #define REGCP_FRAME_ELEMS 1
 325 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 326  * are needed for the regexp context stack bookkeeping. */
 327
 328 STATIC CHECKPOINT
 329 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 330 {
 331     dVAR;
 332     const int retval = PL_savestack_ix;
 333     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 334     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 335     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 336     I32 p;
 337     GET_RE_DEBUG_FLAGS_DECL;
 338
 339     PERL_ARGS_ASSERT_REGCPPUSH;
 340
 341     if (paren_elems_to_push < 0)
 342         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 343                    paren_elems_to_push);
 344
 345     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 346         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 347                    " out of range (%lu-%ld)",
 348                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 349
 350     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 351
 352     DEBUG_BUFFERS_r(
 353         if ((int)PL_regsize > (int)parenfloor)
 354             PerlIO_printf(Perl_debug_log,
 355                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 356                 PTR2UV(rex),
 357                 PTR2UV(rex->offs)
 358             );
 359     );
 360     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 361 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 362         SSPUSHINT(rex->offs[p].end);
 363         SSPUSHINT(rex->offs[p].start);
 364         SSPUSHINT(rex->offs[p].start_tmp);
 365         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 366             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 367             (UV)p,
 368             (IV)rex->offs[p].start,
 369             (IV)rex->offs[p].start_tmp,
 370             (IV)rex->offs[p].end
 371         ));
 372     }
 373 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 374     SSPUSHINT(PL_regsize);
 375     SSPUSHINT(rex->lastparen);
 376     SSPUSHINT(rex->lastcloseparen);
 377     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 378
 379     return retval;
 380 }
 381
 382 /* These are needed since we do not localize EVAL nodes: */
 383 #define REGCP_SET(cp)                                           \
 384     DEBUG_STATE_r(                                              \
 385             PerlIO_printf(Perl_debug_log,                       \
 386                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 387                 (IV)PL_savestack_ix));                          \
 388     cp = PL_savestack_ix
 389
 390 #define REGCP_UNWIND(cp)                                        \
 391     DEBUG_STATE_r(                                              \
 392         if (cp != PL_savestack_ix)                              \
 393             PerlIO_printf(Perl_debug_log,                       \
 394                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 395                 (IV)(cp), (IV)PL_savestack_ix));                \
 396     regcpblow(cp)
 397
 398 #define UNWIND_PAREN(lp, lcp)               \
 399     for (n = rex->lastparen; n > lp; n--)   \
 400         rex->offs[n].end = -1;              \
 401     rex->lastparen = n;                     \
 402     rex->lastcloseparen = lcp;
 403
 404
 405 STATIC void
 406 S_regcppop(pTHX_ regexp *rex)
 407 {
 408     dVAR;
 409     UV i;
 410     U32 paren;
 411     GET_RE_DEBUG_FLAGS_DECL;
 412
 413     PERL_ARGS_ASSERT_REGCPPOP;
 414
 415     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 416     i = SSPOPUV;
 417     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 418     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 419     rex->lastcloseparen = SSPOPINT;
 420     rex->lastparen = SSPOPINT;
 421     PL_regsize = SSPOPINT;
 422
 423     i -= REGCP_OTHER_ELEMS;
 424     /* Now restore the parentheses context. */
 425     DEBUG_BUFFERS_r(
 426         if (i || rex->lastparen + 1 <= rex->nparens)
 427             PerlIO_printf(Perl_debug_log,
 428                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 429                 PTR2UV(rex),
 430                 PTR2UV(rex->offs)
 431             );
 432     );
 433     paren = PL_regsize;
 434     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 435         I32 tmps;
 436         rex->offs[paren].start_tmp = SSPOPINT;
 437         rex->offs[paren].start = SSPOPINT;
 438         tmps = SSPOPINT;
 439         if (paren <= rex->lastparen)
 440             rex->offs[paren].end = tmps;
 441         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 442             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 443             (UV)paren,
 444             (IV)rex->offs[paren].start,
 445             (IV)rex->offs[paren].start_tmp,
 446             (IV)rex->offs[paren].end,
 447             (paren > rex->lastparen ? "(skipped)" : ""));
 448         );
 449         paren--;
 450     }
 451 #if 1
 452     /* It would seem that the similar code in regtry()
 453      * already takes care of this, and in fact it is in
 454      * a better location to since this code can #if 0-ed out
 455      * but the code in regtry() is needed or otherwise tests
 456      * requiring null fields (pat.t#187 and split.t#{13,14}
 457      * (as of patchlevel 7877)  will fail.  Then again,
 458      * this code seems to be necessary or otherwise
 459      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 460      * --jhi updated by dapm */
 461     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 462         if (i > PL_regsize)
 463             rex->offs[i].start = -1;
 464         rex->offs[i].end = -1;
 465         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 466             "    \\%"UVuf": %s   ..-1 undeffing\n",
 467             (UV)i,
 468             (i > PL_regsize) ? "-1" : "  "
 469         ));
 470     }
 471 #endif
 472 }
 473
 474 /* restore the parens and associated vars at savestack position ix,
 475  * but without popping the stack */
 476
 477 STATIC void
 478 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 479 {
 480     I32 tmpix = PL_savestack_ix;
 481     PL_savestack_ix = ix;
 482     regcppop(rex);
 483     PL_savestack_ix = tmpix;
 484 }
 485
 486 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 487
 488 /*
 489  * pregexec and friends
 490  */
 491
 492 #ifndef PERL_IN_XSUB_RE
 493 /*
 494  - pregexec - match a regexp against a string
 495  */
 496 I32
 497 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 498          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 499 /* stringarg: the point in the string at which to begin matching */
 500 /* strend:    pointer to null at end of string */
 501 /* strbeg:    real beginning of string */
 502 /* minend:    end of match must be >= minend bytes after stringarg. */
 503 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 504  *            itself is accessed via the pointers above */
 505 /* nosave:    For optimizations. */
 506 {
 507     PERL_ARGS_ASSERT_PREGEXEC;
 508
 509     return
 510         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 511                       nosave ? 0 : REXEC_COPY_STR);
 512 }
 513 #endif
 514
 515 /*
 516  * Need to implement the following flags for reg_anch:
 517  *
 518  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 519  * USE_INTUIT_ML
 520  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 521  * INTUIT_AUTORITATIVE_ML
 522  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 523  * INTUIT_ONCE_ML
 524  *
 525  * Another flag for this function: SECOND_TIME (so that float substrs
 526  * with giant delta may be not rechecked).
 527  */
 528
 529 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 530
 531 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 532    Otherwise, only SvCUR(sv) is used to get strbeg. */
 533
 534 /* XXXX We assume that strpos is strbeg unless sv. */
 535
 536 /* XXXX Some places assume that there is a fixed substring.
 537         An update may be needed if optimizer marks as "INTUITable"
 538         RExen without fixed substrings.  Similarly, it is assumed that
 539         lengths of all the strings are no more than minlen, thus they
 540         cannot come from lookahead.
 541         (Or minlen should take into account lookahead.)
 542   NOTE: Some of this comment is not correct. minlen does now take account
 543   of lookahead/behind. Further research is required. -- demerphq
 544
 545 */
 546
 547 /* A failure to find a constant substring means that there is no need to make
 548    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 549    finding a substring too deep into the string means that less calls to
 550    regtry() should be needed.
 551
 552    REx compiler's optimizer found 4 possible hints:
 553         a) Anchored substring;
 554         b) Fixed substring;
 555         c) Whether we are anchored (beginning-of-line or \G);
 556         d) First node (of those at offset 0) which may distinguish positions;
 557    We use a)b)d) and multiline-part of c), and try to find a position in the
 558    string which does not contradict any of them.
 559  */
 560
 561 /* Most of decisions we do here should have been done at compile time.
 562    The nodes of the REx which we used for the search should have been
 563    deleted from the finite automaton. */
 564
 565 char *
 566 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 567                      char *strend, const U32 flags, re_scream_pos_data *data)
 568 {
 569     dVAR;
 570     struct regexp *const prog = (struct regexp *)SvANY(rx);
 571     I32 start_shift = 0;
 572     /* Should be nonnegative! */
 573     I32 end_shift   = 0;
 574     char *s;
 575     SV *check;
 576     char *strbeg;
 577     char *t;
 578     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 579     I32 ml_anch;
 580     char *other_last = NULL;    /* other substr checked before this */
 581     char *check_at = NULL;              /* check substr found at this pos */
 582     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 583     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 584     RXi_GET_DECL(prog,progi);
 585 #ifdef DEBUGGING
 586     const char * const i_strpos = strpos;
 587 #endif
 588     GET_RE_DEBUG_FLAGS_DECL;
 589
 590     PERL_ARGS_ASSERT_RE_INTUIT_START;
 591     PERL_UNUSED_ARG(flags);
 592     PERL_UNUSED_ARG(data);
 593
 594     RX_MATCH_UTF8_set(rx,utf8_target);
 595
 596     if (RX_UTF8(rx)) {
 597         PL_reg_flags |= RF_utf8;
 598     }
 599     DEBUG_EXECUTE_r(
 600         debug_start_match(rx, utf8_target, strpos, strend,
 601             sv ? "Guessing start of match in sv for"
 602                : "Guessing start of match in string for");
 603               );
 604
 605     /* CHR_DIST() would be more correct here but it makes things slow. */
 606     if (prog->minlen > strend - strpos) {
 607         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 608                               "String too short... [re_intuit_start]\n"));
 609         goto fail;
 610     }
 611
 612     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 613     PL_regeol = strend;
 614     if (utf8_target) {
 615         if (!prog->check_utf8 && prog->check_substr)
 616             to_utf8_substr(prog);
 617         check = prog->check_utf8;
 618     } else {
 619         if (!prog->check_substr && prog->check_utf8)
 620             to_byte_substr(prog);
 621         check = prog->check_substr;
 622     }
 623     if (check == &PL_sv_undef) {
 624         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 625                 "Non-utf8 string cannot match utf8 check string\n"));
 626         goto fail;
 627     }
 628     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 629         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 630                      || ( (prog->extflags & RXf_ANCH_BOL)
 631                           && !multiline ) );    /* Check after \n? */
 632
 633         if (!ml_anch) {
 634           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 635                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 636                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 637                && sv && !SvROK(sv)
 638                && (strpos != strbeg)) {
 639               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 640               goto fail;
 641           }
 642           if (prog->check_offset_min == prog->check_offset_max &&
 643               !(prog->extflags & RXf_CANY_SEEN)) {
 644             /* Substring at constant offset from beg-of-str... */
 645             I32 slen;
 646
 647             s = HOP3c(strpos, prog->check_offset_min, strend);
 648
 649             if (SvTAIL(check)) {
 650                 slen = SvCUR(check);    /* >= 1 */
 651
 652                 if ( strend - s > slen || strend - s < slen - 1
 653                      || (strend - s == slen && strend[-1] != '\n')) {
 654                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 655                     goto fail_finish;
 656                 }
 657                 /* Now should match s[0..slen-2] */
 658                 slen--;
 659                 if (slen && (*SvPVX_const(check) != *s
 660                              || (slen > 1
 661                                  && memNE(SvPVX_const(check), s, slen)))) {
 662                   report_neq:
 663                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 664                     goto fail_finish;
 665                 }
 666             }
 667             else if (*SvPVX_const(check) != *s
 668                      || ((slen = SvCUR(check)) > 1
 669                          && memNE(SvPVX_const(check), s, slen)))
 670                 goto report_neq;
 671             check_at = s;
 672             goto success_at_start;
 673           }
 674         }
 675         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 676         s = strpos;
 677         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 678         end_shift = prog->check_end_shift;
 679
 680         if (!ml_anch) {
 681             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 682                                          - (SvTAIL(check) != 0);
 683             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 684
 685             if (end_shift < eshift)
 686                 end_shift = eshift;
 687         }
 688     }
 689     else {                              /* Can match at random position */
 690         ml_anch = 0;
 691         s = strpos;
 692         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 693         end_shift = prog->check_end_shift;
 694
 695         /* end shift should be non negative here */
 696     }
 697
 698 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 699     if (end_shift < 0)
 700         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 701                    (IV)end_shift, RX_PRECOMP(prog));
 702 #endif
 703
 704   restart:
 705     /* Find a possible match in the region s..strend by looking for
 706        the "check" substring in the region corrected by start/end_shift. */
 707
 708     {
 709         I32 srch_start_shift = start_shift;
 710         I32 srch_end_shift = end_shift;
 711         U8* start_point;
 712         U8* end_point;
 713         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 714             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 715             srch_start_shift = strbeg - s;
 716         }
 717     DEBUG_OPTIMISE_MORE_r({
 718         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 719             (IV)prog->check_offset_min,
 720             (IV)srch_start_shift,
 721             (IV)srch_end_shift,
 722             (IV)prog->check_end_shift);
 723     });
 724
 725         if (prog->extflags & RXf_CANY_SEEN) {
 726             start_point= (U8*)(s + srch_start_shift);
 727             end_point= (U8*)(strend - srch_end_shift);
 728         } else {
 729             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 730             end_point= HOP3(strend, -srch_end_shift, strbeg);
 731         }
 732         DEBUG_OPTIMISE_MORE_r({
 733             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 734                 (int)(end_point - start_point),
 735                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 736                 start_point);
 737         });
 738
 739         s = fbm_instr( start_point, end_point,
 740                       check, multiline ? FBMrf_MULTILINE : 0);
 741     }
 742     /* Update the count-of-usability, remove useless subpatterns,
 743         unshift s.  */
 744
 745     DEBUG_EXECUTE_r({
 746         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 747             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 748         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 749                           (s ? "Found" : "Did not find"),
 750             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 751                 ? "anchored" : "floating"),
 752             quoted,
 753             RE_SV_TAIL(check),
 754             (s ? " at offset " : "...\n") );
 755     });
 756
 757     if (!s)
 758         goto fail_finish;
 759     /* Finish the diagnostic message */
 760     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 761
 762     /* XXX dmq: first branch is for positive lookbehind...
 763        Our check string is offset from the beginning of the pattern.
 764        So we need to do any stclass tests offset forward from that
 765        point. I think. :-(
 766      */
 767
 768
 769
 770     check_at=s;
 771
 772
 773     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 774        Start with the other substr.
 775        XXXX no SCREAM optimization yet - and a very coarse implementation
 776        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 777                 *always* match.  Probably should be marked during compile...
 778        Probably it is right to do no SCREAM here...
 779      */
 780
 781     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 782                 : (prog->float_substr && prog->anchored_substr))
 783     {
 784         /* Take into account the "other" substring. */
 785         /* XXXX May be hopelessly wrong for UTF... */
 786         if (!other_last)
 787             other_last = strpos;
 788         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 789           do_other_anchored:
 790             {
 791                 char * const last = HOP3c(s, -start_shift, strbeg);
 792                 char *last1, *last2;
 793                 char * const saved_s = s;
 794                 SV* must;
 795
 796                 t = s - prog->check_offset_max;
 797                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 798                     && (!utf8_target
 799                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 800                             && t > strpos)))
 801                     NOOP;
 802                 else
 803                     t = strpos;
 804                 t = HOP3c(t, prog->anchored_offset, strend);
 805                 if (t < other_last)     /* These positions already checked */
 806                     t = other_last;
 807                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 808                 if (last < last1)
 809                     last1 = last;
 810                 /* XXXX It is not documented what units *_offsets are in.
 811                    We assume bytes, but this is clearly wrong.
 812                    Meaning this code needs to be carefully reviewed for errors.
 813                    dmq.
 814                   */
 815
 816                 /* On end-of-str: see comment below. */
 817                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 818                 if (must == &PL_sv_undef) {
 819                     s = (char*)NULL;
 820                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 821                 }
 822                 else
 823                     s = fbm_instr(
 824                         (unsigned char*)t,
 825                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 826                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 827                         must,
 828                         multiline ? FBMrf_MULTILINE : 0
 829                     );
 830                 DEBUG_EXECUTE_r({
 831                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 832                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 833                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 834                         (s ? "Found" : "Contradicts"),
 835                         quoted, RE_SV_TAIL(must));
 836                 });
 837
 838
 839                 if (!s) {
 840                     if (last1 >= last2) {
 841                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 842                                                 ", giving up...\n"));
 843                         goto fail_finish;
 844                     }
 845                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 846                         ", trying floating at offset %ld...\n",
 847                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 848                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 849                     s = HOP3c(last, 1, strend);
 850                     goto restart;
 851                 }
 852                 else {
 853                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 854                           (long)(s - i_strpos)));
 855                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 856                     other_last = HOP3c(s, 1, strend);
 857                     s = saved_s;
 858                     if (t == strpos)
 859                         goto try_at_start;
 860                     goto try_at_offset;
 861                 }
 862             }
 863         }
 864         else {          /* Take into account the floating substring. */
 865             char *last, *last1;
 866             char * const saved_s = s;
 867             SV* must;
 868
 869             t = HOP3c(s, -start_shift, strbeg);
 870             last1 = last =
 871                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 872             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 873                 last = HOP3c(t, prog->float_max_offset, strend);
 874             s = HOP3c(t, prog->float_min_offset, strend);
 875             if (s < other_last)
 876                 s = other_last;
 877  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 878             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 879             /* fbm_instr() takes into account exact value of end-of-str
 880                if the check is SvTAIL(ed).  Since false positives are OK,
 881                and end-of-str is not later than strend we are OK. */
 882             if (must == &PL_sv_undef) {
 883                 s = (char*)NULL;
 884                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 885             }
 886             else
 887                 s = fbm_instr((unsigned char*)s,
 888                               (unsigned char*)last + SvCUR(must)
 889                                   - (SvTAIL(must)!=0),
 890                               must, multiline ? FBMrf_MULTILINE : 0);
 891             DEBUG_EXECUTE_r({
 892                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 893                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 894                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 895                     (s ? "Found" : "Contradicts"),
 896                     quoted, RE_SV_TAIL(must));
 897             });
 898             if (!s) {
 899                 if (last1 == last) {
 900                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 901                                             ", giving up...\n"));
 902                     goto fail_finish;
 903                 }
 904                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 905                     ", trying anchored starting at offset %ld...\n",
 906                     (long)(saved_s + 1 - i_strpos)));
 907                 other_last = last;
 908                 s = HOP3c(t, 1, strend);
 909                 goto restart;
 910             }
 911             else {
 912                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 913                       (long)(s - i_strpos)));
 914                 other_last = s; /* Fix this later. --Hugo */
 915                 s = saved_s;
 916                 if (t == strpos)
 917                     goto try_at_start;
 918                 goto try_at_offset;
 919             }
 920         }
 921     }
 922
 923
 924     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 925
 926     DEBUG_OPTIMISE_MORE_r(
 927         PerlIO_printf(Perl_debug_log,
 928             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 929             (IV)prog->check_offset_min,
 930             (IV)prog->check_offset_max,
 931             (IV)(s-strpos),
 932             (IV)(t-strpos),
 933             (IV)(t-s),
 934             (IV)(strend-strpos)
 935         )
 936     );
 937
 938     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 939         && (!utf8_target
 940             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 941                  && t > strpos)))
 942     {
 943         /* Fixed substring is found far enough so that the match
 944            cannot start at strpos. */
 945       try_at_offset:
 946         if (ml_anch && t[-1] != '\n') {
 947             /* Eventually fbm_*() should handle this, but often
 948                anchored_offset is not 0, so this check will not be wasted. */
 949             /* XXXX In the code below we prefer to look for "^" even in
 950                presence of anchored substrings.  And we search even
 951                beyond the found float position.  These pessimizations
 952                are historical artefacts only.  */
 953           find_anchor:
 954             while (t < strend - prog->minlen) {
 955                 if (*t == '\n') {
 956                     if (t < check_at - prog->check_offset_min) {
 957                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 958                             /* Since we moved from the found position,
 959                                we definitely contradict the found anchored
 960                                substr.  Due to the above check we do not
 961                                contradict "check" substr.
 962                                Thus we can arrive here only if check substr
 963                                is float.  Redo checking for "other"=="fixed".
 964                              */
 965                             strpos = t + 1;
 966                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 967                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 968                             goto do_other_anchored;
 969                         }
 970                         /* We don't contradict the found floating substring. */
 971                         /* XXXX Why not check for STCLASS? */
 972                         s = t + 1;
 973                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 974                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 975                         goto set_useful;
 976                     }
 977                     /* Position contradicts check-string */
 978                     /* XXXX probably better to look for check-string
 979                        than for "\n", so one should lower the limit for t? */
 980                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 981                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 982                     other_last = strpos = s = t + 1;
 983                     goto restart;
 984                 }
 985                 t++;
 986             }
 987             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 988                         PL_colors[0], PL_colors[1]));
 989             goto fail_finish;
 990         }
 991         else {
 992             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 993                         PL_colors[0], PL_colors[1]));
 994         }
 995         s = t;
 996       set_useful:
 997         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 998     }
 999     else {
1000         /* The found string does not prohibit matching at strpos,
1001            - no optimization of calling REx engine can be performed,
1002            unless it was an MBOL and we are not after MBOL,
1003            or a future STCLASS check will fail this. */
1004       try_at_start:
1005         /* Even in this situation we may use MBOL flag if strpos is offset
1006            wrt the start of the string. */
1007         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1008             && (strpos != strbeg) && strpos[-1] != '\n'
1009             /* May be due to an implicit anchor of m{.*foo}  */
1010             && !(prog->intflags & PREGf_IMPLICIT))
1011         {
1012             t = strpos;
1013             goto find_anchor;
1014         }
1015         DEBUG_EXECUTE_r( if (ml_anch)
1016             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1017                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1018         );
1019       success_at_start:
1020         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1021             && (utf8_target ? (
1022                 prog->check_utf8                /* Could be deleted already */
1023                 && --BmUSEFUL(prog->check_utf8) < 0
1024                 && (prog->check_utf8 == prog->float_utf8)
1025             ) : (
1026                 prog->check_substr              /* Could be deleted already */
1027                 && --BmUSEFUL(prog->check_substr) < 0
1028                 && (prog->check_substr == prog->float_substr)
1029             )))
1030         {
1031             /* If flags & SOMETHING - do not do it many times on the same match */
1032             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1033             /* XXX Does the destruction order has to change with utf8_target? */
1034             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1035             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1036             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1037             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1038             check = NULL;                       /* abort */
1039             s = strpos;
1040             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1041                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1042             if (prog->intflags & PREGf_IMPLICIT)
1043                 prog->extflags &= ~RXf_ANCH_MBOL;
1044             /* XXXX This is a remnant of the old implementation.  It
1045                     looks wasteful, since now INTUIT can use many
1046                     other heuristics. */
1047             prog->extflags &= ~RXf_USE_INTUIT;
1048             /* XXXX What other flags might need to be cleared in this branch? */
1049         }
1050         else
1051             s = strpos;
1052     }
1053
1054     /* Last resort... */
1055     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1056     /* trie stclasses are too expensive to use here, we are better off to
1057        leave it to regmatch itself */
1058     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1059         /* minlen == 0 is possible if regstclass is \b or \B,
1060            and the fixed substr is ''$.
1061            Since minlen is already taken into account, s+1 is before strend;
1062            accidentally, minlen >= 1 guaranties no false positives at s + 1
1063            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1064            regstclass does not come from lookahead...  */
1065         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1066            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1067         const U8* const str = (U8*)STRING(progi->regstclass);
1068         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1069                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1070                     : 1);
1071         char * endpos;
1072         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1073             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1074         else if (prog->float_substr || prog->float_utf8)
1075             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1076         else
1077             endpos= strend;
1078
1079         if (checked_upto < s)
1080            checked_upto = s;
1081         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1082                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1083
1084         t = s;
1085         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1086         if (s) {
1087             checked_upto = s;
1088         } else {
1089 #ifdef DEBUGGING
1090             const char *what = NULL;
1091 #endif
1092             if (endpos == strend) {
1093                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1094                                 "Could not match STCLASS...\n") );
1095                 goto fail;
1096             }
1097             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098                                    "This position contradicts STCLASS...\n") );
1099             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1100                 goto fail;
1101             checked_upto = HOPBACKc(endpos, start_shift);
1102             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1103                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1104             /* Contradict one of substrings */
1105             if (prog->anchored_substr || prog->anchored_utf8) {
1106                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1107                     DEBUG_EXECUTE_r( what = "anchored" );
1108                   hop_and_restart:
1109                     s = HOP3c(t, 1, strend);
1110                     if (s + start_shift + end_shift > strend) {
1111                         /* XXXX Should be taken into account earlier? */
1112                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1113                                                "Could not match STCLASS...\n") );
1114                         goto fail;
1115                     }
1116                     if (!check)
1117                         goto giveup;
1118                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                                 "Looking for %s substr starting at offset %ld...\n",
1120                                  what, (long)(s + start_shift - i_strpos)) );
1121                     goto restart;
1122                 }
1123                 /* Have both, check_string is floating */
1124                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1125                     goto retry_floating_check;
1126                 /* Recheck anchored substring, but not floating... */
1127                 s = check_at;
1128                 if (!check)
1129                     goto giveup;
1130                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1131                           "Looking for anchored substr starting at offset %ld...\n",
1132                           (long)(other_last - i_strpos)) );
1133                 goto do_other_anchored;
1134             }
1135             /* Another way we could have checked stclass at the
1136                current position only: */
1137             if (ml_anch) {
1138                 s = t = t + 1;
1139                 if (!check)
1140                     goto giveup;
1141                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142                           "Looking for /%s^%s/m starting at offset %ld...\n",
1143                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1144                 goto try_at_offset;
1145             }
1146             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1147                 goto fail;
1148             /* Check is floating substring. */
1149           retry_floating_check:
1150             t = check_at - start_shift;
1151             DEBUG_EXECUTE_r( what = "floating" );
1152             goto hop_and_restart;
1153         }
1154         if (t != s) {
1155             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1156                         "By STCLASS: moving %ld --> %ld\n",
1157                                   (long)(t - i_strpos), (long)(s - i_strpos))
1158                    );
1159         }
1160         else {
1161             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1162                                   "Does not contradict STCLASS...\n");
1163                    );
1164         }
1165     }
1166   giveup:
1167     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1168                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1169                           PL_colors[5], (long)(s - i_strpos)) );
1170     return s;
1171
1172   fail_finish:                          /* Substring not found */
1173     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1174         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1175   fail:
1176     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1177                           PL_colors[4], PL_colors[5]));
1178     return NULL;
1179 }
1180
1181 #define DECL_TRIE_TYPE(scan) \
1182     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1183                     trie_type = ((scan->flags == EXACT) \
1184                               ? (utf8_target ? trie_utf8 : trie_plain) \
1185                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1186
1187 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1188 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1189     STRLEN skiplen;                                                                 \
1190     switch (trie_type) {                                                            \
1191     case trie_utf8_fold:                                                            \
1192         if ( foldlen>0 ) {                                                          \
1193             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1194             foldlen -= len;                                                         \
1195             uscan += len;                                                           \
1196             len=0;                                                                  \
1197         } else {                                                                    \
1198             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1199             len = UTF8SKIP(uc);                                                     \
1200             skiplen = UNISKIP( uvc );                                               \
1201             foldlen -= skiplen;                                                     \
1202             uscan = foldbuf + skiplen;                                              \
1203         }                                                                           \
1204         break;                                                                      \
1205     case trie_latin_utf8_fold:                                                      \
1206         if ( foldlen>0 ) {                                                          \
1207             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1208             foldlen -= len;                                                         \
1209             uscan += len;                                                           \
1210             len=0;                                                                  \
1211         } else {                                                                    \
1212             len = 1;                                                                \
1213             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1214             skiplen = UNISKIP( uvc );                                               \
1215             foldlen -= skiplen;                                                     \
1216             uscan = foldbuf + skiplen;                                              \
1217         }                                                                           \
1218         break;                                                                      \
1219     case trie_utf8:                                                                 \
1220         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1221         break;                                                                      \
1222     case trie_plain:                                                                \
1223         uvc = (UV)*uc;                                                              \
1224         len = 1;                                                                    \
1225     }                                                                               \
1226     if (uvc < 256) {                                                                \
1227         charid = trie->charmap[ uvc ];                                              \
1228     }                                                                               \
1229     else {                                                                          \
1230         charid = 0;                                                                 \
1231         if (widecharmap) {                                                          \
1232             SV** const svpp = hv_fetch(widecharmap,                                 \
1233                         (char*)&uvc, sizeof(UV), 0);                                \
1234             if (svpp)                                                               \
1235                 charid = (U16)SvIV(*svpp);                                          \
1236         }                                                                           \
1237     }                                                                               \
1238 } STMT_END
1239
1240 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1241 STMT_START {                                              \
1242     while (s <= e) {                                      \
1243         if ( (CoNd)                                       \
1244              && (ln == 1 || folder(s, pat_string, ln))    \
1245              && (!reginfo || regtry(reginfo, &s)) )       \
1246             goto got_it;                                  \
1247         s++;                                              \
1248     }                                                     \
1249 } STMT_END
1250
1251 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1252 STMT_START {                                          \
1253     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1254         CoDe                                          \
1255         s += uskip;                                   \
1256     }                                                 \
1257 } STMT_END
1258
1259 #define REXEC_FBC_SCAN(CoDe)                          \
1260 STMT_START {                                          \
1261     while (s < strend) {                              \
1262         CoDe                                          \
1263         s++;                                          \
1264     }                                                 \
1265 } STMT_END
1266
1267 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1268 REXEC_FBC_UTF8_SCAN(                                  \
1269     if (CoNd) {                                       \
1270         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1271             goto got_it;                              \
1272         else                                          \
1273             tmp = doevery;                            \
1274     }                                                 \
1275     else                                              \
1276         tmp = 1;                                      \
1277 )
1278
1279 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1280 REXEC_FBC_SCAN(                                       \
1281     if (CoNd) {                                       \
1282         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1283             goto got_it;                              \
1284         else                                          \
1285             tmp = doevery;                            \
1286     }                                                 \
1287     else                                              \
1288         tmp = 1;                                      \
1289 )
1290
1291 #define REXEC_FBC_TRYIT               \
1292 if ((!reginfo || regtry(reginfo, &s))) \
1293     goto got_it
1294
1295 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1296     if (utf8_target) {                                             \
1297         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1298     }                                                          \
1299     else {                                                     \
1300         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1301     }
1302
1303 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1304     if (utf8_target) {                                             \
1305         UtFpReLoAd;                                            \
1306         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1307     }                                                          \
1308     else {                                                     \
1309         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1310     }
1311
1312 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1313     PL_reg_flags |= RF_tainted;                                \
1314     if (utf8_target) {                                             \
1315         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1316     }                                                          \
1317     else {                                                     \
1318         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1319     }
1320
1321 #define DUMP_EXEC_POS(li,s,doutf8) \
1322     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1323
1324
1325 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1326         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1327         tmp = TEST_NON_UTF8(tmp);                                              \
1328         REXEC_FBC_UTF8_SCAN(                                                   \
1329             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1330                 tmp = !tmp;                                                    \
1331                 IF_SUCCESS;                                                    \
1332             }                                                                  \
1333             else {                                                             \
1334                 IF_FAIL;                                                       \
1335             }                                                                  \
1336         );                                                                     \
1337
1338 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1339         if (s == PL_bostr) {                                                   \
1340             tmp = '\n';                                                        \
1341         }                                                                      \
1342         else {                                                                 \
1343             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1344             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1345         }                                                                      \
1346         tmp = TeSt1_UtF8;                                                      \
1347         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1348         REXEC_FBC_UTF8_SCAN(                                                   \
1349             if (tmp == ! (TeSt2_UtF8)) { \
1350                 tmp = !tmp;                                                    \
1351                 IF_SUCCESS;                                                    \
1352             }                                                                  \
1353             else {                                                             \
1354                 IF_FAIL;                                                       \
1355             }                                                                  \
1356         );                                                                     \
1357
1358 /* The only difference between the BOUND and NBOUND cases is that
1359  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1360  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1361  * with the other one being empty */
1362 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1363     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1364
1365 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1366     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1367
1368 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1369     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1370
1371 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1372     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1373
1374
1375 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1376  * be passed in completely with the variable name being tested, which isn't
1377  * such a clean interface, but this is easier to read than it was before.  We
1378  * are looking for the boundary (or non-boundary between a word and non-word
1379  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1380  * must be different.  Find the "wordness" of the character just prior to this
1381  * one, and compare it with the wordness of this one.  If they differ, we have
1382  * a boundary.  At the beginning of the string, pretend that the previous
1383  * character was a new-line */
1384 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1385     if (utf8_target) {                                                         \
1386                 UTF8_CODE \
1387     }                                                                          \
1388     else {  /* Not utf8 */                                                     \
1389         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1390         tmp = TEST_NON_UTF8(tmp);                                              \
1391         REXEC_FBC_SCAN(                                                        \
1392             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1393                 tmp = !tmp;                                                    \
1394                 IF_SUCCESS;                                                    \
1395             }                                                                  \
1396             else {                                                             \
1397                 IF_FAIL;                                                       \
1398             }                                                                  \
1399         );                                                                     \
1400     }                                                                          \
1401     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1402         goto got_it;
1403
1404 /* We know what class REx starts with.  Try to find this position... */
1405 /* if reginfo is NULL, its a dryrun */
1406 /* annoyingly all the vars in this routine have different names from their counterparts
1407    in regmatch. /grrr */
1408
1409 STATIC char *
1410 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1411     const char *strend, regmatch_info *reginfo)
1412 {
1413         dVAR;
1414         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1415         char *pat_string;   /* The pattern's exactish string */
1416         char *pat_end;      /* ptr to end char of pat_string */
1417         re_fold_t folder;       /* Function for computing non-utf8 folds */
1418         const U8 *fold_array;   /* array for folding ords < 256 */
1419         STRLEN ln;
1420         STRLEN lnc;
1421         STRLEN uskip;
1422         U8 c1;
1423         U8 c2;
1424         char *e;
1425         I32 tmp = 1;    /* Scratch variable? */
1426         const bool utf8_target = PL_reg_match_utf8;
1427         UV utf8_fold_flags = 0;
1428         RXi_GET_DECL(prog,progi);
1429
1430         PERL_ARGS_ASSERT_FIND_BYCLASS;
1431
1432         /* We know what class it must start with. */
1433         switch (OP(c)) {
1434         case ANYOFV:
1435         case ANYOF:
1436             if (utf8_target || OP(c) == ANYOFV) {
1437                 STRLEN inclasslen = strend - s;
1438                 REXEC_FBC_UTF8_CLASS_SCAN(
1439                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1440             }
1441             else {
1442                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1443             }
1444             break;
1445         case CANY:
1446             REXEC_FBC_SCAN(
1447                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1448                     goto got_it;
1449                 else
1450                     tmp = doevery;
1451             );
1452             break;
1453
1454         case EXACTFA:
1455             if (UTF_PATTERN || utf8_target) {
1456                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1457                 goto do_exactf_utf8;
1458             }
1459             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1460             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1461             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1462
1463         case EXACTF:
1464             if (utf8_target) {
1465
1466                 /* regcomp.c already folded this if pattern is in UTF-8 */
1467                 utf8_fold_flags = 0;
1468                 goto do_exactf_utf8;
1469             }
1470             fold_array = PL_fold;
1471             folder = foldEQ;
1472             goto do_exactf_non_utf8;
1473
1474         case EXACTFL:
1475             if (UTF_PATTERN || utf8_target) {
1476                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1477                 goto do_exactf_utf8;
1478             }
1479             fold_array = PL_fold_locale;
1480             folder = foldEQ_locale;
1481             goto do_exactf_non_utf8;
1482
1483         case EXACTFU_SS:
1484             if (UTF_PATTERN) {
1485                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1486             }
1487             goto do_exactf_utf8;
1488
1489         case EXACTFU_TRICKYFOLD:
1490         case EXACTFU:
1491             if (UTF_PATTERN || utf8_target) {
1492                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1493                 goto do_exactf_utf8;
1494             }
1495
1496             /* Any 'ss' in the pattern should have been replaced by regcomp,
1497              * so we don't have to worry here about this single special case
1498              * in the Latin1 range */
1499             fold_array = PL_fold_latin1;
1500             folder = foldEQ_latin1;
1501
1502             /* FALL THROUGH */
1503
1504         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1505                                are no glitches with fold-length differences
1506                                between the target string and pattern */
1507
1508             /* The idea in the non-utf8 EXACTF* cases is to first find the
1509              * first character of the EXACTF* node and then, if necessary,
1510              * case-insensitively compare the full text of the node.  c1 is the
1511              * first character.  c2 is its fold.  This logic will not work for
1512              * Unicode semantics and the german sharp ss, which hence should
1513              * not be compiled into a node that gets here. */
1514             pat_string = STRING(c);
1515             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1516
1517             /* We know that we have to match at least 'ln' bytes (which is the
1518              * same as characters, since not utf8).  If we have to match 3
1519              * characters, and there are only 2 availabe, we know without
1520              * trying that it will fail; so don't start a match past the
1521              * required minimum number from the far end */
1522             e = HOP3c(strend, -((I32)ln), s);
1523
1524             if (!reginfo && e < s) {
1525                 e = s;                  /* Due to minlen logic of intuit() */
1526             }
1527
1528             c1 = *pat_string;
1529             c2 = fold_array[c1];
1530             if (c1 == c2) { /* If char and fold are the same */
1531                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1532             }
1533             else {
1534                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1535             }
1536             break;
1537
1538         do_exactf_utf8:
1539         {
1540             unsigned expansion;
1541
1542
1543             /* If one of the operands is in utf8, we can't use the simpler
1544              * folding above, due to the fact that many different characters
1545              * can have the same fold, or portion of a fold, or different-
1546              * length fold */
1547             pat_string = STRING(c);
1548             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1549             pat_end = pat_string + ln;
1550             lnc = (UTF_PATTERN) /* length to match in characters */
1551                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1552                     : ln;
1553
1554             /* We have 'lnc' characters to match in the pattern, but because of
1555              * multi-character folding, each character in the target can match
1556              * up to 3 characters (Unicode guarantees it will never exceed
1557              * this) if it is utf8-encoded; and up to 2 if not (based on the
1558              * fact that the Latin 1 folds are already determined, and the
1559              * only multi-char fold in that range is the sharp-s folding to
1560              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1561              * string character.  Adjust lnc accordingly, rounding up, so that
1562              * if we need to match at least 4+1/3 chars, that really is 5. */
1563             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1564             lnc = (lnc + expansion - 1) / expansion;
1565
1566             /* As in the non-UTF8 case, if we have to match 3 characters, and
1567              * only 2 are left, it's guaranteed to fail, so don't start a
1568              * match that would require us to go beyond the end of the string
1569              */
1570             e = HOP3c(strend, -((I32)lnc), s);
1571
1572             if (!reginfo && e < s) {
1573                 e = s;                  /* Due to minlen logic of intuit() */
1574             }
1575
1576             /* XXX Note that we could recalculate e to stop the loop earlier,
1577              * as the worst case expansion above will rarely be met, and as we
1578              * go along we would usually find that e moves further to the left.
1579              * This would happen only after we reached the point in the loop
1580              * where if there were no expansion we should fail.  Unclear if
1581              * worth the expense */
1582
1583             while (s <= e) {
1584                 char *my_strend= (char *)strend;
1585                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1586                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1587                     && (!reginfo || regtry(reginfo, &s)) )
1588                 {
1589                     goto got_it;
1590                 }
1591                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1592             }
1593             break;
1594         }
1595         case BOUNDL:
1596             PL_reg_flags |= RF_tainted;
1597             FBC_BOUND(isALNUM_LC,
1598                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1599                       isALNUM_LC_utf8((U8*)s));
1600             break;
1601         case NBOUNDL:
1602             PL_reg_flags |= RF_tainted;
1603             FBC_NBOUND(isALNUM_LC,
1604                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1605                        isALNUM_LC_utf8((U8*)s));
1606             break;
1607         case BOUND:
1608             FBC_BOUND(isWORDCHAR,
1609                       isALNUM_uni(tmp),
1610                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1611             break;
1612         case BOUNDA:
1613             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1614                              isWORDCHAR_A(tmp),
1615                              isWORDCHAR_A((U8*)s));
1616             break;
1617         case NBOUND:
1618             FBC_NBOUND(isWORDCHAR,
1619                        isALNUM_uni(tmp),
1620                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1621             break;
1622         case NBOUNDA:
1623             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1624                               isWORDCHAR_A(tmp),
1625                               isWORDCHAR_A((U8*)s));
1626             break;
1627         case BOUNDU:
1628             FBC_BOUND(isWORDCHAR_L1,
1629                       isALNUM_uni(tmp),
1630                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1631             break;
1632         case NBOUNDU:
1633             FBC_NBOUND(isWORDCHAR_L1,
1634                        isALNUM_uni(tmp),
1635                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1636             break;
1637         case ALNUML:
1638             REXEC_FBC_CSCAN_TAINT(
1639                 isALNUM_LC_utf8((U8*)s),
1640                 isALNUM_LC(*s)
1641             );
1642             break;
1643         case ALNUMU:
1644             REXEC_FBC_CSCAN_PRELOAD(
1645                 LOAD_UTF8_CHARCLASS_ALNUM(),
1646                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1647                 isWORDCHAR_L1((U8) *s)
1648             );
1649             break;
1650         case ALNUM:
1651             REXEC_FBC_CSCAN_PRELOAD(
1652                 LOAD_UTF8_CHARCLASS_ALNUM(),
1653                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1654                 isWORDCHAR((U8) *s)
1655             );
1656             break;
1657         case ALNUMA:
1658             /* Don't need to worry about utf8, as it can match only a single
1659              * byte invariant character */
1660             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1661             break;
1662         case NALNUMU:
1663             REXEC_FBC_CSCAN_PRELOAD(
1664                 LOAD_UTF8_CHARCLASS_ALNUM(),
1665                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1666                 ! isWORDCHAR_L1((U8) *s)
1667             );
1668             break;
1669         case NALNUM:
1670             REXEC_FBC_CSCAN_PRELOAD(
1671                 LOAD_UTF8_CHARCLASS_ALNUM(),
1672                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1673                 ! isALNUM(*s)
1674             );
1675             break;
1676         case NALNUMA:
1677             REXEC_FBC_CSCAN(
1678                 !isWORDCHAR_A(*s),
1679                 !isWORDCHAR_A(*s)
1680             );
1681             break;
1682         case NALNUML:
1683             REXEC_FBC_CSCAN_TAINT(
1684                 !isALNUM_LC_utf8((U8*)s),
1685                 !isALNUM_LC(*s)
1686             );
1687             break;
1688         case SPACEU:
1689             REXEC_FBC_CSCAN_PRELOAD(
1690                 LOAD_UTF8_CHARCLASS_SPACE(),
1691                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1692                 isSPACE_L1((U8) *s)
1693             );
1694             break;
1695         case SPACE:
1696             REXEC_FBC_CSCAN_PRELOAD(
1697                 LOAD_UTF8_CHARCLASS_SPACE(),
1698                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1699                 isSPACE((U8) *s)
1700             );
1701             break;
1702         case SPACEA:
1703             /* Don't need to worry about utf8, as it can match only a single
1704              * byte invariant character */
1705             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1706             break;
1707         case SPACEL:
1708             REXEC_FBC_CSCAN_TAINT(
1709                 isSPACE_LC_utf8((U8*)s),
1710                 isSPACE_LC(*s)
1711             );
1712             break;
1713         case NSPACEU:
1714             REXEC_FBC_CSCAN_PRELOAD(
1715                 LOAD_UTF8_CHARCLASS_SPACE(),
1716                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1717                 ! isSPACE_L1((U8) *s)
1718             );
1719             break;
1720         case NSPACE:
1721             REXEC_FBC_CSCAN_PRELOAD(
1722                 LOAD_UTF8_CHARCLASS_SPACE(),
1723                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1724                 ! isSPACE((U8) *s)
1725             );
1726             break;
1727         case NSPACEA:
1728             REXEC_FBC_CSCAN(
1729                 !isSPACE_A(*s),
1730                 !isSPACE_A(*s)
1731             );
1732             break;
1733         case NSPACEL:
1734             REXEC_FBC_CSCAN_TAINT(
1735                 !isSPACE_LC_utf8((U8*)s),
1736                 !isSPACE_LC(*s)
1737             );
1738             break;
1739         case DIGIT:
1740             REXEC_FBC_CSCAN_PRELOAD(
1741                 LOAD_UTF8_CHARCLASS_DIGIT(),
1742                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1743                 isDIGIT(*s)
1744             );
1745             break;
1746         case DIGITA:
1747             /* Don't need to worry about utf8, as it can match only a single
1748              * byte invariant character */
1749             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1750             break;
1751         case DIGITL:
1752             REXEC_FBC_CSCAN_TAINT(
1753                 isDIGIT_LC_utf8((U8*)s),
1754                 isDIGIT_LC(*s)
1755             );
1756             break;
1757         case NDIGIT:
1758             REXEC_FBC_CSCAN_PRELOAD(
1759                 LOAD_UTF8_CHARCLASS_DIGIT(),
1760                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1761                 !isDIGIT(*s)
1762             );
1763             break;
1764         case NDIGITA:
1765             REXEC_FBC_CSCAN(
1766                 !isDIGIT_A(*s),
1767                 !isDIGIT_A(*s)
1768             );
1769             break;
1770         case NDIGITL:
1771             REXEC_FBC_CSCAN_TAINT(
1772                 !isDIGIT_LC_utf8((U8*)s),
1773                 !isDIGIT_LC(*s)
1774             );
1775             break;
1776         case LNBREAK:
1777             REXEC_FBC_CSCAN(
1778                 is_LNBREAK_utf8(s),
1779                 is_LNBREAK_latin1(s)
1780             );
1781             break;
1782         case VERTWS:
1783             REXEC_FBC_CSCAN(
1784                 is_VERTWS_utf8(s),
1785                 is_VERTWS_latin1(s)
1786             );
1787             break;
1788         case NVERTWS:
1789             REXEC_FBC_CSCAN(
1790                 !is_VERTWS_utf8(s),
1791                 !is_VERTWS_latin1(s)
1792             );
1793             break;
1794         case HORIZWS:
1795             REXEC_FBC_CSCAN(
1796                 is_HORIZWS_utf8(s),
1797                 is_HORIZWS_latin1(s)
1798             );
1799             break;
1800         case NHORIZWS:
1801             REXEC_FBC_CSCAN(
1802                 !is_HORIZWS_utf8(s),
1803                 !is_HORIZWS_latin1(s)
1804             );
1805             break;
1806         case POSIXA:
1807             /* Don't need to worry about utf8, as it can match only a single
1808             * byte invariant character.  The flag in this node type is the
1809             * class number to pass to _generic_isCC() to build a mask for
1810             * searching in PL_charclass[] */
1811             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1812             break;
1813         case NPOSIXA:
1814             REXEC_FBC_CSCAN(
1815                 !_generic_isCC_A(*s, FLAGS(c)),
1816                 !_generic_isCC_A(*s, FLAGS(c))
1817             );
1818             break;
1819
1820         case AHOCORASICKC:
1821         case AHOCORASICK:
1822             {
1823                 DECL_TRIE_TYPE(c);
1824                 /* what trie are we using right now */
1825                 reg_ac_data *aho
1826                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1827                 reg_trie_data *trie
1828                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1829                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1830
1831                 const char *last_start = strend - trie->minlen;
1832 #ifdef DEBUGGING
1833                 const char *real_start = s;
1834 #endif
1835                 STRLEN maxlen = trie->maxlen;
1836                 SV *sv_points;
1837                 U8 **points; /* map of where we were in the input string
1838                                 when reading a given char. For ASCII this
1839                                 is unnecessary overhead as the relationship
1840                                 is always 1:1, but for Unicode, especially
1841                                 case folded Unicode this is not true. */
1842                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1843                 U8 *bitmap=NULL;
1844
1845
1846                 GET_RE_DEBUG_FLAGS_DECL;
1847
1848                 /* We can't just allocate points here. We need to wrap it in
1849                  * an SV so it gets freed properly if there is a croak while
1850                  * running the match */
1851                 ENTER;
1852                 SAVETMPS;
1853                 sv_points=newSV(maxlen * sizeof(U8 *));
1854                 SvCUR_set(sv_points,
1855                     maxlen * sizeof(U8 *));
1856                 SvPOK_on(sv_points);
1857                 sv_2mortal(sv_points);
1858                 points=(U8**)SvPV_nolen(sv_points );
1859                 if ( trie_type != trie_utf8_fold
1860                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1861                 {
1862                     if (trie->bitmap)
1863                         bitmap=(U8*)trie->bitmap;
1864                     else
1865                         bitmap=(U8*)ANYOF_BITMAP(c);
1866                 }
1867                 /* this is the Aho-Corasick algorithm modified a touch
1868                    to include special handling for long "unknown char"
1869                    sequences. The basic idea being that we use AC as long
1870                    as we are dealing with a possible matching char, when
1871                    we encounter an unknown char (and we have not encountered
1872                    an accepting state) we scan forward until we find a legal
1873                    starting char.
1874                    AC matching is basically that of trie matching, except
1875                    that when we encounter a failing transition, we fall back
1876                    to the current states "fail state", and try the current char
1877                    again, a process we repeat until we reach the root state,
1878                    state 1, or a legal transition. If we fail on the root state
1879                    then we can either terminate if we have reached an accepting
1880                    state previously, or restart the entire process from the beginning
1881                    if we have not.
1882
1883                  */
1884                 while (s <= last_start) {
1885                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1886                     U8 *uc = (U8*)s;
1887                     U16 charid = 0;
1888                     U32 base = 1;
1889                     U32 state = 1;
1890                     UV uvc = 0;
1891                     STRLEN len = 0;
1892                     STRLEN foldlen = 0;
1893                     U8 *uscan = (U8*)NULL;
1894                     U8 *leftmost = NULL;
1895 #ifdef DEBUGGING
1896                     U32 accepted_word= 0;
1897 #endif
1898                     U32 pointpos = 0;
1899
1900                     while ( state && uc <= (U8*)strend ) {
1901                         int failed=0;
1902                         U32 word = aho->states[ state ].wordnum;
1903
1904                         if( state==1 ) {
1905                             if ( bitmap ) {
1906                                 DEBUG_TRIE_EXECUTE_r(
1907                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1908                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1909                                             (char *)uc, utf8_target );
1910                                         PerlIO_printf( Perl_debug_log,
1911                                             " Scanning for legal start char...\n");
1912                                     }
1913                                 );
1914                                 if (utf8_target) {
1915                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1916                                         uc += UTF8SKIP(uc);
1917                                     }
1918                                 } else {
1919                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1920                                         uc++;
1921                                     }
1922                                 }
1923                                 s= (char *)uc;
1924                             }
1925                             if (uc >(U8*)last_start) break;
1926                         }
1927
1928                         if ( word ) {
1929                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1930                             if (!leftmost || lpos < leftmost) {
1931                                 DEBUG_r(accepted_word=word);
1932                                 leftmost= lpos;
1933                             }
1934                             if (base==0) break;
1935
1936                         }
1937                         points[pointpos++ % maxlen]= uc;
1938                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1939                                              uscan, len, uvc, charid, foldlen,
1940                                              foldbuf, uniflags);
1941                         DEBUG_TRIE_EXECUTE_r({
1942                             dump_exec_pos( (char *)uc, c, strend, real_start,
1943                                 s,   utf8_target );
1944                             PerlIO_printf(Perl_debug_log,
1945                                 " Charid:%3u CP:%4"UVxf" ",
1946                                  charid, uvc);
1947                         });
1948
1949                         do {
1950 #ifdef DEBUGGING
1951                             word = aho->states[ state ].wordnum;
1952 #endif
1953                             base = aho->states[ state ].trans.base;
1954
1955                             DEBUG_TRIE_EXECUTE_r({
1956                                 if (failed)
1957                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1958                                         s,   utf8_target );
1959                                 PerlIO_printf( Perl_debug_log,
1960                                     "%sState: %4"UVxf", word=%"UVxf,
1961                                     failed ? " Fail transition to " : "",
1962                                     (UV)state, (UV)word);
1963                             });
1964                             if ( base ) {
1965                                 U32 tmp;
1966                                 I32 offset;
1967                                 if (charid &&
1968                                      ( ((offset = base + charid
1969                                         - 1 - trie->uniquecharcount)) >= 0)
1970                                      && ((U32)offset < trie->lasttrans)
1971                                      && trie->trans[offset].check == state
1972                                      && (tmp=trie->trans[offset].next))
1973                                 {
1974                                     DEBUG_TRIE_EXECUTE_r(
1975                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1976                                     state = tmp;
1977                                     break;
1978                                 }
1979                                 else {
1980                                     DEBUG_TRIE_EXECUTE_r(
1981                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1982                                     failed = 1;
1983                                     state = aho->fail[state];
1984                                 }
1985                             }
1986                             else {
1987                                 /* we must be accepting here */
1988                                 DEBUG_TRIE_EXECUTE_r(
1989                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1990                                 failed = 1;
1991                                 break;
1992                             }
1993                         } while(state);
1994                         uc += len;
1995                         if (failed) {
1996                             if (leftmost)
1997                                 break;
1998                             if (!state) state = 1;
1999                         }
2000                     }
2001                     if ( aho->states[ state ].wordnum ) {
2002                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2003                         if (!leftmost || lpos < leftmost) {
2004                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2005                             leftmost = lpos;
2006                         }
2007                     }
2008                     if (leftmost) {
2009                         s = (char*)leftmost;
2010                         DEBUG_TRIE_EXECUTE_r({
2011                             PerlIO_printf(
2012                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2013                                 (UV)accepted_word, (IV)(s - real_start)
2014                             );
2015                         });
2016                         if (!reginfo || regtry(reginfo, &s)) {
2017                             FREETMPS;
2018                             LEAVE;
2019                             goto got_it;
2020                         }
2021                         s = HOPc(s,1);
2022                         DEBUG_TRIE_EXECUTE_r({
2023                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2024                         });
2025                     } else {
2026                         DEBUG_TRIE_EXECUTE_r(
2027                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2028                         break;
2029                     }
2030                 }
2031                 FREETMPS;
2032                 LEAVE;
2033             }
2034             break;
2035         default:
2036             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2037             break;
2038         }
2039         return 0;
2040       got_it:
2041         return s;
2042 }
2043
2044
2045 /*
2046  - regexec_flags - match a regexp against a string
2047  */
2048 I32
2049 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2050               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2051 /* stringarg: the point in the string at which to begin matching */
2052 /* strend:    pointer to null at end of string */
2053 /* strbeg:    real beginning of string */
2054 /* minend:    end of match must be >= minend bytes after stringarg. */
2055 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2056  *            itself is accessed via the pointers above */
2057 /* data:      May be used for some additional optimizations.
2058               Currently its only used, with a U32 cast, for transmitting
2059               the ganch offset when doing a /g match. This will change */
2060 /* nosave:    For optimizations. */
2061
2062 {
2063     dVAR;
2064     struct regexp *const prog = (struct regexp *)SvANY(rx);
2065     /*register*/ char *s;
2066     regnode *c;
2067     /*register*/ char *startpos = stringarg;
2068     I32 minlen;         /* must match at least this many chars */
2069     I32 dontbother = 0; /* how many characters not to try at end */
2070     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2071     I32 scream_pos = -1;                /* Internal iterator of scream. */
2072     char *scream_olds = NULL;
2073     const bool utf8_target = cBOOL(DO_UTF8(sv));
2074     I32 multiline;
2075     RXi_GET_DECL(prog,progi);
2076     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2077     regexp_paren_pair *swap = NULL;
2078     GET_RE_DEBUG_FLAGS_DECL;
2079
2080     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2081     PERL_UNUSED_ARG(data);
2082
2083     /* Be paranoid... */
2084     if (prog == NULL || startpos == NULL) {
2085         Perl_croak(aTHX_ "NULL regexp parameter");
2086         return 0;
2087     }
2088
2089     multiline = prog->extflags & RXf_PMf_MULTILINE;
2090     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2091
2092     RX_MATCH_UTF8_set(rx, utf8_target);
2093     DEBUG_EXECUTE_r(
2094         debug_start_match(rx, utf8_target, startpos, strend,
2095         "Matching");
2096     );
2097
2098     minlen = prog->minlen;
2099
2100     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2101         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2102                               "String too short [regexec_flags]...\n"));
2103         goto phooey;
2104     }
2105
2106
2107     /* Check validity of program. */
2108     if (UCHARAT(progi->program) != REG_MAGIC) {
2109         Perl_croak(aTHX_ "corrupted regexp program");
2110     }
2111
2112     PL_reg_flags = 0;
2113     PL_reg_state.re_state_eval_setup_done = FALSE;
2114     PL_reg_maxiter = 0;
2115
2116     if (RX_UTF8(rx))
2117         PL_reg_flags |= RF_utf8;
2118
2119     /* Mark beginning of line for ^ and lookbehind. */
2120     reginfo.bol = startpos; /* XXX not used ??? */
2121     PL_bostr  = strbeg;
2122     reginfo.sv = sv;
2123
2124     /* Mark end of line for $ (and such) */
2125     PL_regeol = strend;
2126
2127     /* see how far we have to get to not match where we matched before */
2128     reginfo.till = startpos+minend;
2129
2130     /* If there is a "must appear" string, look for it. */
2131     s = startpos;
2132
2133     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2134         MAGIC *mg;
2135         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2136             reginfo.ganch = startpos + prog->gofs;
2137             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2138               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2139         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2140                   && SvMAGIC(sv)
2141                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2142                   && mg->mg_len >= 0) {
2143             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2144             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2145                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2146
2147             if (prog->extflags & RXf_ANCH_GPOS) {
2148                 if (s > reginfo.ganch)
2149                     goto phooey;
2150                 s = reginfo.ganch - prog->gofs;
2151                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2152                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2153                 if (s < strbeg)
2154                     goto phooey;
2155             }
2156         }
2157         else if (data) {
2158             reginfo.ganch = strbeg + PTR2UV(data);
2159             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2160                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2161
2162         } else {                                /* pos() not defined */
2163             reginfo.ganch = strbeg;
2164             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2165                  "GPOS: reginfo.ganch = strbeg\n"));
2166         }
2167     }
2168     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2169         /* We have to be careful. If the previous successful match
2170            was from this regex we don't want a subsequent partially
2171            successful match to clobber the old results.
2172            So when we detect this possibility we add a swap buffer
2173            to the re, and switch the buffer each match. If we fail
2174            we switch it back, otherwise we leave it swapped.
2175         */
2176         swap = prog->offs;
2177         /* do we need a save destructor here for eval dies? */
2178         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2179         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2180             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2181             PTR2UV(prog),
2182             PTR2UV(swap),
2183             PTR2UV(prog->offs)
2184         ));
2185     }
2186     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2187         re_scream_pos_data d;
2188
2189         d.scream_olds = &scream_olds;
2190         d.scream_pos = &scream_pos;
2191         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2192         if (!s) {
2193             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2194             goto phooey;        /* not present */
2195         }
2196     }
2197
2198
2199
2200     /* Simplest case:  anchored match need be tried only once. */
2201     /*  [unless only anchor is BOL and multiline is set] */
2202     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2203         if (s == startpos && regtry(&reginfo, &startpos))
2204             goto got_it;
2205         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2206                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2207         {
2208             char *end;
2209
2210             if (minlen)
2211                 dontbother = minlen - 1;
2212             end = HOP3c(strend, -dontbother, strbeg) - 1;
2213             /* for multiline we only have to try after newlines */
2214             if (prog->check_substr || prog->check_utf8) {
2215                 /* because of the goto we can not easily reuse the macros for bifurcating the
2216                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2217                 if (utf8_target) {
2218                     if (s == startpos)
2219                         goto after_try_utf8;
2220                     while (1) {
2221                         if (regtry(&reginfo, &s)) {
2222                             goto got_it;
2223                         }
2224                       after_try_utf8:
2225                         if (s > end) {
2226                             goto phooey;
2227                         }
2228                         if (prog->extflags & RXf_USE_INTUIT) {
2229                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2230                             if (!s) {
2231                                 goto phooey;
2232                             }
2233                         }
2234                         else {
2235                             s += UTF8SKIP(s);
2236                         }
2237                     }
2238                 } /* end search for check string in unicode */
2239                 else {
2240                     if (s == startpos) {
2241                         goto after_try_latin;
2242                     }
2243                     while (1) {
2244                         if (regtry(&reginfo, &s)) {
2245                             goto got_it;
2246                         }
2247                       after_try_latin:
2248                         if (s > end) {
2249                             goto phooey;
2250                         }
2251                         if (prog->extflags & RXf_USE_INTUIT) {
2252                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2253                             if (!s) {
2254                                 goto phooey;
2255                             }
2256                         }
2257                         else {
2258                             s++;
2259                         }
2260                     }
2261                 } /* end search for check string in latin*/
2262             } /* end search for check string */
2263             else { /* search for newline */
2264                 if (s > startpos) {
2265                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2266                     s--;
2267                 }
2268                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2269                 while (s <= end) { /* note it could be possible to match at the end of the string */
2270                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2271                         if (regtry(&reginfo, &s))
2272                             goto got_it;
2273                     }
2274                 }
2275             } /* end search for newline */
2276         } /* end anchored/multiline check string search */
2277         goto phooey;
2278     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2279     {
2280         /* the warning about reginfo.ganch being used without initialization
2281            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2282            and we only enter this block when the same bit is set. */
2283         char *tmp_s = reginfo.ganch - prog->gofs;
2284
2285         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2286             goto got_it;
2287         goto phooey;
2288     }
2289
2290     /* Messy cases:  unanchored match. */
2291     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2292         /* we have /x+whatever/ */
2293         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2294         char ch;
2295 #ifdef DEBUGGING
2296         int did_match = 0;
2297 #endif
2298         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2299             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2300         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2301
2302         if (utf8_target) {
2303             REXEC_FBC_SCAN(
2304                 if (*s == ch) {
2305                     DEBUG_EXECUTE_r( did_match = 1 );
2306                     if (regtry(&reginfo, &s)) goto got_it;
2307                     s += UTF8SKIP(s);
2308                     while (s < strend && *s == ch)
2309                         s += UTF8SKIP(s);
2310                 }
2311             );
2312         }
2313         else {
2314             REXEC_FBC_SCAN(
2315                 if (*s == ch) {
2316                     DEBUG_EXECUTE_r( did_match = 1 );
2317                     if (regtry(&reginfo, &s)) goto got_it;
2318                     s++;
2319                     while (s < strend && *s == ch)
2320                         s++;
2321                 }
2322             );
2323         }
2324         DEBUG_EXECUTE_r(if (!did_match)
2325                 PerlIO_printf(Perl_debug_log,
2326                                   "Did not find anchored character...\n")
2327                );
2328     }
2329     else if (prog->anchored_substr != NULL
2330               || prog->anchored_utf8 != NULL
2331               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2332                   && prog->float_max_offset < strend - s)) {
2333         SV *must;
2334         I32 back_max;
2335         I32 back_min;
2336         char *last;
2337         char *last1;            /* Last position checked before */
2338 #ifdef DEBUGGING
2339         int did_match = 0;
2340 #endif
2341         if (prog->anchored_substr || prog->anchored_utf8) {
2342             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2343                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2344             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2345             back_max = back_min = prog->anchored_offset;
2346         } else {
2347             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2348                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2349             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2350             back_max = prog->float_max_offset;
2351             back_min = prog->float_min_offset;
2352         }
2353
2354
2355         if (must == &PL_sv_undef)
2356             /* could not downgrade utf8 check substring, so must fail */
2357             goto phooey;
2358
2359         if (back_min<0) {
2360             last = strend;
2361         } else {
2362             last = HOP3c(strend,        /* Cannot start after this */
2363                   -(I32)(CHR_SVLEN(must)
2364                          - (SvTAIL(must) != 0) + back_min), strbeg);
2365         }
2366         if (s > PL_bostr)
2367             last1 = HOPc(s, -1);
2368         else
2369             last1 = s - 1;      /* bogus */
2370
2371         /* XXXX check_substr already used to find "s", can optimize if
2372            check_substr==must. */
2373         scream_pos = -1;
2374         dontbother = end_shift;
2375         strend = HOPc(strend, -dontbother);
2376         while ( (s <= last) &&
2377                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2378                                   (unsigned char*)strend, must,
2379                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2380             DEBUG_EXECUTE_r( did_match = 1 );
2381             if (HOPc(s, -back_max) > last1) {
2382                 last1 = HOPc(s, -back_min);
2383                 s = HOPc(s, -back_max);
2384             }
2385             else {
2386                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2387
2388                 last1 = HOPc(s, -back_min);
2389                 s = t;
2390             }
2391             if (utf8_target) {
2392                 while (s <= last1) {
2393                     if (regtry(&reginfo, &s))
2394                         goto got_it;
2395                     s += UTF8SKIP(s);
2396                 }
2397             }
2398             else {
2399                 while (s <= last1) {
2400                     if (regtry(&reginfo, &s))
2401                         goto got_it;
2402                     s++;
2403                 }
2404             }
2405         }
2406         DEBUG_EXECUTE_r(if (!did_match) {
2407             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2408                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2409             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2410                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2411                                ? "anchored" : "floating"),
2412                 quoted, RE_SV_TAIL(must));
2413         });
2414         goto phooey;
2415     }
2416     else if ( (c = progi->regstclass) ) {
2417         if (minlen) {
2418             const OPCODE op = OP(progi->regstclass);
2419             /* don't bother with what can't match */
2420             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2421                 strend = HOPc(strend, -(minlen - 1));
2422         }
2423         DEBUG_EXECUTE_r({
2424             SV * const prop = sv_newmortal();
2425             regprop(prog, prop, c);
2426             {
2427                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2428                     s,strend-s,60);
2429                 PerlIO_printf(Perl_debug_log,
2430                     "Matching stclass %.*s against %s (%d bytes)\n",
2431                     (int)SvCUR(prop), SvPVX_const(prop),
2432                      quoted, (int)(strend - s));
2433             }
2434         });
2435         if (find_byclass(prog, c, s, strend, &reginfo))
2436             goto got_it;
2437         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2438     }
2439     else {
2440         dontbother = 0;
2441         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2442             /* Trim the end. */
2443             char *last= NULL;
2444             SV* float_real;
2445             STRLEN len;
2446             const char *little;
2447
2448             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2449                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2450             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2451
2452             little = SvPV_const(float_real, len);
2453             if (SvTAIL(float_real)) {
2454                     /* This means that float_real contains an artificial \n on the end
2455                      * due to the presence of something like this: /foo$/
2456                      * where we can match both "foo" and "foo\n" at the end of the string.
2457                      * So we have to compare the end of the string first against the float_real
2458                      * without the \n and then against the full float_real with the string.
2459                      * We have to watch out for cases where the string might be smaller
2460                      * than the float_real or the float_real without the \n.
2461                      */
2462                     char *checkpos= strend - len;
2463                     DEBUG_OPTIMISE_r(
2464                         PerlIO_printf(Perl_debug_log,
2465                             "%sChecking for float_real.%s\n",
2466                             PL_colors[4], PL_colors[5]));
2467                     if (checkpos + 1 < strbeg) {
2468                         /* can't match, even if we remove the trailing \n string is too short to match */
2469                         DEBUG_EXECUTE_r(
2470                             PerlIO_printf(Perl_debug_log,
2471                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2472                                 PL_colors[4], PL_colors[5]));
2473                         goto phooey;
2474                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2475                         /* can match, the end of the string matches without the "\n" */
2476                         last = checkpos + 1;
2477                     } else if (checkpos < strbeg) {
2478                         /* cant match, string is too short when the "\n" is included */
2479                         DEBUG_EXECUTE_r(
2480                             PerlIO_printf(Perl_debug_log,
2481                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2482                                 PL_colors[4], PL_colors[5]));
2483                         goto phooey;
2484                     } else if (!multiline) {
2485                         /* non multiline match, so compare with the "\n" at the end of the string */
2486                         if (memEQ(checkpos, little, len)) {
2487                             last= checkpos;
2488                         } else {
2489                             DEBUG_EXECUTE_r(
2490                                 PerlIO_printf(Perl_debug_log,
2491                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2492                                     PL_colors[4], PL_colors[5]));
2493                             goto phooey;
2494                         }
2495                     } else {
2496                         /* multiline match, so we have to search for a place where the full string is located */
2497                         goto find_last;
2498                     }
2499             } else {
2500                   find_last:
2501                     if (len)
2502                         last = rninstr(s, strend, little, little + len);
2503                     else
2504                         last = strend;  /* matching "$" */
2505             }
2506             if (!last) {
2507                 /* at one point this block contained a comment which was probably
2508                  * incorrect, which said that this was a "should not happen" case.
2509                  * Even if it was true when it was written I am pretty sure it is
2510                  * not anymore, so I have removed the comment and replaced it with
2511                  * this one. Yves */
2512                 DEBUG_EXECUTE_r(
2513                     PerlIO_printf(Perl_debug_log,
2514                         "String does not contain required substring, cannot match.\n"
2515                     ));
2516                 goto phooey;
2517             }
2518             dontbother = strend - last + prog->float_min_offset;
2519         }
2520         if (minlen && (dontbother < minlen))
2521             dontbother = minlen - 1;
2522         strend -= dontbother;              /* this one's always in bytes! */
2523         /* We don't know much -- general case. */
2524         if (utf8_target) {
2525             for (;;) {
2526                 if (regtry(&reginfo, &s))
2527                     goto got_it;
2528                 if (s >= strend)
2529                     break;
2530                 s += UTF8SKIP(s);
2531             };
2532         }
2533         else {
2534             do {
2535                 if (regtry(&reginfo, &s))
2536                     goto got_it;
2537             } while (s++ < strend);
2538         }
2539     }
2540
2541     /* Failure. */
2542     goto phooey;
2543
2544 got_it:
2545     DEBUG_BUFFERS_r(
2546         if (swap)
2547             PerlIO_printf(Perl_debug_log,
2548                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2549                 PTR2UV(prog),
2550                 PTR2UV(swap)
2551             );
2552     );
2553     Safefree(swap);
2554     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2555
2556     if (PL_reg_state.re_state_eval_setup_done)
2557         restore_pos(aTHX_ prog);
2558     if (RXp_PAREN_NAMES(prog))
2559         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2560
2561     /* make sure $`, $&, $', and $digit will work later */
2562     if ( !(flags & REXEC_NOT_FIRST) ) {
2563         if (flags & REXEC_COPY_STR) {
2564 #ifdef PERL_OLD_COPY_ON_WRITE
2565             if ((SvIsCOW(sv)
2566                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2567                 if (DEBUG_C_TEST) {
2568                     PerlIO_printf(Perl_debug_log,
2569                                   "Copy on write: regexp capture, type %d\n",
2570                                   (int) SvTYPE(sv));
2571                 }
2572                 RX_MATCH_COPY_FREE(rx);
2573                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2574                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2575                 assert (SvPOKp(prog->saved_copy));
2576                 prog->sublen  = PL_regeol - strbeg;
2577                 prog->suboffset = 0;
2578                 prog->subcoffset = 0;
2579             } else
2580 #endif
2581             {
2582                 I32 min = 0;
2583                 I32 max = PL_regeol - strbeg;
2584                 I32 sublen;
2585
2586                 if (    (flags & REXEC_COPY_SKIP_POST)
2587                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2588                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2589                 ) { /* don't copy $' part of string */
2590                     U32 n = 0;
2591                     max = -1;
2592                     /* calculate the right-most part of the string covered
2593                      * by a capture. Due to look-ahead, this may be to
2594                      * the right of $&, so we have to scan all captures */
2595                     while (n <= prog->lastparen) {
2596                         if (prog->offs[n].end > max)
2597                             max = prog->offs[n].end;
2598                         n++;
2599                     }
2600                     if (max == -1)
2601                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2602                                 ? prog->offs[0].start
2603                                 : 0;
2604                     assert(max >= 0 && max <= PL_regeol - strbeg);
2605                 }
2606
2607                 if (    (flags & REXEC_COPY_SKIP_PRE)
2608                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2609                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2610                 ) { /* don't copy $` part of string */
2611                     U32 n = 0;
2612                     min = max;
2613                     /* calculate the left-most part of the string covered
2614                      * by a capture. Due to look-behind, this may be to
2615                      * the left of $&, so we have to scan all captures */
2616                     while (min && n <= prog->lastparen) {
2617                         if (   prog->offs[n].start != -1
2618                             && prog->offs[n].start < min)
2619                         {
2620                             min = prog->offs[n].start;
2621                         }
2622                         n++;
2623                     }
2624                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2625                         && min >  prog->offs[0].end
2626                     )
2627                         min = prog->offs[0].end;
2628
2629                 }
2630
2631                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2632                 sublen = max - min;
2633
2634                 if (RX_MATCH_COPIED(rx)) {
2635                     if (sublen > prog->sublen)
2636                         prog->subbeg =
2637                                 (char*)saferealloc(prog->subbeg, sublen+1);
2638                 }
2639                 else
2640                     prog->subbeg = (char*)safemalloc(sublen+1);
2641                 Copy(strbeg + min, prog->subbeg, sublen, char);
2642                 prog->subbeg[sublen] = '\0';
2643                 prog->suboffset = min;
2644                 prog->sublen = sublen;
2645                 RX_MATCH_COPIED_on(rx);
2646             }
2647             prog->subcoffset = prog->suboffset;
2648             if (prog->suboffset && utf8_target) {
2649                 /* Convert byte offset to chars.
2650                  * XXX ideally should only compute this if @-/@+
2651                  * has been seen, a la PL_sawampersand ??? */
2652
2653                 /* If there's a direct correspondence between the
2654                  * string which we're matching and the original SV,
2655                  * then we can use the utf8 len cache associated with
2656                  * the SV. In particular, it means that under //g,
2657                  * sv_pos_b2u() will use the previously cached
2658                  * position to speed up working out the new length of
2659                  * subcoffset, rather than counting from the start of
2660                  * the string each time. This stops
2661                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2662                  * from going quadratic */
2663                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2664                     sv_pos_b2u(sv, &(prog->subcoffset));
2665                 else
2666                     prog->subcoffset = utf8_length((U8*)strbeg,
2667                                         (U8*)(strbeg+prog->suboffset));
2668             }
2669         }
2670         else {
2671             RX_MATCH_COPY_FREE(rx);
2672             prog->subbeg = strbeg;
2673             prog->suboffset = 0;
2674             prog->subcoffset = 0;
2675             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2676         }
2677     }
2678
2679     return 1;
2680
2681 phooey:
2682     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2683                           PL_colors[4], PL_colors[5]));
2684     if (PL_reg_state.re_state_eval_setup_done)
2685         restore_pos(aTHX_ prog);
2686     if (swap) {
2687         /* we failed :-( roll it back */
2688         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2689             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2690             PTR2UV(prog),
2691             PTR2UV(prog->offs),
2692             PTR2UV(swap)
2693         ));
2694         Safefree(prog->offs);
2695         prog->offs = swap;
2696     }
2697
2698     return 0;
2699 }
2700
2701
2702 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2703  * Do inc before dec, in case old and new rex are the same */
2704 #define SET_reg_curpm(Re2) \
2705     if (PL_reg_state.re_state_eval_setup_done) {    \
2706         (void)ReREFCNT_inc(Re2);                    \
2707         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2708         PM_SETRE((PL_reg_curpm), (Re2));            \
2709     }
2710
2711
2712 /*
2713  - regtry - try match at specific point
2714  */
2715 STATIC I32                      /* 0 failure, 1 success */
2716 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2717 {
2718     dVAR;
2719     CHECKPOINT lastcp;
2720     REGEXP *const rx = reginfo->prog;
2721     regexp *const prog = (struct regexp *)SvANY(rx);
2722     I32 result;
2723     RXi_GET_DECL(prog,progi);
2724     GET_RE_DEBUG_FLAGS_DECL;
2725
2726     PERL_ARGS_ASSERT_REGTRY;
2727
2728     reginfo->cutpoint=NULL;
2729
2730     if ((prog->extflags & RXf_EVAL_SEEN)
2731         && !PL_reg_state.re_state_eval_setup_done)
2732     {
2733         MAGIC *mg;
2734
2735         PL_reg_state.re_state_eval_setup_done = TRUE;
2736         if (reginfo->sv) {
2737             /* Make $_ available to executed code. */
2738             if (reginfo->sv != DEFSV) {
2739                 SAVE_DEFSV;
2740                 DEFSV_set(reginfo->sv);
2741             }
2742
2743             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2744                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2745                 /* prepare for quick setting of pos */
2746 #ifdef PERL_OLD_COPY_ON_WRITE
2747                 if (SvIsCOW(reginfo->sv))
2748                     sv_force_normal_flags(reginfo->sv, 0);
2749 #endif
2750                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2751                                  &PL_vtbl_mglob, NULL, 0);
2752                 mg->mg_len = -1;
2753             }
2754             PL_reg_magic    = mg;
2755             PL_reg_oldpos   = mg->mg_len;
2756             SAVEDESTRUCTOR_X(restore_pos, prog);
2757         }
2758         if (!PL_reg_curpm) {
2759             Newxz(PL_reg_curpm, 1, PMOP);
2760 #ifdef USE_ITHREADS
2761             {
2762                 SV* const repointer = &PL_sv_undef;
2763                 /* this regexp is also owned by the new PL_reg_curpm, which
2764                    will try to free it.  */
2765                 av_push(PL_regex_padav, repointer);
2766                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2767                 PL_regex_pad = AvARRAY(PL_regex_padav);
2768             }
2769 #endif
2770         }
2771         SET_reg_curpm(rx);
2772         PL_reg_oldcurpm = PL_curpm;
2773         PL_curpm = PL_reg_curpm;
2774         if (RXp_MATCH_COPIED(prog)) {
2775             /*  Here is a serious problem: we cannot rewrite subbeg,
2776                 since it may be needed if this match fails.  Thus
2777                 $` inside (?{}) could fail... */
2778             PL_reg_oldsaved = prog->subbeg;
2779             PL_reg_oldsavedlen = prog->sublen;
2780             PL_reg_oldsavedoffset = prog->suboffset;
2781             PL_reg_oldsavedcoffset = prog->suboffset;
2782 #ifdef PERL_OLD_COPY_ON_WRITE
2783             PL_nrs = prog->saved_copy;
2784 #endif
2785             RXp_MATCH_COPIED_off(prog);
2786         }
2787         else
2788             PL_reg_oldsaved = NULL;
2789         prog->subbeg = PL_bostr;
2790         prog->suboffset = 0;
2791         prog->subcoffset = 0;
2792         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2793     }
2794 #ifdef DEBUGGING
2795     PL_reg_starttry = *startposp;
2796 #endif
2797     prog->offs[0].start = *startposp - PL_bostr;
2798     prog->lastparen = 0;
2799     prog->lastcloseparen = 0;
2800     PL_regsize = 0;
2801
2802     /* XXXX What this code is doing here?!!!  There should be no need
2803        to do this again and again, prog->lastparen should take care of
2804        this!  --ilya*/
2805
2806     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2807      * Actually, the code in regcppop() (which Ilya may be meaning by
2808      * prog->lastparen), is not needed at all by the test suite
2809      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2810      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2811      * Meanwhile, this code *is* needed for the
2812      * above-mentioned test suite tests to succeed.  The common theme
2813      * on those tests seems to be returning null fields from matches.
2814      * --jhi updated by dapm */
2815 #if 1
2816     if (prog->nparens) {
2817         regexp_paren_pair *pp = prog->offs;
2818         I32 i;
2819         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2820             ++pp;
2821             pp->start = -1;
2822             pp->end = -1;
2823         }
2824     }
2825 #endif
2826     REGCP_SET(lastcp);
2827     result = regmatch(reginfo, *startposp, progi->program + 1);
2828     if (result != -1) {
2829         prog->offs[0].end = result;
2830         return 1;
2831     }
2832     if (reginfo->cutpoint)
2833         *startposp= reginfo->cutpoint;
2834     REGCP_UNWIND(lastcp);
2835     return 0;
2836 }
2837
2838
2839 #define sayYES goto yes
2840 #define sayNO goto no
2841 #define sayNO_SILENT goto no_silent
2842
2843 /* we dont use STMT_START/END here because it leads to
2844    "unreachable code" warnings, which are bogus, but distracting. */
2845 #define CACHEsayNO \
2846     if (ST.cache_mask) \
2847        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2848     sayNO
2849
2850 /* this is used to determine how far from the left messages like
2851    'failed...' are printed. It should be set such that messages
2852    are inline with the regop output that created them.
2853 */
2854 #define REPORT_CODE_OFF 32
2855
2856
2857 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2858 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2859
2860 #define SLAB_FIRST(s) (&(s)->states[0])
2861 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2862
2863 /* grab a new slab and return the first slot in it */
2864
2865 STATIC regmatch_state *
2866 S_push_slab(pTHX)
2867 {
2868 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2869     dMY_CXT;
2870 #endif
2871     regmatch_slab *s = PL_regmatch_slab->next;
2872     if (!s) {
2873         Newx(s, 1, regmatch_slab);
2874         s->prev = PL_regmatch_slab;
2875         s->next = NULL;
2876         PL_regmatch_slab->next = s;
2877     }
2878     PL_regmatch_slab = s;
2879     return SLAB_FIRST(s);
2880 }
2881
2882
2883 /* push a new state then goto it */
2884
2885 #define PUSH_STATE_GOTO(state, node, input) \
2886     pushinput = input; \
2887     scan = node; \
2888     st->resume_state = state; \
2889     goto push_state;
2890
2891 /* push a new state with success backtracking, then goto it */
2892
2893 #define PUSH_YES_STATE_GOTO(state, node, input) \
2894     pushinput = input; \
2895     scan = node; \
2896     st->resume_state = state; \
2897     goto push_yes_state;
2898
2899
2900
2901
2902 /*
2903
2904 regmatch() - main matching routine
2905
2906 This is basically one big switch statement in a loop. We execute an op,
2907 set 'next' to point the next op, and continue. If we come to a point which
2908 we may need to backtrack to on failure such as (A|B|C), we push a
2909 backtrack state onto the backtrack stack. On failure, we pop the top
2910 state, and re-enter the loop at the state indicated. If there are no more
2911 states to pop, we return failure.
2912
2913 Sometimes we also need to backtrack on success; for example /A+/, where
2914 after successfully matching one A, we need to go back and try to
2915 match another one; similarly for lookahead assertions: if the assertion
2916 completes successfully, we backtrack to the state just before the assertion
2917 and then carry on.  In these cases, the pushed state is marked as
2918 'backtrack on success too'. This marking is in fact done by a chain of
2919 pointers, each pointing to the previous 'yes' state. On success, we pop to
2920 the nearest yes state, discarding any intermediate failure-only states.
2921 Sometimes a yes state is pushed just to force some cleanup code to be
2922 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2923 it to free the inner regex.
2924
2925 Note that failure backtracking rewinds the cursor position, while
2926 success backtracking leaves it alone.
2927
2928 A pattern is complete when the END op is executed, while a subpattern
2929 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2930 ops trigger the "pop to last yes state if any, otherwise return true"
2931 behaviour.
2932
2933 A common convention in this function is to use A and B to refer to the two
2934 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2935 the subpattern to be matched possibly multiple times, while B is the entire
2936 rest of the pattern. Variable and state names reflect this convention.
2937
2938 The states in the main switch are the union of ops and failure/success of
2939 substates associated with with that op.  For example, IFMATCH is the op
2940 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2941 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2942 successfully matched A and IFMATCH_A_fail is a state saying that we have
2943 just failed to match A. Resume states always come in pairs. The backtrack
2944 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2945 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2946 on success or failure.
2947
2948 The struct that holds a backtracking state is actually a big union, with
2949 one variant for each major type of op. The variable st points to the
2950 top-most backtrack struct. To make the code clearer, within each
2951 block of code we #define ST to alias the relevant union.
2952
2953 Here's a concrete example of a (vastly oversimplified) IFMATCH
2954 implementation:
2955
2956     switch (state) {
2957     ....
2958
2959 #define ST st->u.ifmatch
2960
2961     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2962         ST.foo = ...; // some state we wish to save
2963         ...
2964         // push a yes backtrack state with a resume value of
2965         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2966         // first node of A:
2967         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
2968         // NOTREACHED
2969
2970     case IFMATCH_A: // we have successfully executed A; now continue with B
2971         next = B;
2972         bar = ST.foo; // do something with the preserved value
2973         break;
2974
2975     case IFMATCH_A_fail: // A failed, so the assertion failed
2976         ...;   // do some housekeeping, then ...
2977         sayNO; // propagate the failure
2978
2979 #undef ST
2980
2981     ...
2982     }
2983
2984 For any old-timers reading this who are familiar with the old recursive
2985 approach, the code above is equivalent to:
2986
2987     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2988     {
2989         int foo = ...
2990         ...
2991         if (regmatch(A)) {
2992             next = B;
2993             bar = foo;
2994             break;
2995         }
2996         ...;   // do some housekeeping, then ...
2997         sayNO; // propagate the failure
2998     }
2999
3000 The topmost backtrack state, pointed to by st, is usually free. If you
3001 want to claim it, populate any ST.foo fields in it with values you wish to
3002 save, then do one of
3003
3004         PUSH_STATE_GOTO(resume_state, node, newinput);
3005         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3006
3007 which sets that backtrack state's resume value to 'resume_state', pushes a
3008 new free entry to the top of the backtrack stack, then goes to 'node'.
3009 On backtracking, the free slot is popped, and the saved state becomes the
3010 new free state. An ST.foo field in this new top state can be temporarily
3011 accessed to retrieve values, but once the main loop is re-entered, it
3012 becomes available for reuse.
3013
3014 Note that the depth of the backtrack stack constantly increases during the
3015 left-to-right execution of the pattern, rather than going up and down with
3016 the pattern nesting. For example the stack is at its maximum at Z at the
3017 end of the pattern, rather than at X in the following:
3018
3019     /(((X)+)+)+....(Y)+....Z/
3020
3021 The only exceptions to this are lookahead/behind assertions and the cut,
3022 (?>A), which pop all the backtrack states associated with A before
3023 continuing.
3024
3025 Backtrack state structs are allocated in slabs of about 4K in size.
3026 PL_regmatch_state and st always point to the currently active state,
3027 and PL_regmatch_slab points to the slab currently containing
3028 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3029 allocated, and is never freed until interpreter destruction. When the slab
3030 is full, a new one is allocated and chained to the end. At exit from
3031 regmatch(), slabs allocated since entry are freed.
3032
3033 */
3034
3035
3036 #define DEBUG_STATE_pp(pp)                                  \
3037     DEBUG_STATE_r({                                         \
3038         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3039         PerlIO_printf(Perl_debug_log,                       \
3040             "    %*s"pp" %s%s%s%s%s\n",                     \
3041             depth*2, "",                                    \
3042             PL_reg_name[st->resume_state],                     \
3043             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3044             ((st==yes_state) ? "Y" : ""),                   \
3045             ((st==mark_state) ? "M" : ""),                  \
3046             ((st==yes_state||st==mark_state) ? "]" : "")    \
3047         );                                                  \
3048     });
3049
3050
3051 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3052
3053 #ifdef DEBUGGING
3054
3055 STATIC void
3056 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3057     const char *start, const char *end, const char *blurb)
3058 {
3059     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3060
3061     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3062
3063     if (!PL_colorset)
3064             reginitcolors();
3065     {
3066         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3067             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3068
3069         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3070             start, end - start, 60);
3071
3072         PerlIO_printf(Perl_debug_log,
3073             "%s%s REx%s %s against %s\n",
3074                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3075
3076         if (utf8_target||utf8_pat)
3077             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3078                 utf8_pat ? "pattern" : "",
3079                 utf8_pat && utf8_target ? " and " : "",
3080                 utf8_target ? "string" : ""
3081             );
3082     }
3083 }
3084
3085 STATIC void
3086 S_dump_exec_pos(pTHX_ const char *locinput,
3087                       const regnode *scan,
3088                       const char *loc_regeol,
3089                       const char *loc_bostr,
3090                       const char *loc_reg_starttry,
3091                       const bool utf8_target)
3092 {
3093     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3094     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3095     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3096     /* The part of the string before starttry has one color
3097        (pref0_len chars), between starttry and current
3098        position another one (pref_len - pref0_len chars),
3099        after the current position the third one.
3100        We assume that pref0_len <= pref_len, otherwise we
3101        decrease pref0_len.  */
3102     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3103         ? (5 + taill) - l : locinput - loc_bostr;
3104     int pref0_len;
3105
3106     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3107
3108     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3109         pref_len++;
3110     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3111     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3112         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3113               ? (5 + taill) - pref_len : loc_regeol - locinput);
3114     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3115         l--;
3116     if (pref0_len < 0)
3117         pref0_len = 0;
3118     if (pref0_len > pref_len)
3119         pref0_len = pref_len;
3120     {
3121         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3122
3123         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3124             (locinput - pref_len),pref0_len, 60, 4, 5);
3125
3126         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3127                     (locinput - pref_len + pref0_len),
3128                     pref_len - pref0_len, 60, 2, 3);
3129
3130         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3131                     locinput, loc_regeol - locinput, 10, 0, 1);
3132
3133         const STRLEN tlen=len0+len1+len2;
3134         PerlIO_printf(Perl_debug_log,
3135                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3136                     (IV)(locinput - loc_bostr),
3137                     len0, s0,
3138                     len1, s1,
3139                     (docolor ? "" : "> <"),
3140                     len2, s2,
3141                     (int)(tlen > 19 ? 0 :  19 - tlen),
3142                     "");
3143     }
3144 }
3145
3146 #endif
3147
3148 /* reg_check_named_buff_matched()
3149  * Checks to see if a named buffer has matched. The data array of
3150  * buffer numbers corresponding to the buffer is expected to reside
3151  * in the regexp->data->data array in the slot stored in the ARG() of
3152  * node involved. Note that this routine doesn't actually care about the
3153  * name, that information is not preserved from compilation to execution.
3154  * Returns the index of the leftmost defined buffer with the given name
3155  * or 0 if non of the buffers matched.
3156  */
3157 STATIC I32
3158 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3159 {
3160     I32 n;
3161     RXi_GET_DECL(rex,rexi);
3162     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3163     I32 *nums=(I32*)SvPVX(sv_dat);
3164
3165     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3166
3167     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3168         if ((I32)rex->lastparen >= nums[n] &&
3169             rex->offs[nums[n]].end != -1)
3170         {
3171             return nums[n];
3172         }
3173     }
3174     return 0;
3175 }
3176
3177
3178 /* free all slabs above current one  - called during LEAVE_SCOPE */
3179
3180 STATIC void
3181 S_clear_backtrack_stack(pTHX_ void *p)
3182 {
3183     regmatch_slab *s = PL_regmatch_slab->next;
3184     PERL_UNUSED_ARG(p);
3185
3186     if (!s)
3187         return;
3188     PL_regmatch_slab->next = NULL;
3189     while (s) {
3190         regmatch_slab * const osl = s;
3191         s = s->next;
3192         Safefree(osl);
3193     }
3194 }
3195
3196
3197 /* returns -1 on failure, $+[0] on success */
3198 STATIC I32
3199 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3200 {
3201 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3202     dMY_CXT;
3203 #endif
3204     dVAR;
3205     const bool utf8_target = PL_reg_match_utf8;
3206     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3207     REGEXP *rex_sv = reginfo->prog;
3208     regexp *rex = (struct regexp *)SvANY(rex_sv);
3209     RXi_GET_DECL(rex,rexi);
3210     I32 oldsave;
3211     /* the current state. This is a cached copy of PL_regmatch_state */
3212     regmatch_state *st;
3213     /* cache heavy used fields of st in registers */
3214     regnode *scan;
3215     regnode *next;
3216     U32 n = 0;  /* general value; init to avoid compiler warning */
3217     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3218     char *locinput = startpos;
3219     char *pushinput; /* where to continue after a PUSH */
3220     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3221
3222     bool result = 0;        /* return value of S_regmatch */
3223     int depth = 0;          /* depth of backtrack stack */
3224     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3225     const U32 max_nochange_depth =
3226         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3227         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3228     regmatch_state *yes_state = NULL; /* state to pop to on success of
3229                                                             subpattern */
3230     /* mark_state piggy backs on the yes_state logic so that when we unwind
3231        the stack on success we can update the mark_state as we go */
3232     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3233     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3234     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3235     U32 state_num;
3236     bool no_final = 0;      /* prevent failure from backtracking? */
3237     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3238     char *startpoint = locinput;
3239     SV *popmark = NULL;     /* are we looking for a mark? */
3240     SV *sv_commit = NULL;   /* last mark name seen in failure */
3241     SV *sv_yes_mark = NULL; /* last mark name we have seen
3242                                during a successful match */
3243     U32 lastopen = 0;       /* last open we saw */
3244     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3245     SV* const oreplsv = GvSV(PL_replgv);
3246     /* these three flags are set by various ops to signal information to
3247      * the very next op. They have a useful lifetime of exactly one loop
3248      * iteration, and are not preserved or restored by state pushes/pops
3249      */
3250     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3251     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3252     int logical = 0;        /* the following EVAL is:
3253                                 0: (?{...})
3254                                 1: (?(?{...})X|Y)
3255                                 2: (??{...})
3256                                or the following IFMATCH/UNLESSM is:
3257                                 false: plain (?=foo)
3258                                 true:  used as a condition: (?(?=foo))
3259                             */
3260     PAD* last_pad = NULL;
3261     dMULTICALL;
3262     I32 gimme = G_SCALAR;
3263     CV *caller_cv = NULL;       /* who called us */
3264     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3265     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3266
3267 #ifdef DEBUGGING
3268     GET_RE_DEBUG_FLAGS_DECL;
3269 #endif
3270
3271     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3272     multicall_oldcatch = 0;
3273     multicall_cv = NULL;
3274     cx = NULL;
3275     PERL_UNUSED_VAR(multicall_cop);
3276     PERL_UNUSED_VAR(newsp);
3277
3278
3279     PERL_ARGS_ASSERT_REGMATCH;
3280
3281     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3282             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3283     }));
3284     /* on first ever call to regmatch, allocate first slab */
3285     if (!PL_regmatch_slab) {
3286         Newx(PL_regmatch_slab, 1, regmatch_slab);
3287         PL_regmatch_slab->prev = NULL;
3288         PL_regmatch_slab->next = NULL;
3289         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3290     }
3291
3292     oldsave = PL_savestack_ix;
3293     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3294     SAVEVPTR(PL_regmatch_slab);
3295     SAVEVPTR(PL_regmatch_state);
3296
3297     /* grab next free state slot */
3298     st = ++PL_regmatch_state;
3299     if (st >  SLAB_LAST(PL_regmatch_slab))
3300         st = PL_regmatch_state = S_push_slab(aTHX);
3301
3302     /* Note that nextchr is a byte even in UTF */
3303     nextchr = UCHARAT(locinput);
3304     scan = prog;
3305     while (scan != NULL) {
3306
3307         DEBUG_EXECUTE_r( {
3308             SV * const prop = sv_newmortal();
3309             regnode *rnext=regnext(scan);
3310             DUMP_EXEC_POS( locinput, scan, utf8_target );
3311             regprop(rex, prop, scan);
3312
3313             PerlIO_printf(Perl_debug_log,
3314                     "%3"IVdf":%*s%s(%"IVdf")\n",
3315                     (IV)(scan - rexi->program), depth*2, "",
3316                     SvPVX_const(prop),
3317                     (PL_regkind[OP(scan)] == END || !rnext) ?
3318                         0 : (IV)(rnext - rexi->program));
3319         });
3320
3321         next = scan + NEXT_OFF(scan);
3322         if (next == scan)
3323             next = NULL;
3324         state_num = OP(scan);
3325
3326         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3327       reenter_switch:
3328
3329         switch (state_num) {
3330         case BOL:
3331             if (locinput == PL_bostr)
3332             {
3333                 /* reginfo->till = reginfo->bol; */
3334                 break;
3335             }
3336             sayNO;
3337         case MBOL:
3338             if (locinput == PL_bostr ||
3339                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3340             {
3341                 break;
3342             }
3343             sayNO;
3344         case SBOL:
3345             if (locinput == PL_bostr)
3346                 break;
3347             sayNO;
3348         case GPOS:
3349             if (locinput == reginfo->ganch)
3350                 break;
3351             sayNO;
3352
3353         case KEEPS:
3354             /* update the startpoint */
3355             st->u.keeper.val = rex->offs[0].start;
3356             rex->offs[0].start = locinput - PL_bostr;
3357             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3358             /*NOT-REACHED*/
3359         case KEEPS_next_fail:
3360             /* rollback the start point change */
3361             rex->offs[0].start = st->u.keeper.val;
3362             sayNO_SILENT;
3363             /*NOT-REACHED*/
3364         case EOL:
3365                 goto seol;
3366         case MEOL:
3367             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3368                 sayNO;
3369             break;
3370         case SEOL:
3371           seol:
3372             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3373                 sayNO;
3374             if (PL_regeol - locinput > 1)
3375                 sayNO;
3376             break;
3377         case EOS:
3378             if (PL_regeol != locinput)
3379                 sayNO;
3380             break;
3381         case SANY:
3382             if (!nextchr && locinput >= PL_regeol)
3383                 sayNO;
3384             if (utf8_target) {
3385                 locinput += PL_utf8skip[nextchr];
3386                 if (locinput > PL_regeol)
3387                     sayNO;
3388                 nextchr = UCHARAT(locinput);
3389             }
3390             else
3391                 nextchr = UCHARAT(++locinput);
3392             break;
3393         case CANY:
3394             if (!nextchr && locinput >= PL_regeol)
3395                 sayNO;
3396             nextchr = UCHARAT(++locinput);
3397             break;
3398         case REG_ANY:
3399             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3400                 sayNO;
3401             if (utf8_target) {
3402                 locinput += PL_utf8skip[nextchr];
3403                 if (locinput > PL_regeol)
3404                     sayNO;
3405                 nextchr = UCHARAT(locinput);
3406             }
3407             else
3408                 nextchr = UCHARAT(++locinput);
3409             break;
3410
3411 #undef  ST
3412 #define ST st->u.trie
3413         case TRIEC:
3414             /* In this case the charclass data is available inline so
3415                we can fail fast without a lot of extra overhead.
3416              */
3417             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3418                 DEBUG_EXECUTE_r(
3419                     PerlIO_printf(Perl_debug_log,
3420                               "%*s  %sfailed to match trie start class...%s\n",
3421                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3422                 );
3423                 sayNO_SILENT;
3424                 assert(0); /* NOTREACHED */
3425             }
3426             /* FALL THROUGH */
3427         case TRIE:
3428             /* the basic plan of execution of the trie is:
3429              * At the beginning, run though all the states, and
3430              * find the longest-matching word. Also remember the position
3431              * of the shortest matching word. For example, this pattern:
3432              *    1  2 3 4    5
3433              *    ab|a|x|abcd|abc
3434              * when matched against the string "abcde", will generate
3435              * accept states for all words except 3, with the longest
3436              * matching word being 4, and the shortest being 1 (with
3437              * the position being after char 1 of the string).
3438              *
3439              * Then for each matching word, in word order (i.e. 1,2,4,5),
3440              * we run the remainder of the pattern; on each try setting
3441              * the current position to the character following the word,
3442              * returning to try the next word on failure.
3443              *
3444              * We avoid having to build a list of words at runtime by
3445              * using a compile-time structure, wordinfo[].prev, which
3446              * gives, for each word, the previous accepting word (if any).
3447              * In the case above it would contain the mappings 1->2, 2->0,
3448              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3449              * the longest word (4 above), a list of all words, by
3450              * following the list of prev pointers; this gives us the
3451              * unordered list 4,5,1,2. Then given the current word we have
3452              * just tried, we can go through the list and find the
3453              * next-biggest word to try (so if we just failed on word 2,
3454              * the next in the list is 4).
3455              *
3456              * Since at runtime we don't record the matching position in
3457              * the string for each word, we have to work that out for
3458              * each word we're about to process. The wordinfo table holds
3459              * the character length of each word; given that we recorded
3460              * at the start: the position of the shortest word and its
3461              * length in chars, we just need to move the pointer the
3462              * difference between the two char lengths. Depending on
3463              * Unicode status and folding, that's cheap or expensive.
3464              *
3465              * This algorithm is optimised for the case where are only a
3466              * small number of accept states, i.e. 0,1, or maybe 2.
3467              * With lots of accepts states, and having to try all of them,
3468              * it becomes quadratic on number of accept states to find all
3469              * the next words.
3470              */
3471
3472             {
3473                 /* what type of TRIE am I? (utf8 makes this contextual) */
3474                 DECL_TRIE_TYPE(scan);
3475
3476                 /* what trie are we using right now */
3477                 reg_trie_data * const trie
3478                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3479                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3480                 U32 state = trie->startstate;
3481
3482                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3483                     if (trie->states[ state ].wordnum) {
3484                          DEBUG_EXECUTE_r(
3485                             PerlIO_printf(Perl_debug_log,
3486                                           "%*s  %smatched empty string...%s\n",
3487                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3488                         );
3489                         if (!trie->jump)
3490                             break;
3491                     } else {
3492                         DEBUG_EXECUTE_r(
3493                             PerlIO_printf(Perl_debug_log,
3494                                           "%*s  %sfailed to match trie start class...%s\n",
3495                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3496                         );
3497                         sayNO_SILENT;
3498                    }
3499                 }
3500
3501             {
3502                 U8 *uc = ( U8* )locinput;
3503
3504                 STRLEN len = 0;
3505                 STRLEN foldlen = 0;
3506                 U8 *uscan = (U8*)NULL;
3507                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3508                 U32 charcount = 0; /* how many input chars we have matched */
3509                 U32 accepted = 0; /* have we seen any accepting states? */
3510
3511                 ST.jump = trie->jump;
3512                 ST.me = scan;
3513                 ST.firstpos = NULL;
3514                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3515                 ST.nextword = 0;
3516
3517                 /* fully traverse the TRIE; note the position of the
3518                    shortest accept state and the wordnum of the longest
3519                    accept state */
3520
3521                 while ( state && uc <= (U8*)PL_regeol ) {
3522                     U32 base = trie->states[ state ].trans.base;
3523                     UV uvc = 0;
3524                     U16 charid = 0;
3525                     U16 wordnum;
3526                     wordnum = trie->states[ state ].wordnum;
3527
3528                     if (wordnum) { /* it's an accept state */
3529                         if (!accepted) {
3530                             accepted = 1;
3531                             /* record first match position */
3532                             if (ST.longfold) {
3533                                 ST.firstpos = (U8*)locinput;
3534                                 ST.firstchars = 0;
3535                             }
3536                             else {
3537                                 ST.firstpos = uc;
3538                                 ST.firstchars = charcount;
3539                             }
3540                         }
3541                         if (!ST.nextword || wordnum < ST.nextword)
3542                             ST.nextword = wordnum;
3543                         ST.topword = wordnum;
3544                     }
3545
3546                     DEBUG_TRIE_EXECUTE_r({
3547                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3548                                 PerlIO_printf( Perl_debug_log,
3549                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3550                                     2+depth * 2, "", PL_colors[4],
3551                                     (UV)state, (accepted ? 'Y' : 'N'));
3552                     });
3553
3554                     /* read a char and goto next state */
3555                     if ( base ) {
3556                         I32 offset;
3557                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3558                                              uscan, len, uvc, charid, foldlen,
3559                                              foldbuf, uniflags);
3560                         charcount++;
3561                         if (foldlen>0)
3562                             ST.longfold = TRUE;
3563                         if (charid &&
3564                              ( ((offset =
3565                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3566
3567                              && ((U32)offset < trie->lasttrans)
3568                              && trie->trans[offset].check == state)
3569                         {
3570                             state = trie->trans[offset].next;
3571                         }
3572                         else {
3573                             state = 0;
3574                         }
3575                         uc += len;
3576
3577                     }
3578                     else {
3579                         state = 0;
3580                     }
3581                     DEBUG_TRIE_EXECUTE_r(
3582                         PerlIO_printf( Perl_debug_log,
3583                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3584                             charid, uvc, (UV)state, PL_colors[5] );
3585                     );
3586                 }
3587                 if (!accepted)
3588                    sayNO;
3589
3590                 /* calculate total number of accept states */
3591                 {
3592                     U16 w = ST.topword;
3593                     accepted = 0;
3594                     while (w) {
3595                         w = trie->wordinfo[w].prev;
3596                         accepted++;
3597                     }
3598                     ST.accepted = accepted;
3599                 }
3600
3601                 DEBUG_EXECUTE_r(
3602                     PerlIO_printf( Perl_debug_log,
3603                         "%*s  %sgot %"IVdf" possible matches%s\n",
3604                         REPORT_CODE_OFF + depth * 2, "",
3605                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3606                 );
3607                 goto trie_first_try; /* jump into the fail handler */
3608             }}
3609             assert(0); /* NOTREACHED */
3610
3611         case TRIE_next_fail: /* we failed - try next alternative */
3612         {
3613             U8 *uc;
3614             if ( ST.jump) {
3615                 REGCP_UNWIND(ST.cp);
3616                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3617             }
3618             if (!--ST.accepted) {
3619                 DEBUG_EXECUTE_r({
3620                     PerlIO_printf( Perl_debug_log,
3621                         "%*s  %sTRIE failed...%s\n",
3622                         REPORT_CODE_OFF+depth*2, "",
3623                         PL_colors[4],
3624                         PL_colors[5] );
3625                 });
3626                 sayNO_SILENT;
3627             }
3628             {
3629                 /* Find next-highest word to process.  Note that this code
3630                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3631                 U16 min = 0;
3632                 U16 word;
3633                 U16 const nextword = ST.nextword;
3634                 reg_trie_wordinfo * const wordinfo
3635                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3636                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3637                     if (word > nextword && (!min || word < min))
3638                         min = word;
3639                 }
3640                 ST.nextword = min;
3641             }
3642
3643           trie_first_try:
3644             if (do_cutgroup) {
3645                 do_cutgroup = 0;
3646                 no_final = 0;
3647             }
3648
3649             if ( ST.jump) {
3650                 ST.lastparen = rex->lastparen;
3651                 ST.lastcloseparen = rex->lastcloseparen;
3652                 REGCP_SET(ST.cp);
3653             }
3654
3655             /* find start char of end of current word */
3656             {
3657                 U32 chars; /* how many chars to skip */
3658                 reg_trie_data * const trie
3659                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3660
3661                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3662                             >=  ST.firstchars);
3663                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3664                             - ST.firstchars;
3665                 uc = ST.firstpos;
3666
3667                 if (ST.longfold) {
3668                     /* the hard option - fold each char in turn and find
3669                      * its folded length (which may be different */
3670                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3671                     STRLEN foldlen;
3672                     STRLEN len;
3673                     UV uvc;
3674                     U8 *uscan;
3675
3676                     while (chars) {
3677                         if (utf8_target) {
3678                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3679                                                     uniflags);
3680                             uc += len;
3681                         }
3682                         else {
3683                             uvc = *uc;
3684                             uc++;
3685                         }
3686                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3687                         uscan = foldbuf;
3688                         while (foldlen) {
3689                             if (!--chars)
3690                                 break;
3691                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3692                                             uniflags);
3693                             uscan += len;
3694                             foldlen -= len;
3695                         }
3696                     }
3697                 }
3698                 else {
3699                     if (utf8_target)
3700                         while (chars--)
3701                             uc += UTF8SKIP(uc);
3702                     else
3703                         uc += chars;
3704                 }
3705             }
3706
3707             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3708                             ? ST.jump[ST.nextword]
3709                             : NEXT_OFF(ST.me));
3710
3711             DEBUG_EXECUTE_r({
3712                 PerlIO_printf( Perl_debug_log,
3713                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3714                     REPORT_CODE_OFF+depth*2, "",
3715                     PL_colors[4],
3716                     ST.nextword,
3717                     PL_colors[5]
3718                     );
3719             });
3720
3721             if (ST.accepted > 1 || has_cutgroup) {
3722                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3723                 assert(0); /* NOTREACHED */
3724             }
3725             /* only one choice left - just continue */
3726             DEBUG_EXECUTE_r({
3727                 AV *const trie_words
3728                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3729                 SV ** const tmp = av_fetch( trie_words,
3730                     ST.nextword-1, 0 );
3731                 SV *sv= tmp ? sv_newmortal() : NULL;
3732
3733                 PerlIO_printf( Perl_debug_log,
3734                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3735                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3736                     ST.nextword,
3737                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3738                             PL_colors[0], PL_colors[1],
3739                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3740                         )
3741                     : "not compiled under -Dr",
3742                     PL_colors[5] );
3743             });
3744
3745             locinput = (char*)uc;
3746             nextchr = UCHARAT(locinput);
3747             continue; /* execute rest of RE */
3748             assert(0); /* NOTREACHED */
3749         }
3750 #undef  ST
3751
3752         case EXACT: {
3753             char *s = STRING(scan);
3754             ln = STR_LEN(scan);
3755             if (utf8_target != UTF_PATTERN) {
3756                 /* The target and the pattern have differing utf8ness. */
3757                 char *l = locinput;
3758                 const char * const e = s + ln;
3759
3760                 if (utf8_target) {
3761                     /* The target is utf8, the pattern is not utf8.
3762                      * Above-Latin1 code points can't match the pattern;
3763                      * invariants match exactly, and the other Latin1 ones need
3764                      * to be downgraded to a single byte in order to do the
3765                      * comparison.  (If we could be confident that the target
3766                      * is not malformed, this could be refactored to have fewer
3767                      * tests by just assuming that if the first bytes match, it
3768                      * is an invariant, but there are tests in the test suite
3769                      * dealing with (??{...}) which violate this) */
3770                     while (s < e) {
3771                         if (l >= PL_regeol)
3772                              sayNO;
3773                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
3774                             sayNO;
3775                         }
3776                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
3777                             if (*l != *s) {
3778                                 sayNO;
3779                             }
3780                             l++;
3781                         }
3782                         else {
3783                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
3784                                 sayNO;
3785                             }
3786                             l += 2;
3787                         }
3788                         s++;
3789                     }
3790                 }
3791                 else {
3792                     /* The target is not utf8, the pattern is utf8. */
3793                     while (s < e) {
3794                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
3795                         {
3796                             sayNO;
3797                         }
3798                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
3799                             if (*s != *l) {
3800                                 sayNO;
3801                             }
3802                             s++;
3803                         }
3804                         else {
3805                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
3806                                 sayNO;
3807                             }
3808                             s += 2;
3809                         }
3810                         l++;
3811                     }
3812                 }
3813                 locinput = l;
3814                 nextchr = UCHARAT(locinput);
3815                 break;
3816             }
3817             /* The target and the pattern have the same utf8ness. */
3818             /* Inline the first character, for speed. */
3819             if (UCHARAT(s) != nextchr)
3820                 sayNO;
3821             if (PL_regeol - locinput < ln)
3822                 sayNO;
3823             if (ln > 1 && memNE(s, locinput, ln))
3824                 sayNO;
3825             locinput += ln;
3826             nextchr = UCHARAT(locinput);
3827             break;
3828             }
3829         case EXACTFL: {
3830             re_fold_t folder;
3831             const U8 * fold_array;
3832             const char * s;
3833             U32 fold_utf8_flags;
3834
3835             PL_reg_flags |= RF_tainted;
3836             folder = foldEQ_locale;
3837             fold_array = PL_fold_locale;
3838             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3839             goto do_exactf;
3840
3841         case EXACTFU_SS:
3842         case EXACTFU_TRICKYFOLD:
3843         case EXACTFU:
3844             folder = foldEQ_latin1;
3845             fold_array = PL_fold_latin1;
3846             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3847             goto do_exactf;
3848
3849         case EXACTFA:
3850             folder = foldEQ_latin1;
3851             fold_array = PL_fold_latin1;
3852             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3853             goto do_exactf;
3854
3855         case EXACTF:
3856             folder = foldEQ;
3857             fold_array = PL_fold;
3858             fold_utf8_flags = 0;
3859
3860           do_exactf:
3861             s = STRING(scan);
3862             ln = STR_LEN(scan);
3863
3864             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3865               /* Either target or the pattern are utf8, or has the issue where
3866                * the fold lengths may differ. */
3867                 const char * const l = locinput;
3868                 char *e = PL_regeol;
3869
3870                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3871                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3872                 {
3873                     sayNO;
3874                 }
3875                 locinput = e;
3876                 nextchr = UCHARAT(locinput);
3877                 break;
3878             }
3879
3880             /* Neither the target nor the pattern are utf8 */
3881             if (UCHARAT(s) != nextchr &&
3882                 UCHARAT(s) != fold_array[nextchr])
3883             {
3884                 sayNO;
3885             }
3886             if (PL_regeol - locinput < ln)
3887                 sayNO;
3888             if (ln > 1 && ! folder(s, locinput, ln))
3889                 sayNO;
3890             locinput += ln;
3891             nextchr = UCHARAT(locinput);
3892             break;
3893         }
3894
3895         /* XXX Could improve efficiency by separating these all out using a
3896          * macro or in-line function.  At that point regcomp.c would no longer
3897          * have to set the FLAGS fields of these */
3898         case BOUNDL:
3899         case NBOUNDL:
3900             PL_reg_flags |= RF_tainted;
3901             /* FALL THROUGH */
3902         case BOUND:
3903         case BOUNDU:
3904         case BOUNDA:
3905         case NBOUND:
3906         case NBOUNDU:
3907         case NBOUNDA:
3908             /* was last char in word? */
3909             if (utf8_target
3910                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3911                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3912             {
3913                 if (locinput == PL_bostr)
3914                     ln = '\n';
3915                 else {
3916                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3917
3918                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3919                 }
3920                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3921                     ln = isALNUM_uni(ln);
3922                     LOAD_UTF8_CHARCLASS_ALNUM();
3923                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3924                 }
3925                 else {
3926                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3927                     n = isALNUM_LC_utf8((U8*)locinput);
3928                 }
3929             }
3930             else {
3931
3932                 /* Here the string isn't utf8, or is utf8 and only ascii
3933                  * characters are to match \w.  In the latter case looking at
3934                  * the byte just prior to the current one may be just the final
3935                  * byte of a multi-byte character.  This is ok.  There are two
3936                  * cases:
3937                  * 1) it is a single byte character, and then the test is doing
3938                  *      just what it's supposed to.
3939                  * 2) it is a multi-byte character, in which case the final
3940                  *      byte is never mistakable for ASCII, and so the test
3941                  *      will say it is not a word character, which is the
3942                  *      correct answer. */
3943                 ln = (locinput != PL_bostr) ?
3944                     UCHARAT(locinput - 1) : '\n';
3945                 switch (FLAGS(scan)) {
3946                     case REGEX_UNICODE_CHARSET:
3947                         ln = isWORDCHAR_L1(ln);
3948                         n = isWORDCHAR_L1(nextchr);
3949                         break;
3950                     case REGEX_LOCALE_CHARSET:
3951                         ln = isALNUM_LC(ln);
3952                         n = isALNUM_LC(nextchr);
3953                         break;
3954                     case REGEX_DEPENDS_CHARSET:
3955                         ln = isALNUM(ln);
3956                         n = isALNUM(nextchr);
3957                         break;
3958                     case REGEX_ASCII_RESTRICTED_CHARSET:
3959                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3960                         ln = isWORDCHAR_A(ln);
3961                         n = isWORDCHAR_A(nextchr);
3962                         break;
3963                     default:
3964                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3965                         break;
3966                 }
3967             }
3968             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3969              * regcomp.sym */
3970             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3971                     sayNO;
3972             break;
3973         case ANYOFV:
3974         case ANYOF:
3975             if (utf8_target || state_num == ANYOFV) {
3976                 STRLEN inclasslen = PL_regeol - locinput;
3977                 if (locinput >= PL_regeol)
3978                     sayNO;
3979
3980                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3981                     sayNO;
3982                 locinput += inclasslen;
3983                 nextchr = UCHARAT(locinput);
3984                 break;
3985             }
3986             else {
3987                 if (nextchr < 0)
3988                     nextchr = UCHARAT(locinput);
3989                 if (!nextchr && locinput >= PL_regeol)
3990                     sayNO;
3991                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3992                     sayNO;
3993                 nextchr = UCHARAT(++locinput);
3994                 break;
3995             }
3996             break;
3997         /* Special char classes - The defines start on line 129 or so */
3998         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3999                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4000                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4001                   ALNUMA, NALNUMA, isWORDCHAR_A,
4002                   alnum, "a");
4003
4004         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
4005                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4006                   SPACEU, NSPACEU, isSPACE_L1,
4007                   SPACEA, NSPACEA, isSPACE_A,
4008                   space, " ");
4009
4010         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4011                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4012                 DIGITA, NDIGITA, isDIGIT_A,
4013                 digit, "0");
4014
4015         case POSIXA:
4016             if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4017                 sayNO;
4018             }
4019             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4020             nextchr = UCHARAT(++locinput);
4021             break;
4022         case NPOSIXA:
4023             if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
4024                 sayNO;
4025             }
4026             if (utf8_target) {
4027                 locinput += PL_utf8skip[nextchr];
4028                 nextchr = UCHARAT(locinput);
4029             }
4030             else {
4031                 nextchr = UCHARAT(++locinput);
4032             }
4033             break;
4034
4035         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4036                        a Unicode extended Grapheme Cluster */
4037             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4038               extended Grapheme Cluster is:
4039
4040                CR LF
4041                | Prepend* Begin Extend*
4042                | .
4043
4044                Begin is:           ( Special_Begin | ! Control )
4045                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4046                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4047                Control is:         [ GCB_Control  CR  LF ]
4048                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4049
4050                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4051                we can rewrite
4052
4053                    Begin is ( Regular_Begin + Special Begin )
4054
4055                It turns out that 98.4% of all Unicode code points match
4056                Regular_Begin.  Doing it this way eliminates a table match in
4057                the previous implementation for almost all Unicode code points.
4058
4059                There is a subtlety with Prepend* which showed up in testing.
4060                Note that the Begin, and only the Begin is required in:
4061                 | Prepend* Begin Extend*
4062                Also, Begin contains '! Control'.  A Prepend must be a
4063                '!  Control', which means it must also be a Begin.  What it
4064                comes down to is that if we match Prepend* and then find no
4065                suitable Begin afterwards, that if we backtrack the last
4066                Prepend, that one will be a suitable Begin.
4067             */
4068
4069             if (locinput >= PL_regeol)
4070                 sayNO;
4071             if  (! utf8_target) {
4072
4073                 /* Match either CR LF  or '.', as all the other possibilities
4074                  * require utf8 */
4075                 locinput++;         /* Match the . or CR */
4076                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4077                                        match the LF */
4078                     && locinput < PL_regeol
4079                     && UCHARAT(locinput) == '\n') locinput++;
4080             }
4081             else {
4082
4083                 /* Utf8: See if is ( CR LF ); already know that locinput <
4084                  * PL_regeol, so locinput+1 is in bounds */
4085                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
4086                     locinput += 2;
4087                 }
4088                 else {
4089                     STRLEN len;
4090
4091                     /* In case have to backtrack to beginning, then match '.' */
4092                     char *starting = locinput;
4093
4094                     /* In case have to backtrack the last prepend */
4095                     char *previous_prepend = 0;
4096
4097                     LOAD_UTF8_CHARCLASS_GCB();
4098
4099                     /* Match (prepend)*   */
4100                     while (locinput < PL_regeol
4101                            && (len = is_GCB_Prepend_utf8(locinput)))
4102                     {
4103                         previous_prepend = locinput;
4104                         locinput += len;
4105                     }
4106
4107                     /* As noted above, if we matched a prepend character, but
4108                      * the next thing won't match, back off the last prepend we
4109                      * matched, as it is guaranteed to match the begin */
4110                     if (previous_prepend
4111                         && (locinput >=  PL_regeol
4112                             || (! swash_fetch(PL_utf8_X_regular_begin,
4113                                              (U8*)locinput, utf8_target)
4114                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4115                         )
4116                     {
4117                         locinput = previous_prepend;
4118                     }
4119
4120                     /* Note that here we know PL_regeol > locinput, as we
4121                      * tested that upon input to this switch case, and if we
4122                      * moved locinput forward, we tested the result just above
4123                      * and it either passed, or we backed off so that it will
4124                      * now pass */
4125                     if (swash_fetch(PL_utf8_X_regular_begin,
4126                                     (U8*)locinput, utf8_target)) {
4127                         locinput += UTF8SKIP(locinput);
4128                     }
4129                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4130
4131                         /* Here did not match the required 'Begin' in the
4132                          * second term.  So just match the very first
4133                          * character, the '.' of the final term of the regex */
4134                         locinput = starting + UTF8SKIP(starting);
4135                         goto exit_utf8;
4136                     } else {
4137
4138                         /* Here is a special begin.  It can be composed of
4139                          * several individual characters.  One possibility is
4140                          * RI+ */
4141                         if ((len = is_GCB_RI_utf8(locinput))) {
4142                             locinput += len;
4143                             while (locinput < PL_regeol
4144                                    && (len = is_GCB_RI_utf8(locinput)))
4145                             {
4146                                 locinput += len;
4147                             }
4148                         } else if ((len = is_GCB_T_utf8(locinput))) {
4149                             /* Another possibility is T+ */
4150                             locinput += len;
4151                             while (locinput < PL_regeol
4152                                 && (len = is_GCB_T_utf8(locinput)))
4153                             {
4154                                 locinput += len;
4155                             }
4156                         } else {
4157
4158                             /* Here, neither RI+ nor T+; must be some other
4159                              * Hangul.  That means it is one of the others: L,
4160                              * LV, LVT or V, and matches:
4161                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4162
4163                             /* Match L*           */
4164                             while (locinput < PL_regeol
4165                                    && (len = is_GCB_L_utf8(locinput)))
4166                             {
4167                                 locinput += len;
4168                             }
4169
4170                             /* Here, have exhausted L*.  If the next character
4171                              * is not an LV, LVT nor V, it means we had to have
4172                              * at least one L, so matches L+ in the original
4173                              * equation, we have a complete hangul syllable.
4174                              * Are done. */
4175
4176                             if (locinput < PL_regeol
4177                                 && is_GCB_LV_LVT_V_utf8(locinput))
4178                             {
4179
4180                                 /* Otherwise keep going.  Must be LV, LVT or V.
4181                                  * See if LVT */
4182                                 if (is_utf8_X_LVT((U8*)locinput)) {
4183                                     locinput += UTF8SKIP(locinput);
4184                                 } else {
4185
4186                                     /* Must be  V or LV.  Take it, then match
4187                                      * V*     */
4188                                     locinput += UTF8SKIP(locinput);
4189                                     while (locinput < PL_regeol
4190                                            && (len = is_GCB_V_utf8(locinput)))
4191                                     {
4192                                         locinput += len;
4193                                     }
4194                                 }
4195
4196                                 /* And any of LV, LVT, or V can be followed
4197                                  * by T*            */
4198                                 while (locinput < PL_regeol
4199                                        && (len = is_GCB_T_utf8(locinput)))
4200                                 {
4201                                     locinput += len;
4202                                 }
4203                             }
4204                         }
4205                     }
4206
4207                     /* Match any extender */
4208                     while (locinput < PL_regeol
4209                             && swash_fetch(PL_utf8_X_extend,
4210                                             (U8*)locinput, utf8_target))
4211                     {
4212                         locinput += UTF8SKIP(locinput);
4213                     }
4214                 }
4215             exit_utf8:
4216                 if (locinput > PL_regeol) sayNO;
4217             }
4218             nextchr = UCHARAT(locinput);
4219             break;
4220
4221         case NREFFL:
4222         {   /* The capture buffer cases.  The ones beginning with N for the
4223                named buffers just convert to the equivalent numbered and
4224                pretend they were called as the corresponding numbered buffer
4225                op.  */
4226             /* don't initialize these in the declaration, it makes C++
4227                unhappy */
4228             char *s;
4229             char type;
4230             re_fold_t folder;
4231             const U8 *fold_array;
4232             UV utf8_fold_flags;
4233
4234             PL_reg_flags |= RF_tainted;
4235             folder = foldEQ_locale;
4236             fold_array = PL_fold_locale;
4237             type = REFFL;
4238             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4239             goto do_nref;
4240
4241         case NREFFA:
4242             folder = foldEQ_latin1;
4243             fold_array = PL_fold_latin1;
4244             type = REFFA;
4245             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4246             goto do_nref;
4247
4248         case NREFFU:
4249             folder = foldEQ_latin1;
4250             fold_array = PL_fold_latin1;
4251             type = REFFU;
4252             utf8_fold_flags = 0;
4253             goto do_nref;
4254
4255         case NREFF:
4256             folder = foldEQ;
4257             fold_array = PL_fold;
4258             type = REFF;
4259             utf8_fold_flags = 0;
4260             goto do_nref;
4261
4262         case NREF:
4263             type = REF;
4264             folder = NULL;
4265             fold_array = NULL;
4266             utf8_fold_flags = 0;
4267           do_nref:
4268
4269             /* For the named back references, find the corresponding buffer
4270              * number */
4271             n = reg_check_named_buff_matched(rex,scan);
4272
4273             if ( ! n ) {
4274                 sayNO;
4275             }
4276             goto do_nref_ref_common;
4277
4278         case REFFL:
4279             PL_reg_flags |= RF_tainted;
4280             folder = foldEQ_locale;
4281             fold_array = PL_fold_locale;
4282             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4283             goto do_ref;
4284
4285         case REFFA:
4286             folder = foldEQ_latin1;
4287             fold_array = PL_fold_latin1;
4288             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4289             goto do_ref;
4290
4291         case REFFU:
4292             folder = foldEQ_latin1;
4293             fold_array = PL_fold_latin1;
4294             utf8_fold_flags = 0;
4295             goto do_ref;
4296
4297         case REFF:
4298             folder = foldEQ;
4299             fold_array = PL_fold;
4300             utf8_fold_flags = 0;
4301             goto do_ref;
4302
4303         case REF:
4304             folder = NULL;
4305             fold_array = NULL;
4306             utf8_fold_flags = 0;
4307
4308           do_ref:
4309             type = OP(scan);
4310             n = ARG(scan);  /* which paren pair */
4311
4312           do_nref_ref_common:
4313             ln = rex->offs[n].start;
4314             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4315             if (rex->lastparen < n || ln == -1)
4316                 sayNO;                  /* Do not match unless seen CLOSEn. */
4317             if (ln == rex->offs[n].end)
4318                 break;
4319
4320             s = PL_bostr + ln;
4321             if (type != REF     /* REF can do byte comparison */
4322                 && (utf8_target || type == REFFU))
4323             { /* XXX handle REFFL better */
4324                 char * limit = PL_regeol;
4325
4326                 /* This call case insensitively compares the entire buffer
4327                     * at s, with the current input starting at locinput, but
4328                     * not going off the end given by PL_regeol, and returns in
4329                     * limit upon success, how much of the current input was
4330                     * matched */
4331                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4332                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4333                 {
4334                     sayNO;
4335                 }
4336                 locinput = limit;
4337                 nextchr = UCHARAT(locinput);
4338                 break;
4339             }
4340
4341             /* Not utf8:  Inline the first character, for speed. */
4342             if (UCHARAT(s) != nextchr &&
4343                 (type == REF ||
4344                  UCHARAT(s) != fold_array[nextchr]))
4345                 sayNO;
4346             ln = rex->offs[n].end - ln;
4347             if (locinput + ln > PL_regeol)
4348                 sayNO;
4349             if (ln > 1 && (type == REF
4350                            ? memNE(s, locinput, ln)
4351                            : ! folder(s, locinput, ln)))
4352                 sayNO;
4353             locinput += ln;
4354             nextchr = UCHARAT(locinput);
4355             break;
4356         }
4357         case NOTHING:
4358         case TAIL:
4359             break;
4360         case BACK:
4361             break;
4362
4363 #undef  ST
4364 #define ST st->u.eval
4365         {
4366             SV *ret;
4367             REGEXP *re_sv;
4368             regexp *re;
4369             regexp_internal *rei;
4370             regnode *startpoint;
4371
4372         case GOSTART:
4373         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4374             if (cur_eval && cur_eval->locinput==locinput) {
4375                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4376                     Perl_croak(aTHX_ "Infinite recursion in regex");
4377                 if ( ++nochange_depth > max_nochange_depth )
4378                     Perl_croak(aTHX_
4379                         "Pattern subroutine nesting without pos change"
4380                         " exceeded limit in regex");
4381             } else {
4382                 nochange_depth = 0;
4383             }
4384             re_sv = rex_sv;
4385             re = rex;
4386             rei = rexi;
4387             if (OP(scan)==GOSUB) {
4388                 startpoint = scan + ARG2L(scan);
4389                 ST.close_paren = ARG(scan);
4390             } else {
4391                 startpoint = rei->program+1;
4392                 ST.close_paren = 0;
4393             }
4394             goto eval_recurse_doit;
4395             assert(0); /* NOTREACHED */
4396         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4397             if (cur_eval && cur_eval->locinput==locinput) {
4398                 if ( ++nochange_depth > max_nochange_depth )
4399                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4400             } else {
4401                 nochange_depth = 0;
4402             }
4403             {
4404                 /* execute the code in the {...} */
4405
4406                 dSP;
4407                 SV ** before;
4408                 OP * const oop = PL_op;
4409                 COP * const ocurcop = PL_curcop;
4410                 OP *nop;
4411                 char *saved_regeol = PL_regeol;
4412                 struct re_save_state saved_state;
4413                 CV *newcv;
4414
4415                 /* save *all* paren positions */
4416                 regcppush(rex, 0);
4417                 REGCP_SET(runops_cp);
4418
4419                 /* To not corrupt the existing regex state while executing the
4420                  * eval we would normally put it on the save stack, like with
4421                  * save_re_context. However, re-evals have a weird scoping so we
4422                  * can't just add ENTER/LEAVE here. With that, things like
4423                  *
4424                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4425                  *
4426                  * would break, as they expect the localisation to be unwound
4427                  * only when the re-engine backtracks through the bit that
4428                  * localised it.
4429                  *
4430                  * What we do instead is just saving the state in a local c
4431                  * variable.
4432                  */
4433                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4434
4435                 PL_reg_state.re_reparsing = FALSE;
4436
4437                 if (!caller_cv)
4438                     caller_cv = find_runcv(NULL);
4439
4440                 n = ARG(scan);
4441
4442                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4443                     newcv = ((struct regexp *)SvANY(
4444                                                 (REGEXP*)(rexi->data->data[n])
4445                                             ))->qr_anoncv
4446                                         ;
4447                     nop = (OP*)rexi->data->data[n+1];
4448                 }
4449                 else if (rexi->data->what[n] == 'l') { /* literal code */
4450                     newcv = caller_cv;
4451                     nop = (OP*)rexi->data->data[n];
4452                     assert(CvDEPTH(newcv));
4453                 }
4454                 else {
4455                     /* literal with own CV */
4456                     assert(rexi->data->what[n] == 'L');
4457                     newcv = rex->qr_anoncv;
4458                     nop = (OP*)rexi->data->data[n];
4459                 }
4460
4461                 /* normally if we're about to execute code from the same
4462                  * CV that we used previously, we just use the existing
4463                  * CX stack entry. However, its possible that in the
4464                  * meantime we may have backtracked, popped from the save
4465                  * stack, and undone the SAVECOMPPAD(s) associated with
4466                  * PUSH_MULTICALL; in which case PL_comppad no longer
4467                  * points to newcv's pad. */
4468                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4469                 {
4470                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4471                     if (last_pushed_cv) {
4472                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4473                     }
4474                     else {
4475                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4476                     }
4477                     last_pushed_cv = newcv;
4478                 }
4479                 last_pad = PL_comppad;
4480
4481                 /* the initial nextstate you would normally execute
4482                  * at the start of an eval (which would cause error
4483                  * messages to come from the eval), may be optimised
4484                  * away from the execution path in the regex code blocks;
4485                  * so manually set PL_curcop to it initially */
4486                 {
4487                     OP *o = cUNOPx(nop)->op_first;
4488                     assert(o->op_type == OP_NULL);
4489                     if (o->op_targ == OP_SCOPE) {
4490                         o = cUNOPo->op_first;
4491                     }
4492                     else {
4493                         assert(o->op_targ == OP_LEAVE);
4494                         o = cUNOPo->op_first;
4495                         assert(o->op_type == OP_ENTER);
4496                         o = o->op_sibling;
4497                     }
4498
4499                     if (o->op_type != OP_STUB) {
4500                         assert(    o->op_type == OP_NEXTSTATE
4501                                 || o->op_type == OP_DBSTATE
4502                                 || (o->op_type == OP_NULL
4503                                     &&  (  o->op_targ == OP_NEXTSTATE
4504                                         || o->op_targ == OP_DBSTATE
4505                                         )
4506                                     )
4507                         );
4508                         PL_curcop = (COP*)o;
4509                     }
4510                 }
4511                 nop = nop->op_next;
4512
4513                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4514                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4515
4516                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4517
4518                 if (sv_yes_mark) {
4519                     SV *sv_mrk = get_sv("REGMARK", 1);
4520                     sv_setsv(sv_mrk, sv_yes_mark);
4521                 }
4522
4523                 /* we don't use MULTICALL here as we want to call the
4524                  * first op of the block of interest, rather than the
4525                  * first op of the sub */
4526                 before = SP;
4527                 PL_op = nop;
4528                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4529                 SPAGAIN;
4530                 if (SP == before)
4531                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4532                 else {
4533                     ret = POPs;
4534                     PUTBACK;
4535                 }
4536
4537                 /* before restoring everything, evaluate the returned
4538                  * value, so that 'uninit' warnings don't use the wrong
4539                  * PL_op or pad. Also need to process any magic vars
4540                  * (e.g. $1) *before* parentheses are restored */
4541
4542                 PL_op = NULL;
4543
4544                 re_sv = NULL;
4545                 if (logical == 0)        /*   (?{})/   */
4546                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4547                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4548                     sw = cBOOL(SvTRUE(ret));
4549                     logical = 0;
4550                 }
4551                 else {                   /*  /(??{})  */
4552                     /*  if its overloaded, let the regex compiler handle
4553                      *  it; otherwise extract regex, or stringify  */
4554                     if (!SvAMAGIC(ret)) {
4555                         SV *sv = ret;
4556                         if (SvROK(sv))
4557                             sv = SvRV(sv);
4558                         if (SvTYPE(sv) == SVt_REGEXP)
4559                             re_sv = (REGEXP*) sv;
4560                         else if (SvSMAGICAL(sv)) {
4561                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4562                             if (mg)
4563                                 re_sv = (REGEXP *) mg->mg_obj;
4564                         }
4565
4566                         /* force any magic, undef warnings here */
4567                         if (!re_sv) {
4568                             ret = sv_mortalcopy(ret);
4569                             (void) SvPV_force_nolen(ret);
4570                         }
4571                     }
4572
4573                 }
4574
4575                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4576
4577                 /* *** Note that at this point we don't restore
4578                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4579                  * be used again soon. This is safe as long as nothing
4580                  * in the regexp code uses the pad ! */
4581                 PL_op = oop;
4582                 PL_curcop = ocurcop;
4583                 PL_regeol = saved_regeol;
4584                 S_regcp_restore(aTHX_ rex, runops_cp);
4585
4586                 if (logical != 2)
4587                     break;
4588             }
4589
4590                 /* only /(??{})/  from now on */
4591                 logical = 0;
4592                 {
4593                     /* extract RE object from returned value; compiling if
4594                      * necessary */
4595
4596                     if (re_sv) {
4597                         re_sv = reg_temp_copy(NULL, re_sv);
4598                     }
4599                     else {
4600                         U32 pm_flags = 0;
4601                         const I32 osize = PL_regsize;
4602
4603                         if (SvUTF8(ret) && IN_BYTES) {
4604                             /* In use 'bytes': make a copy of the octet
4605                              * sequence, but without the flag on */
4606                             STRLEN len;
4607                             const char *const p = SvPV(ret, len);
4608                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4609                         }
4610                         if (rex->intflags & PREGf_USE_RE_EVAL)
4611                             pm_flags |= PMf_USE_RE_EVAL;
4612
4613                         /* if we got here, it should be an engine which
4614                          * supports compiling code blocks and stuff */
4615                         assert(rex->engine && rex->engine->op_comp);
4616                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4617                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4618                                     rex->engine, NULL, NULL,
4619                                     /* copy /msix etc to inner pattern */
4620                                     scan->flags,
4621                                     pm_flags);
4622
4623                         if (!(SvFLAGS(ret)
4624                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4625                                  | SVs_GMG))) {
4626                             /* This isn't a first class regexp. Instead, it's
4627                                caching a regexp onto an existing, Perl visible
4628                                scalar.  */
4629                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4630                         }
4631                         PL_regsize = osize;
4632                         /* safe to do now that any $1 etc has been
4633                          * interpolated into the new pattern string and
4634                          * compiled */
4635                         S_regcp_restore(aTHX_ rex, runops_cp);
4636                     }
4637                     re = (struct regexp *)SvANY(re_sv);
4638                 }
4639                 RXp_MATCH_COPIED_off(re);
4640                 re->subbeg = rex->subbeg;
4641                 re->sublen = rex->sublen;
4642                 re->suboffset = rex->suboffset;
4643                 re->subcoffset = rex->subcoffset;
4644                 rei = RXi_GET(re);
4645                 DEBUG_EXECUTE_r(
4646                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4647                         "Matching embedded");
4648                 );
4649                 startpoint = rei->program + 1;
4650                 ST.close_paren = 0; /* only used for GOSUB */
4651
4652         eval_recurse_doit: /* Share code with GOSUB below this line */
4653                 /* run the pattern returned from (??{...}) */
4654                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4655                 REGCP_SET(ST.lastcp);
4656
4657                 re->lastparen = 0;
4658                 re->lastcloseparen = 0;
4659
4660                 PL_regsize = 0;
4661
4662                 /* XXXX This is too dramatic a measure... */
4663                 PL_reg_maxiter = 0;
4664
4665                 ST.toggle_reg_flags = PL_reg_flags;
4666                 if (RX_UTF8(re_sv))
4667                     PL_reg_flags |= RF_utf8;
4668                 else
4669                     PL_reg_flags &= ~RF_utf8;
4670                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4671
4672                 ST.prev_rex = rex_sv;
4673                 ST.prev_curlyx = cur_curlyx;
4674                 rex_sv = re_sv;
4675                 SET_reg_curpm(rex_sv);
4676                 rex = re;
4677                 rexi = rei;
4678                 cur_curlyx = NULL;
4679                 ST.B = next;
4680                 ST.prev_eval = cur_eval;
4681                 cur_eval = st;
4682                 /* now continue from first node in postoned RE */
4683                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
4684                 assert(0); /* NOTREACHED */
4685         }
4686
4687         case EVAL_AB: /* cleanup after a successful (??{A})B */
4688             /* note: this is called twice; first after popping B, then A */
4689             PL_reg_flags ^= ST.toggle_reg_flags;
4690             rex_sv = ST.prev_rex;
4691             SET_reg_curpm(rex_sv);
4692             rex = (struct regexp *)SvANY(rex_sv);
4693             rexi = RXi_GET(rex);
4694             regcpblow(ST.cp);
4695             cur_eval = ST.prev_eval;
4696             cur_curlyx = ST.prev_curlyx;
4697
4698             /* XXXX This is too dramatic a measure... */
4699             PL_reg_maxiter = 0;
4700             if ( nochange_depth )
4701                 nochange_depth--;
4702             sayYES;
4703
4704
4705         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4706             /* note: this is called twice; first after popping B, then A */
4707             PL_reg_flags ^= ST.toggle_reg_flags;
4708             rex_sv = ST.prev_rex;
4709             SET_reg_curpm(rex_sv);
4710             rex = (struct regexp *)SvANY(rex_sv);
4711             rexi = RXi_GET(rex);
4712
4713             REGCP_UNWIND(ST.lastcp);
4714             regcppop(rex);
4715             cur_eval = ST.prev_eval;
4716             cur_curlyx = ST.prev_curlyx;
4717             /* XXXX This is too dramatic a measure... */
4718             PL_reg_maxiter = 0;
4719             if ( nochange_depth )
4720                 nochange_depth--;
4721             sayNO_SILENT;
4722 #undef ST
4723
4724         case OPEN:
4725             n = ARG(scan);  /* which paren pair */
4726             rex->offs[n].start_tmp = locinput - PL_bostr;
4727             if (n > PL_regsize)
4728                 PL_regsize = n;
4729             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4730                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4731                 PTR2UV(rex),
4732                 PTR2UV(rex->offs),
4733                 (UV)n,
4734                 (IV)rex->offs[n].start_tmp,
4735                 (UV)PL_regsize
4736             ));
4737             lastopen = n;
4738             break;
4739
4740 /* XXX really need to log other places start/end are set too */
4741 #define CLOSE_CAPTURE \
4742     rex->offs[n].start = rex->offs[n].start_tmp; \
4743     rex->offs[n].end = locinput - PL_bostr; \
4744     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4745         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4746         PTR2UV(rex), \
4747         PTR2UV(rex->offs), \
4748         (UV)n, \
4749         (IV)rex->offs[n].start, \
4750         (IV)rex->offs[n].end \
4751     ))
4752
4753         case CLOSE:
4754             n = ARG(scan);  /* which paren pair */
4755             CLOSE_CAPTURE;
4756             /*if (n > PL_regsize)
4757                 PL_regsize = n;*/
4758             if (n > rex->lastparen)
4759                 rex->lastparen = n;
4760             rex->lastcloseparen = n;
4761             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4762                 goto fake_end;
4763             }
4764             break;
4765         case ACCEPT:
4766             if (ARG(scan)){
4767                 regnode *cursor;
4768                 for (cursor=scan;
4769                      cursor && OP(cursor)!=END;
4770                      cursor=regnext(cursor))
4771                 {
4772                     if ( OP(cursor)==CLOSE ){
4773                         n = ARG(cursor);
4774                         if ( n <= lastopen ) {
4775                             CLOSE_CAPTURE;
4776                             /*if (n > PL_regsize)
4777                             PL_regsize = n;*/
4778                             if (n > rex->lastparen)
4779                                 rex->lastparen = n;
4780                             rex->lastcloseparen = n;
4781                             if ( n == ARG(scan) || (cur_eval &&
4782                                 cur_eval->u.eval.close_paren == n))
4783                                 break;
4784                         }
4785                     }
4786                 }
4787             }
4788             goto fake_end;
4789             /*NOTREACHED*/
4790         case GROUPP:
4791             n = ARG(scan);  /* which paren pair */
4792             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4793             break;
4794         case NGROUPP:
4795             /* reg_check_named_buff_matched returns 0 for no match */
4796             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4797             break;
4798         case INSUBP:
4799             n = ARG(scan);
4800             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4801             break;
4802         case DEFINEP:
4803             sw = 0;
4804             break;
4805         case IFTHEN:
4806             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4807             if (sw)
4808                 next = NEXTOPER(NEXTOPER(scan));
4809             else {
4810                 next = scan + ARG(scan);
4811                 if (OP(next) == IFTHEN) /* Fake one. */
4812                     next = NEXTOPER(NEXTOPER(next));
4813             }
4814             break;
4815         case LOGICAL:
4816             logical = scan->flags;
4817             break;
4818
4819 /*******************************************************************
4820
4821 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4822 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4823 STAR/PLUS/CURLY/CURLYN are used instead.)
4824
4825 A*B is compiled as <CURLYX><A><WHILEM><B>
4826
4827 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4828 state, which contains the current count, initialised to -1. It also sets
4829 cur_curlyx to point to this state, with any previous value saved in the
4830 state block.
4831
4832 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4833 since the pattern may possibly match zero times (i.e. it's a while {} loop
4834 rather than a do {} while loop).
4835
4836 Each entry to WHILEM represents a successful match of A. The count in the
4837 CURLYX block is incremented, another WHILEM state is pushed, and execution
4838 passes to A or B depending on greediness and the current count.
4839
4840 For example, if matching against the string a1a2a3b (where the aN are
4841 substrings that match /A/), then the match progresses as follows: (the
4842 pushed states are interspersed with the bits of strings matched so far):
4843
4844     <CURLYX cnt=-1>
4845     <CURLYX cnt=0><WHILEM>
4846     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4847     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4848     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4849     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4850
4851 (Contrast this with something like CURLYM, which maintains only a single
4852 backtrack state:
4853
4854     <CURLYM cnt=0> a1
4855     a1 <CURLYM cnt=1> a2
4856     a1 a2 <CURLYM cnt=2> a3
4857     a1 a2 a3 <CURLYM cnt=3> b
4858 )
4859
4860 Each WHILEM state block marks a point to backtrack to upon partial failure
4861 of A or B, and also contains some minor state data related to that
4862 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4863 overall state, such as the count, and pointers to the A and B ops.
4864
4865 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4866 must always point to the *current* CURLYX block, the rules are:
4867
4868 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4869 and set cur_curlyx to point the new block.
4870
4871 When popping the CURLYX block after a successful or unsuccessful match,
4872 restore the previous cur_curlyx.
4873
4874 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4875 to the outer one saved in the CURLYX block.
4876
4877 When popping the WHILEM block after a successful or unsuccessful B match,
4878 restore the previous cur_curlyx.
4879
4880 Here's an example for the pattern (AI* BI)*BO
4881 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4882
4883 cur_
4884 curlyx backtrack stack
4885 ------ ---------------
4886 NULL
4887 CO     <CO prev=NULL> <WO>
4888 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4889 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4890 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4891
4892 At this point the pattern succeeds, and we work back down the stack to
4893 clean up, restoring as we go:
4894
4895 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4896 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4897 CO     <CO prev=NULL> <WO>
4898 NULL
4899
4900 *******************************************************************/
4901
4902 #define ST st->u.curlyx
4903
4904         case CURLYX:    /* start of /A*B/  (for complex A) */
4905         {
4906             /* No need to save/restore up to this paren */
4907             I32 parenfloor = scan->flags;
4908
4909             assert(next); /* keep Coverity happy */
4910             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4911                 next += ARG(next);
4912
4913             /* XXXX Probably it is better to teach regpush to support
4914                parenfloor > PL_regsize... */
4915             if (parenfloor > (I32)rex->lastparen)
4916                 parenfloor = rex->lastparen; /* Pessimization... */
4917
4918             ST.prev_curlyx= cur_curlyx;
4919             cur_curlyx = st;
4920             ST.cp = PL_savestack_ix;
4921
4922             /* these fields contain the state of the current curly.
4923              * they are accessed by subsequent WHILEMs */
4924             ST.parenfloor = parenfloor;
4925             ST.me = scan;
4926             ST.B = next;
4927             ST.minmod = minmod;
4928             minmod = 0;
4929             ST.count = -1;      /* this will be updated by WHILEM */
4930             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4931
4932             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
4933             assert(0); /* NOTREACHED */
4934         }
4935
4936         case CURLYX_end: /* just finished matching all of A*B */
4937             cur_curlyx = ST.prev_curlyx;
4938             sayYES;
4939             assert(0); /* NOTREACHED */
4940
4941         case CURLYX_end_fail: /* just failed to match all of A*B */
4942             regcpblow(ST.cp);
4943             cur_curlyx = ST.prev_curlyx;
4944             sayNO;
4945             assert(0); /* NOTREACHED */
4946
4947
4948 #undef ST
4949 #define ST st->u.whilem
4950
4951         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4952         {
4953             /* see the discussion above about CURLYX/WHILEM */
4954             I32 n;
4955             int min = ARG1(cur_curlyx->u.curlyx.me);
4956             int max = ARG2(cur_curlyx->u.curlyx.me);
4957             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4958
4959             assert(cur_curlyx); /* keep Coverity happy */
4960             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4961             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4962             ST.cache_offset = 0;
4963             ST.cache_mask = 0;
4964
4965
4966             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4967                   "%*s  whilem: matched %ld out of %d..%d\n",
4968                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4969             );
4970
4971             /* First just match a string of min A's. */
4972
4973             if (n < min) {
4974                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4975                 cur_curlyx->u.curlyx.lastloc = locinput;
4976                 REGCP_SET(ST.lastcp);
4977
4978                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
4979                 assert(0); /* NOTREACHED */
4980             }
4981
4982             /* If degenerate A matches "", assume A done. */
4983
4984             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4985                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4986                    "%*s  whilem: empty match detected, trying continuation...\n",
4987                    REPORT_CODE_OFF+depth*2, "")
4988                 );
4989                 goto do_whilem_B_max;
4990             }
4991
4992             /* super-linear cache processing */
4993
4994             if (scan->flags) {
4995
4996                 if (!PL_reg_maxiter) {
4997                     /* start the countdown: Postpone detection until we
4998                      * know the match is not *that* much linear. */
4999                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5000                     /* possible overflow for long strings and many CURLYX's */
5001                     if (PL_reg_maxiter < 0)
5002                         PL_reg_maxiter = I32_MAX;
5003                     PL_reg_leftiter = PL_reg_maxiter;
5004                 }
5005
5006                 if (PL_reg_leftiter-- == 0) {
5007                     /* initialise cache */
5008                     const I32 size = (PL_reg_maxiter + 7)/8;
5009                     if (PL_reg_poscache) {
5010                         if ((I32)PL_reg_poscache_size < size) {
5011                             Renew(PL_reg_poscache, size, char);
5012                             PL_reg_poscache_size = size;
5013                         }
5014                         Zero(PL_reg_poscache, size, char);
5015                     }
5016                     else {
5017                         PL_reg_poscache_size = size;
5018                         Newxz(PL_reg_poscache, size, char);
5019                     }
5020                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5021       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5022                               PL_colors[4], PL_colors[5])
5023                     );
5024                 }
5025
5026                 if (PL_reg_leftiter < 0) {
5027                     /* have we already failed at this position? */
5028                     I32 offset, mask;
5029                     offset  = (scan->flags & 0xf) - 1
5030                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5031                     mask    = 1 << (offset % 8);
5032                     offset /= 8;
5033                     if (PL_reg_poscache[offset] & mask) {
5034                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5035                             "%*s  whilem: (cache) already tried at this position...\n",
5036                             REPORT_CODE_OFF+depth*2, "")
5037                         );
5038                         sayNO; /* cache records failure */
5039                     }
5040                     ST.cache_offset = offset;
5041                     ST.cache_mask   = mask;
5042                 }
5043             }
5044
5045             /* Prefer B over A for minimal matching. */
5046
5047             if (cur_curlyx->u.curlyx.minmod) {
5048                 ST.save_curlyx = cur_curlyx;
5049                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5050                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5051                 REGCP_SET(ST.lastcp);
5052                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5053                                     locinput);
5054                 assert(0); /* NOTREACHED */
5055             }
5056
5057             /* Prefer A over B for maximal matching. */
5058
5059             if (n < max) { /* More greed allowed? */
5060                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5061                 cur_curlyx->u.curlyx.lastloc = locinput;
5062                 REGCP_SET(ST.lastcp);
5063                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5064                 assert(0); /* NOTREACHED */
5065             }
5066             goto do_whilem_B_max;
5067         }
5068         assert(0); /* NOTREACHED */
5069
5070         case WHILEM_B_min: /* just matched B in a minimal match */
5071         case WHILEM_B_max: /* just matched B in a maximal match */
5072             cur_curlyx = ST.save_curlyx;
5073             sayYES;
5074             assert(0); /* NOTREACHED */
5075
5076         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5077             cur_curlyx = ST.save_curlyx;
5078             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5079             cur_curlyx->u.curlyx.count--;
5080             CACHEsayNO;
5081             assert(0); /* NOTREACHED */
5082
5083         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5084             /* FALL THROUGH */
5085         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5086             REGCP_UNWIND(ST.lastcp);
5087             regcppop(rex);
5088             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5089             cur_curlyx->u.curlyx.count--;
5090             CACHEsayNO;
5091             assert(0); /* NOTREACHED */
5092
5093         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5094             REGCP_UNWIND(ST.lastcp);
5095             regcppop(rex);      /* Restore some previous $<digit>s? */
5096             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5097                 "%*s  whilem: failed, trying continuation...\n",
5098                 REPORT_CODE_OFF+depth*2, "")
5099             );
5100           do_whilem_B_max:
5101             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5102                 && ckWARN(WARN_REGEXP)
5103                 && !(PL_reg_flags & RF_warned))
5104             {
5105                 PL_reg_flags |= RF_warned;
5106                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5107                      "Complex regular subexpression recursion limit (%d) "
5108                      "exceeded",
5109                      REG_INFTY - 1);
5110             }
5111
5112             /* now try B */
5113             ST.save_curlyx = cur_curlyx;
5114             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5115             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5116                                 locinput);
5117             assert(0); /* NOTREACHED */
5118
5119         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5120             cur_curlyx = ST.save_curlyx;
5121             REGCP_UNWIND(ST.lastcp);
5122             regcppop(rex);
5123
5124             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5125                 /* Maximum greed exceeded */
5126                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5127                     && ckWARN(WARN_REGEXP)
5128                     && !(PL_reg_flags & RF_warned))
5129                 {
5130                     PL_reg_flags |= RF_warned;
5131                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5132                         "Complex regular subexpression recursion "
5133                         "limit (%d) exceeded",
5134                         REG_INFTY - 1);
5135                 }
5136                 cur_curlyx->u.curlyx.count--;
5137                 CACHEsayNO;
5138             }
5139
5140             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5141                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5142             );
5143             /* Try grabbing another A and see if it helps. */
5144             cur_curlyx->u.curlyx.lastloc = locinput;
5145             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5146             REGCP_SET(ST.lastcp);
5147             PUSH_STATE_GOTO(WHILEM_A_min,
5148                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5149                 locinput);
5150             assert(0); /* NOTREACHED */
5151
5152 #undef  ST
5153 #define ST st->u.branch
5154
5155         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5156             next = scan + ARG(scan);
5157             if (next == scan)
5158                 next = NULL;
5159             scan = NEXTOPER(scan);
5160             /* FALL THROUGH */
5161
5162         case BRANCH:        /*  /(...|A|...)/ */
5163             scan = NEXTOPER(scan); /* scan now points to inner node */
5164             ST.lastparen = rex->lastparen;
5165             ST.lastcloseparen = rex->lastcloseparen;
5166             ST.next_branch = next;
5167             REGCP_SET(ST.cp);
5168
5169             /* Now go into the branch */
5170             if (has_cutgroup) {
5171                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5172             } else {
5173                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5174             }
5175             assert(0); /* NOTREACHED */
5176         case CUTGROUP:
5177             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5178                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5179             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5180             assert(0); /* NOTREACHED */
5181         case CUTGROUP_next_fail:
5182             do_cutgroup = 1;
5183             no_final = 1;
5184             if (st->u.mark.mark_name)
5185                 sv_commit = st->u.mark.mark_name;
5186             sayNO;
5187             assert(0); /* NOTREACHED */
5188         case BRANCH_next:
5189             sayYES;
5190             assert(0); /* NOTREACHED */
5191         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5192             if (do_cutgroup) {
5193                 do_cutgroup = 0;
5194                 no_final = 0;
5195             }
5196             REGCP_UNWIND(ST.cp);
5197             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5198             scan = ST.next_branch;
5199             /* no more branches? */
5200             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5201                 DEBUG_EXECUTE_r({
5202                     PerlIO_printf( Perl_debug_log,
5203                         "%*s  %sBRANCH failed...%s\n",
5204                         REPORT_CODE_OFF+depth*2, "",
5205                         PL_colors[4],
5206                         PL_colors[5] );
5207                 });
5208                 sayNO_SILENT;
5209             }
5210             continue; /* execute next BRANCH[J] op */
5211             assert(0); /* NOTREACHED */
5212
5213         case MINMOD:
5214             minmod = 1;
5215             break;
5216
5217 #undef  ST
5218 #define ST st->u.curlym
5219
5220         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5221
5222             /* This is an optimisation of CURLYX that enables us to push
5223              * only a single backtracking state, no matter how many matches
5224              * there are in {m,n}. It relies on the pattern being constant
5225              * length, with no parens to influence future backrefs
5226              */
5227
5228             ST.me = scan;
5229             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5230
5231             ST.lastparen      = rex->lastparen;
5232             ST.lastcloseparen = rex->lastcloseparen;
5233
5234             /* if paren positive, emulate an OPEN/CLOSE around A */
5235             if (ST.me->flags) {
5236                 U32 paren = ST.me->flags;
5237                 if (paren > PL_regsize)
5238                     PL_regsize = paren;
5239                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5240             }
5241             ST.A = scan;
5242             ST.B = next;
5243             ST.alen = 0;
5244             ST.count = 0;
5245             ST.minmod = minmod;
5246             minmod = 0;
5247             ST.c1 = CHRTEST_UNINIT;
5248             REGCP_SET(ST.cp);
5249
5250             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5251                 goto curlym_do_B;
5252
5253           curlym_do_A: /* execute the A in /A{m,n}B/  */
5254             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5255             assert(0); /* NOTREACHED */
5256
5257         case CURLYM_A: /* we've just matched an A */
5258             ST.count++;
5259             /* after first match, determine A's length: u.curlym.alen */
5260             if (ST.count == 1) {
5261                 if (PL_reg_match_utf8) {
5262                     char *s = st->locinput;
5263                     while (s < locinput) {
5264                         ST.alen++;
5265                         s += UTF8SKIP(s);
5266                     }
5267                 }
5268                 else {
5269                     ST.alen = locinput - st->locinput;
5270                 }
5271                 if (ST.alen == 0)
5272                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5273             }
5274             DEBUG_EXECUTE_r(
5275                 PerlIO_printf(Perl_debug_log,
5276                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5277                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5278                           (IV) ST.count, (IV)ST.alen)
5279             );
5280
5281             if (cur_eval && cur_eval->u.eval.close_paren &&
5282                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5283                 goto fake_end;
5284
5285             {
5286                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5287                 if ( max == REG_INFTY || ST.count < max )
5288                     goto curlym_do_A; /* try to match another A */
5289             }
5290             goto curlym_do_B; /* try to match B */
5291
5292         case CURLYM_A_fail: /* just failed to match an A */
5293             REGCP_UNWIND(ST.cp);
5294
5295             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5296                 || (cur_eval && cur_eval->u.eval.close_paren &&
5297                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5298                 sayNO;
5299
5300           curlym_do_B: /* execute the B in /A{m,n}B/  */
5301             if (ST.c1 == CHRTEST_UNINIT) {
5302                 /* calculate c1 and c2 for possible match of 1st char
5303                  * following curly */
5304                 ST.c1 = ST.c2 = CHRTEST_VOID;
5305                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5306                     regnode *text_node = ST.B;
5307                     if (! HAS_TEXT(text_node))
5308                         FIND_NEXT_IMPT(text_node);
5309                     /* this used to be
5310
5311                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5312
5313                         But the former is redundant in light of the latter.
5314
5315                         if this changes back then the macro for
5316                         IS_TEXT and friends need to change.
5317                      */
5318                     if (PL_regkind[OP(text_node)] == EXACT)
5319                     {
5320
5321                         ST.c1 = (U8)*STRING(text_node);
5322                         switch (OP(text_node)) {
5323                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5324                             case EXACTFA:
5325                             case EXACTFU_SS:
5326                             case EXACTFU_TRICKYFOLD:
5327                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5328                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5329                             default: ST.c2 = ST.c1;
5330                         }
5331                     }
5332                 }
5333             }
5334
5335             DEBUG_EXECUTE_r(
5336                 PerlIO_printf(Perl_debug_log,
5337                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5338                     (int)(REPORT_CODE_OFF+(depth*2)),
5339                     "", (IV)ST.count)
5340                 );
5341             if (ST.c1 != CHRTEST_VOID
5342                     && nextchr != ST.c1
5343                     && nextchr != ST.c2)
5344             {
5345                 /* simulate B failing */
5346                 DEBUG_OPTIMISE_r(
5347                     PerlIO_printf(Perl_debug_log,
5348                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5349                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5350                         (IV)ST.c1,(IV)ST.c2
5351                 ));
5352                 state_num = CURLYM_B_fail;
5353                 goto reenter_switch;
5354             }
5355
5356             if (ST.me->flags) {
5357                 /* emulate CLOSE: mark current A as captured */
5358                 I32 paren = ST.me->flags;
5359                 if (ST.count) {
5360                     rex->offs[paren].start
5361                         = HOPc(locinput, -ST.alen) - PL_bostr;
5362                     rex->offs[paren].end = locinput - PL_bostr;
5363                     if ((U32)paren > rex->lastparen)
5364                         rex->lastparen = paren;
5365                     rex->lastcloseparen = paren;
5366                 }
5367                 else
5368                     rex->offs[paren].end = -1;
5369                 if (cur_eval && cur_eval->u.eval.close_paren &&
5370                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5371                 {
5372                     if (ST.count)
5373                         goto fake_end;
5374                     else
5375                         sayNO;
5376                 }
5377             }
5378
5379             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5380             assert(0); /* NOTREACHED */
5381
5382         case CURLYM_B_fail: /* just failed to match a B */
5383             REGCP_UNWIND(ST.cp);
5384             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5385             if (ST.minmod) {
5386                 I32 max = ARG2(ST.me);
5387                 if (max != REG_INFTY && ST.count == max)
5388                     sayNO;
5389                 goto curlym_do_A; /* try to match a further A */
5390             }
5391             /* backtrack one A */
5392             if (ST.count == ARG1(ST.me) /* min */)
5393                 sayNO;
5394             ST.count--;
5395             locinput = HOPc(locinput, -ST.alen);
5396             nextchr = UCHARAT(locinput);
5397             goto curlym_do_B; /* try to match B */
5398
5399 #undef ST
5400 #define ST st->u.curly
5401
5402 #define CURLY_SETPAREN(paren, success) \
5403     if (paren) { \
5404         if (success) { \
5405             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5406             rex->offs[paren].end = locinput - PL_bostr; \
5407             if (paren > rex->lastparen) \
5408                 rex->lastparen = paren; \
5409             rex->lastcloseparen = paren; \
5410         } \
5411         else { \
5412             rex->offs[paren].end = -1; \
5413             rex->lastparen      = ST.lastparen; \
5414             rex->lastcloseparen = ST.lastcloseparen; \
5415         } \
5416     }
5417
5418         case STAR:              /*  /A*B/ where A is width 1 */
5419             ST.paren = 0;
5420             ST.min = 0;
5421             ST.max = REG_INFTY;
5422             scan = NEXTOPER(scan);
5423             goto repeat;
5424         case PLUS:              /*  /A+B/ where A is width 1 */
5425             ST.paren = 0;
5426             ST.min = 1;
5427             ST.max = REG_INFTY;
5428             scan = NEXTOPER(scan);
5429             goto repeat;
5430         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5431             ST.paren = scan->flags;     /* Which paren to set */
5432             ST.lastparen      = rex->lastparen;
5433             ST.lastcloseparen = rex->lastcloseparen;
5434             if (ST.paren > PL_regsize)
5435                 PL_regsize = ST.paren;
5436             ST.min = ARG1(scan);  /* min to match */
5437             ST.max = ARG2(scan);  /* max to match */
5438             if (cur_eval && cur_eval->u.eval.close_paren &&
5439                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5440                 ST.min=1;
5441                 ST.max=1;
5442             }
5443             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5444             goto repeat;
5445         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5446             ST.paren = 0;
5447             ST.min = ARG1(scan);  /* min to match */
5448             ST.max = ARG2(scan);  /* max to match */
5449             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5450           repeat:
5451             /*
5452             * Lookahead to avoid useless match attempts
5453             * when we know what character comes next.
5454             *
5455             * Used to only do .*x and .*?x, but now it allows
5456             * for )'s, ('s and (?{ ... })'s to be in the way
5457             * of the quantifier and the EXACT-like node.  -- japhy
5458             */
5459
5460             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5461                 sayNO;
5462             if (HAS_TEXT(next) || JUMPABLE(next)) {
5463                 U8 *s;
5464                 regnode *text_node = next;
5465
5466                 if (! HAS_TEXT(text_node))
5467                     FIND_NEXT_IMPT(text_node);
5468
5469                 if (! HAS_TEXT(text_node))
5470                     ST.c1 = ST.c2 = CHRTEST_VOID;
5471                 else {
5472                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5473                         ST.c1 = ST.c2 = CHRTEST_VOID;
5474                         goto assume_ok_easy;
5475                     }
5476                     else
5477                         s = (U8*)STRING(text_node);
5478
5479                     /*  Currently we only get here when
5480
5481                         PL_rekind[OP(text_node)] == EXACT
5482
5483                         if this changes back then the macro for IS_TEXT and
5484                         friends need to change. */
5485                     if (!UTF_PATTERN) {
5486                         ST.c1 = *s;
5487                         switch (OP(text_node)) {
5488                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5489                             case EXACTFA:
5490                             case EXACTFU_SS:
5491                             case EXACTFU_TRICKYFOLD:
5492                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5493                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5494                             default: ST.c2 = ST.c1; break;
5495                         }
5496                     }
5497                     else { /* UTF_PATTERN */
5498                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5499                              STRLEN ulen;
5500                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5501
5502                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5503                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5504                                                     uniflags);
5505                         }
5506                         else {
5507                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5508                                                      uniflags);
5509                         }
5510                     }
5511                 }
5512             }
5513             else
5514                 ST.c1 = ST.c2 = CHRTEST_VOID;
5515         assume_ok_easy:
5516
5517             ST.A = scan;
5518             ST.B = next;
5519             if (minmod) {
5520                 char *li = locinput;
5521                 minmod = 0;
5522                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5523                     sayNO;
5524                 locinput = li;
5525                 nextchr = UCHARAT(locinput);
5526                 ST.count = ST.min;
5527                 REGCP_SET(ST.cp);
5528                 if (ST.c1 == CHRTEST_VOID)
5529                     goto curly_try_B_min;
5530
5531                 ST.oldloc = locinput;
5532
5533                 /* set ST.maxpos to the furthest point along the
5534                  * string that could possibly match */
5535                 if  (ST.max == REG_INFTY) {
5536                     ST.maxpos = PL_regeol - 1;
5537                     if (utf8_target)
5538                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5539                             ST.maxpos--;
5540                 }
5541                 else if (utf8_target) {
5542                     int m = ST.max - ST.min;
5543                     for (ST.maxpos = locinput;
5544                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5545                         ST.maxpos += UTF8SKIP(ST.maxpos);
5546                 }
5547                 else {
5548                     ST.maxpos = locinput + ST.max - ST.min;
5549                     if (ST.maxpos >= PL_regeol)
5550                         ST.maxpos = PL_regeol - 1;
5551                 }
5552                 goto curly_try_B_min_known;
5553
5554             }
5555             else {
5556                 /* avoid taking address of locinput, so it can remain
5557                  * a register var */
5558                 char *li = locinput;
5559                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5560                 if (ST.count < ST.min)
5561                     sayNO;
5562                 locinput = li;
5563                 nextchr = UCHARAT(locinput);
5564                 if ((ST.count > ST.min)
5565                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5566                 {
5567                     /* A{m,n} must come at the end of the string, there's
5568                      * no point in backing off ... */
5569                     ST.min = ST.count;
5570                     /* ...except that $ and \Z can match before *and* after
5571                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5572                        We may back off by one in this case. */
5573                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5574                         ST.min--;
5575                 }
5576                 REGCP_SET(ST.cp);
5577                 goto curly_try_B_max;
5578             }
5579             assert(0); /* NOTREACHED */
5580
5581
5582         case CURLY_B_min_known_fail:
5583             /* failed to find B in a non-greedy match where c1,c2 valid */
5584
5585             REGCP_UNWIND(ST.cp);
5586             if (ST.paren) {
5587                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5588             }
5589             /* Couldn't or didn't -- move forward. */
5590             ST.oldloc = locinput;
5591             if (utf8_target)
5592                 locinput += UTF8SKIP(locinput);
5593             else
5594                 locinput++;
5595             ST.count++;
5596           curly_try_B_min_known:
5597              /* find the next place where 'B' could work, then call B */
5598             {
5599                 int n;
5600                 if (utf8_target) {
5601                     n = (ST.oldloc == locinput) ? 0 : 1;
5602                     if (ST.c1 == ST.c2) {
5603                         STRLEN len;
5604                         /* set n to utf8_distance(oldloc, locinput) */
5605                         while (locinput <= ST.maxpos &&
5606                                utf8n_to_uvchr((U8*)locinput,
5607                                               UTF8_MAXBYTES, &len,
5608                                               uniflags) != (UV)ST.c1) {
5609                             locinput += len;
5610                             n++;
5611                         }
5612                     }
5613                     else {
5614                         /* set n to utf8_distance(oldloc, locinput) */
5615                         while (locinput <= ST.maxpos) {
5616                             STRLEN len;
5617                             const UV c = utf8n_to_uvchr((U8*)locinput,
5618                                                   UTF8_MAXBYTES, &len,
5619                                                   uniflags);
5620                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5621                                 break;
5622                             locinput += len;
5623                             n++;
5624                         }
5625                     }
5626                 }
5627                 else {
5628                     if (ST.c1 == ST.c2) {
5629                         while (locinput <= ST.maxpos &&
5630                                UCHARAT(locinput) != ST.c1)
5631                             locinput++;
5632                     }
5633                     else {
5634                         while (locinput <= ST.maxpos
5635                                && UCHARAT(locinput) != ST.c1
5636                                && UCHARAT(locinput) != ST.c2)
5637                             locinput++;
5638                     }
5639                     n = locinput - ST.oldloc;
5640                 }
5641                 if (locinput > ST.maxpos)
5642                     sayNO;
5643                 if (n) {
5644                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5645                      * at b; check that everything between oldloc and
5646                      * locinput matches */
5647                     char *li = ST.oldloc;
5648                     ST.count += n;
5649                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
5650                         sayNO;
5651                     assert(n == REG_INFTY || locinput == li);
5652                 }
5653                 CURLY_SETPAREN(ST.paren, ST.count);
5654                 if (cur_eval && cur_eval->u.eval.close_paren &&
5655                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5656                     goto fake_end;
5657                 }
5658                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5659             }
5660             assert(0); /* NOTREACHED */
5661
5662
5663         case CURLY_B_min_fail:
5664             /* failed to find B in a non-greedy match where c1,c2 invalid */
5665
5666             REGCP_UNWIND(ST.cp);
5667             if (ST.paren) {
5668                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5669             }
5670             /* failed -- move forward one */
5671             {
5672                 char *li = locinput;
5673                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5674                     sayNO;
5675                 }
5676                 locinput = li;
5677             }
5678             {
5679                 ST.count++;
5680                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5681                         ST.count > 0)) /* count overflow ? */
5682                 {
5683                   curly_try_B_min:
5684                     CURLY_SETPAREN(ST.paren, ST.count);
5685                     if (cur_eval && cur_eval->u.eval.close_paren &&
5686                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5687                         goto fake_end;
5688                     }
5689                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
5690                 }
5691             }
5692             assert(0); /* NOTREACHED */
5693
5694
5695         curly_try_B_max:
5696             /* a successful greedy match: now try to match B */
5697             if (cur_eval && cur_eval->u.eval.close_paren &&
5698                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5699                 goto fake_end;
5700             }
5701             {
5702                 UV c = 0;
5703                 if (ST.c1 != CHRTEST_VOID)
5704                     c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
5705                                            UTF8_MAXBYTES, 0, uniflags)
5706                                 : (UV) UCHARAT(locinput);
5707                 /* If it could work, try it. */
5708                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5709                     CURLY_SETPAREN(ST.paren, ST.count);
5710                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
5711                     assert(0); /* NOTREACHED */
5712                 }
5713             }
5714             /* FALL THROUGH */
5715         case CURLY_B_max_fail:
5716             /* failed to find B in a greedy match */
5717
5718             REGCP_UNWIND(ST.cp);
5719             if (ST.paren) {
5720                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5721             }
5722             /*  back up. */
5723             if (--ST.count < ST.min)
5724                 sayNO;
5725             locinput = HOPc(locinput, -1);
5726             goto curly_try_B_max;
5727
5728 #undef ST
5729
5730         case END:
5731             fake_end:
5732             if (cur_eval) {
5733                 /* we've just finished A in /(??{A})B/; now continue with B */
5734                 st->u.eval.toggle_reg_flags
5735                             = cur_eval->u.eval.toggle_reg_flags;
5736                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5737
5738                 st->u.eval.prev_rex = rex_sv;           /* inner */
5739                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5740                 rex_sv = cur_eval->u.eval.prev_rex;
5741                 SET_reg_curpm(rex_sv);
5742                 rex = (struct regexp *)SvANY(rex_sv);
5743                 rexi = RXi_GET(rex);
5744                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5745
5746                 REGCP_SET(st->u.eval.lastcp);
5747
5748                 /* Restore parens of the outer rex without popping the
5749                  * savestack */
5750                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5751
5752                 st->u.eval.prev_eval = cur_eval;
5753                 cur_eval = cur_eval->u.eval.prev_eval;
5754                 DEBUG_EXECUTE_r(
5755                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5756                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5757                 if ( nochange_depth )
5758                     nochange_depth--;
5759
5760                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
5761                                     locinput); /* match B */
5762             }
5763
5764             if (locinput < reginfo->till) {
5765                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5766                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5767                                       PL_colors[4],
5768                                       (long)(locinput - PL_reg_starttry),
5769                                       (long)(reginfo->till - PL_reg_starttry),
5770                                       PL_colors[5]));
5771
5772                 sayNO_SILENT;           /* Cannot match: too short. */
5773             }
5774             sayYES;                     /* Success! */
5775
5776         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5777             DEBUG_EXECUTE_r(
5778             PerlIO_printf(Perl_debug_log,
5779                 "%*s  %ssubpattern success...%s\n",
5780                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5781             sayYES;                     /* Success! */
5782
5783 #undef  ST
5784 #define ST st->u.ifmatch
5785
5786         {
5787             char *newstart;
5788
5789         case SUSPEND:   /* (?>A) */
5790             ST.wanted = 1;
5791             newstart = locinput;
5792             goto do_ifmatch;
5793
5794         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5795             ST.wanted = 0;
5796             goto ifmatch_trivial_fail_test;
5797
5798         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5799             ST.wanted = 1;
5800           ifmatch_trivial_fail_test:
5801             if (scan->flags) {
5802                 char * const s = HOPBACKc(locinput, scan->flags);
5803                 if (!s) {
5804                     /* trivial fail */
5805                     if (logical) {
5806                         logical = 0;
5807                         sw = 1 - cBOOL(ST.wanted);
5808                     }
5809                     else if (ST.wanted)
5810                         sayNO;
5811                     next = scan + ARG(scan);
5812                     if (next == scan)
5813                         next = NULL;
5814                     break;
5815                 }
5816                 newstart = s;
5817             }
5818             else
5819                 newstart = locinput;
5820
5821           do_ifmatch:
5822             ST.me = scan;
5823             ST.logical = logical;
5824             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5825
5826             /* execute body of (?...A) */
5827             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
5828             assert(0); /* NOTREACHED */
5829         }
5830
5831         case IFMATCH_A_fail: /* body of (?...A) failed */
5832             ST.wanted = !ST.wanted;
5833             /* FALL THROUGH */
5834
5835         case IFMATCH_A: /* body of (?...A) succeeded */
5836             if (ST.logical) {
5837                 sw = cBOOL(ST.wanted);
5838             }
5839             else if (!ST.wanted)
5840                 sayNO;
5841
5842             if (OP(ST.me) != SUSPEND) {
5843                 /* restore old position except for (?>...) */
5844                 locinput = st->locinput;
5845                 nextchr = UCHARAT(locinput);
5846             }
5847             scan = ST.me + ARG(ST.me);
5848             if (scan == ST.me)
5849                 scan = NULL;
5850             continue; /* execute B */
5851
5852 #undef ST
5853
5854         case LONGJMP:
5855             next = scan + ARG(scan);
5856             if (next == scan)
5857                 next = NULL;
5858             break;
5859         case COMMIT:
5860             reginfo->cutpoint = PL_regeol;
5861             /* FALLTHROUGH */
5862         case PRUNE:
5863             if (!scan->flags)
5864                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5865             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
5866             assert(0); /* NOTREACHED */
5867         case COMMIT_next_fail:
5868             no_final = 1;
5869             /* FALLTHROUGH */
5870         case OPFAIL:
5871             sayNO;
5872             assert(0); /* NOTREACHED */
5873
5874 #define ST st->u.mark
5875         case MARKPOINT:
5876             ST.prev_mark = mark_state;
5877             ST.mark_name = sv_commit = sv_yes_mark
5878                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5879             mark_state = st;
5880             ST.mark_loc = locinput;
5881             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
5882             assert(0); /* NOTREACHED */
5883         case MARKPOINT_next:
5884             mark_state = ST.prev_mark;
5885             sayYES;
5886             assert(0); /* NOTREACHED */
5887         case MARKPOINT_next_fail:
5888             if (popmark && sv_eq(ST.mark_name,popmark))
5889             {
5890                 if (ST.mark_loc > startpoint)
5891                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5892                 popmark = NULL; /* we found our mark */
5893                 sv_commit = ST.mark_name;
5894
5895                 DEBUG_EXECUTE_r({
5896                         PerlIO_printf(Perl_debug_log,
5897                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5898                             REPORT_CODE_OFF+depth*2, "",
5899                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5900                 });
5901             }
5902             mark_state = ST.prev_mark;
5903             sv_yes_mark = mark_state ?
5904                 mark_state->u.mark.mark_name : NULL;
5905             sayNO;
5906             assert(0); /* NOTREACHED */
5907         case SKIP:
5908             if (scan->flags) {
5909                 /* (*SKIP) : if we fail we cut here*/
5910                 ST.mark_name = NULL;
5911                 ST.mark_loc = locinput;
5912                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5913             } else {
5914                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5915                    otherwise do nothing.  Meaning we need to scan
5916                  */
5917                 regmatch_state *cur = mark_state;
5918                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5919
5920                 while (cur) {
5921                     if ( sv_eq( cur->u.mark.mark_name,
5922                                 find ) )
5923                     {
5924                         ST.mark_name = find;
5925                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
5926                     }
5927                     cur = cur->u.mark.prev_mark;
5928                 }
5929             }
5930             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5931             break;
5932         case SKIP_next_fail:
5933             if (ST.mark_name) {
5934                 /* (*CUT:NAME) - Set up to search for the name as we
5935                    collapse the stack*/
5936                 popmark = ST.mark_name;
5937             } else {
5938                 /* (*CUT) - No name, we cut here.*/
5939                 if (ST.mark_loc > startpoint)
5940                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5941                 /* but we set sv_commit to latest mark_name if there
5942                    is one so they can test to see how things lead to this
5943                    cut */
5944                 if (mark_state)
5945                     sv_commit=mark_state->u.mark.mark_name;
5946             }
5947             no_final = 1;
5948             sayNO;
5949             assert(0); /* NOTREACHED */
5950 #undef ST
5951         case LNBREAK:
5952             if ((n=is_LNBREAK(locinput,utf8_target))) {
5953                 locinput += n;
5954                 nextchr = UCHARAT(locinput);
5955             } else
5956                 sayNO;
5957             break;
5958
5959 #define CASE_CLASS(nAmE)                              \
5960         case nAmE:                                    \
5961             if (locinput >= PL_regeol)                \
5962                 sayNO;                                \
5963             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5964                 locinput += n;                        \
5965                 nextchr = UCHARAT(locinput);          \
5966             } else                                    \
5967                 sayNO;                                \
5968             break;                                    \
5969         case N##nAmE:                                 \
5970             if (locinput >= PL_regeol)                \
5971                 sayNO;                                \
5972             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5973                 sayNO;                                \
5974             } else {                                  \
5975                 locinput += UTF8SKIP(locinput);       \
5976                 nextchr = UCHARAT(locinput);          \
5977             }                                         \
5978             break
5979
5980         CASE_CLASS(VERTWS);
5981         CASE_CLASS(HORIZWS);
5982 #undef CASE_CLASS
5983
5984         default:
5985             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5986                           PTR2UV(scan), OP(scan));
5987             Perl_croak(aTHX_ "regexp memory corruption");
5988
5989         } /* end switch */
5990
5991         /* switch break jumps here */
5992         scan = next; /* prepare to execute the next op and ... */
5993         continue;    /* ... jump back to the top, reusing st */
5994         assert(0); /* NOTREACHED */
5995
5996       push_yes_state:
5997         /* push a state that backtracks on success */
5998         st->u.yes.prev_yes_state = yes_state;
5999         yes_state = st;
6000         /* FALL THROUGH */
6001       push_state:
6002         /* push a new regex state, then continue at scan  */
6003         {
6004             regmatch_state *newst;
6005
6006             DEBUG_STACK_r({
6007                 regmatch_state *cur = st;
6008                 regmatch_state *curyes = yes_state;
6009                 int curd = depth;
6010                 regmatch_slab *slab = PL_regmatch_slab;
6011                 for (;curd > -1;cur--,curd--) {
6012                     if (cur < SLAB_FIRST(slab)) {
6013                         slab = slab->prev;
6014                         cur = SLAB_LAST(slab);
6015                     }
6016                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6017                         REPORT_CODE_OFF + 2 + depth * 2,"",
6018                         curd, PL_reg_name[cur->resume_state],
6019                         (curyes == cur) ? "yes" : ""
6020                     );
6021                     if (curyes == cur)
6022                         curyes = cur->u.yes.prev_yes_state;
6023                 }
6024             } else
6025                 DEBUG_STATE_pp("push")
6026             );
6027             depth++;
6028             st->locinput = locinput;
6029             newst = st+1;
6030             if (newst >  SLAB_LAST(PL_regmatch_slab))
6031                 newst = S_push_slab(aTHX);
6032             PL_regmatch_state = newst;
6033
6034             locinput = pushinput;
6035             nextchr = UCHARAT(locinput);
6036             st = newst;
6037             continue;
6038             assert(0); /* NOTREACHED */
6039         }
6040     }
6041
6042     /*
6043     * We get here only if there's trouble -- normally "case END" is
6044     * the terminating point.
6045     */
6046     Perl_croak(aTHX_ "corrupted regexp pointers");
6047     /*NOTREACHED*/
6048     sayNO;
6049
6050 yes:
6051     if (yes_state) {
6052         /* we have successfully completed a subexpression, but we must now
6053          * pop to the state marked by yes_state and continue from there */
6054         assert(st != yes_state);
6055 #ifdef DEBUGGING
6056         while (st != yes_state) {
6057             st--;
6058             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6059                 PL_regmatch_slab = PL_regmatch_slab->prev;
6060                 st = SLAB_LAST(PL_regmatch_slab);
6061             }
6062             DEBUG_STATE_r({
6063                 if (no_final) {
6064                     DEBUG_STATE_pp("pop (no final)");
6065                 } else {
6066                     DEBUG_STATE_pp("pop (yes)");
6067                 }
6068             });
6069             depth--;
6070         }
6071 #else
6072         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6073             || yes_state > SLAB_LAST(PL_regmatch_slab))
6074         {
6075             /* not in this slab, pop slab */
6076             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6077             PL_regmatch_slab = PL_regmatch_slab->prev;
6078             st = SLAB_LAST(PL_regmatch_slab);
6079         }
6080         depth -= (st - yes_state);
6081 #endif
6082         st = yes_state;
6083         yes_state = st->u.yes.prev_yes_state;
6084         PL_regmatch_state = st;
6085
6086         if (no_final) {
6087             locinput= st->locinput;
6088             nextchr = UCHARAT(locinput);
6089         }
6090         state_num = st->resume_state + no_final;
6091         goto reenter_switch;
6092     }
6093
6094     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6095                           PL_colors[4], PL_colors[5]));
6096
6097     if (PL_reg_state.re_state_eval_setup_done) {
6098         /* each successfully executed (?{...}) block does the equivalent of
6099          *   local $^R = do {...}
6100          * When popping the save stack, all these locals would be undone;
6101          * bypass this by setting the outermost saved $^R to the latest
6102          * value */
6103         if (oreplsv != GvSV(PL_replgv))
6104             sv_setsv(oreplsv, GvSV(PL_replgv));
6105     }
6106     result = 1;
6107     goto final_exit;
6108
6109 no:
6110     DEBUG_EXECUTE_r(
6111         PerlIO_printf(Perl_debug_log,
6112             "%*s  %sfailed...%s\n",
6113             REPORT_CODE_OFF+depth*2, "",
6114             PL_colors[4], PL_colors[5])
6115         );
6116
6117 no_silent:
6118     if (no_final) {
6119         if (yes_state) {
6120             goto yes;
6121         } else {
6122             goto final_exit;
6123         }
6124     }
6125     if (depth) {
6126         /* there's a previous state to backtrack to */
6127         st--;
6128         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6129             PL_regmatch_slab = PL_regmatch_slab->prev;
6130             st = SLAB_LAST(PL_regmatch_slab);
6131         }
6132         PL_regmatch_state = st;
6133         locinput= st->locinput;
6134         nextchr = UCHARAT(locinput);
6135
6136         DEBUG_STATE_pp("pop");
6137         depth--;
6138         if (yes_state == st)
6139             yes_state = st->u.yes.prev_yes_state;
6140
6141         state_num = st->resume_state + 1; /* failure = success + 1 */
6142         goto reenter_switch;
6143     }
6144     result = 0;
6145
6146   final_exit:
6147     if (rex->intflags & PREGf_VERBARG_SEEN) {
6148         SV *sv_err = get_sv("REGERROR", 1);
6149         SV *sv_mrk = get_sv("REGMARK", 1);
6150         if (result) {
6151             sv_commit = &PL_sv_no;
6152             if (!sv_yes_mark)
6153                 sv_yes_mark = &PL_sv_yes;
6154         } else {
6155             if (!sv_commit)
6156                 sv_commit = &PL_sv_yes;
6157             sv_yes_mark = &PL_sv_no;
6158         }
6159         sv_setsv(sv_err, sv_commit);
6160         sv_setsv(sv_mrk, sv_yes_mark);
6161     }
6162
6163
6164     if (last_pushed_cv) {
6165         dSP;
6166         POP_MULTICALL;
6167         PERL_UNUSED_VAR(SP);
6168     }
6169
6170     /* clean up; in particular, free all slabs above current one */
6171     LEAVE_SCOPE(oldsave);
6172
6173     assert(!result ||  locinput - PL_bostr >= 0);
6174     return result ?  locinput - PL_bostr : -1;
6175 }
6176
6177 /*
6178  - regrepeat - repeatedly match something simple, report how many
6179  *
6180  * startposp - pointer a pointer to the start position.  This is updated
6181  *             to point to the byte following the highest successful
6182  *             match.
6183  * p         - the regnode to be repeatedly matched against.
6184  * max       - maximum number of characters to match.
6185  * depth     - (for debugging) backtracking depth.
6186  */
6187 STATIC I32
6188 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6189 {
6190     dVAR;
6191     char *scan;
6192     I32 c;
6193     char *loceol = PL_regeol;
6194     I32 hardcount = 0;
6195     bool utf8_target = PL_reg_match_utf8;
6196     UV utf8_flags;
6197 #ifndef DEBUGGING
6198     PERL_UNUSED_ARG(depth);
6199 #endif
6200
6201     PERL_ARGS_ASSERT_REGREPEAT;
6202
6203     scan = *startposp;
6204     if (max == REG_INFTY)
6205         max = I32_MAX;
6206     else if (max < loceol - scan)
6207         loceol = scan + max;
6208     switch (OP(p)) {
6209     case REG_ANY:
6210         if (utf8_target) {
6211             loceol = PL_regeol;
6212             while (scan < loceol && hardcount < max && *scan != '\n') {
6213                 scan += UTF8SKIP(scan);
6214                 hardcount++;
6215             }
6216         } else {
6217             while (scan < loceol && *scan != '\n')
6218                 scan++;
6219         }
6220         break;
6221     case SANY:
6222         if (utf8_target) {
6223             loceol = PL_regeol;
6224             while (scan < loceol && hardcount < max) {
6225                 scan += UTF8SKIP(scan);
6226                 hardcount++;
6227             }
6228         }
6229         else
6230             scan = loceol;
6231         break;
6232     case CANY:
6233         scan = loceol;
6234         break;
6235     case EXACT:
6236         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6237          * means they match only characters in the string that can be expressed
6238          * as a single byte.  For non-utf8 strings, that means a simple match.
6239          * For utf8 strings, the character matched must be an invariant, or
6240          * downgradable to a single byte.  The pattern's utf8ness is
6241          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6242          * it is, it's an invariant */
6243
6244         c = (U8)*STRING(p);
6245         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6246
6247         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6248             while (scan < loceol && UCHARAT(scan) == c) {
6249                 scan++;
6250             }
6251         }
6252         else {
6253
6254             /* Here, the string is utf8, and the pattern char is different
6255              * in utf8 than not, so can't compare them directly.  Outside the
6256              * loop, find the two utf8 bytes that represent c, and then
6257              * look for those in sequence in the utf8 string */
6258             U8 high = UTF8_TWO_BYTE_HI(c);
6259             U8 low = UTF8_TWO_BYTE_LO(c);
6260             loceol = PL_regeol;
6261
6262             while (hardcount < max
6263                     && scan + 1 < loceol
6264                     && UCHARAT(scan) == high
6265                     && UCHARAT(scan + 1) == low)
6266             {
6267                 scan += 2;
6268                 hardcount++;
6269             }
6270         }
6271         break;
6272     case EXACTFA:
6273         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6274         goto do_exactf;
6275
6276     case EXACTFL:
6277         PL_reg_flags |= RF_tainted;
6278         utf8_flags = FOLDEQ_UTF8_LOCALE;
6279         goto do_exactf;
6280
6281     case EXACTF:
6282             utf8_flags = 0;
6283             goto do_exactf;
6284
6285     case EXACTFU_SS:
6286     case EXACTFU_TRICKYFOLD:
6287     case EXACTFU:
6288         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6289
6290         /* The comments for the EXACT case above apply as well to these fold
6291          * ones */
6292
6293     do_exactf:
6294         c = (U8)*STRING(p);
6295         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6296
6297         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6298             char *tmpeol = loceol;
6299             while (hardcount < max
6300                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6301                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6302             {
6303                 scan = tmpeol;
6304                 tmpeol = loceol;
6305                 hardcount++;
6306             }
6307
6308             /* XXX Note that the above handles properly the German sharp s in
6309              * the pattern matching ss in the string.  But it doesn't handle
6310              * properly cases where the string contains say 'LIGATURE ff' and
6311              * the pattern is 'f+'.  This would require, say, a new function or
6312              * revised interface to foldEQ_utf8(), in which the maximum number
6313              * of characters to match could be passed and it would return how
6314              * many actually did.  This is just one of many cases where
6315              * multi-char folds don't work properly, and so the fix is being
6316              * deferred */
6317         }
6318         else {
6319             U8 folded;
6320
6321             /* Here, the string isn't utf8 and c is a single byte; and either
6322              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6323              * doesn't affect c.  Can just do simple comparisons for exact or
6324              * fold matching. */
6325             switch (OP(p)) {
6326                 case EXACTF: folded = PL_fold[c]; break;
6327                 case EXACTFA:
6328                 case EXACTFU_TRICKYFOLD:
6329                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6330                 case EXACTFL: folded = PL_fold_locale[c]; break;
6331                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6332             }
6333             while (scan < loceol &&
6334                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6335             {
6336                 scan++;
6337             }
6338         }
6339         break;
6340     case ANYOFV:
6341     case ANYOF:
6342         if (utf8_target || OP(p) == ANYOFV) {
6343             STRLEN inclasslen;
6344             loceol = PL_regeol;
6345             inclasslen = loceol - scan;
6346             while (hardcount < max
6347                    && ((inclasslen = loceol - scan) > 0)
6348                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6349             {
6350                 scan += inclasslen;
6351                 hardcount++;
6352             }
6353         } else {
6354             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6355                 scan++;
6356         }
6357         break;
6358     case ALNUMU:
6359         if (utf8_target) {
6360     utf8_wordchar:
6361             loceol = PL_regeol;
6362             LOAD_UTF8_CHARCLASS_ALNUM();
6363             while (hardcount < max && scan < loceol &&
6364                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6365             {
6366                 scan += UTF8SKIP(scan);
6367                 hardcount++;
6368             }
6369         } else {
6370             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6371                 scan++;
6372             }
6373         }
6374         break;
6375     case ALNUM:
6376         if (utf8_target)
6377             goto utf8_wordchar;
6378         while (scan < loceol && isALNUM((U8) *scan)) {
6379             scan++;
6380         }
6381         break;
6382     case ALNUMA:
6383         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6384             scan++;
6385         }
6386         break;
6387     case ALNUML:
6388         PL_reg_flags |= RF_tainted;
6389         if (utf8_target) {
6390             loceol = PL_regeol;
6391             while (hardcount < max && scan < loceol &&
6392                    isALNUM_LC_utf8((U8*)scan)) {
6393                 scan += UTF8SKIP(scan);
6394                 hardcount++;
6395             }
6396         } else {
6397             while (scan < loceol && isALNUM_LC(*scan))
6398                 scan++;
6399         }
6400         break;
6401     case NALNUMU:
6402         if (utf8_target) {
6403
6404     utf8_Nwordchar:
6405
6406             loceol = PL_regeol;
6407             LOAD_UTF8_CHARCLASS_ALNUM();
6408             while (hardcount < max && scan < loceol &&
6409                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6410             {
6411                 scan += UTF8SKIP(scan);
6412                 hardcount++;
6413             }
6414         } else {
6415             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6416                 scan++;
6417             }
6418         }
6419         break;
6420     case NALNUM:
6421         if (utf8_target)
6422             goto utf8_Nwordchar;
6423         while (scan < loceol && ! isALNUM((U8) *scan)) {
6424             scan++;
6425         }
6426         break;
6427
6428     case POSIXA:
6429        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6430             scan++;
6431         }
6432         break;
6433     case NPOSIXA:
6434         if (utf8_target) {
6435             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6436                 scan += UTF8SKIP(scan);
6437             }
6438         }
6439         else {
6440             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6441                 scan++;
6442             }
6443         }
6444         break;
6445     case NALNUMA:
6446         if (utf8_target) {
6447             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6448                 scan += UTF8SKIP(scan);
6449             }
6450         }
6451         else {
6452             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6453                 scan++;
6454             }
6455         }
6456         break;
6457     case NALNUML:
6458         PL_reg_flags |= RF_tainted;
6459         if (utf8_target) {
6460             loceol = PL_regeol;
6461             while (hardcount < max && scan < loceol &&
6462                    !isALNUM_LC_utf8((U8*)scan)) {
6463                 scan += UTF8SKIP(scan);
6464                 hardcount++;
6465             }
6466         } else {
6467             while (scan < loceol && !isALNUM_LC(*scan))
6468                 scan++;
6469         }
6470         break;
6471     case SPACEU:
6472         if (utf8_target) {
6473
6474     utf8_space:
6475
6476             loceol = PL_regeol;
6477             LOAD_UTF8_CHARCLASS_SPACE();
6478             while (hardcount < max && scan < loceol &&
6479                    (*scan == ' ' ||
6480                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6481             {
6482                 scan += UTF8SKIP(scan);
6483                 hardcount++;
6484             }
6485             break;
6486         }
6487         else {
6488             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6489                 scan++;
6490             }
6491             break;
6492         }
6493     case SPACE:
6494         if (utf8_target)
6495             goto utf8_space;
6496
6497         while (scan < loceol && isSPACE((U8) *scan)) {
6498             scan++;
6499         }
6500         break;
6501     case SPACEA:
6502         while (scan < loceol && isSPACE_A((U8) *scan)) {
6503             scan++;
6504         }
6505         break;
6506     case SPACEL:
6507         PL_reg_flags |= RF_tainted;
6508         if (utf8_target) {
6509             loceol = PL_regeol;
6510             while (hardcount < max && scan < loceol &&
6511                    isSPACE_LC_utf8((U8*)scan)) {
6512                 scan += UTF8SKIP(scan);
6513                 hardcount++;
6514             }
6515         } else {
6516             while (scan < loceol && isSPACE_LC(*scan))
6517                 scan++;
6518         }
6519         break;
6520     case NSPACEU:
6521         if (utf8_target) {
6522
6523     utf8_Nspace:
6524
6525             loceol = PL_regeol;
6526             LOAD_UTF8_CHARCLASS_SPACE();
6527             while (hardcount < max && scan < loceol &&
6528                    ! (*scan == ' ' ||
6529                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6530             {
6531                 scan += UTF8SKIP(scan);
6532                 hardcount++;
6533             }
6534             break;
6535         }
6536         else {
6537             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6538                 scan++;
6539             }
6540         }
6541         break;
6542     case NSPACE:
6543         if (utf8_target)
6544             goto utf8_Nspace;
6545
6546         while (scan < loceol && ! isSPACE((U8) *scan)) {
6547             scan++;
6548         }
6549         break;
6550     case NSPACEA:
6551         if (utf8_target) {
6552             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6553                 scan += UTF8SKIP(scan);
6554             }
6555         }
6556         else {
6557             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6558                 scan++;
6559             }
6560         }
6561         break;
6562     case NSPACEL:
6563         PL_reg_flags |= RF_tainted;
6564         if (utf8_target) {
6565             loceol = PL_regeol;
6566             while (hardcount < max && scan < loceol &&
6567                    !isSPACE_LC_utf8((U8*)scan)) {
6568                 scan += UTF8SKIP(scan);
6569                 hardcount++;
6570             }
6571         } else {
6572             while (scan < loceol && !isSPACE_LC(*scan))
6573                 scan++;
6574         }
6575         break;
6576     case DIGIT:
6577         if (utf8_target) {
6578             loceol = PL_regeol;
6579             LOAD_UTF8_CHARCLASS_DIGIT();
6580             while (hardcount < max && scan < loceol &&
6581                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6582                 scan += UTF8SKIP(scan);
6583                 hardcount++;
6584             }
6585         } else {
6586             while (scan < loceol && isDIGIT(*scan))
6587                 scan++;
6588         }
6589         break;
6590     case DIGITA:
6591         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6592             scan++;
6593         }
6594         break;
6595     case DIGITL:
6596         PL_reg_flags |= RF_tainted;
6597         if (utf8_target) {
6598             loceol = PL_regeol;
6599             while (hardcount < max && scan < loceol &&
6600                    isDIGIT_LC_utf8((U8*)scan)) {
6601                 scan += UTF8SKIP(scan);
6602                 hardcount++;
6603             }
6604         } else {
6605             while (scan < loceol && isDIGIT_LC(*scan))
6606                 scan++;
6607         }
6608         break;
6609     case NDIGIT:
6610         if (utf8_target) {
6611             loceol = PL_regeol;
6612             LOAD_UTF8_CHARCLASS_DIGIT();
6613             while (hardcount < max && scan < loceol &&
6614                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6615                 scan += UTF8SKIP(scan);
6616                 hardcount++;
6617             }
6618         } else {
6619             while (scan < loceol && !isDIGIT(*scan))
6620                 scan++;
6621         }
6622         break;
6623     case NDIGITA:
6624         if (utf8_target) {
6625             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6626                 scan += UTF8SKIP(scan);
6627             }
6628         }
6629         else {
6630             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6631                 scan++;
6632             }
6633         }
6634         break;
6635     case NDIGITL:
6636         PL_reg_flags |= RF_tainted;
6637         if (utf8_target) {
6638             loceol = PL_regeol;
6639             while (hardcount < max && scan < loceol &&
6640                    !isDIGIT_LC_utf8((U8*)scan)) {
6641                 scan += UTF8SKIP(scan);
6642                 hardcount++;
6643             }
6644         } else {
6645             while (scan < loceol && !isDIGIT_LC(*scan))
6646                 scan++;
6647         }
6648         break;
6649     case LNBREAK:
6650         if (utf8_target) {
6651             loceol = PL_regeol;
6652             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6653                 scan += c;
6654                 hardcount++;
6655             }
6656         } else {
6657             /*
6658               LNBREAK can match two latin chars, which is ok,
6659               because we have a null terminated string, but we
6660               have to use hardcount in this situation
6661             */
6662             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6663                 scan+=c;
6664                 hardcount++;
6665             }
6666         }
6667         break;
6668     case HORIZWS:
6669         if (utf8_target) {
6670             loceol = PL_regeol;
6671             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6672                 scan += c;
6673                 hardcount++;
6674             }
6675         } else {
6676             while (scan < loceol && is_HORIZWS_latin1(scan))
6677                 scan++;
6678         }
6679         break;
6680     case NHORIZWS:
6681         if (utf8_target) {
6682             loceol = PL_regeol;
6683             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6684                 scan += UTF8SKIP(scan);
6685                 hardcount++;
6686             }
6687         } else {
6688             while (scan < loceol && !is_HORIZWS_latin1(scan))
6689                 scan++;
6690
6691         }
6692         break;
6693     case VERTWS:
6694         if (utf8_target) {
6695             loceol = PL_regeol;
6696             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6697                 scan += c;
6698                 hardcount++;
6699             }
6700         } else {
6701             while (scan < loceol && is_VERTWS_latin1(scan))
6702                 scan++;
6703
6704         }
6705         break;
6706     case NVERTWS:
6707         if (utf8_target) {
6708             loceol = PL_regeol;
6709             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6710                 scan += UTF8SKIP(scan);
6711                 hardcount++;
6712             }
6713         } else {
6714             while (scan < loceol && !is_VERTWS_latin1(scan))
6715                 scan++;
6716
6717         }
6718         break;
6719
6720     default:            /* Called on something of 0 width. */
6721         break;          /* So match right here or not at all. */
6722     }
6723
6724     if (hardcount)
6725         c = hardcount;
6726     else
6727         c = scan - *startposp;
6728     *startposp = scan;
6729
6730     DEBUG_r({
6731         GET_RE_DEBUG_FLAGS_DECL;
6732         DEBUG_EXECUTE_r({
6733             SV * const prop = sv_newmortal();
6734             regprop(prog, prop, p);
6735             PerlIO_printf(Perl_debug_log,
6736                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6737                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6738         });
6739     });
6740
6741     return(c);
6742 }
6743
6744
6745 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6746 /*
6747 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6748 create a copy so that changes the caller makes won't change the shared one
6749  */
6750 SV *
6751 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6752 {
6753     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6754     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6755 }
6756 #endif
6757
6758 STATIC SV *
6759 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6760 {
6761     /* Returns the swash for the input 'node' in the regex 'prog'.
6762      * If <doinit> is true, will attempt to create the swash if not already
6763      *    done.
6764      * If <listsvp> is non-null, will return the swash initialization string in
6765      *    it.
6766      * If <altsvp> is non-null, will return the alternates to the regular swash
6767      *    in it
6768      * Tied intimately to how regcomp.c sets up the data structure */
6769
6770     dVAR;
6771     SV *sw  = NULL;
6772     SV *si  = NULL;
6773     SV *alt = NULL;
6774     SV*  invlist = NULL;
6775
6776     RXi_GET_DECL(prog,progi);
6777     const struct reg_data * const data = prog ? progi->data : NULL;
6778
6779     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6780
6781     assert(ANYOF_NONBITMAP(node));
6782
6783     if (data && data->count) {
6784         const U32 n = ARG(node);
6785
6786         if (data->what[n] == 's') {
6787             SV * const rv = MUTABLE_SV(data->data[n]);
6788             AV * const av = MUTABLE_AV(SvRV(rv));
6789             SV **const ary = AvARRAY(av);
6790             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6791
6792             si = *ary;  /* ary[0] = the string to initialize the swash with */
6793
6794             /* Elements 3 and 4 are either both present or both absent. [3] is
6795              * any inversion list generated at compile time; [4] indicates if
6796              * that inversion list has any user-defined properties in it. */
6797             if (av_len(av) >= 3) {
6798                 invlist = ary[3];
6799                 if (SvUV(ary[4])) {
6800                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6801                 }
6802             }
6803             else {
6804                 invlist = NULL;
6805             }
6806
6807             /* Element [1] is reserved for the set-up swash.  If already there,
6808              * return it; if not, create it and store it there */
6809             if (SvROK(ary[1])) {
6810                 sw = ary[1];
6811             }
6812             else if (si && doinit) {
6813
6814                 sw = _core_swash_init("utf8", /* the utf8 package */
6815                                       "", /* nameless */
6816                                       si,
6817                                       1, /* binary */
6818                                       0, /* not from tr/// */
6819                                       invlist,
6820                                       &swash_init_flags);
6821                 (void)av_store(av, 1, sw);
6822             }
6823
6824             /* Element [2] is for any multi-char folds.  Note that is a
6825              * fundamentally flawed design, because can't backtrack and try
6826              * again.  See [perl #89774] */
6827             if (SvTYPE(ary[2]) == SVt_PVAV) {
6828                 alt = ary[2];
6829             }
6830         }
6831     }
6832
6833     if (listsvp) {
6834         SV* matches_string = newSVpvn("", 0);
6835
6836         /* Use the swash, if any, which has to have incorporated into it all
6837          * possibilities */
6838         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6839             && (si && si != &PL_sv_undef))
6840         {
6841
6842             /* If no swash, use the input initialization string, if available */
6843             sv_catsv(matches_string, si);
6844         }
6845
6846         /* Add the inversion list to whatever we have.  This may have come from
6847          * the swash, or from an input parameter */
6848         if (invlist) {
6849             sv_catsv(matches_string, _invlist_contents(invlist));
6850         }
6851         *listsvp = matches_string;
6852     }
6853
6854     if (altsvp)
6855         *altsvp  = alt;
6856
6857     return sw;
6858 }
6859
6860 /*
6861  - reginclass - determine if a character falls into a character class
6862
6863   n is the ANYOF regnode
6864   p is the target string
6865   lenp is pointer to the maximum number of bytes of how far to go in p
6866     (This is assumed wthout checking to always be at least the current
6867     character's size)
6868   utf8_target tells whether p is in UTF-8.
6869
6870   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6871   from a successful match, the value it points to will be updated to how many
6872   bytes in p were matched.  If there was no match, the value is undefined,
6873   possibly changed from the input.
6874
6875   Note that this can be a synthetic start class, a combination of various
6876   nodes, so things you think might be mutually exclusive, such as locale,
6877   aren't.  It can match both locale and non-locale
6878
6879  */
6880
6881 STATIC bool
6882 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6883 {
6884     dVAR;
6885     const char flags = ANYOF_FLAGS(n);
6886     bool match = FALSE;
6887     UV c = *p;
6888     STRLEN c_len = 0;
6889     STRLEN maxlen;
6890
6891     PERL_ARGS_ASSERT_REGINCLASS;
6892
6893     /* If c is not already the code point, get it */
6894     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6895         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6896                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6897                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6898                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6899                  * UTF8_ALLOW_FFFF */
6900         if (c_len == (STRLEN)-1)
6901             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6902     }
6903     else {
6904         c_len = 1;
6905     }
6906
6907     /* Use passed in max length, or one character if none passed in or less
6908      * than one character.  And assume will match just one character.  This is
6909      * overwritten later if matched more. */
6910     if (lenp) {
6911         maxlen = (*lenp > c_len) ? *lenp : c_len;
6912         *lenp = c_len;
6913
6914     }
6915     else {
6916         maxlen = c_len;
6917     }
6918
6919     /* If this character is potentially in the bitmap, check it */
6920     if (c < 256) {
6921         if (ANYOF_BITMAP_TEST(n, c))
6922             match = TRUE;
6923         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6924                 && ! utf8_target
6925                 && ! isASCII(c))
6926         {
6927             match = TRUE;
6928         }
6929
6930         else if (flags & ANYOF_LOCALE) {
6931             PL_reg_flags |= RF_tainted;
6932
6933             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6934                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6935             {
6936                 match = TRUE;
6937             }
6938             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6939                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6940                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6941                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6942                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6943                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6944                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6945                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6946                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6947                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6948                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6949                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6950                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6951                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6952                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6953                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6954                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6955                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6956                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6957                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6958                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6959                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6960                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6961                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6962                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6963                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6964                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6965                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6966                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6967                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6968                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6969                      ) /* How's that for a conditional? */
6970             ) {
6971                 match = TRUE;
6972             }
6973         }
6974     }
6975
6976     /* If the bitmap didn't (or couldn't) match, and something outside the
6977      * bitmap could match, try that.  Locale nodes specifiy completely the
6978      * behavior of code points in the bit map (otherwise, a utf8 target would
6979      * cause them to be treated as Unicode and not locale), except in
6980      * the very unlikely event when this node is a synthetic start class, which
6981      * could be a combination of locale and non-locale nodes.  So allow locale
6982      * to match for the synthetic start class, which will give a false
6983      * positive that will be resolved when the match is done again as not part
6984      * of the synthetic start class */
6985     if (!match) {
6986         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6987             match = TRUE;       /* Everything above 255 matches */
6988         }
6989         else if (ANYOF_NONBITMAP(n)
6990                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6991                      || (utf8_target
6992                          && (c >=256
6993                              || (! (flags & ANYOF_LOCALE))
6994                              || (flags & ANYOF_IS_SYNTHETIC)))))
6995         {
6996             AV *av;
6997             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6998
6999             if (sw) {
7000                 U8 * utf8_p;
7001                 if (utf8_target) {
7002                     utf8_p = (U8 *) p;
7003                 } else {
7004
7005                     /* Not utf8.  Convert as much of the string as available up
7006                      * to the limit of how far the (single) character in the
7007                      * pattern can possibly match (no need to go further).  If
7008                      * the node is a straight ANYOF or not folding, it can't
7009                      * match more than one.  Otherwise, It can match up to how
7010                      * far a single char can fold to.  Since not utf8, each
7011                      * character is a single byte, so the max it can be in
7012                      * bytes is the same as the max it can be in characters */
7013                     STRLEN len = (OP(n) == ANYOF
7014                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
7015                                   ? 1
7016                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
7017                                     ? maxlen
7018                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
7019                     utf8_p = bytes_to_utf8(p, &len);
7020                 }
7021
7022                 if (swash_fetch(sw, utf8_p, TRUE))
7023                     match = TRUE;
7024                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
7025
7026                     /* Here, we need to test if the fold of the target string
7027                      * matches.  The non-multi char folds have all been moved to
7028                      * the compilation phase, and the multi-char folds have
7029                      * been stored by regcomp into 'av'; we linearly check to
7030                      * see if any match the target string (folded).   We know
7031                      * that the originals were each one character, but we don't
7032                      * currently know how many characters/bytes each folded to,
7033                      * except we do know that there are small limits imposed by
7034                      * Unicode.  XXX A performance enhancement would be to have
7035                      * regcomp.c store the max number of chars/bytes that are
7036                      * in an av entry, as, say the 0th element.  Even better
7037                      * would be to have a hash of the few characters that can
7038                      * start a multi-char fold to the max number of chars of
7039                      * those folds.
7040                      *
7041                      * If there is a match, we will need to advance (if lenp is
7042                      * specified) the match pointer in the target string.  But
7043                      * what we are comparing here isn't that string directly,
7044                      * but its fold, whose length may differ from the original.
7045                      * As we go along in constructing the fold, therefore, we
7046                      * create a map so that we know how many bytes in the
7047                      * source to advance given that we have matched a certain
7048                      * number of bytes in the fold.  This map is stored in
7049                      * 'map_fold_len_back'.  Let n mean the number of bytes in
7050                      * the fold of the first character that we are folding.
7051                      * Then map_fold_len_back[n] is set to the number of bytes
7052                      * in that first character.  Similarly let m be the
7053                      * corresponding number for the second character to be
7054                      * folded.  Then map_fold_len_back[n+m] is set to the
7055                      * number of bytes occupied by the first two source
7056                      * characters. ... */
7057                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
7058                     U8 folded[UTF8_MAXBYTES_CASE+1];
7059                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
7060                     STRLEN total_foldlen = 0; /* num bytes in fold of all
7061                                                   chars */
7062
7063                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
7064
7065                         /* Here, only need to fold the first char of the target
7066                          * string.  It the source wasn't utf8, is 1 byte long */
7067                         to_utf8_fold(utf8_p, folded, &foldlen);
7068                         total_foldlen = foldlen;
7069                         map_fold_len_back[foldlen] = (utf8_target)
7070                                                      ? UTF8SKIP(utf8_p)
7071                                                      : 1;
7072                     }
7073                     else {
7074
7075                         /* Here, need to fold more than the first char.  Do so
7076                          * up to the limits */
7077                         U8* source_ptr = utf8_p;    /* The source for the fold
7078                                                        is the regex target
7079                                                        string */
7080                         U8* folded_ptr = folded;
7081                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
7082                                                        available byte in the
7083                                                        target string */
7084                         U8 i;
7085                         for (i = 0;
7086                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7087                              i++)
7088                         {
7089
7090                             /* Fold the next character */
7091                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7092                             STRLEN this_char_foldlen;
7093                             to_utf8_fold(source_ptr,
7094                                          this_char_folded,
7095                                          &this_char_foldlen);
7096
7097                             /* Bail if it would exceed the byte limit for
7098                              * folding a single char. */
7099                             if (this_char_foldlen + folded_ptr - folded >
7100                                                             UTF8_MAXBYTES_CASE)
7101                             {
7102                                 break;
7103                             }
7104
7105                             /* Add the fold of this character */
7106                             Copy(this_char_folded,
7107                                  folded_ptr,
7108                                  this_char_foldlen,
7109                                  U8);
7110                             source_ptr += UTF8SKIP(source_ptr);
7111                             folded_ptr += this_char_foldlen;
7112                             total_foldlen = folded_ptr - folded;
7113
7114                             /* Create map from the number of bytes in the fold
7115                              * back to the number of bytes in the source.  If
7116                              * the source isn't utf8, the byte count is just
7117                              * the number of characters so far */
7118                             map_fold_len_back[total_foldlen]
7119                                                       = (utf8_target)
7120                                                         ? source_ptr - utf8_p
7121                                                         : i + 1;
7122                         }
7123                         *folded_ptr = '\0';
7124                     }
7125
7126
7127                     /* Do the linear search to see if the fold is in the list
7128                      * of multi-char folds. */
7129                     if (av) {
7130                         I32 i;
7131                         for (i = 0; i <= av_len(av); i++) {
7132                             SV* const sv = *av_fetch(av, i, FALSE);
7133                             STRLEN len;
7134                             const char * const s = SvPV_const(sv, len);
7135
7136                             if (len <= total_foldlen
7137                                 && memEQ(s, (char*)folded, len)
7138
7139                                    /* If 0, means matched a partial char. See
7140                                     * [perl #90536] */
7141                                 && map_fold_len_back[len])
7142                             {
7143
7144                                 /* Advance the target string ptr to account for
7145                                  * this fold, but have to translate from the
7146                                  * folded length to the corresponding source
7147                                  * length. */
7148                                 if (lenp) {
7149                                     *lenp = map_fold_len_back[len];
7150                                 }
7151                                 match = TRUE;
7152                                 break;
7153                             }
7154                         }
7155                     }
7156                 }
7157
7158                 /* If we allocated a string above, free it */
7159                 if (! utf8_target) Safefree(utf8_p);
7160             }
7161         }
7162
7163         if (UNICODE_IS_SUPER(c)
7164             && (flags & ANYOF_WARN_SUPER)
7165             && ckWARN_d(WARN_NON_UNICODE))
7166         {
7167             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7168                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7169         }
7170     }
7171
7172     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7173     return cBOOL(flags & ANYOF_INVERT) ^ match;
7174 }
7175
7176 STATIC U8 *
7177 S_reghop3(U8 *s, I32 off, const U8* lim)
7178 {
7179     /* return the position 'off' UTF-8 characters away from 's', forward if
7180      * 'off' >= 0, backwards if negative.  But don't go outside of position
7181      * 'lim', which better be < s  if off < 0 */
7182
7183     dVAR;
7184
7185     PERL_ARGS_ASSERT_REGHOP3;
7186
7187     if (off >= 0) {
7188         while (off-- && s < lim) {
7189             /* XXX could check well-formedness here */
7190             s += UTF8SKIP(s);
7191         }
7192     }
7193     else {
7194         while (off++ && s > lim) {
7195             s--;
7196             if (UTF8_IS_CONTINUED(*s)) {
7197                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7198                     s--;
7199             }
7200             /* XXX could check well-formedness here */
7201         }
7202     }
7203     return s;
7204 }
7205
7206 #ifdef XXX_dmq
7207 /* there are a bunch of places where we use two reghop3's that should
7208    be replaced with this routine. but since thats not done yet
7209    we ifdef it out - dmq
7210 */
7211 STATIC U8 *
7212 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7213 {
7214     dVAR;
7215
7216     PERL_ARGS_ASSERT_REGHOP4;
7217
7218     if (off >= 0) {
7219         while (off-- && s < rlim) {
7220             /* XXX could check well-formedness here */
7221             s += UTF8SKIP(s);
7222         }
7223     }
7224     else {
7225         while (off++ && s > llim) {
7226             s--;
7227             if (UTF8_IS_CONTINUED(*s)) {
7228                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7229                     s--;
7230             }
7231             /* XXX could check well-formedness here */
7232         }
7233     }
7234     return s;
7235 }
7236 #endif
7237
7238 STATIC U8 *
7239 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7240 {
7241     dVAR;
7242
7243     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7244
7245     if (off >= 0) {
7246         while (off-- && s < lim) {
7247             /* XXX could check well-formedness here */
7248             s += UTF8SKIP(s);
7249         }
7250         if (off >= 0)
7251             return NULL;
7252     }
7253     else {
7254         while (off++ && s > lim) {
7255             s--;
7256             if (UTF8_IS_CONTINUED(*s)) {
7257                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7258                     s--;
7259             }
7260             /* XXX could check well-formedness here */
7261         }
7262         if (off <= 0)
7263             return NULL;
7264     }
7265     return s;
7266 }
7267
7268 static void
7269 restore_pos(pTHX_ void *arg)
7270 {
7271     dVAR;
7272     regexp * const rex = (regexp *)arg;
7273     if (PL_reg_state.re_state_eval_setup_done) {
7274         if (PL_reg_oldsaved) {
7275             rex->subbeg = PL_reg_oldsaved;
7276             rex->sublen = PL_reg_oldsavedlen;
7277             rex->suboffset = PL_reg_oldsavedoffset;
7278             rex->subcoffset = PL_reg_oldsavedcoffset;
7279 #ifdef PERL_OLD_COPY_ON_WRITE
7280             rex->saved_copy = PL_nrs;
7281 #endif
7282             RXp_MATCH_COPIED_on(rex);
7283         }
7284         PL_reg_magic->mg_len = PL_reg_oldpos;
7285         PL_reg_state.re_state_eval_setup_done = FALSE;
7286         PL_curpm = PL_reg_oldcurpm;
7287     }
7288 }
7289
7290 STATIC void
7291 S_to_utf8_substr(pTHX_ register regexp *prog)
7292 {
7293     int i = 1;
7294
7295     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7296
7297     do {
7298         if (prog->substrs->data[i].substr
7299             && !prog->substrs->data[i].utf8_substr) {
7300             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7301             prog->substrs->data[i].utf8_substr = sv;
7302             sv_utf8_upgrade(sv);
7303             if (SvVALID(prog->substrs->data[i].substr)) {
7304                 if (SvTAIL(prog->substrs->data[i].substr)) {
7305                     /* Trim the trailing \n that fbm_compile added last
7306                        time.  */
7307                     SvCUR_set(sv, SvCUR(sv) - 1);
7308                     /* Whilst this makes the SV technically "invalid" (as its
7309                        buffer is no longer followed by "\0") when fbm_compile()
7310                        adds the "\n" back, a "\0" is restored.  */
7311                     fbm_compile(sv, FBMcf_TAIL);
7312                 } else
7313                     fbm_compile(sv, 0);
7314             }
7315             if (prog->substrs->data[i].substr == prog->check_substr)
7316                 prog->check_utf8 = sv;
7317         }
7318     } while (i--);
7319 }
7320
7321 STATIC void
7322 S_to_byte_substr(pTHX_ register regexp *prog)
7323 {
7324     dVAR;
7325     int i = 1;
7326
7327     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7328
7329     do {
7330         if (prog->substrs->data[i].utf8_substr
7331             && !prog->substrs->data[i].substr) {
7332             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7333             if (sv_utf8_downgrade(sv, TRUE)) {
7334                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7335                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7336                         /* Trim the trailing \n that fbm_compile added last
7337                            time.  */
7338                         SvCUR_set(sv, SvCUR(sv) - 1);
7339                         fbm_compile(sv, FBMcf_TAIL);
7340                     } else
7341                         fbm_compile(sv, 0);
7342                 }
7343             } else {
7344                 SvREFCNT_dec(sv);
7345                 sv = &PL_sv_undef;
7346             }
7347             prog->substrs->data[i].substr = sv;
7348             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7349                 prog->check_substr = sv;
7350         }
7351     } while (i--);
7352 }
7353
7354 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7355  * pre-composed Hangul syllables, which are all in a contiguous block and
7356  * arranged there in such a way so as to facilitate alorithmic determination of
7357  * their characteristics.  As such, they don't need a swash, but can be
7358  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7359  * is a GCB=LV */
7360 #define SBASE 0xAC00    /* Start of block */
7361 #define SCount 11172    /* Length of block */
7362 #define TCount 28
7363
7364 #if 0   /* This routine is not currently used */
7365 PERL_STATIC_INLINE bool
7366 S_is_utf8_X_LV(pTHX_ const U8 *p)
7367 {
7368     /* Unlike most other similarly named routines here, this does not create a
7369      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7370
7371     dVAR;
7372
7373     UV cp = valid_utf8_to_uvchr(p, NULL);
7374
7375     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7376
7377     /* The earliest Unicode releases did not have these precomposed Hangul
7378      * syllables.  Set to point to undef in that case, so will return false on
7379      * every call */
7380     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7381         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7382         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7383             SvREFCNT_dec(PL_utf8_X_LV);
7384             PL_utf8_X_LV = &PL_sv_undef;
7385         }
7386     }
7387
7388     return (PL_utf8_X_LV != &PL_sv_undef
7389             && cp >= SBASE && cp < SBASE + SCount
7390             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7391 }
7392 #endif
7393
7394 PERL_STATIC_INLINE bool
7395 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7396 {
7397     /* Unlike most other similarly named routines here, this does not create a
7398      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7399
7400     dVAR;
7401
7402     UV cp = valid_utf8_to_uvchr(p, NULL);
7403
7404     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7405
7406     /* The earliest Unicode releases did not have these precomposed Hangul
7407      * syllables.  Set to point to undef in that case, so will return false on
7408      * every call */
7409     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7410         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7411         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7412             SvREFCNT_dec(PL_utf8_X_LVT);
7413             PL_utf8_X_LVT = &PL_sv_undef;
7414         }
7415     }
7416
7417     return (PL_utf8_X_LVT != &PL_sv_undef
7418             && cp >= SBASE && cp < SBASE + SCount
7419             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7420 }
7421
7422 /*
7423  * Local variables:
7424  * c-indentation-style: bsd
7425  * c-basic-offset: 4
7426  * indent-tabs-mode: nil
7427  * End:
7428  *
7429  * ex: set ts=8 sts=4 sw=4 et:
7430  */