src/5017002/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #ifndef STATIC
  92 #define STATIC  static
  93 #endif
  94
  95 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  96  * call if there are no complications: i.e., if everything matchable is
  97  * straight forward in the bitmap */
  98 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
  99                                               : ANYOF_BITMAP_TEST(p,*(c)))
 100
 101 /*
 102  * Forwards.
 103  */
 104
 105 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 106 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 107
 108 #define HOPc(pos,off) \
 109         (char *)(PL_reg_match_utf8 \
 110             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 111             : (U8*)(pos + off))
 112 #define HOPBACKc(pos, off) \
 113         (char*)(PL_reg_match_utf8\
 114             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 115             : (pos - off >= PL_bostr)           \
 116                 ? (U8*)pos - off                \
 117                 : NULL)
 118
 119 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 120 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 121
 122 /* these are unrolled below in the CCC_TRY_XXX defined */
 123 #ifdef EBCDIC
 124     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 125      * skip the check on EBCDIC platforms */
 126 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 127 #else
 128 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 129     if (!CAT2(PL_utf8_,class)) { \
 130         bool ok; \
 131         ENTER; save_re_context(); \
 132         ok=CAT2(is_utf8_,class)((const U8*)str); \
 133         PERL_UNUSED_VAR(ok); \
 134         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 135 #endif
 136
 137 /* Doesn't do an assert to verify that is correct */
 138 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 139     if (!CAT2(PL_utf8_,class)) { \
 140         bool throw_away PERL_UNUSED_DECL; \
 141         ENTER; save_re_context(); \
 142         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 143         LEAVE; } } STMT_END
 144
 145 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 146 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 147 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 148
 149 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 150         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 151         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 152         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 153             * assert should likely and hopefully fail on an EBCDIC machine */ \
 154         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 155                                                                             \
 156         /* No asserts are done for these, in case called on an early        \
 157             * Unicode version in which they map to nothing */               \
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 163         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 164         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 165
 166 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 167
 168 /* The actual code for CCC_TRY, which uses several variables from the routine
 169  * it's callable from.  It is designed to be the bulk of a case statement.
 170  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 171  *      nextchr matches the class.
 172  * UTF8_TEST is the whole test string to use for utf8 targets
 173  * LOAD is what to use to test, and if not present to load in the swash for the
 174  *      class
 175  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 176  *      UTF8_TEST test.
 177  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 178  * utf8 and a variant, load the swash if necessary and test using the utf8
 179  * test.  Advance to the next character if test is ok, otherwise fail; If not
 180  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 181  * fails, or advance to the next character */
 182
 183 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 184     if (locinput >= PL_regeol) {                                              \
 185         sayNO;                                                                \
 186     }                                                                         \
 187     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 188         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 189         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 190             sayNO;                                                            \
 191         }                                                                     \
 192         locinput += PL_utf8skip[nextchr];                                     \
 193         nextchr = UCHARAT(locinput);                                          \
 194         break;                                                                \
 195     }                                                                         \
 196     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 197         sayNO;                                                                \
 198     }                                                                         \
 199     nextchr = UCHARAT(++locinput);                                            \
 200     break;
 201
 202 /* Handle the non-locale cases for a character class and its complement.  It
 203  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 204  * This is because that code fails when the test succeeds, so we want to have
 205  * the test fail so that the code succeeds.  The swash is stored in a
 206  * predictable PL_ place */
 207 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 208                            CLASS, STR)                                        \
 209     case NAME:                                                                \
 210         _CCC_TRY_CODE( !, FUNC,                                               \
 211                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 212                                             (U8*)locinput, TRUE)),            \
 213                           CLASS, STR)                                         \
 214     case NNAME:                                                               \
 215         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 216                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 217                                             (U8*)locinput, TRUE)),            \
 218                           CLASS, STR)                                         \
 219
 220 /* Generate the case statements for both locale and non-locale character
 221  * classes in regmatch for classes that don't have special unicode semantics.
 222  * Locales don't use an immediate swash, but an intermediary special locale
 223  * function that is called on the pointer to the current place in the input
 224  * string.  That function will resolve to needing the same swash.  One might
 225  * think that because we don't know what the locale will match, we shouldn't
 226  * check with the swash loading function that it loaded properly; ie, that we
 227  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 228  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 229  * irrelevant here */
 230 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 231                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 232                 NAMEA, NNAMEA, FUNCA,                                         \
 233                 CLASS, STR)                                                   \
 234     case NAMEL:                                                               \
 235         PL_reg_flags |= RF_tainted;                                           \
 236         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 237     case NNAMEL:                                                              \
 238         PL_reg_flags |= RF_tainted;                                           \
 239         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 240                        CLASS, STR)                                            \
 241     case NAMEA:                                                               \
 242         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 243             sayNO;                                                            \
 244         }                                                                     \
 245         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 246         nextchr = UCHARAT(++locinput);                                        \
 247         break;                                                                \
 248     case NNAMEA:                                                              \
 249         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 250             sayNO;                                                            \
 251         }                                                                     \
 252         if (utf8_target) {                                                    \
 253             locinput += PL_utf8skip[nextchr];                                 \
 254             nextchr = UCHARAT(locinput);                                      \
 255         }                                                                     \
 256         else {                                                                \
 257             nextchr = UCHARAT(++locinput);                                    \
 258         }                                                                     \
 259         break;                                                                \
 260     /* Generate the non-locale cases */                                       \
 261     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 262
 263 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 264  * statements to handle separate Unicode semantics nodes */
 265 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 266                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 267                   NAMEU, NNAMEU, FUNCU,                                        \
 268                   NAMEA, NNAMEA, FUNCA,                                        \
 269                   CLASS, STR)                                                  \
 270     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 271             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 272             NAMEA, NNAMEA, FUNCA,                                              \
 273             CLASS, STR)                                                        \
 274     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 275
 276 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 277
 278 /* for use after a quantifier and before an EXACT-like node -- japhy */
 279 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 280  *
 281  * NOTE that *nothing* that affects backtracking should be in here, specifically
 282  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 283  * node that is in between two EXACT like nodes when ascertaining what the required
 284  * "follow" character is. This should probably be moved to regex compile time
 285  * although it may be done at run time beause of the REF possibility - more
 286  * investigation required. -- demerphq
 287 */
 288 #define JUMPABLE(rn) (      \
 289     OP(rn) == OPEN ||       \
 290     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 291     OP(rn) == EVAL ||   \
 292     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 293     OP(rn) == PLUS || OP(rn) == MINMOD || \
 294     OP(rn) == KEEPS || \
 295     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 296 )
 297 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 298
 299 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 300
 301 #if 0
 302 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 303    we don't need this definition. */
 304 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 305 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 306 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 307
 308 #else
 309 /* ... so we use this as its faster. */
 310 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 311 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 312 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 313 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 314
 315 #endif
 316
 317 /*
 318   Search for mandatory following text node; for lookahead, the text must
 319   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 320 */
 321 #define FIND_NEXT_IMPT(rn) STMT_START { \
 322     while (JUMPABLE(rn)) { \
 323         const OPCODE type = OP(rn); \
 324         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 325             rn = NEXTOPER(NEXTOPER(rn)); \
 326         else if (type == PLUS) \
 327             rn = NEXTOPER(rn); \
 328         else if (type == IFMATCH) \
 329             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 330         else rn += NEXT_OFF(rn); \
 331     } \
 332 } STMT_END
 333
 334
 335 static void restore_pos(pTHX_ void *arg);
 336
 337 #define REGCP_PAREN_ELEMS 3
 338 #define REGCP_OTHER_ELEMS 3
 339 #define REGCP_FRAME_ELEMS 1
 340 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 341  * are needed for the regexp context stack bookkeeping. */
 342
 343 STATIC CHECKPOINT
 344 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 345 {
 346     dVAR;
 347     const int retval = PL_savestack_ix;
 348     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 349     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 350     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 351     I32 p;
 352     GET_RE_DEBUG_FLAGS_DECL;
 353
 354     PERL_ARGS_ASSERT_REGCPPUSH;
 355
 356     if (paren_elems_to_push < 0)
 357         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 358                    paren_elems_to_push);
 359
 360     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 361         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 362                    " out of range (%lu-%ld)",
 363                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 364
 365     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 366
 367     DEBUG_BUFFERS_r(
 368         if ((int)PL_regsize > (int)parenfloor)
 369             PerlIO_printf(Perl_debug_log,
 370                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 371                 PTR2UV(rex),
 372                 PTR2UV(rex->offs)
 373             );
 374     );
 375     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 376 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 377         SSPUSHINT(rex->offs[p].end);
 378         SSPUSHINT(rex->offs[p].start);
 379         SSPUSHINT(rex->offs[p].start_tmp);
 380         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 381             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 382             (UV)p,
 383             (IV)rex->offs[p].start,
 384             (IV)rex->offs[p].start_tmp,
 385             (IV)rex->offs[p].end
 386         ));
 387     }
 388 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 389     SSPUSHINT(PL_regsize);
 390     SSPUSHINT(rex->lastparen);
 391     SSPUSHINT(rex->lastcloseparen);
 392     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 393
 394     return retval;
 395 }
 396
 397 /* These are needed since we do not localize EVAL nodes: */
 398 #define REGCP_SET(cp)                                           \
 399     DEBUG_STATE_r(                                              \
 400             PerlIO_printf(Perl_debug_log,                       \
 401                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 402                 (IV)PL_savestack_ix));                          \
 403     cp = PL_savestack_ix
 404
 405 #define REGCP_UNWIND(cp)                                        \
 406     DEBUG_STATE_r(                                              \
 407         if (cp != PL_savestack_ix)                              \
 408             PerlIO_printf(Perl_debug_log,                       \
 409                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 410                 (IV)(cp), (IV)PL_savestack_ix));                \
 411     regcpblow(cp)
 412
 413 #define UNWIND_PAREN(lp, lcp)               \
 414     for (n = rex->lastparen; n > lp; n--)   \
 415         rex->offs[n].end = -1;              \
 416     rex->lastparen = n;                     \
 417     rex->lastcloseparen = lcp;
 418
 419
 420 STATIC void
 421 S_regcppop(pTHX_ regexp *rex)
 422 {
 423     dVAR;
 424     UV i;
 425     U32 paren;
 426     GET_RE_DEBUG_FLAGS_DECL;
 427
 428     PERL_ARGS_ASSERT_REGCPPOP;
 429
 430     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 431     i = SSPOPUV;
 432     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 433     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 434     rex->lastcloseparen = SSPOPINT;
 435     rex->lastparen = SSPOPINT;
 436     PL_regsize = SSPOPINT;
 437
 438     i -= REGCP_OTHER_ELEMS;
 439     /* Now restore the parentheses context. */
 440     DEBUG_BUFFERS_r(
 441         if (i || rex->lastparen + 1 <= rex->nparens)
 442             PerlIO_printf(Perl_debug_log,
 443                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 444                 PTR2UV(rex),
 445                 PTR2UV(rex->offs)
 446             );
 447     );
 448     paren = PL_regsize;
 449     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 450         I32 tmps;
 451         rex->offs[paren].start_tmp = SSPOPINT;
 452         rex->offs[paren].start = SSPOPINT;
 453         tmps = SSPOPINT;
 454         if (paren <= rex->lastparen)
 455             rex->offs[paren].end = tmps;
 456         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 457             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 458             (UV)paren,
 459             (IV)rex->offs[paren].start,
 460             (IV)rex->offs[paren].start_tmp,
 461             (IV)rex->offs[paren].end,
 462             (paren > rex->lastparen ? "(skipped)" : ""));
 463         );
 464         paren--;
 465     }
 466 #if 1
 467     /* It would seem that the similar code in regtry()
 468      * already takes care of this, and in fact it is in
 469      * a better location to since this code can #if 0-ed out
 470      * but the code in regtry() is needed or otherwise tests
 471      * requiring null fields (pat.t#187 and split.t#{13,14}
 472      * (as of patchlevel 7877)  will fail.  Then again,
 473      * this code seems to be necessary or otherwise
 474      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 475      * --jhi updated by dapm */
 476     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 477         if (i > PL_regsize)
 478             rex->offs[i].start = -1;
 479         rex->offs[i].end = -1;
 480         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 481             "    \\%"UVuf": %s   ..-1 undeffing\n",
 482             (UV)i,
 483             (i > PL_regsize) ? "-1" : "  "
 484         ));
 485     }
 486 #endif
 487 }
 488
 489 /* restore the parens and associated vars at savestack position ix,
 490  * but without popping the stack */
 491
 492 STATIC void
 493 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 494 {
 495     I32 tmpix = PL_savestack_ix;
 496     PL_savestack_ix = ix;
 497     regcppop(rex);
 498     PL_savestack_ix = tmpix;
 499 }
 500
 501 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 502
 503 /*
 504  * pregexec and friends
 505  */
 506
 507 #ifndef PERL_IN_XSUB_RE
 508 /*
 509  - pregexec - match a regexp against a string
 510  */
 511 I32
 512 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 513          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 514 /* strend: pointer to null at end of string */
 515 /* strbeg: real beginning of string */
 516 /* minend: end of match must be >=minend after stringarg. */
 517 /* nosave: For optimizations. */
 518 {
 519     PERL_ARGS_ASSERT_PREGEXEC;
 520
 521     return
 522         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 523                       nosave ? 0 : REXEC_COPY_STR);
 524 }
 525 #endif
 526
 527 /*
 528  * Need to implement the following flags for reg_anch:
 529  *
 530  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 531  * USE_INTUIT_ML
 532  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 533  * INTUIT_AUTORITATIVE_ML
 534  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 535  * INTUIT_ONCE_ML
 536  *
 537  * Another flag for this function: SECOND_TIME (so that float substrs
 538  * with giant delta may be not rechecked).
 539  */
 540
 541 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 542
 543 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 544    Otherwise, only SvCUR(sv) is used to get strbeg. */
 545
 546 /* XXXX We assume that strpos is strbeg unless sv. */
 547
 548 /* XXXX Some places assume that there is a fixed substring.
 549         An update may be needed if optimizer marks as "INTUITable"
 550         RExen without fixed substrings.  Similarly, it is assumed that
 551         lengths of all the strings are no more than minlen, thus they
 552         cannot come from lookahead.
 553         (Or minlen should take into account lookahead.)
 554   NOTE: Some of this comment is not correct. minlen does now take account
 555   of lookahead/behind. Further research is required. -- demerphq
 556
 557 */
 558
 559 /* A failure to find a constant substring means that there is no need to make
 560    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 561    finding a substring too deep into the string means that less calls to
 562    regtry() should be needed.
 563
 564    REx compiler's optimizer found 4 possible hints:
 565         a) Anchored substring;
 566         b) Fixed substring;
 567         c) Whether we are anchored (beginning-of-line or \G);
 568         d) First node (of those at offset 0) which may distinguish positions;
 569    We use a)b)d) and multiline-part of c), and try to find a position in the
 570    string which does not contradict any of them.
 571  */
 572
 573 /* Most of decisions we do here should have been done at compile time.
 574    The nodes of the REx which we used for the search should have been
 575    deleted from the finite automaton. */
 576
 577 char *
 578 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 579                      char *strend, const U32 flags, re_scream_pos_data *data)
 580 {
 581     dVAR;
 582     struct regexp *const prog = (struct regexp *)SvANY(rx);
 583     register I32 start_shift = 0;
 584     /* Should be nonnegative! */
 585     register I32 end_shift   = 0;
 586     register char *s;
 587     register SV *check;
 588     char *strbeg;
 589     char *t;
 590     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 591     I32 ml_anch;
 592     register char *other_last = NULL;   /* other substr checked before this */
 593     char *check_at = NULL;              /* check substr found at this pos */
 594     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 595     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 596     RXi_GET_DECL(prog,progi);
 597 #ifdef DEBUGGING
 598     const char * const i_strpos = strpos;
 599 #endif
 600     GET_RE_DEBUG_FLAGS_DECL;
 601
 602     PERL_ARGS_ASSERT_RE_INTUIT_START;
 603     PERL_UNUSED_ARG(flags);
 604     PERL_UNUSED_ARG(data);
 605
 606     RX_MATCH_UTF8_set(rx,utf8_target);
 607
 608     if (RX_UTF8(rx)) {
 609         PL_reg_flags |= RF_utf8;
 610     }
 611     DEBUG_EXECUTE_r(
 612         debug_start_match(rx, utf8_target, strpos, strend,
 613             sv ? "Guessing start of match in sv for"
 614                : "Guessing start of match in string for");
 615               );
 616
 617     /* CHR_DIST() would be more correct here but it makes things slow. */
 618     if (prog->minlen > strend - strpos) {
 619         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 620                               "String too short... [re_intuit_start]\n"));
 621         goto fail;
 622     }
 623
 624     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 625     PL_regeol = strend;
 626     if (utf8_target) {
 627         if (!prog->check_utf8 && prog->check_substr)
 628             to_utf8_substr(prog);
 629         check = prog->check_utf8;
 630     } else {
 631         if (!prog->check_substr && prog->check_utf8)
 632             to_byte_substr(prog);
 633         check = prog->check_substr;
 634     }
 635     if (check == &PL_sv_undef) {
 636         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 637                 "Non-utf8 string cannot match utf8 check string\n"));
 638         goto fail;
 639     }
 640     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 641         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 642                      || ( (prog->extflags & RXf_ANCH_BOL)
 643                           && !multiline ) );    /* Check after \n? */
 644
 645         if (!ml_anch) {
 646           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 647                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 648                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 649                && sv && !SvROK(sv)
 650                && (strpos != strbeg)) {
 651               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 652               goto fail;
 653           }
 654           if (prog->check_offset_min == prog->check_offset_max &&
 655               !(prog->extflags & RXf_CANY_SEEN)) {
 656             /* Substring at constant offset from beg-of-str... */
 657             I32 slen;
 658
 659             s = HOP3c(strpos, prog->check_offset_min, strend);
 660
 661             if (SvTAIL(check)) {
 662                 slen = SvCUR(check);    /* >= 1 */
 663
 664                 if ( strend - s > slen || strend - s < slen - 1
 665                      || (strend - s == slen && strend[-1] != '\n')) {
 666                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 667                     goto fail_finish;
 668                 }
 669                 /* Now should match s[0..slen-2] */
 670                 slen--;
 671                 if (slen && (*SvPVX_const(check) != *s
 672                              || (slen > 1
 673                                  && memNE(SvPVX_const(check), s, slen)))) {
 674                   report_neq:
 675                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 676                     goto fail_finish;
 677                 }
 678             }
 679             else if (*SvPVX_const(check) != *s
 680                      || ((slen = SvCUR(check)) > 1
 681                          && memNE(SvPVX_const(check), s, slen)))
 682                 goto report_neq;
 683             check_at = s;
 684             goto success_at_start;
 685           }
 686         }
 687         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 688         s = strpos;
 689         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 690         end_shift = prog->check_end_shift;
 691
 692         if (!ml_anch) {
 693             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 694                                          - (SvTAIL(check) != 0);
 695             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 696
 697             if (end_shift < eshift)
 698                 end_shift = eshift;
 699         }
 700     }
 701     else {                              /* Can match at random position */
 702         ml_anch = 0;
 703         s = strpos;
 704         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 705         end_shift = prog->check_end_shift;
 706
 707         /* end shift should be non negative here */
 708     }
 709
 710 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 711     if (end_shift < 0)
 712         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 713                    (IV)end_shift, RX_PRECOMP(prog));
 714 #endif
 715
 716   restart:
 717     /* Find a possible match in the region s..strend by looking for
 718        the "check" substring in the region corrected by start/end_shift. */
 719
 720     {
 721         I32 srch_start_shift = start_shift;
 722         I32 srch_end_shift = end_shift;
 723         U8* start_point;
 724         U8* end_point;
 725         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 726             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 727             srch_start_shift = strbeg - s;
 728         }
 729     DEBUG_OPTIMISE_MORE_r({
 730         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 731             (IV)prog->check_offset_min,
 732             (IV)srch_start_shift,
 733             (IV)srch_end_shift,
 734             (IV)prog->check_end_shift);
 735     });
 736
 737         if (prog->extflags & RXf_CANY_SEEN) {
 738             start_point= (U8*)(s + srch_start_shift);
 739             end_point= (U8*)(strend - srch_end_shift);
 740         } else {
 741             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 742             end_point= HOP3(strend, -srch_end_shift, strbeg);
 743         }
 744         DEBUG_OPTIMISE_MORE_r({
 745             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 746                 (int)(end_point - start_point),
 747                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 748                 start_point);
 749         });
 750
 751         s = fbm_instr( start_point, end_point,
 752                       check, multiline ? FBMrf_MULTILINE : 0);
 753     }
 754     /* Update the count-of-usability, remove useless subpatterns,
 755         unshift s.  */
 756
 757     DEBUG_EXECUTE_r({
 758         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 759             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 760         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 761                           (s ? "Found" : "Did not find"),
 762             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 763                 ? "anchored" : "floating"),
 764             quoted,
 765             RE_SV_TAIL(check),
 766             (s ? " at offset " : "...\n") );
 767     });
 768
 769     if (!s)
 770         goto fail_finish;
 771     /* Finish the diagnostic message */
 772     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 773
 774     /* XXX dmq: first branch is for positive lookbehind...
 775        Our check string is offset from the beginning of the pattern.
 776        So we need to do any stclass tests offset forward from that
 777        point. I think. :-(
 778      */
 779
 780
 781
 782     check_at=s;
 783
 784
 785     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 786        Start with the other substr.
 787        XXXX no SCREAM optimization yet - and a very coarse implementation
 788        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 789                 *always* match.  Probably should be marked during compile...
 790        Probably it is right to do no SCREAM here...
 791      */
 792
 793     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 794                 : (prog->float_substr && prog->anchored_substr))
 795     {
 796         /* Take into account the "other" substring. */
 797         /* XXXX May be hopelessly wrong for UTF... */
 798         if (!other_last)
 799             other_last = strpos;
 800         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 801           do_other_anchored:
 802             {
 803                 char * const last = HOP3c(s, -start_shift, strbeg);
 804                 char *last1, *last2;
 805                 char * const saved_s = s;
 806                 SV* must;
 807
 808                 t = s - prog->check_offset_max;
 809                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 810                     && (!utf8_target
 811                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 812                             && t > strpos)))
 813                     NOOP;
 814                 else
 815                     t = strpos;
 816                 t = HOP3c(t, prog->anchored_offset, strend);
 817                 if (t < other_last)     /* These positions already checked */
 818                     t = other_last;
 819                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 820                 if (last < last1)
 821                     last1 = last;
 822                 /* XXXX It is not documented what units *_offsets are in.
 823                    We assume bytes, but this is clearly wrong.
 824                    Meaning this code needs to be carefully reviewed for errors.
 825                    dmq.
 826                   */
 827
 828                 /* On end-of-str: see comment below. */
 829                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 830                 if (must == &PL_sv_undef) {
 831                     s = (char*)NULL;
 832                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 833                 }
 834                 else
 835                     s = fbm_instr(
 836                         (unsigned char*)t,
 837                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 838                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 839                         must,
 840                         multiline ? FBMrf_MULTILINE : 0
 841                     );
 842                 DEBUG_EXECUTE_r({
 843                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 844                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 845                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 846                         (s ? "Found" : "Contradicts"),
 847                         quoted, RE_SV_TAIL(must));
 848                 });
 849
 850
 851                 if (!s) {
 852                     if (last1 >= last2) {
 853                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 854                                                 ", giving up...\n"));
 855                         goto fail_finish;
 856                     }
 857                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 858                         ", trying floating at offset %ld...\n",
 859                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 860                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 861                     s = HOP3c(last, 1, strend);
 862                     goto restart;
 863                 }
 864                 else {
 865                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 866                           (long)(s - i_strpos)));
 867                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 868                     other_last = HOP3c(s, 1, strend);
 869                     s = saved_s;
 870                     if (t == strpos)
 871                         goto try_at_start;
 872                     goto try_at_offset;
 873                 }
 874             }
 875         }
 876         else {          /* Take into account the floating substring. */
 877             char *last, *last1;
 878             char * const saved_s = s;
 879             SV* must;
 880
 881             t = HOP3c(s, -start_shift, strbeg);
 882             last1 = last =
 883                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 884             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 885                 last = HOP3c(t, prog->float_max_offset, strend);
 886             s = HOP3c(t, prog->float_min_offset, strend);
 887             if (s < other_last)
 888                 s = other_last;
 889  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 890             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 891             /* fbm_instr() takes into account exact value of end-of-str
 892                if the check is SvTAIL(ed).  Since false positives are OK,
 893                and end-of-str is not later than strend we are OK. */
 894             if (must == &PL_sv_undef) {
 895                 s = (char*)NULL;
 896                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 897             }
 898             else
 899                 s = fbm_instr((unsigned char*)s,
 900                               (unsigned char*)last + SvCUR(must)
 901                                   - (SvTAIL(must)!=0),
 902                               must, multiline ? FBMrf_MULTILINE : 0);
 903             DEBUG_EXECUTE_r({
 904                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 905                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 906                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 907                     (s ? "Found" : "Contradicts"),
 908                     quoted, RE_SV_TAIL(must));
 909             });
 910             if (!s) {
 911                 if (last1 == last) {
 912                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 913                                             ", giving up...\n"));
 914                     goto fail_finish;
 915                 }
 916                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 917                     ", trying anchored starting at offset %ld...\n",
 918                     (long)(saved_s + 1 - i_strpos)));
 919                 other_last = last;
 920                 s = HOP3c(t, 1, strend);
 921                 goto restart;
 922             }
 923             else {
 924                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 925                       (long)(s - i_strpos)));
 926                 other_last = s; /* Fix this later. --Hugo */
 927                 s = saved_s;
 928                 if (t == strpos)
 929                     goto try_at_start;
 930                 goto try_at_offset;
 931             }
 932         }
 933     }
 934
 935
 936     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 937
 938     DEBUG_OPTIMISE_MORE_r(
 939         PerlIO_printf(Perl_debug_log,
 940             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 941             (IV)prog->check_offset_min,
 942             (IV)prog->check_offset_max,
 943             (IV)(s-strpos),
 944             (IV)(t-strpos),
 945             (IV)(t-s),
 946             (IV)(strend-strpos)
 947         )
 948     );
 949
 950     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 951         && (!utf8_target
 952             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 953                  && t > strpos)))
 954     {
 955         /* Fixed substring is found far enough so that the match
 956            cannot start at strpos. */
 957       try_at_offset:
 958         if (ml_anch && t[-1] != '\n') {
 959             /* Eventually fbm_*() should handle this, but often
 960                anchored_offset is not 0, so this check will not be wasted. */
 961             /* XXXX In the code below we prefer to look for "^" even in
 962                presence of anchored substrings.  And we search even
 963                beyond the found float position.  These pessimizations
 964                are historical artefacts only.  */
 965           find_anchor:
 966             while (t < strend - prog->minlen) {
 967                 if (*t == '\n') {
 968                     if (t < check_at - prog->check_offset_min) {
 969                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 970                             /* Since we moved from the found position,
 971                                we definitely contradict the found anchored
 972                                substr.  Due to the above check we do not
 973                                contradict "check" substr.
 974                                Thus we can arrive here only if check substr
 975                                is float.  Redo checking for "other"=="fixed".
 976                              */
 977                             strpos = t + 1;
 978                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 979                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 980                             goto do_other_anchored;
 981                         }
 982                         /* We don't contradict the found floating substring. */
 983                         /* XXXX Why not check for STCLASS? */
 984                         s = t + 1;
 985                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 986                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 987                         goto set_useful;
 988                     }
 989                     /* Position contradicts check-string */
 990                     /* XXXX probably better to look for check-string
 991                        than for "\n", so one should lower the limit for t? */
 992                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 993                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 994                     other_last = strpos = s = t + 1;
 995                     goto restart;
 996                 }
 997                 t++;
 998             }
 999             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1000                         PL_colors[0], PL_colors[1]));
1001             goto fail_finish;
1002         }
1003         else {
1004             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1005                         PL_colors[0], PL_colors[1]));
1006         }
1007         s = t;
1008       set_useful:
1009         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1010     }
1011     else {
1012         /* The found string does not prohibit matching at strpos,
1013            - no optimization of calling REx engine can be performed,
1014            unless it was an MBOL and we are not after MBOL,
1015            or a future STCLASS check will fail this. */
1016       try_at_start:
1017         /* Even in this situation we may use MBOL flag if strpos is offset
1018            wrt the start of the string. */
1019         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1020             && (strpos != strbeg) && strpos[-1] != '\n'
1021             /* May be due to an implicit anchor of m{.*foo}  */
1022             && !(prog->intflags & PREGf_IMPLICIT))
1023         {
1024             t = strpos;
1025             goto find_anchor;
1026         }
1027         DEBUG_EXECUTE_r( if (ml_anch)
1028             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1029                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1030         );
1031       success_at_start:
1032         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1033             && (utf8_target ? (
1034                 prog->check_utf8                /* Could be deleted already */
1035                 && --BmUSEFUL(prog->check_utf8) < 0
1036                 && (prog->check_utf8 == prog->float_utf8)
1037             ) : (
1038                 prog->check_substr              /* Could be deleted already */
1039                 && --BmUSEFUL(prog->check_substr) < 0
1040                 && (prog->check_substr == prog->float_substr)
1041             )))
1042         {
1043             /* If flags & SOMETHING - do not do it many times on the same match */
1044             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1045             /* XXX Does the destruction order has to change with utf8_target? */
1046             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1047             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1048             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1049             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1050             check = NULL;                       /* abort */
1051             s = strpos;
1052             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1053                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1054             if (prog->intflags & PREGf_IMPLICIT)
1055                 prog->extflags &= ~RXf_ANCH_MBOL;
1056             /* XXXX This is a remnant of the old implementation.  It
1057                     looks wasteful, since now INTUIT can use many
1058                     other heuristics. */
1059             prog->extflags &= ~RXf_USE_INTUIT;
1060             /* XXXX What other flags might need to be cleared in this branch? */
1061         }
1062         else
1063             s = strpos;
1064     }
1065
1066     /* Last resort... */
1067     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1068     /* trie stclasses are too expensive to use here, we are better off to
1069        leave it to regmatch itself */
1070     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1071         /* minlen == 0 is possible if regstclass is \b or \B,
1072            and the fixed substr is ''$.
1073            Since minlen is already taken into account, s+1 is before strend;
1074            accidentally, minlen >= 1 guaranties no false positives at s + 1
1075            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1076            regstclass does not come from lookahead...  */
1077         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1078            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1079         const U8* const str = (U8*)STRING(progi->regstclass);
1080         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1081                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1082                     : 1);
1083         char * endpos;
1084         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1085             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1086         else if (prog->float_substr || prog->float_utf8)
1087             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1088         else
1089             endpos= strend;
1090
1091         if (checked_upto < s)
1092            checked_upto = s;
1093         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1094                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1095
1096         t = s;
1097         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1098         if (s) {
1099             checked_upto = s;
1100         } else {
1101 #ifdef DEBUGGING
1102             const char *what = NULL;
1103 #endif
1104             if (endpos == strend) {
1105                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1106                                 "Could not match STCLASS...\n") );
1107                 goto fail;
1108             }
1109             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1110                                    "This position contradicts STCLASS...\n") );
1111             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1112                 goto fail;
1113             checked_upto = HOPBACKc(endpos, start_shift);
1114             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1115                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1116             /* Contradict one of substrings */
1117             if (prog->anchored_substr || prog->anchored_utf8) {
1118                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1119                     DEBUG_EXECUTE_r( what = "anchored" );
1120                   hop_and_restart:
1121                     s = HOP3c(t, 1, strend);
1122                     if (s + start_shift + end_shift > strend) {
1123                         /* XXXX Should be taken into account earlier? */
1124                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1125                                                "Could not match STCLASS...\n") );
1126                         goto fail;
1127                     }
1128                     if (!check)
1129                         goto giveup;
1130                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1131                                 "Looking for %s substr starting at offset %ld...\n",
1132                                  what, (long)(s + start_shift - i_strpos)) );
1133                     goto restart;
1134                 }
1135                 /* Have both, check_string is floating */
1136                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1137                     goto retry_floating_check;
1138                 /* Recheck anchored substring, but not floating... */
1139                 s = check_at;
1140                 if (!check)
1141                     goto giveup;
1142                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143                           "Looking for anchored substr starting at offset %ld...\n",
1144                           (long)(other_last - i_strpos)) );
1145                 goto do_other_anchored;
1146             }
1147             /* Another way we could have checked stclass at the
1148                current position only: */
1149             if (ml_anch) {
1150                 s = t = t + 1;
1151                 if (!check)
1152                     goto giveup;
1153                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1154                           "Looking for /%s^%s/m starting at offset %ld...\n",
1155                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1156                 goto try_at_offset;
1157             }
1158             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1159                 goto fail;
1160             /* Check is floating substring. */
1161           retry_floating_check:
1162             t = check_at - start_shift;
1163             DEBUG_EXECUTE_r( what = "floating" );
1164             goto hop_and_restart;
1165         }
1166         if (t != s) {
1167             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1168                         "By STCLASS: moving %ld --> %ld\n",
1169                                   (long)(t - i_strpos), (long)(s - i_strpos))
1170                    );
1171         }
1172         else {
1173             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1174                                   "Does not contradict STCLASS...\n");
1175                    );
1176         }
1177     }
1178   giveup:
1179     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1180                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1181                           PL_colors[5], (long)(s - i_strpos)) );
1182     return s;
1183
1184   fail_finish:                          /* Substring not found */
1185     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1186         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1187   fail:
1188     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1189                           PL_colors[4], PL_colors[5]));
1190     return NULL;
1191 }
1192
1193 #define DECL_TRIE_TYPE(scan) \
1194     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1195                     trie_type = ((scan->flags == EXACT) \
1196                               ? (utf8_target ? trie_utf8 : trie_plain) \
1197                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1198
1199 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1200 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1201     STRLEN skiplen;                                                                 \
1202     switch (trie_type) {                                                            \
1203     case trie_utf8_fold:                                                            \
1204         if ( foldlen>0 ) {                                                          \
1205             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1206             foldlen -= len;                                                         \
1207             uscan += len;                                                           \
1208             len=0;                                                                  \
1209         } else {                                                                    \
1210             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1211             len = UTF8SKIP(uc);                                                     \
1212             skiplen = UNISKIP( uvc );                                               \
1213             foldlen -= skiplen;                                                     \
1214             uscan = foldbuf + skiplen;                                              \
1215         }                                                                           \
1216         break;                                                                      \
1217     case trie_latin_utf8_fold:                                                      \
1218         if ( foldlen>0 ) {                                                          \
1219             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1220             foldlen -= len;                                                         \
1221             uscan += len;                                                           \
1222             len=0;                                                                  \
1223         } else {                                                                    \
1224             len = 1;                                                                \
1225             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1226             skiplen = UNISKIP( uvc );                                               \
1227             foldlen -= skiplen;                                                     \
1228             uscan = foldbuf + skiplen;                                              \
1229         }                                                                           \
1230         break;                                                                      \
1231     case trie_utf8:                                                                 \
1232         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1233         break;                                                                      \
1234     case trie_plain:                                                                \
1235         uvc = (UV)*uc;                                                              \
1236         len = 1;                                                                    \
1237     }                                                                               \
1238     if (uvc < 256) {                                                                \
1239         charid = trie->charmap[ uvc ];                                              \
1240     }                                                                               \
1241     else {                                                                          \
1242         charid = 0;                                                                 \
1243         if (widecharmap) {                                                          \
1244             SV** const svpp = hv_fetch(widecharmap,                                 \
1245                         (char*)&uvc, sizeof(UV), 0);                                \
1246             if (svpp)                                                               \
1247                 charid = (U16)SvIV(*svpp);                                          \
1248         }                                                                           \
1249     }                                                                               \
1250 } STMT_END
1251
1252 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1253 STMT_START {                                              \
1254     while (s <= e) {                                      \
1255         if ( (CoNd)                                       \
1256              && (ln == 1 || folder(s, pat_string, ln))    \
1257              && (!reginfo || regtry(reginfo, &s)) )       \
1258             goto got_it;                                  \
1259         s++;                                              \
1260     }                                                     \
1261 } STMT_END
1262
1263 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1264 STMT_START {                                          \
1265     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1266         CoDe                                          \
1267         s += uskip;                                   \
1268     }                                                 \
1269 } STMT_END
1270
1271 #define REXEC_FBC_SCAN(CoDe)                          \
1272 STMT_START {                                          \
1273     while (s < strend) {                              \
1274         CoDe                                          \
1275         s++;                                          \
1276     }                                                 \
1277 } STMT_END
1278
1279 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1280 REXEC_FBC_UTF8_SCAN(                                  \
1281     if (CoNd) {                                       \
1282         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1283             goto got_it;                              \
1284         else                                          \
1285             tmp = doevery;                            \
1286     }                                                 \
1287     else                                              \
1288         tmp = 1;                                      \
1289 )
1290
1291 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1292 REXEC_FBC_SCAN(                                       \
1293     if (CoNd) {                                       \
1294         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1295             goto got_it;                              \
1296         else                                          \
1297             tmp = doevery;                            \
1298     }                                                 \
1299     else                                              \
1300         tmp = 1;                                      \
1301 )
1302
1303 #define REXEC_FBC_TRYIT               \
1304 if ((!reginfo || regtry(reginfo, &s))) \
1305     goto got_it
1306
1307 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1308     if (utf8_target) {                                             \
1309         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1310     }                                                          \
1311     else {                                                     \
1312         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1313     }
1314
1315 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1316     if (utf8_target) {                                             \
1317         UtFpReLoAd;                                            \
1318         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1319     }                                                          \
1320     else {                                                     \
1321         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1322     }
1323
1324 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1325     PL_reg_flags |= RF_tainted;                                \
1326     if (utf8_target) {                                             \
1327         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1328     }                                                          \
1329     else {                                                     \
1330         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1331     }
1332
1333 #define DUMP_EXEC_POS(li,s,doutf8) \
1334     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1335
1336
1337 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1338         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1339         tmp = TEST_NON_UTF8(tmp);                                              \
1340         REXEC_FBC_UTF8_SCAN(                                                   \
1341             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1342                 tmp = !tmp;                                                    \
1343                 IF_SUCCESS;                                                    \
1344             }                                                                  \
1345             else {                                                             \
1346                 IF_FAIL;                                                       \
1347             }                                                                  \
1348         );                                                                     \
1349
1350 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1351         if (s == PL_bostr) {                                                   \
1352             tmp = '\n';                                                        \
1353         }                                                                      \
1354         else {                                                                 \
1355             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1356             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1357         }                                                                      \
1358         tmp = TeSt1_UtF8;                                                      \
1359         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1360         REXEC_FBC_UTF8_SCAN(                                                   \
1361             if (tmp == ! (TeSt2_UtF8)) { \
1362                 tmp = !tmp;                                                    \
1363                 IF_SUCCESS;                                                    \
1364             }                                                                  \
1365             else {                                                             \
1366                 IF_FAIL;                                                       \
1367             }                                                                  \
1368         );                                                                     \
1369
1370 /* The only difference between the BOUND and NBOUND cases is that
1371  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1372  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1373  * with the other one being empty */
1374 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1375     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1376
1377 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1378     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1379
1380 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1381     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1382
1383 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1384     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1385
1386
1387 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1388  * be passed in completely with the variable name being tested, which isn't
1389  * such a clean interface, but this is easier to read than it was before.  We
1390  * are looking for the boundary (or non-boundary between a word and non-word
1391  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1392  * must be different.  Find the "wordness" of the character just prior to this
1393  * one, and compare it with the wordness of this one.  If they differ, we have
1394  * a boundary.  At the beginning of the string, pretend that the previous
1395  * character was a new-line */
1396 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1397     if (utf8_target) {                                                         \
1398                 UTF8_CODE \
1399     }                                                                          \
1400     else {  /* Not utf8 */                                                     \
1401         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1402         tmp = TEST_NON_UTF8(tmp);                                              \
1403         REXEC_FBC_SCAN(                                                        \
1404             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1405                 tmp = !tmp;                                                    \
1406                 IF_SUCCESS;                                                    \
1407             }                                                                  \
1408             else {                                                             \
1409                 IF_FAIL;                                                       \
1410             }                                                                  \
1411         );                                                                     \
1412     }                                                                          \
1413     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1414         goto got_it;
1415
1416 /* We know what class REx starts with.  Try to find this position... */
1417 /* if reginfo is NULL, its a dryrun */
1418 /* annoyingly all the vars in this routine have different names from their counterparts
1419    in regmatch. /grrr */
1420
1421 STATIC char *
1422 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1423     const char *strend, regmatch_info *reginfo)
1424 {
1425         dVAR;
1426         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1427         char *pat_string;   /* The pattern's exactish string */
1428         char *pat_end;      /* ptr to end char of pat_string */
1429         re_fold_t folder;       /* Function for computing non-utf8 folds */
1430         const U8 *fold_array;   /* array for folding ords < 256 */
1431         STRLEN ln;
1432         STRLEN lnc;
1433         register STRLEN uskip;
1434         U8 c1;
1435         U8 c2;
1436         char *e;
1437         register I32 tmp = 1;   /* Scratch variable? */
1438         register const bool utf8_target = PL_reg_match_utf8;
1439         UV utf8_fold_flags = 0;
1440         RXi_GET_DECL(prog,progi);
1441
1442         PERL_ARGS_ASSERT_FIND_BYCLASS;
1443
1444         /* We know what class it must start with. */
1445         switch (OP(c)) {
1446         case ANYOFV:
1447         case ANYOF:
1448             if (utf8_target || OP(c) == ANYOFV) {
1449                 STRLEN inclasslen = strend - s;
1450                 REXEC_FBC_UTF8_CLASS_SCAN(
1451                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1452             }
1453             else {
1454                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1455             }
1456             break;
1457         case CANY:
1458             REXEC_FBC_SCAN(
1459                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1460                     goto got_it;
1461                 else
1462                     tmp = doevery;
1463             );
1464             break;
1465
1466         case EXACTFA:
1467             if (UTF_PATTERN || utf8_target) {
1468                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1469                 goto do_exactf_utf8;
1470             }
1471             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1472             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1473             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1474
1475         case EXACTF:
1476             if (utf8_target) {
1477
1478                 /* regcomp.c already folded this if pattern is in UTF-8 */
1479                 utf8_fold_flags = 0;
1480                 goto do_exactf_utf8;
1481             }
1482             fold_array = PL_fold;
1483             folder = foldEQ;
1484             goto do_exactf_non_utf8;
1485
1486         case EXACTFL:
1487             if (UTF_PATTERN || utf8_target) {
1488                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1489                 goto do_exactf_utf8;
1490             }
1491             fold_array = PL_fold_locale;
1492             folder = foldEQ_locale;
1493             goto do_exactf_non_utf8;
1494
1495         case EXACTFU_SS:
1496             if (UTF_PATTERN) {
1497                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1498             }
1499             goto do_exactf_utf8;
1500
1501         case EXACTFU_TRICKYFOLD:
1502         case EXACTFU:
1503             if (UTF_PATTERN || utf8_target) {
1504                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1505                 goto do_exactf_utf8;
1506             }
1507
1508             /* Any 'ss' in the pattern should have been replaced by regcomp,
1509              * so we don't have to worry here about this single special case
1510              * in the Latin1 range */
1511             fold_array = PL_fold_latin1;
1512             folder = foldEQ_latin1;
1513
1514             /* FALL THROUGH */
1515
1516         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1517                                are no glitches with fold-length differences
1518                                between the target string and pattern */
1519
1520             /* The idea in the non-utf8 EXACTF* cases is to first find the
1521              * first character of the EXACTF* node and then, if necessary,
1522              * case-insensitively compare the full text of the node.  c1 is the
1523              * first character.  c2 is its fold.  This logic will not work for
1524              * Unicode semantics and the german sharp ss, which hence should
1525              * not be compiled into a node that gets here. */
1526             pat_string = STRING(c);
1527             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1528
1529             /* We know that we have to match at least 'ln' bytes (which is the
1530              * same as characters, since not utf8).  If we have to match 3
1531              * characters, and there are only 2 availabe, we know without
1532              * trying that it will fail; so don't start a match past the
1533              * required minimum number from the far end */
1534             e = HOP3c(strend, -((I32)ln), s);
1535
1536             if (!reginfo && e < s) {
1537                 e = s;                  /* Due to minlen logic of intuit() */
1538             }
1539
1540             c1 = *pat_string;
1541             c2 = fold_array[c1];
1542             if (c1 == c2) { /* If char and fold are the same */
1543                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1544             }
1545             else {
1546                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1547             }
1548             break;
1549
1550         do_exactf_utf8:
1551         {
1552             unsigned expansion;
1553
1554
1555             /* If one of the operands is in utf8, we can't use the simpler
1556              * folding above, due to the fact that many different characters
1557              * can have the same fold, or portion of a fold, or different-
1558              * length fold */
1559             pat_string = STRING(c);
1560             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1561             pat_end = pat_string + ln;
1562             lnc = (UTF_PATTERN) /* length to match in characters */
1563                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1564                     : ln;
1565
1566             /* We have 'lnc' characters to match in the pattern, but because of
1567              * multi-character folding, each character in the target can match
1568              * up to 3 characters (Unicode guarantees it will never exceed
1569              * this) if it is utf8-encoded; and up to 2 if not (based on the
1570              * fact that the Latin 1 folds are already determined, and the
1571              * only multi-char fold in that range is the sharp-s folding to
1572              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1573              * string character.  Adjust lnc accordingly, rounding up, so that
1574              * if we need to match at least 4+1/3 chars, that really is 5. */
1575             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1576             lnc = (lnc + expansion - 1) / expansion;
1577
1578             /* As in the non-UTF8 case, if we have to match 3 characters, and
1579              * only 2 are left, it's guaranteed to fail, so don't start a
1580              * match that would require us to go beyond the end of the string
1581              */
1582             e = HOP3c(strend, -((I32)lnc), s);
1583
1584             if (!reginfo && e < s) {
1585                 e = s;                  /* Due to minlen logic of intuit() */
1586             }
1587
1588             /* XXX Note that we could recalculate e to stop the loop earlier,
1589              * as the worst case expansion above will rarely be met, and as we
1590              * go along we would usually find that e moves further to the left.
1591              * This would happen only after we reached the point in the loop
1592              * where if there were no expansion we should fail.  Unclear if
1593              * worth the expense */
1594
1595             while (s <= e) {
1596                 char *my_strend= (char *)strend;
1597                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1598                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1599                     && (!reginfo || regtry(reginfo, &s)) )
1600                 {
1601                     goto got_it;
1602                 }
1603                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1604             }
1605             break;
1606         }
1607         case BOUNDL:
1608             PL_reg_flags |= RF_tainted;
1609             FBC_BOUND(isALNUM_LC,
1610                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1611                       isALNUM_LC_utf8((U8*)s));
1612             break;
1613         case NBOUNDL:
1614             PL_reg_flags |= RF_tainted;
1615             FBC_NBOUND(isALNUM_LC,
1616                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1617                        isALNUM_LC_utf8((U8*)s));
1618             break;
1619         case BOUND:
1620             FBC_BOUND(isWORDCHAR,
1621                       isALNUM_uni(tmp),
1622                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1623             break;
1624         case BOUNDA:
1625             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1626                              isWORDCHAR_A(tmp),
1627                              isWORDCHAR_A((U8*)s));
1628             break;
1629         case NBOUND:
1630             FBC_NBOUND(isWORDCHAR,
1631                        isALNUM_uni(tmp),
1632                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1633             break;
1634         case NBOUNDA:
1635             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1636                               isWORDCHAR_A(tmp),
1637                               isWORDCHAR_A((U8*)s));
1638             break;
1639         case BOUNDU:
1640             FBC_BOUND(isWORDCHAR_L1,
1641                       isALNUM_uni(tmp),
1642                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1643             break;
1644         case NBOUNDU:
1645             FBC_NBOUND(isWORDCHAR_L1,
1646                        isALNUM_uni(tmp),
1647                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1648             break;
1649         case ALNUML:
1650             REXEC_FBC_CSCAN_TAINT(
1651                 isALNUM_LC_utf8((U8*)s),
1652                 isALNUM_LC(*s)
1653             );
1654             break;
1655         case ALNUMU:
1656             REXEC_FBC_CSCAN_PRELOAD(
1657                 LOAD_UTF8_CHARCLASS_ALNUM(),
1658                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1659                 isWORDCHAR_L1((U8) *s)
1660             );
1661             break;
1662         case ALNUM:
1663             REXEC_FBC_CSCAN_PRELOAD(
1664                 LOAD_UTF8_CHARCLASS_ALNUM(),
1665                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1666                 isWORDCHAR((U8) *s)
1667             );
1668             break;
1669         case ALNUMA:
1670             /* Don't need to worry about utf8, as it can match only a single
1671              * byte invariant character */
1672             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1673             break;
1674         case NALNUMU:
1675             REXEC_FBC_CSCAN_PRELOAD(
1676                 LOAD_UTF8_CHARCLASS_ALNUM(),
1677                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1678                 ! isWORDCHAR_L1((U8) *s)
1679             );
1680             break;
1681         case NALNUM:
1682             REXEC_FBC_CSCAN_PRELOAD(
1683                 LOAD_UTF8_CHARCLASS_ALNUM(),
1684                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1685                 ! isALNUM(*s)
1686             );
1687             break;
1688         case NALNUMA:
1689             REXEC_FBC_CSCAN(
1690                 !isWORDCHAR_A(*s),
1691                 !isWORDCHAR_A(*s)
1692             );
1693             break;
1694         case NALNUML:
1695             REXEC_FBC_CSCAN_TAINT(
1696                 !isALNUM_LC_utf8((U8*)s),
1697                 !isALNUM_LC(*s)
1698             );
1699             break;
1700         case SPACEU:
1701             REXEC_FBC_CSCAN_PRELOAD(
1702                 LOAD_UTF8_CHARCLASS_SPACE(),
1703                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1704                 isSPACE_L1((U8) *s)
1705             );
1706             break;
1707         case SPACE:
1708             REXEC_FBC_CSCAN_PRELOAD(
1709                 LOAD_UTF8_CHARCLASS_SPACE(),
1710                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1711                 isSPACE((U8) *s)
1712             );
1713             break;
1714         case SPACEA:
1715             /* Don't need to worry about utf8, as it can match only a single
1716              * byte invariant character */
1717             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1718             break;
1719         case SPACEL:
1720             REXEC_FBC_CSCAN_TAINT(
1721                 isSPACE_LC_utf8((U8*)s),
1722                 isSPACE_LC(*s)
1723             );
1724             break;
1725         case NSPACEU:
1726             REXEC_FBC_CSCAN_PRELOAD(
1727                 LOAD_UTF8_CHARCLASS_SPACE(),
1728                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1729                 ! isSPACE_L1((U8) *s)
1730             );
1731             break;
1732         case NSPACE:
1733             REXEC_FBC_CSCAN_PRELOAD(
1734                 LOAD_UTF8_CHARCLASS_SPACE(),
1735                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1736                 ! isSPACE((U8) *s)
1737             );
1738             break;
1739         case NSPACEA:
1740             REXEC_FBC_CSCAN(
1741                 !isSPACE_A(*s),
1742                 !isSPACE_A(*s)
1743             );
1744             break;
1745         case NSPACEL:
1746             REXEC_FBC_CSCAN_TAINT(
1747                 !isSPACE_LC_utf8((U8*)s),
1748                 !isSPACE_LC(*s)
1749             );
1750             break;
1751         case DIGIT:
1752             REXEC_FBC_CSCAN_PRELOAD(
1753                 LOAD_UTF8_CHARCLASS_DIGIT(),
1754                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1755                 isDIGIT(*s)
1756             );
1757             break;
1758         case DIGITA:
1759             /* Don't need to worry about utf8, as it can match only a single
1760              * byte invariant character */
1761             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1762             break;
1763         case DIGITL:
1764             REXEC_FBC_CSCAN_TAINT(
1765                 isDIGIT_LC_utf8((U8*)s),
1766                 isDIGIT_LC(*s)
1767             );
1768             break;
1769         case NDIGIT:
1770             REXEC_FBC_CSCAN_PRELOAD(
1771                 LOAD_UTF8_CHARCLASS_DIGIT(),
1772                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1773                 !isDIGIT(*s)
1774             );
1775             break;
1776         case NDIGITA:
1777             REXEC_FBC_CSCAN(
1778                 !isDIGIT_A(*s),
1779                 !isDIGIT_A(*s)
1780             );
1781             break;
1782         case NDIGITL:
1783             REXEC_FBC_CSCAN_TAINT(
1784                 !isDIGIT_LC_utf8((U8*)s),
1785                 !isDIGIT_LC(*s)
1786             );
1787             break;
1788         case LNBREAK:
1789             REXEC_FBC_CSCAN(
1790                 is_LNBREAK_utf8(s),
1791                 is_LNBREAK_latin1(s)
1792             );
1793             break;
1794         case VERTWS:
1795             REXEC_FBC_CSCAN(
1796                 is_VERTWS_utf8(s),
1797                 is_VERTWS_latin1(s)
1798             );
1799             break;
1800         case NVERTWS:
1801             REXEC_FBC_CSCAN(
1802                 !is_VERTWS_utf8(s),
1803                 !is_VERTWS_latin1(s)
1804             );
1805             break;
1806         case HORIZWS:
1807             REXEC_FBC_CSCAN(
1808                 is_HORIZWS_utf8(s),
1809                 is_HORIZWS_latin1(s)
1810             );
1811             break;
1812         case NHORIZWS:
1813             REXEC_FBC_CSCAN(
1814                 !is_HORIZWS_utf8(s),
1815                 !is_HORIZWS_latin1(s)
1816             );
1817             break;
1818         case AHOCORASICKC:
1819         case AHOCORASICK:
1820             {
1821                 DECL_TRIE_TYPE(c);
1822                 /* what trie are we using right now */
1823                 reg_ac_data *aho
1824                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1825                 reg_trie_data *trie
1826                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1827                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1828
1829                 const char *last_start = strend - trie->minlen;
1830 #ifdef DEBUGGING
1831                 const char *real_start = s;
1832 #endif
1833                 STRLEN maxlen = trie->maxlen;
1834                 SV *sv_points;
1835                 U8 **points; /* map of where we were in the input string
1836                                 when reading a given char. For ASCII this
1837                                 is unnecessary overhead as the relationship
1838                                 is always 1:1, but for Unicode, especially
1839                                 case folded Unicode this is not true. */
1840                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1841                 U8 *bitmap=NULL;
1842
1843
1844                 GET_RE_DEBUG_FLAGS_DECL;
1845
1846                 /* We can't just allocate points here. We need to wrap it in
1847                  * an SV so it gets freed properly if there is a croak while
1848                  * running the match */
1849                 ENTER;
1850                 SAVETMPS;
1851                 sv_points=newSV(maxlen * sizeof(U8 *));
1852                 SvCUR_set(sv_points,
1853                     maxlen * sizeof(U8 *));
1854                 SvPOK_on(sv_points);
1855                 sv_2mortal(sv_points);
1856                 points=(U8**)SvPV_nolen(sv_points );
1857                 if ( trie_type != trie_utf8_fold
1858                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1859                 {
1860                     if (trie->bitmap)
1861                         bitmap=(U8*)trie->bitmap;
1862                     else
1863                         bitmap=(U8*)ANYOF_BITMAP(c);
1864                 }
1865                 /* this is the Aho-Corasick algorithm modified a touch
1866                    to include special handling for long "unknown char"
1867                    sequences. The basic idea being that we use AC as long
1868                    as we are dealing with a possible matching char, when
1869                    we encounter an unknown char (and we have not encountered
1870                    an accepting state) we scan forward until we find a legal
1871                    starting char.
1872                    AC matching is basically that of trie matching, except
1873                    that when we encounter a failing transition, we fall back
1874                    to the current states "fail state", and try the current char
1875                    again, a process we repeat until we reach the root state,
1876                    state 1, or a legal transition. If we fail on the root state
1877                    then we can either terminate if we have reached an accepting
1878                    state previously, or restart the entire process from the beginning
1879                    if we have not.
1880
1881                  */
1882                 while (s <= last_start) {
1883                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1884                     U8 *uc = (U8*)s;
1885                     U16 charid = 0;
1886                     U32 base = 1;
1887                     U32 state = 1;
1888                     UV uvc = 0;
1889                     STRLEN len = 0;
1890                     STRLEN foldlen = 0;
1891                     U8 *uscan = (U8*)NULL;
1892                     U8 *leftmost = NULL;
1893 #ifdef DEBUGGING
1894                     U32 accepted_word= 0;
1895 #endif
1896                     U32 pointpos = 0;
1897
1898                     while ( state && uc <= (U8*)strend ) {
1899                         int failed=0;
1900                         U32 word = aho->states[ state ].wordnum;
1901
1902                         if( state==1 ) {
1903                             if ( bitmap ) {
1904                                 DEBUG_TRIE_EXECUTE_r(
1905                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1907                                             (char *)uc, utf8_target );
1908                                         PerlIO_printf( Perl_debug_log,
1909                                             " Scanning for legal start char...\n");
1910                                     }
1911                                 );
1912                                 if (utf8_target) {
1913                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1914                                         uc += UTF8SKIP(uc);
1915                                     }
1916                                 } else {
1917                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1918                                         uc++;
1919                                     }
1920                                 }
1921                                 s= (char *)uc;
1922                             }
1923                             if (uc >(U8*)last_start) break;
1924                         }
1925
1926                         if ( word ) {
1927                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1928                             if (!leftmost || lpos < leftmost) {
1929                                 DEBUG_r(accepted_word=word);
1930                                 leftmost= lpos;
1931                             }
1932                             if (base==0) break;
1933
1934                         }
1935                         points[pointpos++ % maxlen]= uc;
1936                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1937                                              uscan, len, uvc, charid, foldlen,
1938                                              foldbuf, uniflags);
1939                         DEBUG_TRIE_EXECUTE_r({
1940                             dump_exec_pos( (char *)uc, c, strend, real_start,
1941                                 s,   utf8_target );
1942                             PerlIO_printf(Perl_debug_log,
1943                                 " Charid:%3u CP:%4"UVxf" ",
1944                                  charid, uvc);
1945                         });
1946
1947                         do {
1948 #ifdef DEBUGGING
1949                             word = aho->states[ state ].wordnum;
1950 #endif
1951                             base = aho->states[ state ].trans.base;
1952
1953                             DEBUG_TRIE_EXECUTE_r({
1954                                 if (failed)
1955                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1956                                         s,   utf8_target );
1957                                 PerlIO_printf( Perl_debug_log,
1958                                     "%sState: %4"UVxf", word=%"UVxf,
1959                                     failed ? " Fail transition to " : "",
1960                                     (UV)state, (UV)word);
1961                             });
1962                             if ( base ) {
1963                                 U32 tmp;
1964                                 I32 offset;
1965                                 if (charid &&
1966                                      ( ((offset = base + charid
1967                                         - 1 - trie->uniquecharcount)) >= 0)
1968                                      && ((U32)offset < trie->lasttrans)
1969                                      && trie->trans[offset].check == state
1970                                      && (tmp=trie->trans[offset].next))
1971                                 {
1972                                     DEBUG_TRIE_EXECUTE_r(
1973                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1974                                     state = tmp;
1975                                     break;
1976                                 }
1977                                 else {
1978                                     DEBUG_TRIE_EXECUTE_r(
1979                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1980                                     failed = 1;
1981                                     state = aho->fail[state];
1982                                 }
1983                             }
1984                             else {
1985                                 /* we must be accepting here */
1986                                 DEBUG_TRIE_EXECUTE_r(
1987                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1988                                 failed = 1;
1989                                 break;
1990                             }
1991                         } while(state);
1992                         uc += len;
1993                         if (failed) {
1994                             if (leftmost)
1995                                 break;
1996                             if (!state) state = 1;
1997                         }
1998                     }
1999                     if ( aho->states[ state ].wordnum ) {
2000                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2001                         if (!leftmost || lpos < leftmost) {
2002                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2003                             leftmost = lpos;
2004                         }
2005                     }
2006                     if (leftmost) {
2007                         s = (char*)leftmost;
2008                         DEBUG_TRIE_EXECUTE_r({
2009                             PerlIO_printf(
2010                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2011                                 (UV)accepted_word, (IV)(s - real_start)
2012                             );
2013                         });
2014                         if (!reginfo || regtry(reginfo, &s)) {
2015                             FREETMPS;
2016                             LEAVE;
2017                             goto got_it;
2018                         }
2019                         s = HOPc(s,1);
2020                         DEBUG_TRIE_EXECUTE_r({
2021                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2022                         });
2023                     } else {
2024                         DEBUG_TRIE_EXECUTE_r(
2025                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2026                         break;
2027                     }
2028                 }
2029                 FREETMPS;
2030                 LEAVE;
2031             }
2032             break;
2033         default:
2034             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2035             break;
2036         }
2037         return 0;
2038       got_it:
2039         return s;
2040 }
2041
2042
2043 /*
2044  - regexec_flags - match a regexp against a string
2045  */
2046 I32
2047 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2048               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2049 /* strend: pointer to null at end of string */
2050 /* strbeg: real beginning of string */
2051 /* minend: end of match must be >=minend after stringarg. */
2052 /* data: May be used for some additional optimizations.
2053          Currently its only used, with a U32 cast, for transmitting
2054          the ganch offset when doing a /g match. This will change */
2055 /* nosave: For optimizations. */
2056 {
2057     dVAR;
2058     struct regexp *const prog = (struct regexp *)SvANY(rx);
2059     /*register*/ char *s;
2060     register regnode *c;
2061     /*register*/ char *startpos = stringarg;
2062     I32 minlen;         /* must match at least this many chars */
2063     I32 dontbother = 0; /* how many characters not to try at end */
2064     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2065     I32 scream_pos = -1;                /* Internal iterator of scream. */
2066     char *scream_olds = NULL;
2067     const bool utf8_target = cBOOL(DO_UTF8(sv));
2068     I32 multiline;
2069     RXi_GET_DECL(prog,progi);
2070     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2071     regexp_paren_pair *swap = NULL;
2072     GET_RE_DEBUG_FLAGS_DECL;
2073
2074     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2075     PERL_UNUSED_ARG(data);
2076
2077     /* Be paranoid... */
2078     if (prog == NULL || startpos == NULL) {
2079         Perl_croak(aTHX_ "NULL regexp parameter");
2080         return 0;
2081     }
2082
2083     multiline = prog->extflags & RXf_PMf_MULTILINE;
2084     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2085
2086     RX_MATCH_UTF8_set(rx, utf8_target);
2087     DEBUG_EXECUTE_r(
2088         debug_start_match(rx, utf8_target, startpos, strend,
2089         "Matching");
2090     );
2091
2092     minlen = prog->minlen;
2093
2094     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2095         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2096                               "String too short [regexec_flags]...\n"));
2097         goto phooey;
2098     }
2099
2100
2101     /* Check validity of program. */
2102     if (UCHARAT(progi->program) != REG_MAGIC) {
2103         Perl_croak(aTHX_ "corrupted regexp program");
2104     }
2105
2106     PL_reg_flags = 0;
2107     PL_reg_state.re_state_eval_setup_done = FALSE;
2108     PL_reg_maxiter = 0;
2109
2110     if (RX_UTF8(rx))
2111         PL_reg_flags |= RF_utf8;
2112
2113     /* Mark beginning of line for ^ and lookbehind. */
2114     reginfo.bol = startpos; /* XXX not used ??? */
2115     PL_bostr  = strbeg;
2116     reginfo.sv = sv;
2117
2118     /* Mark end of line for $ (and such) */
2119     PL_regeol = strend;
2120
2121     /* see how far we have to get to not match where we matched before */
2122     reginfo.till = startpos+minend;
2123
2124     /* If there is a "must appear" string, look for it. */
2125     s = startpos;
2126
2127     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2128         MAGIC *mg;
2129         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2130             reginfo.ganch = startpos + prog->gofs;
2131             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2132               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2133         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2134                   && SvMAGIC(sv)
2135                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2136                   && mg->mg_len >= 0) {
2137             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2138             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2139                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2140
2141             if (prog->extflags & RXf_ANCH_GPOS) {
2142                 if (s > reginfo.ganch)
2143                     goto phooey;
2144                 s = reginfo.ganch - prog->gofs;
2145                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2146                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2147                 if (s < strbeg)
2148                     goto phooey;
2149             }
2150         }
2151         else if (data) {
2152             reginfo.ganch = strbeg + PTR2UV(data);
2153             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2154                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2155
2156         } else {                                /* pos() not defined */
2157             reginfo.ganch = strbeg;
2158             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159                  "GPOS: reginfo.ganch = strbeg\n"));
2160         }
2161     }
2162     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2163         /* We have to be careful. If the previous successful match
2164            was from this regex we don't want a subsequent partially
2165            successful match to clobber the old results.
2166            So when we detect this possibility we add a swap buffer
2167            to the re, and switch the buffer each match. If we fail
2168            we switch it back, otherwise we leave it swapped.
2169         */
2170         swap = prog->offs;
2171         /* do we need a save destructor here for eval dies? */
2172         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2173         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2174             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2175             PTR2UV(prog),
2176             PTR2UV(swap),
2177             PTR2UV(prog->offs)
2178         ));
2179     }
2180     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2181         re_scream_pos_data d;
2182
2183         d.scream_olds = &scream_olds;
2184         d.scream_pos = &scream_pos;
2185         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2186         if (!s) {
2187             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2188             goto phooey;        /* not present */
2189         }
2190     }
2191
2192
2193
2194     /* Simplest case:  anchored match need be tried only once. */
2195     /*  [unless only anchor is BOL and multiline is set] */
2196     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2197         if (s == startpos && regtry(&reginfo, &startpos))
2198             goto got_it;
2199         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2200                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2201         {
2202             char *end;
2203
2204             if (minlen)
2205                 dontbother = minlen - 1;
2206             end = HOP3c(strend, -dontbother, strbeg) - 1;
2207             /* for multiline we only have to try after newlines */
2208             if (prog->check_substr || prog->check_utf8) {
2209                 /* because of the goto we can not easily reuse the macros for bifurcating the
2210                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2211                 if (utf8_target) {
2212                     if (s == startpos)
2213                         goto after_try_utf8;
2214                     while (1) {
2215                         if (regtry(&reginfo, &s)) {
2216                             goto got_it;
2217                         }
2218                       after_try_utf8:
2219                         if (s > end) {
2220                             goto phooey;
2221                         }
2222                         if (prog->extflags & RXf_USE_INTUIT) {
2223                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2224                             if (!s) {
2225                                 goto phooey;
2226                             }
2227                         }
2228                         else {
2229                             s += UTF8SKIP(s);
2230                         }
2231                     }
2232                 } /* end search for check string in unicode */
2233                 else {
2234                     if (s == startpos) {
2235                         goto after_try_latin;
2236                     }
2237                     while (1) {
2238                         if (regtry(&reginfo, &s)) {
2239                             goto got_it;
2240                         }
2241                       after_try_latin:
2242                         if (s > end) {
2243                             goto phooey;
2244                         }
2245                         if (prog->extflags & RXf_USE_INTUIT) {
2246                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2247                             if (!s) {
2248                                 goto phooey;
2249                             }
2250                         }
2251                         else {
2252                             s++;
2253                         }
2254                     }
2255                 } /* end search for check string in latin*/
2256             } /* end search for check string */
2257             else { /* search for newline */
2258                 if (s > startpos) {
2259                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2260                     s--;
2261                 }
2262                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2263                 while (s <= end) { /* note it could be possible to match at the end of the string */
2264                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2265                         if (regtry(&reginfo, &s))
2266                             goto got_it;
2267                     }
2268                 }
2269             } /* end search for newline */
2270         } /* end anchored/multiline check string search */
2271         goto phooey;
2272     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2273     {
2274         /* the warning about reginfo.ganch being used without initialization
2275            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2276            and we only enter this block when the same bit is set. */
2277         char *tmp_s = reginfo.ganch - prog->gofs;
2278
2279         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2280             goto got_it;
2281         goto phooey;
2282     }
2283
2284     /* Messy cases:  unanchored match. */
2285     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2286         /* we have /x+whatever/ */
2287         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2288         char ch;
2289 #ifdef DEBUGGING
2290         int did_match = 0;
2291 #endif
2292         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2293             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2294         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2295
2296         if (utf8_target) {
2297             REXEC_FBC_SCAN(
2298                 if (*s == ch) {
2299                     DEBUG_EXECUTE_r( did_match = 1 );
2300                     if (regtry(&reginfo, &s)) goto got_it;
2301                     s += UTF8SKIP(s);
2302                     while (s < strend && *s == ch)
2303                         s += UTF8SKIP(s);
2304                 }
2305             );
2306         }
2307         else {
2308             REXEC_FBC_SCAN(
2309                 if (*s == ch) {
2310                     DEBUG_EXECUTE_r( did_match = 1 );
2311                     if (regtry(&reginfo, &s)) goto got_it;
2312                     s++;
2313                     while (s < strend && *s == ch)
2314                         s++;
2315                 }
2316             );
2317         }
2318         DEBUG_EXECUTE_r(if (!did_match)
2319                 PerlIO_printf(Perl_debug_log,
2320                                   "Did not find anchored character...\n")
2321                );
2322     }
2323     else if (prog->anchored_substr != NULL
2324               || prog->anchored_utf8 != NULL
2325               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2326                   && prog->float_max_offset < strend - s)) {
2327         SV *must;
2328         I32 back_max;
2329         I32 back_min;
2330         char *last;
2331         char *last1;            /* Last position checked before */
2332 #ifdef DEBUGGING
2333         int did_match = 0;
2334 #endif
2335         if (prog->anchored_substr || prog->anchored_utf8) {
2336             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2337                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2338             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2339             back_max = back_min = prog->anchored_offset;
2340         } else {
2341             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2342                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2343             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2344             back_max = prog->float_max_offset;
2345             back_min = prog->float_min_offset;
2346         }
2347
2348
2349         if (must == &PL_sv_undef)
2350             /* could not downgrade utf8 check substring, so must fail */
2351             goto phooey;
2352
2353         if (back_min<0) {
2354             last = strend;
2355         } else {
2356             last = HOP3c(strend,        /* Cannot start after this */
2357                   -(I32)(CHR_SVLEN(must)
2358                          - (SvTAIL(must) != 0) + back_min), strbeg);
2359         }
2360         if (s > PL_bostr)
2361             last1 = HOPc(s, -1);
2362         else
2363             last1 = s - 1;      /* bogus */
2364
2365         /* XXXX check_substr already used to find "s", can optimize if
2366            check_substr==must. */
2367         scream_pos = -1;
2368         dontbother = end_shift;
2369         strend = HOPc(strend, -dontbother);
2370         while ( (s <= last) &&
2371                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2372                                   (unsigned char*)strend, must,
2373                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2374             DEBUG_EXECUTE_r( did_match = 1 );
2375             if (HOPc(s, -back_max) > last1) {
2376                 last1 = HOPc(s, -back_min);
2377                 s = HOPc(s, -back_max);
2378             }
2379             else {
2380                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2381
2382                 last1 = HOPc(s, -back_min);
2383                 s = t;
2384             }
2385             if (utf8_target) {
2386                 while (s <= last1) {
2387                     if (regtry(&reginfo, &s))
2388                         goto got_it;
2389                     s += UTF8SKIP(s);
2390                 }
2391             }
2392             else {
2393                 while (s <= last1) {
2394                     if (regtry(&reginfo, &s))
2395                         goto got_it;
2396                     s++;
2397                 }
2398             }
2399         }
2400         DEBUG_EXECUTE_r(if (!did_match) {
2401             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2402                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2403             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2404                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2405                                ? "anchored" : "floating"),
2406                 quoted, RE_SV_TAIL(must));
2407         });
2408         goto phooey;
2409     }
2410     else if ( (c = progi->regstclass) ) {
2411         if (minlen) {
2412             const OPCODE op = OP(progi->regstclass);
2413             /* don't bother with what can't match */
2414             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2415                 strend = HOPc(strend, -(minlen - 1));
2416         }
2417         DEBUG_EXECUTE_r({
2418             SV * const prop = sv_newmortal();
2419             regprop(prog, prop, c);
2420             {
2421                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2422                     s,strend-s,60);
2423                 PerlIO_printf(Perl_debug_log,
2424                     "Matching stclass %.*s against %s (%d bytes)\n",
2425                     (int)SvCUR(prop), SvPVX_const(prop),
2426                      quoted, (int)(strend - s));
2427             }
2428         });
2429         if (find_byclass(prog, c, s, strend, &reginfo))
2430             goto got_it;
2431         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2432     }
2433     else {
2434         dontbother = 0;
2435         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2436             /* Trim the end. */
2437             char *last= NULL;
2438             SV* float_real;
2439             STRLEN len;
2440             const char *little;
2441
2442             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2443                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2444             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2445
2446             little = SvPV_const(float_real, len);
2447             if (SvTAIL(float_real)) {
2448                     /* This means that float_real contains an artificial \n on the end
2449                      * due to the presence of something like this: /foo$/
2450                      * where we can match both "foo" and "foo\n" at the end of the string.
2451                      * So we have to compare the end of the string first against the float_real
2452                      * without the \n and then against the full float_real with the string.
2453                      * We have to watch out for cases where the string might be smaller
2454                      * than the float_real or the float_real without the \n.
2455                      */
2456                     char *checkpos= strend - len;
2457                     DEBUG_OPTIMISE_r(
2458                         PerlIO_printf(Perl_debug_log,
2459                             "%sChecking for float_real.%s\n",
2460                             PL_colors[4], PL_colors[5]));
2461                     if (checkpos + 1 < strbeg) {
2462                         /* can't match, even if we remove the trailing \n string is too short to match */
2463                         DEBUG_EXECUTE_r(
2464                             PerlIO_printf(Perl_debug_log,
2465                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2466                                 PL_colors[4], PL_colors[5]));
2467                         goto phooey;
2468                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2469                         /* can match, the end of the string matches without the "\n" */
2470                         last = checkpos + 1;
2471                     } else if (checkpos < strbeg) {
2472                         /* cant match, string is too short when the "\n" is included */
2473                         DEBUG_EXECUTE_r(
2474                             PerlIO_printf(Perl_debug_log,
2475                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2476                                 PL_colors[4], PL_colors[5]));
2477                         goto phooey;
2478                     } else if (!multiline) {
2479                         /* non multiline match, so compare with the "\n" at the end of the string */
2480                         if (memEQ(checkpos, little, len)) {
2481                             last= checkpos;
2482                         } else {
2483                             DEBUG_EXECUTE_r(
2484                                 PerlIO_printf(Perl_debug_log,
2485                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2486                                     PL_colors[4], PL_colors[5]));
2487                             goto phooey;
2488                         }
2489                     } else {
2490                         /* multiline match, so we have to search for a place where the full string is located */
2491                         goto find_last;
2492                     }
2493             } else {
2494                   find_last:
2495                     if (len)
2496                         last = rninstr(s, strend, little, little + len);
2497                     else
2498                         last = strend;  /* matching "$" */
2499             }
2500             if (!last) {
2501                 /* at one point this block contained a comment which was probably
2502                  * incorrect, which said that this was a "should not happen" case.
2503                  * Even if it was true when it was written I am pretty sure it is
2504                  * not anymore, so I have removed the comment and replaced it with
2505                  * this one. Yves */
2506                 DEBUG_EXECUTE_r(
2507                     PerlIO_printf(Perl_debug_log,
2508                         "String does not contain required substring, cannot match.\n"
2509                     ));
2510                 goto phooey;
2511             }
2512             dontbother = strend - last + prog->float_min_offset;
2513         }
2514         if (minlen && (dontbother < minlen))
2515             dontbother = minlen - 1;
2516         strend -= dontbother;              /* this one's always in bytes! */
2517         /* We don't know much -- general case. */
2518         if (utf8_target) {
2519             for (;;) {
2520                 if (regtry(&reginfo, &s))
2521                     goto got_it;
2522                 if (s >= strend)
2523                     break;
2524                 s += UTF8SKIP(s);
2525             };
2526         }
2527         else {
2528             do {
2529                 if (regtry(&reginfo, &s))
2530                     goto got_it;
2531             } while (s++ < strend);
2532         }
2533     }
2534
2535     /* Failure. */
2536     goto phooey;
2537
2538 got_it:
2539     DEBUG_BUFFERS_r(
2540         if (swap)
2541             PerlIO_printf(Perl_debug_log,
2542                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2543                 PTR2UV(prog),
2544                 PTR2UV(swap)
2545             );
2546     );
2547     Safefree(swap);
2548     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2549
2550     if (PL_reg_state.re_state_eval_setup_done)
2551         restore_pos(aTHX_ prog);
2552     if (RXp_PAREN_NAMES(prog))
2553         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2554
2555     /* make sure $`, $&, $', and $digit will work later */
2556     if ( !(flags & REXEC_NOT_FIRST) ) {
2557         RX_MATCH_COPY_FREE(rx);
2558         if (flags & REXEC_COPY_STR) {
2559             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2560 #ifdef PERL_OLD_COPY_ON_WRITE
2561             if ((SvIsCOW(sv)
2562                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2563                 if (DEBUG_C_TEST) {
2564                     PerlIO_printf(Perl_debug_log,
2565                                   "Copy on write: regexp capture, type %d\n",
2566                                   (int) SvTYPE(sv));
2567                 }
2568                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2569                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2570                 assert (SvPOKp(prog->saved_copy));
2571             } else
2572 #endif
2573             {
2574                 RX_MATCH_COPIED_on(rx);
2575                 s = savepvn(strbeg, i);
2576                 prog->subbeg = s;
2577             }
2578             prog->sublen = i;
2579         }
2580         else {
2581             prog->subbeg = strbeg;
2582             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2583         }
2584     }
2585
2586     return 1;
2587
2588 phooey:
2589     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2590                           PL_colors[4], PL_colors[5]));
2591     if (PL_reg_state.re_state_eval_setup_done)
2592         restore_pos(aTHX_ prog);
2593     if (swap) {
2594         /* we failed :-( roll it back */
2595         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2596             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2597             PTR2UV(prog),
2598             PTR2UV(prog->offs),
2599             PTR2UV(swap)
2600         ));
2601         Safefree(prog->offs);
2602         prog->offs = swap;
2603     }
2604
2605     return 0;
2606 }
2607
2608
2609 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2610  * Do inc before dec, in case old and new rex are the same */
2611 #define SET_reg_curpm(Re2) \
2612     if (PL_reg_state.re_state_eval_setup_done) {    \
2613         (void)ReREFCNT_inc(Re2);                    \
2614         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2615         PM_SETRE((PL_reg_curpm), (Re2));            \
2616     }
2617
2618
2619 /*
2620  - regtry - try match at specific point
2621  */
2622 STATIC I32                      /* 0 failure, 1 success */
2623 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2624 {
2625     dVAR;
2626     CHECKPOINT lastcp;
2627     REGEXP *const rx = reginfo->prog;
2628     regexp *const prog = (struct regexp *)SvANY(rx);
2629     RXi_GET_DECL(prog,progi);
2630     GET_RE_DEBUG_FLAGS_DECL;
2631
2632     PERL_ARGS_ASSERT_REGTRY;
2633
2634     reginfo->cutpoint=NULL;
2635
2636     if ((prog->extflags & RXf_EVAL_SEEN)
2637         && !PL_reg_state.re_state_eval_setup_done)
2638     {
2639         MAGIC *mg;
2640
2641         PL_reg_state.re_state_eval_setup_done = TRUE;
2642         if (reginfo->sv) {
2643             /* Make $_ available to executed code. */
2644             if (reginfo->sv != DEFSV) {
2645                 SAVE_DEFSV;
2646                 DEFSV_set(reginfo->sv);
2647             }
2648
2649             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2650                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2651                 /* prepare for quick setting of pos */
2652 #ifdef PERL_OLD_COPY_ON_WRITE
2653                 if (SvIsCOW(reginfo->sv))
2654                     sv_force_normal_flags(reginfo->sv, 0);
2655 #endif
2656                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2657                                  &PL_vtbl_mglob, NULL, 0);
2658                 mg->mg_len = -1;
2659             }
2660             PL_reg_magic    = mg;
2661             PL_reg_oldpos   = mg->mg_len;
2662             SAVEDESTRUCTOR_X(restore_pos, prog);
2663         }
2664         if (!PL_reg_curpm) {
2665             Newxz(PL_reg_curpm, 1, PMOP);
2666 #ifdef USE_ITHREADS
2667             {
2668                 SV* const repointer = &PL_sv_undef;
2669                 /* this regexp is also owned by the new PL_reg_curpm, which
2670                    will try to free it.  */
2671                 av_push(PL_regex_padav, repointer);
2672                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2673                 PL_regex_pad = AvARRAY(PL_regex_padav);
2674             }
2675 #endif
2676         }
2677         SET_reg_curpm(rx);
2678         PL_reg_oldcurpm = PL_curpm;
2679         PL_curpm = PL_reg_curpm;
2680         if (RXp_MATCH_COPIED(prog)) {
2681             /*  Here is a serious problem: we cannot rewrite subbeg,
2682                 since it may be needed if this match fails.  Thus
2683                 $` inside (?{}) could fail... */
2684             PL_reg_oldsaved = prog->subbeg;
2685             PL_reg_oldsavedlen = prog->sublen;
2686 #ifdef PERL_OLD_COPY_ON_WRITE
2687             PL_nrs = prog->saved_copy;
2688 #endif
2689             RXp_MATCH_COPIED_off(prog);
2690         }
2691         else
2692             PL_reg_oldsaved = NULL;
2693         prog->subbeg = PL_bostr;
2694         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2695     }
2696 #ifdef DEBUGGING
2697     PL_reg_starttry = *startpos;
2698 #endif
2699     prog->offs[0].start = *startpos - PL_bostr;
2700     PL_reginput = *startpos;
2701     prog->lastparen = 0;
2702     prog->lastcloseparen = 0;
2703     PL_regsize = 0;
2704
2705     /* XXXX What this code is doing here?!!!  There should be no need
2706        to do this again and again, prog->lastparen should take care of
2707        this!  --ilya*/
2708
2709     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2710      * Actually, the code in regcppop() (which Ilya may be meaning by
2711      * prog->lastparen), is not needed at all by the test suite
2712      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2713      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2714      * Meanwhile, this code *is* needed for the
2715      * above-mentioned test suite tests to succeed.  The common theme
2716      * on those tests seems to be returning null fields from matches.
2717      * --jhi updated by dapm */
2718 #if 1
2719     if (prog->nparens) {
2720         regexp_paren_pair *pp = prog->offs;
2721         register I32 i;
2722         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2723             ++pp;
2724             pp->start = -1;
2725             pp->end = -1;
2726         }
2727     }
2728 #endif
2729     REGCP_SET(lastcp);
2730     if (regmatch(reginfo, progi->program + 1)) {
2731         prog->offs[0].end = PL_reginput - PL_bostr;
2732         return 1;
2733     }
2734     if (reginfo->cutpoint)
2735         *startpos= reginfo->cutpoint;
2736     REGCP_UNWIND(lastcp);
2737     return 0;
2738 }
2739
2740
2741 #define sayYES goto yes
2742 #define sayNO goto no
2743 #define sayNO_SILENT goto no_silent
2744
2745 /* we dont use STMT_START/END here because it leads to
2746    "unreachable code" warnings, which are bogus, but distracting. */
2747 #define CACHEsayNO \
2748     if (ST.cache_mask) \
2749        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2750     sayNO
2751
2752 /* this is used to determine how far from the left messages like
2753    'failed...' are printed. It should be set such that messages
2754    are inline with the regop output that created them.
2755 */
2756 #define REPORT_CODE_OFF 32
2757
2758
2759 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2760 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2761
2762 #define SLAB_FIRST(s) (&(s)->states[0])
2763 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2764
2765 /* grab a new slab and return the first slot in it */
2766
2767 STATIC regmatch_state *
2768 S_push_slab(pTHX)
2769 {
2770 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2771     dMY_CXT;
2772 #endif
2773     regmatch_slab *s = PL_regmatch_slab->next;
2774     if (!s) {
2775         Newx(s, 1, regmatch_slab);
2776         s->prev = PL_regmatch_slab;
2777         s->next = NULL;
2778         PL_regmatch_slab->next = s;
2779     }
2780     PL_regmatch_slab = s;
2781     return SLAB_FIRST(s);
2782 }
2783
2784
2785 /* push a new state then goto it */
2786
2787 #define PUSH_STATE_GOTO(state, node) \
2788     scan = node; \
2789     st->resume_state = state; \
2790     goto push_state;
2791
2792 /* push a new state with success backtracking, then goto it */
2793
2794 #define PUSH_YES_STATE_GOTO(state, node) \
2795     scan = node; \
2796     st->resume_state = state; \
2797     goto push_yes_state;
2798
2799
2800
2801 /*
2802
2803 regmatch() - main matching routine
2804
2805 This is basically one big switch statement in a loop. We execute an op,
2806 set 'next' to point the next op, and continue. If we come to a point which
2807 we may need to backtrack to on failure such as (A|B|C), we push a
2808 backtrack state onto the backtrack stack. On failure, we pop the top
2809 state, and re-enter the loop at the state indicated. If there are no more
2810 states to pop, we return failure.
2811
2812 Sometimes we also need to backtrack on success; for example /A+/, where
2813 after successfully matching one A, we need to go back and try to
2814 match another one; similarly for lookahead assertions: if the assertion
2815 completes successfully, we backtrack to the state just before the assertion
2816 and then carry on.  In these cases, the pushed state is marked as
2817 'backtrack on success too'. This marking is in fact done by a chain of
2818 pointers, each pointing to the previous 'yes' state. On success, we pop to
2819 the nearest yes state, discarding any intermediate failure-only states.
2820 Sometimes a yes state is pushed just to force some cleanup code to be
2821 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2822 it to free the inner regex.
2823
2824 Note that failure backtracking rewinds the cursor position, while
2825 success backtracking leaves it alone.
2826
2827 A pattern is complete when the END op is executed, while a subpattern
2828 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2829 ops trigger the "pop to last yes state if any, otherwise return true"
2830 behaviour.
2831
2832 A common convention in this function is to use A and B to refer to the two
2833 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2834 the subpattern to be matched possibly multiple times, while B is the entire
2835 rest of the pattern. Variable and state names reflect this convention.
2836
2837 The states in the main switch are the union of ops and failure/success of
2838 substates associated with with that op.  For example, IFMATCH is the op
2839 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2840 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2841 successfully matched A and IFMATCH_A_fail is a state saying that we have
2842 just failed to match A. Resume states always come in pairs. The backtrack
2843 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2844 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2845 on success or failure.
2846
2847 The struct that holds a backtracking state is actually a big union, with
2848 one variant for each major type of op. The variable st points to the
2849 top-most backtrack struct. To make the code clearer, within each
2850 block of code we #define ST to alias the relevant union.
2851
2852 Here's a concrete example of a (vastly oversimplified) IFMATCH
2853 implementation:
2854
2855     switch (state) {
2856     ....
2857
2858 #define ST st->u.ifmatch
2859
2860     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2861         ST.foo = ...; // some state we wish to save
2862         ...
2863         // push a yes backtrack state with a resume value of
2864         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2865         // first node of A:
2866         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2867         // NOTREACHED
2868
2869     case IFMATCH_A: // we have successfully executed A; now continue with B
2870         next = B;
2871         bar = ST.foo; // do something with the preserved value
2872         break;
2873
2874     case IFMATCH_A_fail: // A failed, so the assertion failed
2875         ...;   // do some housekeeping, then ...
2876         sayNO; // propagate the failure
2877
2878 #undef ST
2879
2880     ...
2881     }
2882
2883 For any old-timers reading this who are familiar with the old recursive
2884 approach, the code above is equivalent to:
2885
2886     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2887     {
2888         int foo = ...
2889         ...
2890         if (regmatch(A)) {
2891             next = B;
2892             bar = foo;
2893             break;
2894         }
2895         ...;   // do some housekeeping, then ...
2896         sayNO; // propagate the failure
2897     }
2898
2899 The topmost backtrack state, pointed to by st, is usually free. If you
2900 want to claim it, populate any ST.foo fields in it with values you wish to
2901 save, then do one of
2902
2903         PUSH_STATE_GOTO(resume_state, node);
2904         PUSH_YES_STATE_GOTO(resume_state, node);
2905
2906 which sets that backtrack state's resume value to 'resume_state', pushes a
2907 new free entry to the top of the backtrack stack, then goes to 'node'.
2908 On backtracking, the free slot is popped, and the saved state becomes the
2909 new free state. An ST.foo field in this new top state can be temporarily
2910 accessed to retrieve values, but once the main loop is re-entered, it
2911 becomes available for reuse.
2912
2913 Note that the depth of the backtrack stack constantly increases during the
2914 left-to-right execution of the pattern, rather than going up and down with
2915 the pattern nesting. For example the stack is at its maximum at Z at the
2916 end of the pattern, rather than at X in the following:
2917
2918     /(((X)+)+)+....(Y)+....Z/
2919
2920 The only exceptions to this are lookahead/behind assertions and the cut,
2921 (?>A), which pop all the backtrack states associated with A before
2922 continuing.
2923
2924 Backtrack state structs are allocated in slabs of about 4K in size.
2925 PL_regmatch_state and st always point to the currently active state,
2926 and PL_regmatch_slab points to the slab currently containing
2927 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2928 allocated, and is never freed until interpreter destruction. When the slab
2929 is full, a new one is allocated and chained to the end. At exit from
2930 regmatch(), slabs allocated since entry are freed.
2931
2932 */
2933
2934
2935 #define DEBUG_STATE_pp(pp)                                  \
2936     DEBUG_STATE_r({                                         \
2937         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2938         PerlIO_printf(Perl_debug_log,                       \
2939             "    %*s"pp" %s%s%s%s%s\n",                     \
2940             depth*2, "",                                    \
2941             PL_reg_name[st->resume_state],                     \
2942             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2943             ((st==yes_state) ? "Y" : ""),                   \
2944             ((st==mark_state) ? "M" : ""),                  \
2945             ((st==yes_state||st==mark_state) ? "]" : "")    \
2946         );                                                  \
2947     });
2948
2949
2950 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2951
2952 #ifdef DEBUGGING
2953
2954 STATIC void
2955 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2956     const char *start, const char *end, const char *blurb)
2957 {
2958     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2959
2960     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2961
2962     if (!PL_colorset)
2963             reginitcolors();
2964     {
2965         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2966             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2967
2968         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2969             start, end - start, 60);
2970
2971         PerlIO_printf(Perl_debug_log,
2972             "%s%s REx%s %s against %s\n",
2973                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2974
2975         if (utf8_target||utf8_pat)
2976             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2977                 utf8_pat ? "pattern" : "",
2978                 utf8_pat && utf8_target ? " and " : "",
2979                 utf8_target ? "string" : ""
2980             );
2981     }
2982 }
2983
2984 STATIC void
2985 S_dump_exec_pos(pTHX_ const char *locinput,
2986                       const regnode *scan,
2987                       const char *loc_regeol,
2988                       const char *loc_bostr,
2989                       const char *loc_reg_starttry,
2990                       const bool utf8_target)
2991 {
2992     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2993     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2994     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2995     /* The part of the string before starttry has one color
2996        (pref0_len chars), between starttry and current
2997        position another one (pref_len - pref0_len chars),
2998        after the current position the third one.
2999        We assume that pref0_len <= pref_len, otherwise we
3000        decrease pref0_len.  */
3001     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3002         ? (5 + taill) - l : locinput - loc_bostr;
3003     int pref0_len;
3004
3005     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3006
3007     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3008         pref_len++;
3009     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3010     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3011         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3012               ? (5 + taill) - pref_len : loc_regeol - locinput);
3013     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3014         l--;
3015     if (pref0_len < 0)
3016         pref0_len = 0;
3017     if (pref0_len > pref_len)
3018         pref0_len = pref_len;
3019     {
3020         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3021
3022         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3023             (locinput - pref_len),pref0_len, 60, 4, 5);
3024
3025         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3026                     (locinput - pref_len + pref0_len),
3027                     pref_len - pref0_len, 60, 2, 3);
3028
3029         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3030                     locinput, loc_regeol - locinput, 10, 0, 1);
3031
3032         const STRLEN tlen=len0+len1+len2;
3033         PerlIO_printf(Perl_debug_log,
3034                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3035                     (IV)(locinput - loc_bostr),
3036                     len0, s0,
3037                     len1, s1,
3038                     (docolor ? "" : "> <"),
3039                     len2, s2,
3040                     (int)(tlen > 19 ? 0 :  19 - tlen),
3041                     "");
3042     }
3043 }
3044
3045 #endif
3046
3047 /* reg_check_named_buff_matched()
3048  * Checks to see if a named buffer has matched. The data array of
3049  * buffer numbers corresponding to the buffer is expected to reside
3050  * in the regexp->data->data array in the slot stored in the ARG() of
3051  * node involved. Note that this routine doesn't actually care about the
3052  * name, that information is not preserved from compilation to execution.
3053  * Returns the index of the leftmost defined buffer with the given name
3054  * or 0 if non of the buffers matched.
3055  */
3056 STATIC I32
3057 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3058 {
3059     I32 n;
3060     RXi_GET_DECL(rex,rexi);
3061     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3062     I32 *nums=(I32*)SvPVX(sv_dat);
3063
3064     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3065
3066     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3067         if ((I32)rex->lastparen >= nums[n] &&
3068             rex->offs[nums[n]].end != -1)
3069         {
3070             return nums[n];
3071         }
3072     }
3073     return 0;
3074 }
3075
3076
3077 /* free all slabs above current one  - called during LEAVE_SCOPE */
3078
3079 STATIC void
3080 S_clear_backtrack_stack(pTHX_ void *p)
3081 {
3082     regmatch_slab *s = PL_regmatch_slab->next;
3083     PERL_UNUSED_ARG(p);
3084
3085     if (!s)
3086         return;
3087     PL_regmatch_slab->next = NULL;
3088     while (s) {
3089         regmatch_slab * const osl = s;
3090         s = s->next;
3091         Safefree(osl);
3092     }
3093 }
3094
3095
3096 STATIC I32                      /* 0 failure, 1 success */
3097 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3098 {
3099 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3100     dMY_CXT;
3101 #endif
3102     dVAR;
3103     register const bool utf8_target = PL_reg_match_utf8;
3104     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3105     REGEXP *rex_sv = reginfo->prog;
3106     regexp *rex = (struct regexp *)SvANY(rex_sv);
3107     RXi_GET_DECL(rex,rexi);
3108     I32 oldsave;
3109     /* the current state. This is a cached copy of PL_regmatch_state */
3110     register regmatch_state *st;
3111     /* cache heavy used fields of st in registers */
3112     register regnode *scan;
3113     register regnode *next;
3114     register U32 n = 0; /* general value; init to avoid compiler warning */
3115     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3116     register char *locinput = PL_reginput;
3117     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3118
3119     bool result = 0;        /* return value of S_regmatch */
3120     int depth = 0;          /* depth of backtrack stack */
3121     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3122     const U32 max_nochange_depth =
3123         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3124         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3125     regmatch_state *yes_state = NULL; /* state to pop to on success of
3126                                                             subpattern */
3127     /* mark_state piggy backs on the yes_state logic so that when we unwind
3128        the stack on success we can update the mark_state as we go */
3129     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3130     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3131     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3132     U32 state_num;
3133     bool no_final = 0;      /* prevent failure from backtracking? */
3134     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3135     char *startpoint = PL_reginput;
3136     SV *popmark = NULL;     /* are we looking for a mark? */
3137     SV *sv_commit = NULL;   /* last mark name seen in failure */
3138     SV *sv_yes_mark = NULL; /* last mark name we have seen
3139                                during a successful match */
3140     U32 lastopen = 0;       /* last open we saw */
3141     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3142     SV* const oreplsv = GvSV(PL_replgv);
3143     /* these three flags are set by various ops to signal information to
3144      * the very next op. They have a useful lifetime of exactly one loop
3145      * iteration, and are not preserved or restored by state pushes/pops
3146      */
3147     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3148     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3149     int logical = 0;        /* the following EVAL is:
3150                                 0: (?{...})
3151                                 1: (?(?{...})X|Y)
3152                                 2: (??{...})
3153                                or the following IFMATCH/UNLESSM is:
3154                                 false: plain (?=foo)
3155                                 true:  used as a condition: (?(?=foo))
3156                             */
3157     PAD* last_pad = NULL;
3158     dMULTICALL;
3159     I32 gimme = G_SCALAR;
3160     CV *caller_cv = NULL;       /* who called us */
3161     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3162     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3163
3164 #ifdef DEBUGGING
3165     GET_RE_DEBUG_FLAGS_DECL;
3166 #endif
3167
3168     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3169     multicall_oldcatch = 0;
3170     multicall_cv = NULL;
3171     cx = NULL;
3172     PERL_UNUSED_VAR(multicall_cop);
3173     PERL_UNUSED_VAR(newsp);
3174
3175
3176     PERL_ARGS_ASSERT_REGMATCH;
3177
3178     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3179             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3180     }));
3181     /* on first ever call to regmatch, allocate first slab */
3182     if (!PL_regmatch_slab) {
3183         Newx(PL_regmatch_slab, 1, regmatch_slab);
3184         PL_regmatch_slab->prev = NULL;
3185         PL_regmatch_slab->next = NULL;
3186         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3187     }
3188
3189     oldsave = PL_savestack_ix;
3190     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3191     SAVEVPTR(PL_regmatch_slab);
3192     SAVEVPTR(PL_regmatch_state);
3193
3194     /* grab next free state slot */
3195     st = ++PL_regmatch_state;
3196     if (st >  SLAB_LAST(PL_regmatch_slab))
3197         st = PL_regmatch_state = S_push_slab(aTHX);
3198
3199     /* Note that nextchr is a byte even in UTF */
3200     nextchr = UCHARAT(locinput);
3201     scan = prog;
3202     while (scan != NULL) {
3203
3204         DEBUG_EXECUTE_r( {
3205             SV * const prop = sv_newmortal();
3206             regnode *rnext=regnext(scan);
3207             DUMP_EXEC_POS( locinput, scan, utf8_target );
3208             regprop(rex, prop, scan);
3209
3210             PerlIO_printf(Perl_debug_log,
3211                     "%3"IVdf":%*s%s(%"IVdf")\n",
3212                     (IV)(scan - rexi->program), depth*2, "",
3213                     SvPVX_const(prop),
3214                     (PL_regkind[OP(scan)] == END || !rnext) ?
3215                         0 : (IV)(rnext - rexi->program));
3216         });
3217
3218         next = scan + NEXT_OFF(scan);
3219         if (next == scan)
3220             next = NULL;
3221         state_num = OP(scan);
3222
3223         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3224       reenter_switch:
3225
3226         switch (state_num) {
3227         case BOL:
3228             if (locinput == PL_bostr)
3229             {
3230                 /* reginfo->till = reginfo->bol; */
3231                 break;
3232             }
3233             sayNO;
3234         case MBOL:
3235             if (locinput == PL_bostr ||
3236                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3237             {
3238                 break;
3239             }
3240             sayNO;
3241         case SBOL:
3242             if (locinput == PL_bostr)
3243                 break;
3244             sayNO;
3245         case GPOS:
3246             if (locinput == reginfo->ganch)
3247                 break;
3248             sayNO;
3249
3250         case KEEPS:
3251             /* update the startpoint */
3252             st->u.keeper.val = rex->offs[0].start;
3253             PL_reginput = locinput;
3254             rex->offs[0].start = locinput - PL_bostr;
3255             PUSH_STATE_GOTO(KEEPS_next, next);
3256             /*NOT-REACHED*/
3257         case KEEPS_next_fail:
3258             /* rollback the start point change */
3259             rex->offs[0].start = st->u.keeper.val;
3260             sayNO_SILENT;
3261             /*NOT-REACHED*/
3262         case EOL:
3263                 goto seol;
3264         case MEOL:
3265             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3266                 sayNO;
3267             break;
3268         case SEOL:
3269           seol:
3270             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3271                 sayNO;
3272             if (PL_regeol - locinput > 1)
3273                 sayNO;
3274             break;
3275         case EOS:
3276             if (PL_regeol != locinput)
3277                 sayNO;
3278             break;
3279         case SANY:
3280             if (!nextchr && locinput >= PL_regeol)
3281                 sayNO;
3282             if (utf8_target) {
3283                 locinput += PL_utf8skip[nextchr];
3284                 if (locinput > PL_regeol)
3285                     sayNO;
3286                 nextchr = UCHARAT(locinput);
3287             }
3288             else
3289                 nextchr = UCHARAT(++locinput);
3290             break;
3291         case CANY:
3292             if (!nextchr && locinput >= PL_regeol)
3293                 sayNO;
3294             nextchr = UCHARAT(++locinput);
3295             break;
3296         case REG_ANY:
3297             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3298                 sayNO;
3299             if (utf8_target) {
3300                 locinput += PL_utf8skip[nextchr];
3301                 if (locinput > PL_regeol)
3302                     sayNO;
3303                 nextchr = UCHARAT(locinput);
3304             }
3305             else
3306                 nextchr = UCHARAT(++locinput);
3307             break;
3308
3309 #undef  ST
3310 #define ST st->u.trie
3311         case TRIEC:
3312             /* In this case the charclass data is available inline so
3313                we can fail fast without a lot of extra overhead.
3314              */
3315             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3316                 DEBUG_EXECUTE_r(
3317                     PerlIO_printf(Perl_debug_log,
3318                               "%*s  %sfailed to match trie start class...%s\n",
3319                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3320                 );
3321                 sayNO_SILENT;
3322                 assert(0); /* NOTREACHED */
3323             }
3324             /* FALL THROUGH */
3325         case TRIE:
3326             /* the basic plan of execution of the trie is:
3327              * At the beginning, run though all the states, and
3328              * find the longest-matching word. Also remember the position
3329              * of the shortest matching word. For example, this pattern:
3330              *    1  2 3 4    5
3331              *    ab|a|x|abcd|abc
3332              * when matched against the string "abcde", will generate
3333              * accept states for all words except 3, with the longest
3334              * matching word being 4, and the shortest being 1 (with
3335              * the position being after char 1 of the string).
3336              *
3337              * Then for each matching word, in word order (i.e. 1,2,4,5),
3338              * we run the remainder of the pattern; on each try setting
3339              * the current position to the character following the word,
3340              * returning to try the next word on failure.
3341              *
3342              * We avoid having to build a list of words at runtime by
3343              * using a compile-time structure, wordinfo[].prev, which
3344              * gives, for each word, the previous accepting word (if any).
3345              * In the case above it would contain the mappings 1->2, 2->0,
3346              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3347              * the longest word (4 above), a list of all words, by
3348              * following the list of prev pointers; this gives us the
3349              * unordered list 4,5,1,2. Then given the current word we have
3350              * just tried, we can go through the list and find the
3351              * next-biggest word to try (so if we just failed on word 2,
3352              * the next in the list is 4).
3353              *
3354              * Since at runtime we don't record the matching position in
3355              * the string for each word, we have to work that out for
3356              * each word we're about to process. The wordinfo table holds
3357              * the character length of each word; given that we recorded
3358              * at the start: the position of the shortest word and its
3359              * length in chars, we just need to move the pointer the
3360              * difference between the two char lengths. Depending on
3361              * Unicode status and folding, that's cheap or expensive.
3362              *
3363              * This algorithm is optimised for the case where are only a
3364              * small number of accept states, i.e. 0,1, or maybe 2.
3365              * With lots of accepts states, and having to try all of them,
3366              * it becomes quadratic on number of accept states to find all
3367              * the next words.
3368              */
3369
3370             {
3371                 /* what type of TRIE am I? (utf8 makes this contextual) */
3372                 DECL_TRIE_TYPE(scan);
3373
3374                 /* what trie are we using right now */
3375                 reg_trie_data * const trie
3376                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3377                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3378                 U32 state = trie->startstate;
3379
3380                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3381                     if (trie->states[ state ].wordnum) {
3382                          DEBUG_EXECUTE_r(
3383                             PerlIO_printf(Perl_debug_log,
3384                                           "%*s  %smatched empty string...%s\n",
3385                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3386                         );
3387                         if (!trie->jump)
3388                             break;
3389                     } else {
3390                         DEBUG_EXECUTE_r(
3391                             PerlIO_printf(Perl_debug_log,
3392                                           "%*s  %sfailed to match trie start class...%s\n",
3393                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3394                         );
3395                         sayNO_SILENT;
3396                    }
3397                 }
3398
3399             {
3400                 U8 *uc = ( U8* )locinput;
3401
3402                 STRLEN len = 0;
3403                 STRLEN foldlen = 0;
3404                 U8 *uscan = (U8*)NULL;
3405                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3406                 U32 charcount = 0; /* how many input chars we have matched */
3407                 U32 accepted = 0; /* have we seen any accepting states? */
3408
3409                 ST.jump = trie->jump;
3410                 ST.me = scan;
3411                 ST.firstpos = NULL;
3412                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3413                 ST.nextword = 0;
3414
3415                 /* fully traverse the TRIE; note the position of the
3416                    shortest accept state and the wordnum of the longest
3417                    accept state */
3418
3419                 while ( state && uc <= (U8*)PL_regeol ) {
3420                     U32 base = trie->states[ state ].trans.base;
3421                     UV uvc = 0;
3422                     U16 charid = 0;
3423                     U16 wordnum;
3424                     wordnum = trie->states[ state ].wordnum;
3425
3426                     if (wordnum) { /* it's an accept state */
3427                         if (!accepted) {
3428                             accepted = 1;
3429                             /* record first match position */
3430                             if (ST.longfold) {
3431                                 ST.firstpos = (U8*)locinput;
3432                                 ST.firstchars = 0;
3433                             }
3434                             else {
3435                                 ST.firstpos = uc;
3436                                 ST.firstchars = charcount;
3437                             }
3438                         }
3439                         if (!ST.nextword || wordnum < ST.nextword)
3440                             ST.nextword = wordnum;
3441                         ST.topword = wordnum;
3442                     }
3443
3444                     DEBUG_TRIE_EXECUTE_r({
3445                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3446                                 PerlIO_printf( Perl_debug_log,
3447                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3448                                     2+depth * 2, "", PL_colors[4],
3449                                     (UV)state, (accepted ? 'Y' : 'N'));
3450                     });
3451
3452                     /* read a char and goto next state */
3453                     if ( base ) {
3454                         I32 offset;
3455                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3456                                              uscan, len, uvc, charid, foldlen,
3457                                              foldbuf, uniflags);
3458                         charcount++;
3459                         if (foldlen>0)
3460                             ST.longfold = TRUE;
3461                         if (charid &&
3462                              ( ((offset =
3463                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3464
3465                              && ((U32)offset < trie->lasttrans)
3466                              && trie->trans[offset].check == state)
3467                         {
3468                             state = trie->trans[offset].next;
3469                         }
3470                         else {
3471                             state = 0;
3472                         }
3473                         uc += len;
3474
3475                     }
3476                     else {
3477                         state = 0;
3478                     }
3479                     DEBUG_TRIE_EXECUTE_r(
3480                         PerlIO_printf( Perl_debug_log,
3481                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3482                             charid, uvc, (UV)state, PL_colors[5] );
3483                     );
3484                 }
3485                 if (!accepted)
3486                    sayNO;
3487
3488                 /* calculate total number of accept states */
3489                 {
3490                     U16 w = ST.topword;
3491                     accepted = 0;
3492                     while (w) {
3493                         w = trie->wordinfo[w].prev;
3494                         accepted++;
3495                     }
3496                     ST.accepted = accepted;
3497                 }
3498
3499                 DEBUG_EXECUTE_r(
3500                     PerlIO_printf( Perl_debug_log,
3501                         "%*s  %sgot %"IVdf" possible matches%s\n",
3502                         REPORT_CODE_OFF + depth * 2, "",
3503                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3504                 );
3505                 goto trie_first_try; /* jump into the fail handler */
3506             }}
3507             assert(0); /* NOTREACHED */
3508
3509         case TRIE_next_fail: /* we failed - try next alternative */
3510             if ( ST.jump) {
3511                 REGCP_UNWIND(ST.cp);
3512                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3513             }
3514             if (!--ST.accepted) {
3515                 DEBUG_EXECUTE_r({
3516                     PerlIO_printf( Perl_debug_log,
3517                         "%*s  %sTRIE failed...%s\n",
3518                         REPORT_CODE_OFF+depth*2, "",
3519                         PL_colors[4],
3520                         PL_colors[5] );
3521                 });
3522                 sayNO_SILENT;
3523             }
3524             {
3525                 /* Find next-highest word to process.  Note that this code
3526                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3527                 register U16 min = 0;
3528                 register U16 word;
3529                 register U16 const nextword = ST.nextword;
3530                 register reg_trie_wordinfo * const wordinfo
3531                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3532                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3533                     if (word > nextword && (!min || word < min))
3534                         min = word;
3535                 }
3536                 ST.nextword = min;
3537             }
3538
3539           trie_first_try:
3540             if (do_cutgroup) {
3541                 do_cutgroup = 0;
3542                 no_final = 0;
3543             }
3544
3545             if ( ST.jump) {
3546                 ST.lastparen = rex->lastparen;
3547                 ST.lastcloseparen = rex->lastcloseparen;
3548                 REGCP_SET(ST.cp);
3549             }
3550
3551             /* find start char of end of current word */
3552             {
3553                 U32 chars; /* how many chars to skip */
3554                 U8 *uc = ST.firstpos;
3555                 reg_trie_data * const trie
3556                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3557
3558                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3559                             >=  ST.firstchars);
3560                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3561                             - ST.firstchars;
3562
3563                 if (ST.longfold) {
3564                     /* the hard option - fold each char in turn and find
3565                      * its folded length (which may be different */
3566                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3567                     STRLEN foldlen;
3568                     STRLEN len;
3569                     UV uvc;
3570                     U8 *uscan;
3571
3572                     while (chars) {
3573                         if (utf8_target) {
3574                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3575                                                     uniflags);
3576                             uc += len;
3577                         }
3578                         else {
3579                             uvc = *uc;
3580                             uc++;
3581                         }
3582                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3583                         uscan = foldbuf;
3584                         while (foldlen) {
3585                             if (!--chars)
3586                                 break;
3587                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3588                                             uniflags);
3589                             uscan += len;
3590                             foldlen -= len;
3591                         }
3592                     }
3593                 }
3594                 else {
3595                     if (utf8_target)
3596                         while (chars--)
3597                             uc += UTF8SKIP(uc);
3598                     else
3599                         uc += chars;
3600                 }
3601                 PL_reginput = (char *)uc;
3602             }
3603
3604             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3605                             ? ST.jump[ST.nextword]
3606                             : NEXT_OFF(ST.me));
3607
3608             DEBUG_EXECUTE_r({
3609                 PerlIO_printf( Perl_debug_log,
3610                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3611                     REPORT_CODE_OFF+depth*2, "",
3612                     PL_colors[4],
3613                     ST.nextword,
3614                     PL_colors[5]
3615                     );
3616             });
3617
3618             if (ST.accepted > 1 || has_cutgroup) {
3619                 PUSH_STATE_GOTO(TRIE_next, scan);
3620                 assert(0); /* NOTREACHED */
3621             }
3622             /* only one choice left - just continue */
3623             DEBUG_EXECUTE_r({
3624                 AV *const trie_words
3625                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3626                 SV ** const tmp = av_fetch( trie_words,
3627                     ST.nextword-1, 0 );
3628                 SV *sv= tmp ? sv_newmortal() : NULL;
3629
3630                 PerlIO_printf( Perl_debug_log,
3631                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3632                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3633                     ST.nextword,
3634                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3635                             PL_colors[0], PL_colors[1],
3636                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3637                         )
3638                     : "not compiled under -Dr",
3639                     PL_colors[5] );
3640             });
3641
3642             locinput = PL_reginput;
3643             nextchr = UCHARAT(locinput);
3644             continue; /* execute rest of RE */
3645             assert(0); /* NOTREACHED */
3646 #undef  ST
3647
3648         case EXACT: {
3649             char *s = STRING(scan);
3650             ln = STR_LEN(scan);
3651             if (utf8_target != UTF_PATTERN) {
3652                 /* The target and the pattern have differing utf8ness. */
3653                 char *l = locinput;
3654                 const char * const e = s + ln;
3655
3656                 if (utf8_target) {
3657                     /* The target is utf8, the pattern is not utf8. */
3658                     while (s < e) {
3659                         STRLEN ulen;
3660                         if (l >= PL_regeol)
3661                              sayNO;
3662                         if (NATIVE_TO_UNI(*(U8*)s) !=
3663                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3664                                             uniflags))
3665                              sayNO;
3666                         l += ulen;
3667                         s ++;
3668                     }
3669                 }
3670                 else {
3671                     /* The target is not utf8, the pattern is utf8. */
3672                     while (s < e) {
3673                         STRLEN ulen;
3674                         if (l >= PL_regeol)
3675                             sayNO;
3676                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3677                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3678                                            uniflags))
3679                             sayNO;
3680                         s += ulen;
3681                         l ++;
3682                     }
3683                 }
3684                 locinput = l;
3685                 nextchr = UCHARAT(locinput);
3686                 break;
3687             }
3688             /* The target and the pattern have the same utf8ness. */
3689             /* Inline the first character, for speed. */
3690             if (UCHARAT(s) != nextchr)
3691                 sayNO;
3692             if (PL_regeol - locinput < ln)
3693                 sayNO;
3694             if (ln > 1 && memNE(s, locinput, ln))
3695                 sayNO;
3696             locinput += ln;
3697             nextchr = UCHARAT(locinput);
3698             break;
3699             }
3700         case EXACTFL: {
3701             re_fold_t folder;
3702             const U8 * fold_array;
3703             const char * s;
3704             U32 fold_utf8_flags;
3705
3706             PL_reg_flags |= RF_tainted;
3707             folder = foldEQ_locale;
3708             fold_array = PL_fold_locale;
3709             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3710             goto do_exactf;
3711
3712         case EXACTFU_SS:
3713         case EXACTFU_TRICKYFOLD:
3714         case EXACTFU:
3715             folder = foldEQ_latin1;
3716             fold_array = PL_fold_latin1;
3717             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3718             goto do_exactf;
3719
3720         case EXACTFA:
3721             folder = foldEQ_latin1;
3722             fold_array = PL_fold_latin1;
3723             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3724             goto do_exactf;
3725
3726         case EXACTF:
3727             folder = foldEQ;
3728             fold_array = PL_fold;
3729             fold_utf8_flags = 0;
3730
3731           do_exactf:
3732             s = STRING(scan);
3733             ln = STR_LEN(scan);
3734
3735             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3736               /* Either target or the pattern are utf8, or has the issue where
3737                * the fold lengths may differ. */
3738                 const char * const l = locinput;
3739                 char *e = PL_regeol;
3740
3741                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3742                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3743                 {
3744                     sayNO;
3745                 }
3746                 locinput = e;
3747                 nextchr = UCHARAT(locinput);
3748                 break;
3749             }
3750
3751             /* Neither the target nor the pattern are utf8 */
3752             if (UCHARAT(s) != nextchr &&
3753                 UCHARAT(s) != fold_array[nextchr])
3754             {
3755                 sayNO;
3756             }
3757             if (PL_regeol - locinput < ln)
3758                 sayNO;
3759             if (ln > 1 && ! folder(s, locinput, ln))
3760                 sayNO;
3761             locinput += ln;
3762             nextchr = UCHARAT(locinput);
3763             break;
3764         }
3765
3766         /* XXX Could improve efficiency by separating these all out using a
3767          * macro or in-line function.  At that point regcomp.c would no longer
3768          * have to set the FLAGS fields of these */
3769         case BOUNDL:
3770         case NBOUNDL:
3771             PL_reg_flags |= RF_tainted;
3772             /* FALL THROUGH */
3773         case BOUND:
3774         case BOUNDU:
3775         case BOUNDA:
3776         case NBOUND:
3777         case NBOUNDU:
3778         case NBOUNDA:
3779             /* was last char in word? */
3780             if (utf8_target
3781                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3782                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3783             {
3784                 if (locinput == PL_bostr)
3785                     ln = '\n';
3786                 else {
3787                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3788
3789                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3790                 }
3791                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3792                     ln = isALNUM_uni(ln);
3793                     LOAD_UTF8_CHARCLASS_ALNUM();
3794                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3795                 }
3796                 else {
3797                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3798                     n = isALNUM_LC_utf8((U8*)locinput);
3799                 }
3800             }
3801             else {
3802
3803                 /* Here the string isn't utf8, or is utf8 and only ascii
3804                  * characters are to match \w.  In the latter case looking at
3805                  * the byte just prior to the current one may be just the final
3806                  * byte of a multi-byte character.  This is ok.  There are two
3807                  * cases:
3808                  * 1) it is a single byte character, and then the test is doing
3809                  *      just what it's supposed to.
3810                  * 2) it is a multi-byte character, in which case the final
3811                  *      byte is never mistakable for ASCII, and so the test
3812                  *      will say it is not a word character, which is the
3813                  *      correct answer. */
3814                 ln = (locinput != PL_bostr) ?
3815                     UCHARAT(locinput - 1) : '\n';
3816                 switch (FLAGS(scan)) {
3817                     case REGEX_UNICODE_CHARSET:
3818                         ln = isWORDCHAR_L1(ln);
3819                         n = isWORDCHAR_L1(nextchr);
3820                         break;
3821                     case REGEX_LOCALE_CHARSET:
3822                         ln = isALNUM_LC(ln);
3823                         n = isALNUM_LC(nextchr);
3824                         break;
3825                     case REGEX_DEPENDS_CHARSET:
3826                         ln = isALNUM(ln);
3827                         n = isALNUM(nextchr);
3828                         break;
3829                     case REGEX_ASCII_RESTRICTED_CHARSET:
3830                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3831                         ln = isWORDCHAR_A(ln);
3832                         n = isWORDCHAR_A(nextchr);
3833                         break;
3834                     default:
3835                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3836                         break;
3837                 }
3838             }
3839             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3840              * regcomp.sym */
3841             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3842                     sayNO;
3843             break;
3844         case ANYOFV:
3845         case ANYOF:
3846             if (utf8_target || state_num == ANYOFV) {
3847                 STRLEN inclasslen = PL_regeol - locinput;
3848                 if (locinput >= PL_regeol)
3849                     sayNO;
3850
3851                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3852                     sayNO;
3853                 locinput += inclasslen;
3854                 nextchr = UCHARAT(locinput);
3855                 break;
3856             }
3857             else {
3858                 if (nextchr < 0)
3859                     nextchr = UCHARAT(locinput);
3860                 if (!nextchr && locinput >= PL_regeol)
3861                     sayNO;
3862                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3863                     sayNO;
3864                 nextchr = UCHARAT(++locinput);
3865                 break;
3866             }
3867             break;
3868         /* Special char classes - The defines start on line 129 or so */
3869         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3870                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3871                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3872                   ALNUMA, NALNUMA, isWORDCHAR_A,
3873                   alnum, "a");
3874
3875         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3876                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3877                   SPACEU, NSPACEU, isSPACE_L1,
3878                   SPACEA, NSPACEA, isSPACE_A,
3879                   space, " ");
3880
3881         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3882                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3883                 DIGITA, NDIGITA, isDIGIT_A,
3884                 digit, "0");
3885
3886         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3887                        a Unicode extended Grapheme Cluster */
3888             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3889               extended Grapheme Cluster is:
3890
3891                CR LF
3892                | Prepend* Begin Extend*
3893                | .
3894
3895                Begin is (Hangul-syllable | ! Control)
3896                Extend is (Grapheme_Extend | Spacing_Mark)
3897                Control is [ GCB_Control CR LF ]
3898
3899                The discussion below shows how the code for CLUMP is derived
3900                from this regex.  Note that most of these concepts are from
3901                property values of the Grapheme Cluster Boundary (GCB) property.
3902                No code point can have multiple property values for a given
3903                property.  Thus a code point in Prepend can't be in Control, but
3904                it must be in !Control.  This is why Control above includes
3905                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3906                property separately, and so can't be in GCB_Control, even though
3907                they logically are controls.  Control is not the same as gc=cc,
3908                but includes format and other characters as well.
3909
3910                The Unicode definition of Hangul-syllable is:
3911                    L+
3912                    | (L* ( ( V | LV ) V* | LVT ) T*)
3913                    | T+
3914                   )
3915                Each of these is a value for the GCB property, and hence must be
3916                disjoint, so the order they are tested is immaterial, so the
3917                above can safely be changed to
3918                    T+
3919                    | L+
3920                    | (L* ( LVT | ( V | LV ) V*) T*)
3921
3922                The last two terms can be combined like this:
3923                    L* ( L
3924                         | (( LVT | ( V | LV ) V*) T*))
3925
3926                And refactored into this:
3927                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3928
3929                That means that if we have seen any L's at all we can quit
3930                there, but if the next character is an LVT, a V, or an LV we
3931                should keep going.
3932
3933                There is a subtlety with Prepend* which showed up in testing.
3934                Note that the Begin, and only the Begin is required in:
3935                 | Prepend* Begin Extend*
3936                Also, Begin contains '! Control'.  A Prepend must be a
3937                '!  Control', which means it must also be a Begin.  What it
3938                comes down to is that if we match Prepend* and then find no
3939                suitable Begin afterwards, that if we backtrack the last
3940                Prepend, that one will be a suitable Begin.
3941             */
3942
3943             if (locinput >= PL_regeol)
3944                 sayNO;
3945             if  (! utf8_target) {
3946
3947                 /* Match either CR LF  or '.', as all the other possibilities
3948                  * require utf8 */
3949                 locinput++;         /* Match the . or CR */
3950                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3951                                        match the LF */
3952                     && locinput < PL_regeol
3953                     && UCHARAT(locinput) == '\n') locinput++;
3954             }
3955             else {
3956
3957                 /* Utf8: See if is ( CR LF ); already know that locinput <
3958                  * PL_regeol, so locinput+1 is in bounds */
3959                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3960                     locinput += 2;
3961                 }
3962                 else {
3963                     /* In case have to backtrack to beginning, then match '.' */
3964                     char *starting = locinput;
3965
3966                     /* In case have to backtrack the last prepend */
3967                     char *previous_prepend = 0;
3968
3969                     LOAD_UTF8_CHARCLASS_GCB();
3970
3971                     /* Match (prepend)* */
3972                     while (locinput < PL_regeol
3973                            && swash_fetch(PL_utf8_X_prepend,
3974                                           (U8*)locinput, utf8_target))
3975                     {
3976                         previous_prepend = locinput;
3977                         locinput += UTF8SKIP(locinput);
3978                     }
3979
3980                     /* As noted above, if we matched a prepend character, but
3981                      * the next thing won't match, back off the last prepend we
3982                      * matched, as it is guaranteed to match the begin */
3983                     if (previous_prepend
3984                         && (locinput >=  PL_regeol
3985                             || ! swash_fetch(PL_utf8_X_begin,
3986                                              (U8*)locinput, utf8_target)))
3987                     {
3988                         locinput = previous_prepend;
3989                     }
3990
3991                     /* Note that here we know PL_regeol > locinput, as we
3992                      * tested that upon input to this switch case, and if we
3993                      * moved locinput forward, we tested the result just above
3994                      * and it either passed, or we backed off so that it will
3995                      * now pass */
3996                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3997
3998                         /* Here did not match the required 'Begin' in the
3999                          * second term.  So just match the very first
4000                          * character, the '.' of the final term of the regex */
4001                         locinput = starting + UTF8SKIP(starting);
4002                     } else {
4003
4004                         /* Here is the beginning of a character that can have
4005                          * an extender.  It is either a hangul syllable, or a
4006                          * non-control */
4007                         if (swash_fetch(PL_utf8_X_non_hangul,
4008                                         (U8*)locinput, utf8_target))
4009                         {
4010
4011                             /* Here not a Hangul syllable, must be a
4012                              * ('!  * Control') */
4013                             locinput += UTF8SKIP(locinput);
4014                         } else {
4015
4016                             /* Here is a Hangul syllable.  It can be composed
4017                              * of several individual characters.  One
4018                              * possibility is T+ */
4019                             if (swash_fetch(PL_utf8_X_T,
4020                                             (U8*)locinput, utf8_target))
4021                             {
4022                                 while (locinput < PL_regeol
4023                                         && swash_fetch(PL_utf8_X_T,
4024                                                         (U8*)locinput, utf8_target))
4025                                 {
4026                                     locinput += UTF8SKIP(locinput);
4027                                 }
4028                             } else {
4029
4030                                 /* Here, not T+, but is a Hangul.  That means
4031                                  * it is one of the others: L, LV, LVT or V,
4032                                  * and matches:
4033                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4034
4035                                 /* Match L*           */
4036                                 while (locinput < PL_regeol
4037                                         && swash_fetch(PL_utf8_X_L,
4038                                                         (U8*)locinput, utf8_target))
4039                                 {
4040                                     locinput += UTF8SKIP(locinput);
4041                                 }
4042
4043                                 /* Here, have exhausted L*.  If the next
4044                                  * character is not an LV, LVT nor V, it means
4045                                  * we had to have at least one L, so matches L+
4046                                  * in the original equation, we have a complete
4047                                  * hangul syllable.  Are done. */
4048
4049                                 if (locinput < PL_regeol
4050                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4051                                                     (U8*)locinput, utf8_target))
4052                                 {
4053
4054                                     /* Otherwise keep going.  Must be LV, LVT
4055                                      * or V.  See if LVT */
4056                                     if (swash_fetch(PL_utf8_X_LVT,
4057                                                     (U8*)locinput, utf8_target))
4058                                     {
4059                                         locinput += UTF8SKIP(locinput);
4060                                     } else {
4061
4062                                         /* Must be  V or LV.  Take it, then
4063                                          * match V*     */
4064                                         locinput += UTF8SKIP(locinput);
4065                                         while (locinput < PL_regeol
4066                                                 && swash_fetch(PL_utf8_X_V,
4067                                                          (U8*)locinput, utf8_target))
4068                                         {
4069                                             locinput += UTF8SKIP(locinput);
4070                                         }
4071                                     }
4072
4073                                     /* And any of LV, LVT, or V can be followed
4074                                      * by T*            */
4075                                     while (locinput < PL_regeol
4076                                            && swash_fetch(PL_utf8_X_T,
4077                                                            (U8*)locinput,
4078                                                            utf8_target))
4079                                     {
4080                                         locinput += UTF8SKIP(locinput);
4081                                     }
4082                                 }
4083                             }
4084                         }
4085
4086                         /* Match any extender */
4087                         while (locinput < PL_regeol
4088                                 && swash_fetch(PL_utf8_X_extend,
4089                                                 (U8*)locinput, utf8_target))
4090                         {
4091                             locinput += UTF8SKIP(locinput);
4092                         }
4093                     }
4094                 }
4095                 if (locinput > PL_regeol) sayNO;
4096             }
4097             nextchr = UCHARAT(locinput);
4098             break;
4099
4100         case NREFFL:
4101         {   /* The capture buffer cases.  The ones beginning with N for the
4102                named buffers just convert to the equivalent numbered and
4103                pretend they were called as the corresponding numbered buffer
4104                op.  */
4105             /* don't initialize these in the declaration, it makes C++
4106                unhappy */
4107             char *s;
4108             char type;
4109             re_fold_t folder;
4110             const U8 *fold_array;
4111             UV utf8_fold_flags;
4112
4113             PL_reg_flags |= RF_tainted;
4114             folder = foldEQ_locale;
4115             fold_array = PL_fold_locale;
4116             type = REFFL;
4117             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4118             goto do_nref;
4119
4120         case NREFFA:
4121             folder = foldEQ_latin1;
4122             fold_array = PL_fold_latin1;
4123             type = REFFA;
4124             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4125             goto do_nref;
4126
4127         case NREFFU:
4128             folder = foldEQ_latin1;
4129             fold_array = PL_fold_latin1;
4130             type = REFFU;
4131             utf8_fold_flags = 0;
4132             goto do_nref;
4133
4134         case NREFF:
4135             folder = foldEQ;
4136             fold_array = PL_fold;
4137             type = REFF;
4138             utf8_fold_flags = 0;
4139             goto do_nref;
4140
4141         case NREF:
4142             type = REF;
4143             folder = NULL;
4144             fold_array = NULL;
4145             utf8_fold_flags = 0;
4146           do_nref:
4147
4148             /* For the named back references, find the corresponding buffer
4149              * number */
4150             n = reg_check_named_buff_matched(rex,scan);
4151
4152             if ( ! n ) {
4153                 sayNO;
4154             }
4155             goto do_nref_ref_common;
4156
4157         case REFFL:
4158             PL_reg_flags |= RF_tainted;
4159             folder = foldEQ_locale;
4160             fold_array = PL_fold_locale;
4161             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4162             goto do_ref;
4163
4164         case REFFA:
4165             folder = foldEQ_latin1;
4166             fold_array = PL_fold_latin1;
4167             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4168             goto do_ref;
4169
4170         case REFFU:
4171             folder = foldEQ_latin1;
4172             fold_array = PL_fold_latin1;
4173             utf8_fold_flags = 0;
4174             goto do_ref;
4175
4176         case REFF:
4177             folder = foldEQ;
4178             fold_array = PL_fold;
4179             utf8_fold_flags = 0;
4180             goto do_ref;
4181
4182         case REF:
4183             folder = NULL;
4184             fold_array = NULL;
4185             utf8_fold_flags = 0;
4186
4187           do_ref:
4188             type = OP(scan);
4189             n = ARG(scan);  /* which paren pair */
4190
4191           do_nref_ref_common:
4192             ln = rex->offs[n].start;
4193             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4194             if (rex->lastparen < n || ln == -1)
4195                 sayNO;                  /* Do not match unless seen CLOSEn. */
4196             if (ln == rex->offs[n].end)
4197                 break;
4198
4199             s = PL_bostr + ln;
4200             if (type != REF     /* REF can do byte comparison */
4201                 && (utf8_target || type == REFFU))
4202             { /* XXX handle REFFL better */
4203                 char * limit = PL_regeol;
4204
4205                 /* This call case insensitively compares the entire buffer
4206                     * at s, with the current input starting at locinput, but
4207                     * not going off the end given by PL_regeol, and returns in
4208                     * limit upon success, how much of the current input was
4209                     * matched */
4210                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4211                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4212                 {
4213                     sayNO;
4214                 }
4215                 locinput = limit;
4216                 nextchr = UCHARAT(locinput);
4217                 break;
4218             }
4219
4220             /* Not utf8:  Inline the first character, for speed. */
4221             if (UCHARAT(s) != nextchr &&
4222                 (type == REF ||
4223                  UCHARAT(s) != fold_array[nextchr]))
4224                 sayNO;
4225             ln = rex->offs[n].end - ln;
4226             if (locinput + ln > PL_regeol)
4227                 sayNO;
4228             if (ln > 1 && (type == REF
4229                            ? memNE(s, locinput, ln)
4230                            : ! folder(s, locinput, ln)))
4231                 sayNO;
4232             locinput += ln;
4233             nextchr = UCHARAT(locinput);
4234             break;
4235         }
4236         case NOTHING:
4237         case TAIL:
4238             break;
4239         case BACK:
4240             break;
4241
4242 #undef  ST
4243 #define ST st->u.eval
4244         {
4245             SV *ret;
4246             REGEXP *re_sv;
4247             regexp *re;
4248             regexp_internal *rei;
4249             regnode *startpoint;
4250
4251         case GOSTART:
4252         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4253             if (cur_eval && cur_eval->locinput==locinput) {
4254                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4255                     Perl_croak(aTHX_ "Infinite recursion in regex");
4256                 if ( ++nochange_depth > max_nochange_depth )
4257                     Perl_croak(aTHX_
4258                         "Pattern subroutine nesting without pos change"
4259                         " exceeded limit in regex");
4260             } else {
4261                 nochange_depth = 0;
4262             }
4263             re_sv = rex_sv;
4264             re = rex;
4265             rei = rexi;
4266             if (OP(scan)==GOSUB) {
4267                 startpoint = scan + ARG2L(scan);
4268                 ST.close_paren = ARG(scan);
4269             } else {
4270                 startpoint = rei->program+1;
4271                 ST.close_paren = 0;
4272             }
4273             goto eval_recurse_doit;
4274             assert(0); /* NOTREACHED */
4275         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4276             if (cur_eval && cur_eval->locinput==locinput) {
4277                 if ( ++nochange_depth > max_nochange_depth )
4278                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4279             } else {
4280                 nochange_depth = 0;
4281             }
4282             {
4283                 /* execute the code in the {...} */
4284
4285                 dSP;
4286                 SV ** before;
4287                 OP * const oop = PL_op;
4288                 COP * const ocurcop = PL_curcop;
4289                 OP *nop;
4290                 char *saved_regeol = PL_regeol;
4291                 struct re_save_state saved_state;
4292                 CV *newcv;
4293
4294                 /* save *all* paren positions */
4295                 regcppush(rex, 0);
4296                 REGCP_SET(runops_cp);
4297
4298                 /* To not corrupt the existing regex state while executing the
4299                  * eval we would normally put it on the save stack, like with
4300                  * save_re_context. However, re-evals have a weird scoping so we
4301                  * can't just add ENTER/LEAVE here. With that, things like
4302                  *
4303                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4304                  *
4305                  * would break, as they expect the localisation to be unwound
4306                  * only when the re-engine backtracks through the bit that
4307                  * localised it.
4308                  *
4309                  * What we do instead is just saving the state in a local c
4310                  * variable.
4311                  */
4312                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4313
4314                 PL_reg_state.re_reparsing = FALSE;
4315
4316                 if (!caller_cv)
4317                     caller_cv = find_runcv(NULL);
4318
4319                 n = ARG(scan);
4320
4321                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4322                     newcv = ((struct regexp *)SvANY(
4323                                                 (REGEXP*)(rexi->data->data[n])
4324                                             ))->qr_anoncv
4325                                         ;
4326                     nop = (OP*)rexi->data->data[n+1];
4327                 }
4328                 else if (rexi->data->what[n] == 'l') { /* literal code */
4329                     newcv = caller_cv;
4330                     nop = (OP*)rexi->data->data[n];
4331                     assert(CvDEPTH(newcv));
4332                 }
4333                 else {
4334                     /* literal with own CV */
4335                     assert(rexi->data->what[n] == 'L');
4336                     newcv = rex->qr_anoncv;
4337                     nop = (OP*)rexi->data->data[n];
4338                 }
4339
4340                 /* normally if we're about to execute code from the same
4341                  * CV that we used previously, we just use the existing
4342                  * CX stack entry. However, its possible that in the
4343                  * meantime we may have backtracked, popped from the save
4344                  * stack, and undone the SAVECOMPPAD(s) associated with
4345                  * PUSH_MULTICALL; in which case PL_comppad no longer
4346                  * points to newcv's pad. */
4347                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4348                 {
4349                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4350                     if (last_pushed_cv) {
4351                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4352                     }
4353                     else {
4354                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4355                     }
4356                     last_pushed_cv = newcv;
4357                 }
4358                 last_pad = PL_comppad;
4359
4360                 /* the initial nextstate you would normally execute
4361                  * at the start of an eval (which would cause error
4362                  * messages to come from the eval), may be optimised
4363                  * away from the execution path in the regex code blocks;
4364                  * so manually set PL_curcop to it initially */
4365                 {
4366                     OP *o = cUNOPx(nop)->op_first;
4367                     assert(o->op_type == OP_NULL);
4368                     if (o->op_targ == OP_SCOPE) {
4369                         o = cUNOPo->op_first;
4370                     }
4371                     else {
4372                         assert(o->op_targ == OP_LEAVE);
4373                         o = cUNOPo->op_first;
4374                         assert(o->op_type == OP_ENTER);
4375                         o = o->op_sibling;
4376                     }
4377
4378                     if (o->op_type != OP_STUB) {
4379                         assert(    o->op_type == OP_NEXTSTATE
4380                                 || o->op_type == OP_DBSTATE
4381                                 || (o->op_type == OP_NULL
4382                                     &&  (  o->op_targ == OP_NEXTSTATE
4383                                         || o->op_targ == OP_DBSTATE
4384                                         )
4385                                     )
4386                         );
4387                         PL_curcop = (COP*)o;
4388                     }
4389                 }
4390                 nop = nop->op_next;
4391
4392                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4393                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4394
4395                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4396
4397                 if (sv_yes_mark) {
4398                     SV *sv_mrk = get_sv("REGMARK", 1);
4399                     sv_setsv(sv_mrk, sv_yes_mark);
4400                 }
4401
4402                 /* we don't use MULTICALL here as we want to call the
4403                  * first op of the block of interest, rather than the
4404                  * first op of the sub */
4405                 before = SP;
4406                 PL_op = nop;
4407                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4408                 SPAGAIN;
4409                 if (SP == before)
4410                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4411                 else {
4412                     ret = POPs;
4413                     PUTBACK;
4414                 }
4415
4416                 /* before restoring everything, evaluate the returned
4417                  * value, so that 'uninit' warnings don't use the wrong
4418                  * PL_op or pad. Also need to process any magic vars
4419                  * (e.g. $1) *before* parentheses are restored */
4420
4421                 PL_op = NULL;
4422
4423                 re_sv = NULL;
4424                 if (logical == 0)        /*   (?{})/   */
4425                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4426                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4427                     sw = cBOOL(SvTRUE(ret));
4428                     logical = 0;
4429                 }
4430                 else {                   /*  /(??{})  */
4431                     /*  if its overloaded, let the regex compiler handle
4432                      *  it; otherwise extract regex, or stringify  */
4433                     if (!SvAMAGIC(ret)) {
4434                         SV *sv = ret;
4435                         if (SvROK(sv))
4436                             sv = SvRV(sv);
4437                         if (SvTYPE(sv) == SVt_REGEXP)
4438                             re_sv = (REGEXP*) sv;
4439                         else if (SvSMAGICAL(sv)) {
4440                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4441                             if (mg)
4442                                 re_sv = (REGEXP *) mg->mg_obj;
4443                         }
4444
4445                         /* force any magic, undef warnings here */
4446                         if (!re_sv) {
4447                             ret = sv_mortalcopy(ret);
4448                             (void) SvPV_force_nolen(ret);
4449                         }
4450                     }
4451
4452                 }
4453
4454                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4455
4456                 /* *** Note that at this point we don't restore
4457                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4458                  * be used again soon. This is safe as long as nothing
4459                  * in the regexp code uses the pad ! */
4460                 PL_op = oop;
4461                 PL_curcop = ocurcop;
4462                 PL_regeol = saved_regeol;
4463                 S_regcp_restore(aTHX_ rex, runops_cp);
4464
4465                 if (logical != 2)
4466                     break;
4467             }
4468
4469                 /* only /(??{})/  from now on */
4470                 logical = 0;
4471                 {
4472                     /* extract RE object from returned value; compiling if
4473                      * necessary */
4474
4475                     if (re_sv) {
4476                         re_sv = reg_temp_copy(NULL, re_sv);
4477                     }
4478                     else {
4479                         U32 pm_flags = 0;
4480                         const I32 osize = PL_regsize;
4481
4482                         if (SvUTF8(ret) && IN_BYTES) {
4483                             /* In use 'bytes': make a copy of the octet
4484                              * sequence, but without the flag on */
4485                             STRLEN len;
4486                             const char *const p = SvPV(ret, len);
4487                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4488                         }
4489                         if (rex->intflags & PREGf_USE_RE_EVAL)
4490                             pm_flags |= PMf_USE_RE_EVAL;
4491
4492                         /* if we got here, it should be an engine which
4493                          * supports compiling code blocks and stuff */
4494                         assert(rex->engine && rex->engine->op_comp);
4495                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4496                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4497                                     rex->engine, NULL, NULL,
4498                                     /* copy /msix etc to inner pattern */
4499                                     scan->flags,
4500                                     pm_flags);
4501
4502                         if (!(SvFLAGS(ret)
4503                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4504                                  | SVs_GMG))) {
4505                             /* This isn't a first class regexp. Instead, it's
4506                                caching a regexp onto an existing, Perl visible
4507                                scalar.  */
4508                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4509                         }
4510                         PL_regsize = osize;
4511                         /* safe to do now that any $1 etc has been
4512                          * interpolated into the new pattern string and
4513                          * compiled */
4514                         S_regcp_restore(aTHX_ rex, runops_cp);
4515                     }
4516                     re = (struct regexp *)SvANY(re_sv);
4517                 }
4518                 RXp_MATCH_COPIED_off(re);
4519                 re->subbeg = rex->subbeg;
4520                 re->sublen = rex->sublen;
4521                 rei = RXi_GET(re);
4522                 DEBUG_EXECUTE_r(
4523                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4524                         "Matching embedded");
4525                 );
4526                 startpoint = rei->program + 1;
4527                 ST.close_paren = 0; /* only used for GOSUB */
4528
4529         eval_recurse_doit: /* Share code with GOSUB below this line */
4530                 /* run the pattern returned from (??{...}) */
4531                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4532                 REGCP_SET(ST.lastcp);
4533
4534                 re->lastparen = 0;
4535                 re->lastcloseparen = 0;
4536
4537                 PL_reginput = locinput;
4538                 PL_regsize = 0;
4539
4540                 /* XXXX This is too dramatic a measure... */
4541                 PL_reg_maxiter = 0;
4542
4543                 ST.toggle_reg_flags = PL_reg_flags;
4544                 if (RX_UTF8(re_sv))
4545                     PL_reg_flags |= RF_utf8;
4546                 else
4547                     PL_reg_flags &= ~RF_utf8;
4548                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4549
4550                 ST.prev_rex = rex_sv;
4551                 ST.prev_curlyx = cur_curlyx;
4552                 rex_sv = re_sv;
4553                 SET_reg_curpm(rex_sv);
4554                 rex = re;
4555                 rexi = rei;
4556                 cur_curlyx = NULL;
4557                 ST.B = next;
4558                 ST.prev_eval = cur_eval;
4559                 cur_eval = st;
4560                 /* now continue from first node in postoned RE */
4561                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4562                 assert(0); /* NOTREACHED */
4563         }
4564
4565         case EVAL_AB: /* cleanup after a successful (??{A})B */
4566             /* note: this is called twice; first after popping B, then A */
4567             PL_reg_flags ^= ST.toggle_reg_flags;
4568             rex_sv = ST.prev_rex;
4569             SET_reg_curpm(rex_sv);
4570             rex = (struct regexp *)SvANY(rex_sv);
4571             rexi = RXi_GET(rex);
4572             regcpblow(ST.cp);
4573             cur_eval = ST.prev_eval;
4574             cur_curlyx = ST.prev_curlyx;
4575
4576             /* XXXX This is too dramatic a measure... */
4577             PL_reg_maxiter = 0;
4578             if ( nochange_depth )
4579                 nochange_depth--;
4580             sayYES;
4581
4582
4583         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4584             /* note: this is called twice; first after popping B, then A */
4585             PL_reg_flags ^= ST.toggle_reg_flags;
4586             rex_sv = ST.prev_rex;
4587             SET_reg_curpm(rex_sv);
4588             rex = (struct regexp *)SvANY(rex_sv);
4589             rexi = RXi_GET(rex);
4590
4591             PL_reginput = locinput;
4592             REGCP_UNWIND(ST.lastcp);
4593             regcppop(rex);
4594             cur_eval = ST.prev_eval;
4595             cur_curlyx = ST.prev_curlyx;
4596             /* XXXX This is too dramatic a measure... */
4597             PL_reg_maxiter = 0;
4598             if ( nochange_depth )
4599                 nochange_depth--;
4600             sayNO_SILENT;
4601 #undef ST
4602
4603         case OPEN:
4604             n = ARG(scan);  /* which paren pair */
4605             rex->offs[n].start_tmp = locinput - PL_bostr;
4606             if (n > PL_regsize)
4607                 PL_regsize = n;
4608             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4609                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4610                 PTR2UV(rex),
4611                 PTR2UV(rex->offs),
4612                 (UV)n,
4613                 (IV)rex->offs[n].start_tmp,
4614                 (UV)PL_regsize
4615             ));
4616             lastopen = n;
4617             break;
4618
4619 /* XXX really need to log other places start/end are set too */
4620 #define CLOSE_CAPTURE \
4621     rex->offs[n].start = rex->offs[n].start_tmp; \
4622     rex->offs[n].end = locinput - PL_bostr; \
4623     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4624         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4625         PTR2UV(rex), \
4626         PTR2UV(rex->offs), \
4627         (UV)n, \
4628         (IV)rex->offs[n].start, \
4629         (IV)rex->offs[n].end \
4630     ))
4631
4632         case CLOSE:
4633             n = ARG(scan);  /* which paren pair */
4634             CLOSE_CAPTURE;
4635             /*if (n > PL_regsize)
4636                 PL_regsize = n;*/
4637             if (n > rex->lastparen)
4638                 rex->lastparen = n;
4639             rex->lastcloseparen = n;
4640             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4641                 goto fake_end;
4642             }
4643             break;
4644         case ACCEPT:
4645             if (ARG(scan)){
4646                 regnode *cursor;
4647                 for (cursor=scan;
4648                      cursor && OP(cursor)!=END;
4649                      cursor=regnext(cursor))
4650                 {
4651                     if ( OP(cursor)==CLOSE ){
4652                         n = ARG(cursor);
4653                         if ( n <= lastopen ) {
4654                             CLOSE_CAPTURE;
4655                             /*if (n > PL_regsize)
4656                             PL_regsize = n;*/
4657                             if (n > rex->lastparen)
4658                                 rex->lastparen = n;
4659                             rex->lastcloseparen = n;
4660                             if ( n == ARG(scan) || (cur_eval &&
4661                                 cur_eval->u.eval.close_paren == n))
4662                                 break;
4663                         }
4664                     }
4665                 }
4666             }
4667             goto fake_end;
4668             /*NOTREACHED*/
4669         case GROUPP:
4670             n = ARG(scan);  /* which paren pair */
4671             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4672             break;
4673         case NGROUPP:
4674             /* reg_check_named_buff_matched returns 0 for no match */
4675             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4676             break;
4677         case INSUBP:
4678             n = ARG(scan);
4679             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4680             break;
4681         case DEFINEP:
4682             sw = 0;
4683             break;
4684         case IFTHEN:
4685             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4686             if (sw)
4687                 next = NEXTOPER(NEXTOPER(scan));
4688             else {
4689                 next = scan + ARG(scan);
4690                 if (OP(next) == IFTHEN) /* Fake one. */
4691                     next = NEXTOPER(NEXTOPER(next));
4692             }
4693             break;
4694         case LOGICAL:
4695             logical = scan->flags;
4696             break;
4697
4698 /*******************************************************************
4699
4700 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4701 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4702 STAR/PLUS/CURLY/CURLYN are used instead.)
4703
4704 A*B is compiled as <CURLYX><A><WHILEM><B>
4705
4706 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4707 state, which contains the current count, initialised to -1. It also sets
4708 cur_curlyx to point to this state, with any previous value saved in the
4709 state block.
4710
4711 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4712 since the pattern may possibly match zero times (i.e. it's a while {} loop
4713 rather than a do {} while loop).
4714
4715 Each entry to WHILEM represents a successful match of A. The count in the
4716 CURLYX block is incremented, another WHILEM state is pushed, and execution
4717 passes to A or B depending on greediness and the current count.
4718
4719 For example, if matching against the string a1a2a3b (where the aN are
4720 substrings that match /A/), then the match progresses as follows: (the
4721 pushed states are interspersed with the bits of strings matched so far):
4722
4723     <CURLYX cnt=-1>
4724     <CURLYX cnt=0><WHILEM>
4725     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4726     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4727     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4728     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4729
4730 (Contrast this with something like CURLYM, which maintains only a single
4731 backtrack state:
4732
4733     <CURLYM cnt=0> a1
4734     a1 <CURLYM cnt=1> a2
4735     a1 a2 <CURLYM cnt=2> a3
4736     a1 a2 a3 <CURLYM cnt=3> b
4737 )
4738
4739 Each WHILEM state block marks a point to backtrack to upon partial failure
4740 of A or B, and also contains some minor state data related to that
4741 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4742 overall state, such as the count, and pointers to the A and B ops.
4743
4744 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4745 must always point to the *current* CURLYX block, the rules are:
4746
4747 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4748 and set cur_curlyx to point the new block.
4749
4750 When popping the CURLYX block after a successful or unsuccessful match,
4751 restore the previous cur_curlyx.
4752
4753 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4754 to the outer one saved in the CURLYX block.
4755
4756 When popping the WHILEM block after a successful or unsuccessful B match,
4757 restore the previous cur_curlyx.
4758
4759 Here's an example for the pattern (AI* BI)*BO
4760 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4761
4762 cur_
4763 curlyx backtrack stack
4764 ------ ---------------
4765 NULL
4766 CO     <CO prev=NULL> <WO>
4767 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4768 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4769 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4770
4771 At this point the pattern succeeds, and we work back down the stack to
4772 clean up, restoring as we go:
4773
4774 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4775 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4776 CO     <CO prev=NULL> <WO>
4777 NULL
4778
4779 *******************************************************************/
4780
4781 #define ST st->u.curlyx
4782
4783         case CURLYX:    /* start of /A*B/  (for complex A) */
4784         {
4785             /* No need to save/restore up to this paren */
4786             I32 parenfloor = scan->flags;
4787
4788             assert(next); /* keep Coverity happy */
4789             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4790                 next += ARG(next);
4791
4792             /* XXXX Probably it is better to teach regpush to support
4793                parenfloor > PL_regsize... */
4794             if (parenfloor > (I32)rex->lastparen)
4795                 parenfloor = rex->lastparen; /* Pessimization... */
4796
4797             ST.prev_curlyx= cur_curlyx;
4798             cur_curlyx = st;
4799             ST.cp = PL_savestack_ix;
4800
4801             /* these fields contain the state of the current curly.
4802              * they are accessed by subsequent WHILEMs */
4803             ST.parenfloor = parenfloor;
4804             ST.me = scan;
4805             ST.B = next;
4806             ST.minmod = minmod;
4807             minmod = 0;
4808             ST.count = -1;      /* this will be updated by WHILEM */
4809             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4810
4811             PL_reginput = locinput;
4812             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4813             assert(0); /* NOTREACHED */
4814         }
4815
4816         case CURLYX_end: /* just finished matching all of A*B */
4817             cur_curlyx = ST.prev_curlyx;
4818             sayYES;
4819             assert(0); /* NOTREACHED */
4820
4821         case CURLYX_end_fail: /* just failed to match all of A*B */
4822             regcpblow(ST.cp);
4823             cur_curlyx = ST.prev_curlyx;
4824             sayNO;
4825             assert(0); /* NOTREACHED */
4826
4827
4828 #undef ST
4829 #define ST st->u.whilem
4830
4831         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4832         {
4833             /* see the discussion above about CURLYX/WHILEM */
4834             I32 n;
4835             int min = ARG1(cur_curlyx->u.curlyx.me);
4836             int max = ARG2(cur_curlyx->u.curlyx.me);
4837             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4838
4839             assert(cur_curlyx); /* keep Coverity happy */
4840             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4841             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4842             ST.cache_offset = 0;
4843             ST.cache_mask = 0;
4844
4845             PL_reginput = locinput;
4846
4847             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4848                   "%*s  whilem: matched %ld out of %d..%d\n",
4849                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4850             );
4851
4852             /* First just match a string of min A's. */
4853
4854             if (n < min) {
4855                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4856                 cur_curlyx->u.curlyx.lastloc = locinput;
4857                 REGCP_SET(ST.lastcp);
4858
4859                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4860                 assert(0); /* NOTREACHED */
4861             }
4862
4863             /* If degenerate A matches "", assume A done. */
4864
4865             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4866                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4867                    "%*s  whilem: empty match detected, trying continuation...\n",
4868                    REPORT_CODE_OFF+depth*2, "")
4869                 );
4870                 goto do_whilem_B_max;
4871             }
4872
4873             /* super-linear cache processing */
4874
4875             if (scan->flags) {
4876
4877                 if (!PL_reg_maxiter) {
4878                     /* start the countdown: Postpone detection until we
4879                      * know the match is not *that* much linear. */
4880                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4881                     /* possible overflow for long strings and many CURLYX's */
4882                     if (PL_reg_maxiter < 0)
4883                         PL_reg_maxiter = I32_MAX;
4884                     PL_reg_leftiter = PL_reg_maxiter;
4885                 }
4886
4887                 if (PL_reg_leftiter-- == 0) {
4888                     /* initialise cache */
4889                     const I32 size = (PL_reg_maxiter + 7)/8;
4890                     if (PL_reg_poscache) {
4891                         if ((I32)PL_reg_poscache_size < size) {
4892                             Renew(PL_reg_poscache, size, char);
4893                             PL_reg_poscache_size = size;
4894                         }
4895                         Zero(PL_reg_poscache, size, char);
4896                     }
4897                     else {
4898                         PL_reg_poscache_size = size;
4899                         Newxz(PL_reg_poscache, size, char);
4900                     }
4901                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4902       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4903                               PL_colors[4], PL_colors[5])
4904                     );
4905                 }
4906
4907                 if (PL_reg_leftiter < 0) {
4908                     /* have we already failed at this position? */
4909                     I32 offset, mask;
4910                     offset  = (scan->flags & 0xf) - 1
4911                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4912                     mask    = 1 << (offset % 8);
4913                     offset /= 8;
4914                     if (PL_reg_poscache[offset] & mask) {
4915                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4916                             "%*s  whilem: (cache) already tried at this position...\n",
4917                             REPORT_CODE_OFF+depth*2, "")
4918                         );
4919                         sayNO; /* cache records failure */
4920                     }
4921                     ST.cache_offset = offset;
4922                     ST.cache_mask   = mask;
4923                 }
4924             }
4925
4926             /* Prefer B over A for minimal matching. */
4927
4928             if (cur_curlyx->u.curlyx.minmod) {
4929                 ST.save_curlyx = cur_curlyx;
4930                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4931                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
4932                 REGCP_SET(ST.lastcp);
4933                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4934                 assert(0); /* NOTREACHED */
4935             }
4936
4937             /* Prefer A over B for maximal matching. */
4938
4939             if (n < max) { /* More greed allowed? */
4940                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4941                 cur_curlyx->u.curlyx.lastloc = locinput;
4942                 REGCP_SET(ST.lastcp);
4943                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4944                 assert(0); /* NOTREACHED */
4945             }
4946             goto do_whilem_B_max;
4947         }
4948         assert(0); /* NOTREACHED */
4949
4950         case WHILEM_B_min: /* just matched B in a minimal match */
4951         case WHILEM_B_max: /* just matched B in a maximal match */
4952             cur_curlyx = ST.save_curlyx;
4953             sayYES;
4954             assert(0); /* NOTREACHED */
4955
4956         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4957             cur_curlyx = ST.save_curlyx;
4958             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4959             cur_curlyx->u.curlyx.count--;
4960             CACHEsayNO;
4961             assert(0); /* NOTREACHED */
4962
4963         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4964             /* FALL THROUGH */
4965         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4966             REGCP_UNWIND(ST.lastcp);
4967             regcppop(rex);
4968             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4969             cur_curlyx->u.curlyx.count--;
4970             CACHEsayNO;
4971             assert(0); /* NOTREACHED */
4972
4973         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4974             REGCP_UNWIND(ST.lastcp);
4975             regcppop(rex);      /* Restore some previous $<digit>s? */
4976             PL_reginput = locinput;
4977             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4978                 "%*s  whilem: failed, trying continuation...\n",
4979                 REPORT_CODE_OFF+depth*2, "")
4980             );
4981           do_whilem_B_max:
4982             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4983                 && ckWARN(WARN_REGEXP)
4984                 && !(PL_reg_flags & RF_warned))
4985             {
4986                 PL_reg_flags |= RF_warned;
4987                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4988                      "Complex regular subexpression recursion limit (%d) "
4989                      "exceeded",
4990                      REG_INFTY - 1);
4991             }
4992
4993             /* now try B */
4994             ST.save_curlyx = cur_curlyx;
4995             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4996             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4997             assert(0); /* NOTREACHED */
4998
4999         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5000             cur_curlyx = ST.save_curlyx;
5001             REGCP_UNWIND(ST.lastcp);
5002             regcppop(rex);
5003
5004             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5005                 /* Maximum greed exceeded */
5006                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5007                     && ckWARN(WARN_REGEXP)
5008                     && !(PL_reg_flags & RF_warned))
5009                 {
5010                     PL_reg_flags |= RF_warned;
5011                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5012                         "Complex regular subexpression recursion "
5013                         "limit (%d) exceeded",
5014                         REG_INFTY - 1);
5015                 }
5016                 cur_curlyx->u.curlyx.count--;
5017                 CACHEsayNO;
5018             }
5019
5020             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5021                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5022             );
5023             /* Try grabbing another A and see if it helps. */
5024             PL_reginput = locinput;
5025             cur_curlyx->u.curlyx.lastloc = locinput;
5026             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5027             REGCP_SET(ST.lastcp);
5028             PUSH_STATE_GOTO(WHILEM_A_min,
5029                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5030             assert(0); /* NOTREACHED */
5031
5032 #undef  ST
5033 #define ST st->u.branch
5034
5035         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5036             next = scan + ARG(scan);
5037             if (next == scan)
5038                 next = NULL;
5039             scan = NEXTOPER(scan);
5040             /* FALL THROUGH */
5041
5042         case BRANCH:        /*  /(...|A|...)/ */
5043             scan = NEXTOPER(scan); /* scan now points to inner node */
5044             ST.lastparen = rex->lastparen;
5045             ST.lastcloseparen = rex->lastcloseparen;
5046             ST.next_branch = next;
5047             REGCP_SET(ST.cp);
5048             PL_reginput = locinput;
5049
5050             /* Now go into the branch */
5051             if (has_cutgroup) {
5052                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5053             } else {
5054                 PUSH_STATE_GOTO(BRANCH_next, scan);
5055             }
5056             assert(0); /* NOTREACHED */
5057         case CUTGROUP:
5058             PL_reginput = locinput;
5059             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5060                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5061             PUSH_STATE_GOTO(CUTGROUP_next,next);
5062             assert(0); /* NOTREACHED */
5063         case CUTGROUP_next_fail:
5064             do_cutgroup = 1;
5065             no_final = 1;
5066             if (st->u.mark.mark_name)
5067                 sv_commit = st->u.mark.mark_name;
5068             sayNO;
5069             assert(0); /* NOTREACHED */
5070         case BRANCH_next:
5071             sayYES;
5072             assert(0); /* NOTREACHED */
5073         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5074             if (do_cutgroup) {
5075                 do_cutgroup = 0;
5076                 no_final = 0;
5077             }
5078             REGCP_UNWIND(ST.cp);
5079             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5080             scan = ST.next_branch;
5081             /* no more branches? */
5082             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5083                 DEBUG_EXECUTE_r({
5084                     PerlIO_printf( Perl_debug_log,
5085                         "%*s  %sBRANCH failed...%s\n",
5086                         REPORT_CODE_OFF+depth*2, "",
5087                         PL_colors[4],
5088                         PL_colors[5] );
5089                 });
5090                 sayNO_SILENT;
5091             }
5092             continue; /* execute next BRANCH[J] op */
5093             assert(0); /* NOTREACHED */
5094
5095         case MINMOD:
5096             minmod = 1;
5097             break;
5098
5099 #undef  ST
5100 #define ST st->u.curlym
5101
5102         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5103
5104             /* This is an optimisation of CURLYX that enables us to push
5105              * only a single backtracking state, no matter how many matches
5106              * there are in {m,n}. It relies on the pattern being constant
5107              * length, with no parens to influence future backrefs
5108              */
5109
5110             ST.me = scan;
5111             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5112
5113             ST.lastparen      = rex->lastparen;
5114             ST.lastcloseparen = rex->lastcloseparen;
5115
5116             /* if paren positive, emulate an OPEN/CLOSE around A */
5117             if (ST.me->flags) {
5118                 U32 paren = ST.me->flags;
5119                 if (paren > PL_regsize)
5120                     PL_regsize = paren;
5121                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5122             }
5123             ST.A = scan;
5124             ST.B = next;
5125             ST.alen = 0;
5126             ST.count = 0;
5127             ST.minmod = minmod;
5128             minmod = 0;
5129             ST.c1 = CHRTEST_UNINIT;
5130             REGCP_SET(ST.cp);
5131
5132             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5133                 goto curlym_do_B;
5134
5135           curlym_do_A: /* execute the A in /A{m,n}B/  */
5136             PL_reginput = locinput;
5137             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5138             assert(0); /* NOTREACHED */
5139
5140         case CURLYM_A: /* we've just matched an A */
5141             locinput = st->locinput;
5142             nextchr = UCHARAT(locinput);
5143
5144             ST.count++;
5145             /* after first match, determine A's length: u.curlym.alen */
5146             if (ST.count == 1) {
5147                 if (PL_reg_match_utf8) {
5148                     char *s = locinput;
5149                     while (s < PL_reginput) {
5150                         ST.alen++;
5151                         s += UTF8SKIP(s);
5152                     }
5153                 }
5154                 else {
5155                     ST.alen = PL_reginput - locinput;
5156                 }
5157                 if (ST.alen == 0)
5158                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5159             }
5160             DEBUG_EXECUTE_r(
5161                 PerlIO_printf(Perl_debug_log,
5162                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5163                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5164                           (IV) ST.count, (IV)ST.alen)
5165             );
5166
5167             locinput = PL_reginput;
5168
5169             if (cur_eval && cur_eval->u.eval.close_paren &&
5170                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5171                 goto fake_end;
5172
5173             {
5174                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5175                 if ( max == REG_INFTY || ST.count < max )
5176                     goto curlym_do_A; /* try to match another A */
5177             }
5178             goto curlym_do_B; /* try to match B */
5179
5180         case CURLYM_A_fail: /* just failed to match an A */
5181             REGCP_UNWIND(ST.cp);
5182
5183             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5184                 || (cur_eval && cur_eval->u.eval.close_paren &&
5185                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5186                 sayNO;
5187
5188           curlym_do_B: /* execute the B in /A{m,n}B/  */
5189             PL_reginput = locinput;
5190             if (ST.c1 == CHRTEST_UNINIT) {
5191                 /* calculate c1 and c2 for possible match of 1st char
5192                  * following curly */
5193                 ST.c1 = ST.c2 = CHRTEST_VOID;
5194                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5195                     regnode *text_node = ST.B;
5196                     if (! HAS_TEXT(text_node))
5197                         FIND_NEXT_IMPT(text_node);
5198                     /* this used to be
5199
5200                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5201
5202                         But the former is redundant in light of the latter.
5203
5204                         if this changes back then the macro for
5205                         IS_TEXT and friends need to change.
5206                      */
5207                     if (PL_regkind[OP(text_node)] == EXACT)
5208                     {
5209
5210                         ST.c1 = (U8)*STRING(text_node);
5211                         switch (OP(text_node)) {
5212                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5213                             case EXACTFA:
5214                             case EXACTFU_SS:
5215                             case EXACTFU_TRICKYFOLD:
5216                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5217                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5218                             default: ST.c2 = ST.c1;
5219                         }
5220                     }
5221                 }
5222             }
5223
5224             DEBUG_EXECUTE_r(
5225                 PerlIO_printf(Perl_debug_log,
5226                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5227                     (int)(REPORT_CODE_OFF+(depth*2)),
5228                     "", (IV)ST.count)
5229                 );
5230             if (ST.c1 != CHRTEST_VOID
5231                     && UCHARAT(PL_reginput) != ST.c1
5232                     && UCHARAT(PL_reginput) != ST.c2)
5233             {
5234                 /* simulate B failing */
5235                 DEBUG_OPTIMISE_r(
5236                     PerlIO_printf(Perl_debug_log,
5237                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5238                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5239                         (IV)ST.c1,(IV)ST.c2
5240                 ));
5241                 state_num = CURLYM_B_fail;
5242                 goto reenter_switch;
5243             }
5244
5245             if (ST.me->flags) {
5246                 /* emulate CLOSE: mark current A as captured */
5247                 I32 paren = ST.me->flags;
5248                 if (ST.count) {
5249                     rex->offs[paren].start
5250                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5251                     rex->offs[paren].end = PL_reginput - PL_bostr;
5252                     if ((U32)paren > rex->lastparen)
5253                         rex->lastparen = paren;
5254                     rex->lastcloseparen = paren;
5255                 }
5256                 else
5257                     rex->offs[paren].end = -1;
5258                 if (cur_eval && cur_eval->u.eval.close_paren &&
5259                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5260                 {
5261                     if (ST.count)
5262                         goto fake_end;
5263                     else
5264                         sayNO;
5265                 }
5266             }
5267
5268             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5269             assert(0); /* NOTREACHED */
5270
5271         case CURLYM_B_fail: /* just failed to match a B */
5272             REGCP_UNWIND(ST.cp);
5273             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5274             if (ST.minmod) {
5275                 I32 max = ARG2(ST.me);
5276                 if (max != REG_INFTY && ST.count == max)
5277                     sayNO;
5278                 goto curlym_do_A; /* try to match a further A */
5279             }
5280             /* backtrack one A */
5281             if (ST.count == ARG1(ST.me) /* min */)
5282                 sayNO;
5283             ST.count--;
5284             locinput = HOPc(locinput, -ST.alen);
5285             goto curlym_do_B; /* try to match B */
5286
5287 #undef ST
5288 #define ST st->u.curly
5289
5290 #define CURLY_SETPAREN(paren, success) \
5291     if (paren) { \
5292         if (success) { \
5293             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5294             rex->offs[paren].end = locinput - PL_bostr; \
5295             if (paren > rex->lastparen) \
5296                 rex->lastparen = paren; \
5297             rex->lastcloseparen = paren; \
5298         } \
5299         else { \
5300             rex->offs[paren].end = -1; \
5301             rex->lastparen      = ST.lastparen; \
5302             rex->lastcloseparen = ST.lastcloseparen; \
5303         } \
5304     }
5305
5306         case STAR:              /*  /A*B/ where A is width 1 */
5307             ST.paren = 0;
5308             ST.min = 0;
5309             ST.max = REG_INFTY;
5310             scan = NEXTOPER(scan);
5311             goto repeat;
5312         case PLUS:              /*  /A+B/ where A is width 1 */
5313             ST.paren = 0;
5314             ST.min = 1;
5315             ST.max = REG_INFTY;
5316             scan = NEXTOPER(scan);
5317             goto repeat;
5318         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5319             ST.paren = scan->flags;     /* Which paren to set */
5320             ST.lastparen      = rex->lastparen;
5321             ST.lastcloseparen = rex->lastcloseparen;
5322             if (ST.paren > PL_regsize)
5323                 PL_regsize = ST.paren;
5324             ST.min = ARG1(scan);  /* min to match */
5325             ST.max = ARG2(scan);  /* max to match */
5326             if (cur_eval && cur_eval->u.eval.close_paren &&
5327                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5328                 ST.min=1;
5329                 ST.max=1;
5330             }
5331             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5332             goto repeat;
5333         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5334             ST.paren = 0;
5335             ST.min = ARG1(scan);  /* min to match */
5336             ST.max = ARG2(scan);  /* max to match */
5337             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5338           repeat:
5339             /*
5340             * Lookahead to avoid useless match attempts
5341             * when we know what character comes next.
5342             *
5343             * Used to only do .*x and .*?x, but now it allows
5344             * for )'s, ('s and (?{ ... })'s to be in the way
5345             * of the quantifier and the EXACT-like node.  -- japhy
5346             */
5347
5348             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5349                 sayNO;
5350             if (HAS_TEXT(next) || JUMPABLE(next)) {
5351                 U8 *s;
5352                 regnode *text_node = next;
5353
5354                 if (! HAS_TEXT(text_node))
5355                     FIND_NEXT_IMPT(text_node);
5356
5357                 if (! HAS_TEXT(text_node))
5358                     ST.c1 = ST.c2 = CHRTEST_VOID;
5359                 else {
5360                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5361                         ST.c1 = ST.c2 = CHRTEST_VOID;
5362                         goto assume_ok_easy;
5363                     }
5364                     else
5365                         s = (U8*)STRING(text_node);
5366
5367                     /*  Currently we only get here when
5368
5369                         PL_rekind[OP(text_node)] == EXACT
5370
5371                         if this changes back then the macro for IS_TEXT and
5372                         friends need to change. */
5373                     if (!UTF_PATTERN) {
5374                         ST.c1 = *s;
5375                         switch (OP(text_node)) {
5376                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5377                             case EXACTFA:
5378                             case EXACTFU_SS:
5379                             case EXACTFU_TRICKYFOLD:
5380                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5381                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5382                             default: ST.c2 = ST.c1; break;
5383                         }
5384                     }
5385                     else { /* UTF_PATTERN */
5386                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5387                              STRLEN ulen;
5388                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5389
5390                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5391                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5392                                                     uniflags);
5393                         }
5394                         else {
5395                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5396                                                      uniflags);
5397                         }
5398                     }
5399                 }
5400             }
5401             else
5402                 ST.c1 = ST.c2 = CHRTEST_VOID;
5403         assume_ok_easy:
5404
5405             ST.A = scan;
5406             ST.B = next;
5407             PL_reginput = locinput;
5408             if (minmod) {
5409                 minmod = 0;
5410                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5411                     sayNO;
5412                 ST.count = ST.min;
5413                 locinput = PL_reginput;
5414                 REGCP_SET(ST.cp);
5415                 if (ST.c1 == CHRTEST_VOID)
5416                     goto curly_try_B_min;
5417
5418                 ST.oldloc = locinput;
5419
5420                 /* set ST.maxpos to the furthest point along the
5421                  * string that could possibly match */
5422                 if  (ST.max == REG_INFTY) {
5423                     ST.maxpos = PL_regeol - 1;
5424                     if (utf8_target)
5425                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5426                             ST.maxpos--;
5427                 }
5428                 else if (utf8_target) {
5429                     int m = ST.max - ST.min;
5430                     for (ST.maxpos = locinput;
5431                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5432                         ST.maxpos += UTF8SKIP(ST.maxpos);
5433                 }
5434                 else {
5435                     ST.maxpos = locinput + ST.max - ST.min;
5436                     if (ST.maxpos >= PL_regeol)
5437                         ST.maxpos = PL_regeol - 1;
5438                 }
5439                 goto curly_try_B_min_known;
5440
5441             }
5442             else {
5443                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5444                 locinput = PL_reginput;
5445                 if (ST.count < ST.min)
5446                     sayNO;
5447                 if ((ST.count > ST.min)
5448                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5449                 {
5450                     /* A{m,n} must come at the end of the string, there's
5451                      * no point in backing off ... */
5452                     ST.min = ST.count;
5453                     /* ...except that $ and \Z can match before *and* after
5454                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5455                        We may back off by one in this case. */
5456                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5457                         ST.min--;
5458                 }
5459                 REGCP_SET(ST.cp);
5460                 goto curly_try_B_max;
5461             }
5462             assert(0); /* NOTREACHED */
5463
5464
5465         case CURLY_B_min_known_fail:
5466             /* failed to find B in a non-greedy match where c1,c2 valid */
5467
5468             PL_reginput = locinput;     /* Could be reset... */
5469             REGCP_UNWIND(ST.cp);
5470             if (ST.paren) {
5471                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5472             }
5473             /* Couldn't or didn't -- move forward. */
5474             ST.oldloc = locinput;
5475             if (utf8_target)
5476                 locinput += UTF8SKIP(locinput);
5477             else
5478                 locinput++;
5479             ST.count++;
5480           curly_try_B_min_known:
5481              /* find the next place where 'B' could work, then call B */
5482             {
5483                 int n;
5484                 if (utf8_target) {
5485                     n = (ST.oldloc == locinput) ? 0 : 1;
5486                     if (ST.c1 == ST.c2) {
5487                         STRLEN len;
5488                         /* set n to utf8_distance(oldloc, locinput) */
5489                         while (locinput <= ST.maxpos &&
5490                                utf8n_to_uvchr((U8*)locinput,
5491                                               UTF8_MAXBYTES, &len,
5492                                               uniflags) != (UV)ST.c1) {
5493                             locinput += len;
5494                             n++;
5495                         }
5496                     }
5497                     else {
5498                         /* set n to utf8_distance(oldloc, locinput) */
5499                         while (locinput <= ST.maxpos) {
5500                             STRLEN len;
5501                             const UV c = utf8n_to_uvchr((U8*)locinput,
5502                                                   UTF8_MAXBYTES, &len,
5503                                                   uniflags);
5504                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5505                                 break;
5506                             locinput += len;
5507                             n++;
5508                         }
5509                     }
5510                 }
5511                 else {
5512                     if (ST.c1 == ST.c2) {
5513                         while (locinput <= ST.maxpos &&
5514                                UCHARAT(locinput) != ST.c1)
5515                             locinput++;
5516                     }
5517                     else {
5518                         while (locinput <= ST.maxpos
5519                                && UCHARAT(locinput) != ST.c1
5520                                && UCHARAT(locinput) != ST.c2)
5521                             locinput++;
5522                     }
5523                     n = locinput - ST.oldloc;
5524                 }
5525                 if (locinput > ST.maxpos)
5526                     sayNO;
5527                 /* PL_reginput == oldloc now */
5528                 if (n) {
5529                     ST.count += n;
5530                     if (regrepeat(rex, ST.A, n, depth) < n)
5531                         sayNO;
5532                 }
5533                 PL_reginput = locinput;
5534                 CURLY_SETPAREN(ST.paren, ST.count);
5535                 if (cur_eval && cur_eval->u.eval.close_paren &&
5536                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5537                     goto fake_end;
5538                 }
5539                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5540             }
5541             assert(0); /* NOTREACHED */
5542
5543
5544         case CURLY_B_min_fail:
5545             /* failed to find B in a non-greedy match where c1,c2 invalid */
5546
5547             REGCP_UNWIND(ST.cp);
5548             if (ST.paren) {
5549                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5550             }
5551             /* failed -- move forward one */
5552             PL_reginput = locinput;
5553             if (regrepeat(rex, ST.A, 1, depth)) {
5554                 ST.count++;
5555                 locinput = PL_reginput;
5556                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5557                         ST.count > 0)) /* count overflow ? */
5558                 {
5559                   curly_try_B_min:
5560                     CURLY_SETPAREN(ST.paren, ST.count);
5561                     if (cur_eval && cur_eval->u.eval.close_paren &&
5562                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5563                         goto fake_end;
5564                     }
5565                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5566                 }
5567             }
5568             sayNO;
5569             assert(0); /* NOTREACHED */
5570
5571
5572         curly_try_B_max:
5573             /* a successful greedy match: now try to match B */
5574             if (cur_eval && cur_eval->u.eval.close_paren &&
5575                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5576                 goto fake_end;
5577             }
5578             {
5579                 UV c = 0;
5580                 if (ST.c1 != CHRTEST_VOID)
5581                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5582                                            UTF8_MAXBYTES, 0, uniflags)
5583                                 : (UV) UCHARAT(PL_reginput);
5584                 /* If it could work, try it. */
5585                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5586                     CURLY_SETPAREN(ST.paren, ST.count);
5587                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5588                     assert(0); /* NOTREACHED */
5589                 }
5590             }
5591             /* FALL THROUGH */
5592         case CURLY_B_max_fail:
5593             /* failed to find B in a greedy match */
5594
5595             REGCP_UNWIND(ST.cp);
5596             if (ST.paren) {
5597                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5598             }
5599             /*  back up. */
5600             if (--ST.count < ST.min)
5601                 sayNO;
5602             PL_reginput = locinput = HOPc(locinput, -1);
5603             goto curly_try_B_max;
5604
5605 #undef ST
5606
5607         case END:
5608             fake_end:
5609             if (cur_eval) {
5610                 /* we've just finished A in /(??{A})B/; now continue with B */
5611                 st->u.eval.toggle_reg_flags
5612                             = cur_eval->u.eval.toggle_reg_flags;
5613                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5614
5615                 st->u.eval.prev_rex = rex_sv;           /* inner */
5616                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5617                 rex_sv = cur_eval->u.eval.prev_rex;
5618                 SET_reg_curpm(rex_sv);
5619                 rex = (struct regexp *)SvANY(rex_sv);
5620                 rexi = RXi_GET(rex);
5621                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5622
5623                 REGCP_SET(st->u.eval.lastcp);
5624                 PL_reginput = locinput;
5625
5626                 /* Restore parens of the outer rex without popping the
5627                  * savestack */
5628                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5629
5630                 st->u.eval.prev_eval = cur_eval;
5631                 cur_eval = cur_eval->u.eval.prev_eval;
5632                 DEBUG_EXECUTE_r(
5633                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5634                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5635                 if ( nochange_depth )
5636                     nochange_depth--;
5637
5638                 PUSH_YES_STATE_GOTO(EVAL_AB,
5639                         st->u.eval.prev_eval->u.eval.B); /* match B */
5640             }
5641
5642             if (locinput < reginfo->till) {
5643                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5644                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5645                                       PL_colors[4],
5646                                       (long)(locinput - PL_reg_starttry),
5647                                       (long)(reginfo->till - PL_reg_starttry),
5648                                       PL_colors[5]));
5649
5650                 sayNO_SILENT;           /* Cannot match: too short. */
5651             }
5652             PL_reginput = locinput;     /* put where regtry can find it */
5653             sayYES;                     /* Success! */
5654
5655         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5656             DEBUG_EXECUTE_r(
5657             PerlIO_printf(Perl_debug_log,
5658                 "%*s  %ssubpattern success...%s\n",
5659                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5660             PL_reginput = locinput;     /* put where regtry can find it */
5661             sayYES;                     /* Success! */
5662
5663 #undef  ST
5664 #define ST st->u.ifmatch
5665
5666         case SUSPEND:   /* (?>A) */
5667             ST.wanted = 1;
5668             PL_reginput = locinput;
5669             goto do_ifmatch;
5670
5671         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5672             ST.wanted = 0;
5673             goto ifmatch_trivial_fail_test;
5674
5675         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5676             ST.wanted = 1;
5677           ifmatch_trivial_fail_test:
5678             if (scan->flags) {
5679                 char * const s = HOPBACKc(locinput, scan->flags);
5680                 if (!s) {
5681                     /* trivial fail */
5682                     if (logical) {
5683                         logical = 0;
5684                         sw = 1 - cBOOL(ST.wanted);
5685                     }
5686                     else if (ST.wanted)
5687                         sayNO;
5688                     next = scan + ARG(scan);
5689                     if (next == scan)
5690                         next = NULL;
5691                     break;
5692                 }
5693                 PL_reginput = s;
5694             }
5695             else
5696                 PL_reginput = locinput;
5697
5698           do_ifmatch:
5699             ST.me = scan;
5700             ST.logical = logical;
5701             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5702
5703             /* execute body of (?...A) */
5704             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5705             assert(0); /* NOTREACHED */
5706
5707         case IFMATCH_A_fail: /* body of (?...A) failed */
5708             ST.wanted = !ST.wanted;
5709             /* FALL THROUGH */
5710
5711         case IFMATCH_A: /* body of (?...A) succeeded */
5712             if (ST.logical) {
5713                 sw = cBOOL(ST.wanted);
5714             }
5715             else if (!ST.wanted)
5716                 sayNO;
5717
5718             if (OP(ST.me) == SUSPEND)
5719                 locinput = PL_reginput;
5720             else {
5721                 locinput = PL_reginput = st->locinput;
5722                 nextchr = UCHARAT(locinput);
5723             }
5724             scan = ST.me + ARG(ST.me);
5725             if (scan == ST.me)
5726                 scan = NULL;
5727             continue; /* execute B */
5728
5729 #undef ST
5730
5731         case LONGJMP:
5732             next = scan + ARG(scan);
5733             if (next == scan)
5734                 next = NULL;
5735             break;
5736         case COMMIT:
5737             reginfo->cutpoint = PL_regeol;
5738             /* FALLTHROUGH */
5739         case PRUNE:
5740             PL_reginput = locinput;
5741             if (!scan->flags)
5742                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5743             PUSH_STATE_GOTO(COMMIT_next,next);
5744             assert(0); /* NOTREACHED */
5745         case COMMIT_next_fail:
5746             no_final = 1;
5747             /* FALLTHROUGH */
5748         case OPFAIL:
5749             sayNO;
5750             assert(0); /* NOTREACHED */
5751
5752 #define ST st->u.mark
5753         case MARKPOINT:
5754             ST.prev_mark = mark_state;
5755             ST.mark_name = sv_commit = sv_yes_mark
5756                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5757             mark_state = st;
5758             ST.mark_loc = PL_reginput = locinput;
5759             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5760             assert(0); /* NOTREACHED */
5761         case MARKPOINT_next:
5762             mark_state = ST.prev_mark;
5763             sayYES;
5764             assert(0); /* NOTREACHED */
5765         case MARKPOINT_next_fail:
5766             if (popmark && sv_eq(ST.mark_name,popmark))
5767             {
5768                 if (ST.mark_loc > startpoint)
5769                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5770                 popmark = NULL; /* we found our mark */
5771                 sv_commit = ST.mark_name;
5772
5773                 DEBUG_EXECUTE_r({
5774                         PerlIO_printf(Perl_debug_log,
5775                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5776                             REPORT_CODE_OFF+depth*2, "",
5777                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5778                 });
5779             }
5780             mark_state = ST.prev_mark;
5781             sv_yes_mark = mark_state ?
5782                 mark_state->u.mark.mark_name : NULL;
5783             sayNO;
5784             assert(0); /* NOTREACHED */
5785         case SKIP:
5786             PL_reginput = locinput;
5787             if (scan->flags) {
5788                 /* (*SKIP) : if we fail we cut here*/
5789                 ST.mark_name = NULL;
5790                 ST.mark_loc = locinput;
5791                 PUSH_STATE_GOTO(SKIP_next,next);
5792             } else {
5793                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5794                    otherwise do nothing.  Meaning we need to scan
5795                  */
5796                 regmatch_state *cur = mark_state;
5797                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5798
5799                 while (cur) {
5800                     if ( sv_eq( cur->u.mark.mark_name,
5801                                 find ) )
5802                     {
5803                         ST.mark_name = find;
5804                         PUSH_STATE_GOTO( SKIP_next, next );
5805                     }
5806                     cur = cur->u.mark.prev_mark;
5807                 }
5808             }
5809             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5810             break;
5811         case SKIP_next_fail:
5812             if (ST.mark_name) {
5813                 /* (*CUT:NAME) - Set up to search for the name as we
5814                    collapse the stack*/
5815                 popmark = ST.mark_name;
5816             } else {
5817                 /* (*CUT) - No name, we cut here.*/
5818                 if (ST.mark_loc > startpoint)
5819                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5820                 /* but we set sv_commit to latest mark_name if there
5821                    is one so they can test to see how things lead to this
5822                    cut */
5823                 if (mark_state)
5824                     sv_commit=mark_state->u.mark.mark_name;
5825             }
5826             no_final = 1;
5827             sayNO;
5828             assert(0); /* NOTREACHED */
5829 #undef ST
5830         case LNBREAK:
5831             if ((n=is_LNBREAK(locinput,utf8_target))) {
5832                 locinput += n;
5833                 nextchr = UCHARAT(locinput);
5834             } else
5835                 sayNO;
5836             break;
5837
5838 #define CASE_CLASS(nAmE)                              \
5839         case nAmE:                                    \
5840             if (locinput >= PL_regeol)                \
5841                 sayNO;                                \
5842             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5843                 locinput += n;                        \
5844                 nextchr = UCHARAT(locinput);          \
5845             } else                                    \
5846                 sayNO;                                \
5847             break;                                    \
5848         case N##nAmE:                                 \
5849             if (locinput >= PL_regeol)                \
5850                 sayNO;                                \
5851             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5852                 sayNO;                                \
5853             } else {                                  \
5854                 locinput += UTF8SKIP(locinput);       \
5855                 nextchr = UCHARAT(locinput);          \
5856             }                                         \
5857             break
5858
5859         CASE_CLASS(VERTWS);
5860         CASE_CLASS(HORIZWS);
5861 #undef CASE_CLASS
5862
5863         default:
5864             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5865                           PTR2UV(scan), OP(scan));
5866             Perl_croak(aTHX_ "regexp memory corruption");
5867
5868         } /* end switch */
5869
5870         /* switch break jumps here */
5871         scan = next; /* prepare to execute the next op and ... */
5872         continue;    /* ... jump back to the top, reusing st */
5873         assert(0); /* NOTREACHED */
5874
5875       push_yes_state:
5876         /* push a state that backtracks on success */
5877         st->u.yes.prev_yes_state = yes_state;
5878         yes_state = st;
5879         /* FALL THROUGH */
5880       push_state:
5881         /* push a new regex state, then continue at scan  */
5882         {
5883             regmatch_state *newst;
5884
5885             DEBUG_STACK_r({
5886                 regmatch_state *cur = st;
5887                 regmatch_state *curyes = yes_state;
5888                 int curd = depth;
5889                 regmatch_slab *slab = PL_regmatch_slab;
5890                 for (;curd > -1;cur--,curd--) {
5891                     if (cur < SLAB_FIRST(slab)) {
5892                         slab = slab->prev;
5893                         cur = SLAB_LAST(slab);
5894                     }
5895                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5896                         REPORT_CODE_OFF + 2 + depth * 2,"",
5897                         curd, PL_reg_name[cur->resume_state],
5898                         (curyes == cur) ? "yes" : ""
5899                     );
5900                     if (curyes == cur)
5901                         curyes = cur->u.yes.prev_yes_state;
5902                 }
5903             } else
5904                 DEBUG_STATE_pp("push")
5905             );
5906             depth++;
5907             st->locinput = locinput;
5908             newst = st+1;
5909             if (newst >  SLAB_LAST(PL_regmatch_slab))
5910                 newst = S_push_slab(aTHX);
5911             PL_regmatch_state = newst;
5912
5913             locinput = PL_reginput;
5914             nextchr = UCHARAT(locinput);
5915             st = newst;
5916             continue;
5917             assert(0); /* NOTREACHED */
5918         }
5919     }
5920
5921     /*
5922     * We get here only if there's trouble -- normally "case END" is
5923     * the terminating point.
5924     */
5925     Perl_croak(aTHX_ "corrupted regexp pointers");
5926     /*NOTREACHED*/
5927     sayNO;
5928
5929 yes:
5930     if (yes_state) {
5931         /* we have successfully completed a subexpression, but we must now
5932          * pop to the state marked by yes_state and continue from there */
5933         assert(st != yes_state);
5934 #ifdef DEBUGGING
5935         while (st != yes_state) {
5936             st--;
5937             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5938                 PL_regmatch_slab = PL_regmatch_slab->prev;
5939                 st = SLAB_LAST(PL_regmatch_slab);
5940             }
5941             DEBUG_STATE_r({
5942                 if (no_final) {
5943                     DEBUG_STATE_pp("pop (no final)");
5944                 } else {
5945                     DEBUG_STATE_pp("pop (yes)");
5946                 }
5947             });
5948             depth--;
5949         }
5950 #else
5951         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5952             || yes_state > SLAB_LAST(PL_regmatch_slab))
5953         {
5954             /* not in this slab, pop slab */
5955             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5956             PL_regmatch_slab = PL_regmatch_slab->prev;
5957             st = SLAB_LAST(PL_regmatch_slab);
5958         }
5959         depth -= (st - yes_state);
5960 #endif
5961         st = yes_state;
5962         yes_state = st->u.yes.prev_yes_state;
5963         PL_regmatch_state = st;
5964
5965         if (no_final) {
5966             locinput= st->locinput;
5967             nextchr = UCHARAT(locinput);
5968         }
5969         state_num = st->resume_state + no_final;
5970         goto reenter_switch;
5971     }
5972
5973     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5974                           PL_colors[4], PL_colors[5]));
5975
5976     if (PL_reg_state.re_state_eval_setup_done) {
5977         /* each successfully executed (?{...}) block does the equivalent of
5978          *   local $^R = do {...}
5979          * When popping the save stack, all these locals would be undone;
5980          * bypass this by setting the outermost saved $^R to the latest
5981          * value */
5982         if (oreplsv != GvSV(PL_replgv))
5983             sv_setsv(oreplsv, GvSV(PL_replgv));
5984     }
5985     result = 1;
5986     goto final_exit;
5987
5988 no:
5989     DEBUG_EXECUTE_r(
5990         PerlIO_printf(Perl_debug_log,
5991             "%*s  %sfailed...%s\n",
5992             REPORT_CODE_OFF+depth*2, "",
5993             PL_colors[4], PL_colors[5])
5994         );
5995
5996 no_silent:
5997     if (no_final) {
5998         if (yes_state) {
5999             goto yes;
6000         } else {
6001             goto final_exit;
6002         }
6003     }
6004     if (depth) {
6005         /* there's a previous state to backtrack to */
6006         st--;
6007         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6008             PL_regmatch_slab = PL_regmatch_slab->prev;
6009             st = SLAB_LAST(PL_regmatch_slab);
6010         }
6011         PL_regmatch_state = st;
6012         locinput= st->locinput;
6013         nextchr = UCHARAT(locinput);
6014
6015         DEBUG_STATE_pp("pop");
6016         depth--;
6017         if (yes_state == st)
6018             yes_state = st->u.yes.prev_yes_state;
6019
6020         state_num = st->resume_state + 1; /* failure = success + 1 */
6021         goto reenter_switch;
6022     }
6023     result = 0;
6024
6025   final_exit:
6026     if (rex->intflags & PREGf_VERBARG_SEEN) {
6027         SV *sv_err = get_sv("REGERROR", 1);
6028         SV *sv_mrk = get_sv("REGMARK", 1);
6029         if (result) {
6030             sv_commit = &PL_sv_no;
6031             if (!sv_yes_mark)
6032                 sv_yes_mark = &PL_sv_yes;
6033         } else {
6034             if (!sv_commit)
6035                 sv_commit = &PL_sv_yes;
6036             sv_yes_mark = &PL_sv_no;
6037         }
6038         sv_setsv(sv_err, sv_commit);
6039         sv_setsv(sv_mrk, sv_yes_mark);
6040     }
6041
6042
6043     if (last_pushed_cv) {
6044         dSP;
6045         POP_MULTICALL;
6046         PERL_UNUSED_VAR(SP);
6047     }
6048
6049     /* clean up; in particular, free all slabs above current one */
6050     LEAVE_SCOPE(oldsave);
6051
6052     return result;
6053 }
6054
6055 /*
6056  - regrepeat - repeatedly match something simple, report how many
6057  */
6058 /*
6059  * [This routine now assumes that it will only match on things of length 1.
6060  * That was true before, but now we assume scan - reginput is the count,
6061  * rather than incrementing count on every character.  [Er, except utf8.]]
6062  */
6063 STATIC I32
6064 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
6065 {
6066     dVAR;
6067     register char *scan;
6068     register I32 c;
6069     register char *loceol = PL_regeol;
6070     register I32 hardcount = 0;
6071     register bool utf8_target = PL_reg_match_utf8;
6072     UV utf8_flags;
6073 #ifndef DEBUGGING
6074     PERL_UNUSED_ARG(depth);
6075 #endif
6076
6077     PERL_ARGS_ASSERT_REGREPEAT;
6078
6079     scan = PL_reginput;
6080     if (max == REG_INFTY)
6081         max = I32_MAX;
6082     else if (max < loceol - scan)
6083         loceol = scan + max;
6084     switch (OP(p)) {
6085     case REG_ANY:
6086         if (utf8_target) {
6087             loceol = PL_regeol;
6088             while (scan < loceol && hardcount < max && *scan != '\n') {
6089                 scan += UTF8SKIP(scan);
6090                 hardcount++;
6091             }
6092         } else {
6093             while (scan < loceol && *scan != '\n')
6094                 scan++;
6095         }
6096         break;
6097     case SANY:
6098         if (utf8_target) {
6099             loceol = PL_regeol;
6100             while (scan < loceol && hardcount < max) {
6101                 scan += UTF8SKIP(scan);
6102                 hardcount++;
6103             }
6104         }
6105         else
6106             scan = loceol;
6107         break;
6108     case CANY:
6109         scan = loceol;
6110         break;
6111     case EXACT:
6112         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6113          * means they match only characters in the string that can be expressed
6114          * as a single byte.  For non-utf8 strings, that means a simple match.
6115          * For utf8 strings, the character matched must be an invariant, or
6116          * downgradable to a single byte.  The pattern's utf8ness is
6117          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6118          * it is, it's an invariant */
6119
6120         c = (U8)*STRING(p);
6121         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6122
6123         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6124             while (scan < loceol && UCHARAT(scan) == c) {
6125                 scan++;
6126             }
6127         }
6128         else {
6129
6130             /* Here, the string is utf8, and the pattern char is different
6131              * in utf8 than not, so can't compare them directly.  Outside the
6132              * loop, find the two utf8 bytes that represent c, and then
6133              * look for those in sequence in the utf8 string */
6134             U8 high = UTF8_TWO_BYTE_HI(c);
6135             U8 low = UTF8_TWO_BYTE_LO(c);
6136             loceol = PL_regeol;
6137
6138             while (hardcount < max
6139                     && scan + 1 < loceol
6140                     && UCHARAT(scan) == high
6141                     && UCHARAT(scan + 1) == low)
6142             {
6143                 scan += 2;
6144                 hardcount++;
6145             }
6146         }
6147         break;
6148     case EXACTFA:
6149         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6150         goto do_exactf;
6151
6152     case EXACTFL:
6153         PL_reg_flags |= RF_tainted;
6154         utf8_flags = FOLDEQ_UTF8_LOCALE;
6155         goto do_exactf;
6156
6157     case EXACTF:
6158             utf8_flags = 0;
6159             goto do_exactf;
6160
6161     case EXACTFU_SS:
6162     case EXACTFU_TRICKYFOLD:
6163     case EXACTFU:
6164         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6165
6166         /* The comments for the EXACT case above apply as well to these fold
6167          * ones */
6168
6169     do_exactf:
6170         c = (U8)*STRING(p);
6171         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6172
6173         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6174             char *tmpeol = loceol;
6175             while (hardcount < max
6176                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6177                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6178             {
6179                 scan = tmpeol;
6180                 tmpeol = loceol;
6181                 hardcount++;
6182             }
6183
6184             /* XXX Note that the above handles properly the German sharp s in
6185              * the pattern matching ss in the string.  But it doesn't handle
6186              * properly cases where the string contains say 'LIGATURE ff' and
6187              * the pattern is 'f+'.  This would require, say, a new function or
6188              * revised interface to foldEQ_utf8(), in which the maximum number
6189              * of characters to match could be passed and it would return how
6190              * many actually did.  This is just one of many cases where
6191              * multi-char folds don't work properly, and so the fix is being
6192              * deferred */
6193         }
6194         else {
6195             U8 folded;
6196
6197             /* Here, the string isn't utf8 and c is a single byte; and either
6198              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6199              * doesn't affect c.  Can just do simple comparisons for exact or
6200              * fold matching. */
6201             switch (OP(p)) {
6202                 case EXACTF: folded = PL_fold[c]; break;
6203                 case EXACTFA:
6204                 case EXACTFU_TRICKYFOLD:
6205                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6206                 case EXACTFL: folded = PL_fold_locale[c]; break;
6207                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6208             }
6209             while (scan < loceol &&
6210                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6211             {
6212                 scan++;
6213             }
6214         }
6215         break;
6216     case ANYOFV:
6217     case ANYOF:
6218         if (utf8_target || OP(p) == ANYOFV) {
6219             STRLEN inclasslen;
6220             loceol = PL_regeol;
6221             inclasslen = loceol - scan;
6222             while (hardcount < max
6223                    && ((inclasslen = loceol - scan) > 0)
6224                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6225             {
6226                 scan += inclasslen;
6227                 hardcount++;
6228             }
6229         } else {
6230             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6231                 scan++;
6232         }
6233         break;
6234     case ALNUMU:
6235         if (utf8_target) {
6236     utf8_wordchar:
6237             loceol = PL_regeol;
6238             LOAD_UTF8_CHARCLASS_ALNUM();
6239             while (hardcount < max && scan < loceol &&
6240                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6241             {
6242                 scan += UTF8SKIP(scan);
6243                 hardcount++;
6244             }
6245         } else {
6246             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6247                 scan++;
6248             }
6249         }
6250         break;
6251     case ALNUM:
6252         if (utf8_target)
6253             goto utf8_wordchar;
6254         while (scan < loceol && isALNUM((U8) *scan)) {
6255             scan++;
6256         }
6257         break;
6258     case ALNUMA:
6259         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6260             scan++;
6261         }
6262         break;
6263     case ALNUML:
6264         PL_reg_flags |= RF_tainted;
6265         if (utf8_target) {
6266             loceol = PL_regeol;
6267             while (hardcount < max && scan < loceol &&
6268                    isALNUM_LC_utf8((U8*)scan)) {
6269                 scan += UTF8SKIP(scan);
6270                 hardcount++;
6271             }
6272         } else {
6273             while (scan < loceol && isALNUM_LC(*scan))
6274                 scan++;
6275         }
6276         break;
6277     case NALNUMU:
6278         if (utf8_target) {
6279
6280     utf8_Nwordchar:
6281
6282             loceol = PL_regeol;
6283             LOAD_UTF8_CHARCLASS_ALNUM();
6284             while (hardcount < max && scan < loceol &&
6285                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6286             {
6287                 scan += UTF8SKIP(scan);
6288                 hardcount++;
6289             }
6290         } else {
6291             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6292                 scan++;
6293             }
6294         }
6295         break;
6296     case NALNUM:
6297         if (utf8_target)
6298             goto utf8_Nwordchar;
6299         while (scan < loceol && ! isALNUM((U8) *scan)) {
6300             scan++;
6301         }
6302         break;
6303     case NALNUMA:
6304         if (utf8_target) {
6305             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6306                 scan += UTF8SKIP(scan);
6307             }
6308         }
6309         else {
6310             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6311                 scan++;
6312             }
6313         }
6314         break;
6315     case NALNUML:
6316         PL_reg_flags |= RF_tainted;
6317         if (utf8_target) {
6318             loceol = PL_regeol;
6319             while (hardcount < max && scan < loceol &&
6320                    !isALNUM_LC_utf8((U8*)scan)) {
6321                 scan += UTF8SKIP(scan);
6322                 hardcount++;
6323             }
6324         } else {
6325             while (scan < loceol && !isALNUM_LC(*scan))
6326                 scan++;
6327         }
6328         break;
6329     case SPACEU:
6330         if (utf8_target) {
6331
6332     utf8_space:
6333
6334             loceol = PL_regeol;
6335             LOAD_UTF8_CHARCLASS_SPACE();
6336             while (hardcount < max && scan < loceol &&
6337                    (*scan == ' ' ||
6338                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6339             {
6340                 scan += UTF8SKIP(scan);
6341                 hardcount++;
6342             }
6343             break;
6344         }
6345         else {
6346             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6347                 scan++;
6348             }
6349             break;
6350         }
6351     case SPACE:
6352         if (utf8_target)
6353             goto utf8_space;
6354
6355         while (scan < loceol && isSPACE((U8) *scan)) {
6356             scan++;
6357         }
6358         break;
6359     case SPACEA:
6360         while (scan < loceol && isSPACE_A((U8) *scan)) {
6361             scan++;
6362         }
6363         break;
6364     case SPACEL:
6365         PL_reg_flags |= RF_tainted;
6366         if (utf8_target) {
6367             loceol = PL_regeol;
6368             while (hardcount < max && scan < loceol &&
6369                    isSPACE_LC_utf8((U8*)scan)) {
6370                 scan += UTF8SKIP(scan);
6371                 hardcount++;
6372             }
6373         } else {
6374             while (scan < loceol && isSPACE_LC(*scan))
6375                 scan++;
6376         }
6377         break;
6378     case NSPACEU:
6379         if (utf8_target) {
6380
6381     utf8_Nspace:
6382
6383             loceol = PL_regeol;
6384             LOAD_UTF8_CHARCLASS_SPACE();
6385             while (hardcount < max && scan < loceol &&
6386                    ! (*scan == ' ' ||
6387                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6388             {
6389                 scan += UTF8SKIP(scan);
6390                 hardcount++;
6391             }
6392             break;
6393         }
6394         else {
6395             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6396                 scan++;
6397             }
6398         }
6399         break;
6400     case NSPACE:
6401         if (utf8_target)
6402             goto utf8_Nspace;
6403
6404         while (scan < loceol && ! isSPACE((U8) *scan)) {
6405             scan++;
6406         }
6407         break;
6408     case NSPACEA:
6409         if (utf8_target) {
6410             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6411                 scan += UTF8SKIP(scan);
6412             }
6413         }
6414         else {
6415             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6416                 scan++;
6417             }
6418         }
6419         break;
6420     case NSPACEL:
6421         PL_reg_flags |= RF_tainted;
6422         if (utf8_target) {
6423             loceol = PL_regeol;
6424             while (hardcount < max && scan < loceol &&
6425                    !isSPACE_LC_utf8((U8*)scan)) {
6426                 scan += UTF8SKIP(scan);
6427                 hardcount++;
6428             }
6429         } else {
6430             while (scan < loceol && !isSPACE_LC(*scan))
6431                 scan++;
6432         }
6433         break;
6434     case DIGIT:
6435         if (utf8_target) {
6436             loceol = PL_regeol;
6437             LOAD_UTF8_CHARCLASS_DIGIT();
6438             while (hardcount < max && scan < loceol &&
6439                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6440                 scan += UTF8SKIP(scan);
6441                 hardcount++;
6442             }
6443         } else {
6444             while (scan < loceol && isDIGIT(*scan))
6445                 scan++;
6446         }
6447         break;
6448     case DIGITA:
6449         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6450             scan++;
6451         }
6452         break;
6453     case DIGITL:
6454         PL_reg_flags |= RF_tainted;
6455         if (utf8_target) {
6456             loceol = PL_regeol;
6457             while (hardcount < max && scan < loceol &&
6458                    isDIGIT_LC_utf8((U8*)scan)) {
6459                 scan += UTF8SKIP(scan);
6460                 hardcount++;
6461             }
6462         } else {
6463             while (scan < loceol && isDIGIT_LC(*scan))
6464                 scan++;
6465         }
6466         break;
6467     case NDIGIT:
6468         if (utf8_target) {
6469             loceol = PL_regeol;
6470             LOAD_UTF8_CHARCLASS_DIGIT();
6471             while (hardcount < max && scan < loceol &&
6472                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6473                 scan += UTF8SKIP(scan);
6474                 hardcount++;
6475             }
6476         } else {
6477             while (scan < loceol && !isDIGIT(*scan))
6478                 scan++;
6479         }
6480         break;
6481     case NDIGITA:
6482         if (utf8_target) {
6483             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6484                 scan += UTF8SKIP(scan);
6485             }
6486         }
6487         else {
6488             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6489                 scan++;
6490             }
6491         }
6492         break;
6493     case NDIGITL:
6494         PL_reg_flags |= RF_tainted;
6495         if (utf8_target) {
6496             loceol = PL_regeol;
6497             while (hardcount < max && scan < loceol &&
6498                    !isDIGIT_LC_utf8((U8*)scan)) {
6499                 scan += UTF8SKIP(scan);
6500                 hardcount++;
6501             }
6502         } else {
6503             while (scan < loceol && !isDIGIT_LC(*scan))
6504                 scan++;
6505         }
6506         break;
6507     case LNBREAK:
6508         if (utf8_target) {
6509             loceol = PL_regeol;
6510             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6511                 scan += c;
6512                 hardcount++;
6513             }
6514         } else {
6515             /*
6516               LNBREAK can match two latin chars, which is ok,
6517               because we have a null terminated string, but we
6518               have to use hardcount in this situation
6519             */
6520             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6521                 scan+=c;
6522                 hardcount++;
6523             }
6524         }
6525         break;
6526     case HORIZWS:
6527         if (utf8_target) {
6528             loceol = PL_regeol;
6529             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6530                 scan += c;
6531                 hardcount++;
6532             }
6533         } else {
6534             while (scan < loceol && is_HORIZWS_latin1(scan))
6535                 scan++;
6536         }
6537         break;
6538     case NHORIZWS:
6539         if (utf8_target) {
6540             loceol = PL_regeol;
6541             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6542                 scan += UTF8SKIP(scan);
6543                 hardcount++;
6544             }
6545         } else {
6546             while (scan < loceol && !is_HORIZWS_latin1(scan))
6547                 scan++;
6548
6549         }
6550         break;
6551     case VERTWS:
6552         if (utf8_target) {
6553             loceol = PL_regeol;
6554             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6555                 scan += c;
6556                 hardcount++;
6557             }
6558         } else {
6559             while (scan < loceol && is_VERTWS_latin1(scan))
6560                 scan++;
6561
6562         }
6563         break;
6564     case NVERTWS:
6565         if (utf8_target) {
6566             loceol = PL_regeol;
6567             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6568                 scan += UTF8SKIP(scan);
6569                 hardcount++;
6570             }
6571         } else {
6572             while (scan < loceol && !is_VERTWS_latin1(scan))
6573                 scan++;
6574
6575         }
6576         break;
6577
6578     default:            /* Called on something of 0 width. */
6579         break;          /* So match right here or not at all. */
6580     }
6581
6582     if (hardcount)
6583         c = hardcount;
6584     else
6585         c = scan - PL_reginput;
6586     PL_reginput = scan;
6587
6588     DEBUG_r({
6589         GET_RE_DEBUG_FLAGS_DECL;
6590         DEBUG_EXECUTE_r({
6591             SV * const prop = sv_newmortal();
6592             regprop(prog, prop, p);
6593             PerlIO_printf(Perl_debug_log,
6594                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6595                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6596         });
6597     });
6598
6599     return(c);
6600 }
6601
6602
6603 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6604 /*
6605 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6606 create a copy so that changes the caller makes won't change the shared one
6607  */
6608 SV *
6609 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6610 {
6611     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6612     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6613 }
6614 #endif
6615
6616 STATIC SV *
6617 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6618 {
6619     /* Returns the swash for the input 'node' in the regex 'prog'.
6620      * If <doinit> is true, will attempt to create the swash if not already
6621      *    done.
6622      * If <listsvp> is non-null, will return the swash initialization string in
6623      *    it.
6624      * If <altsvp> is non-null, will return the alternates to the regular swash
6625      *    in it
6626      * Tied intimately to how regcomp.c sets up the data structure */
6627
6628     dVAR;
6629     SV *sw  = NULL;
6630     SV *si  = NULL;
6631     SV *alt = NULL;
6632     SV*  invlist = NULL;
6633
6634     RXi_GET_DECL(prog,progi);
6635     const struct reg_data * const data = prog ? progi->data : NULL;
6636
6637     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6638
6639     assert(ANYOF_NONBITMAP(node));
6640
6641     if (data && data->count) {
6642         const U32 n = ARG(node);
6643
6644         if (data->what[n] == 's') {
6645             SV * const rv = MUTABLE_SV(data->data[n]);
6646             AV * const av = MUTABLE_AV(SvRV(rv));
6647             SV **const ary = AvARRAY(av);
6648             bool invlist_has_user_defined_property;
6649
6650             si = *ary;  /* ary[0] = the string to initialize the swash with */
6651
6652             /* Elements 3 and 4 are either both present or both absent. [3] is
6653              * any inversion list generated at compile time; [4] indicates if
6654              * that inversion list has any user-defined properties in it. */
6655             if (av_len(av) >= 3) {
6656                 invlist = ary[3];
6657                 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6658             }
6659             else {
6660                 invlist = NULL;
6661                 invlist_has_user_defined_property = FALSE;
6662             }
6663
6664             /* Element [1] is reserved for the set-up swash.  If already there,
6665              * return it; if not, create it and store it there */
6666             if (SvROK(ary[1])) {
6667                 sw = ary[1];
6668             }
6669             else if (si && doinit) {
6670
6671                 sw = _core_swash_init("utf8", /* the utf8 package */
6672                                       "", /* nameless */
6673                                       si,
6674                                       1, /* binary */
6675                                       0, /* not from tr/// */
6676                                       FALSE, /* is error if can't find
6677                                                 property */
6678                                       invlist,
6679                                       invlist_has_user_defined_property);
6680                 (void)av_store(av, 1, sw);
6681             }
6682
6683             /* Element [2] is for any multi-char folds.  Note that is a
6684              * fundamentally flawed design, because can't backtrack and try
6685              * again.  See [perl #89774] */
6686             if (SvTYPE(ary[2]) == SVt_PVAV) {
6687                 alt = ary[2];
6688             }
6689         }
6690     }
6691
6692     if (listsvp) {
6693         SV* matches_string = newSVpvn("", 0);
6694         SV** invlistsvp;
6695
6696         /* Use the swash, if any, which has to have incorporated into it all
6697          * possibilities */
6698         if (   sw
6699             && SvROK(sw)
6700             && SvTYPE(SvRV(sw)) == SVt_PVHV
6701             && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6702         {
6703             invlist = *invlistsvp;
6704         }
6705         else if (si && si != &PL_sv_undef) {
6706
6707             /* If no swash, use the input nitialization string, if available */
6708             sv_catsv(matches_string, si);
6709         }
6710
6711         /* Add the inversion list to whatever we have.  This may have come from
6712          * the swash, or from an input parameter */
6713         if (invlist) {
6714             sv_catsv(matches_string, _invlist_contents(invlist));
6715         }
6716         *listsvp = matches_string;
6717     }
6718
6719     if (altsvp)
6720         *altsvp  = alt;
6721
6722     return sw;
6723 }
6724
6725 /*
6726  - reginclass - determine if a character falls into a character class
6727
6728   n is the ANYOF regnode
6729   p is the target string
6730   lenp is pointer to the maximum number of bytes of how far to go in p
6731     (This is assumed wthout checking to always be at least the current
6732     character's size)
6733   utf8_target tells whether p is in UTF-8.
6734
6735   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6736   from a successful match, the value it points to will be updated to how many
6737   bytes in p were matched.  If there was no match, the value is undefined,
6738   possibly changed from the input.
6739
6740   Note that this can be a synthetic start class, a combination of various
6741   nodes, so things you think might be mutually exclusive, such as locale,
6742   aren't.  It can match both locale and non-locale
6743
6744  */
6745
6746 STATIC bool
6747 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6748 {
6749     dVAR;
6750     const char flags = ANYOF_FLAGS(n);
6751     bool match = FALSE;
6752     UV c = *p;
6753     STRLEN c_len = 0;
6754     STRLEN maxlen;
6755
6756     PERL_ARGS_ASSERT_REGINCLASS;
6757
6758     /* If c is not already the code point, get it */
6759     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6760         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6761                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6762                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6763                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6764                  * UTF8_ALLOW_FFFF */
6765         if (c_len == (STRLEN)-1)
6766             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6767     }
6768     else {
6769         c_len = 1;
6770     }
6771
6772     /* Use passed in max length, or one character if none passed in or less
6773      * than one character.  And assume will match just one character.  This is
6774      * overwritten later if matched more. */
6775     if (lenp) {
6776         maxlen = (*lenp > c_len) ? *lenp : c_len;
6777         *lenp = c_len;
6778
6779     }
6780     else {
6781         maxlen = c_len;
6782     }
6783
6784     /* If this character is potentially in the bitmap, check it */
6785     if (c < 256) {
6786         if (ANYOF_BITMAP_TEST(n, c))
6787             match = TRUE;
6788         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6789                 && ! utf8_target
6790                 && ! isASCII(c))
6791         {
6792             match = TRUE;
6793         }
6794
6795         else if (flags & ANYOF_LOCALE) {
6796             PL_reg_flags |= RF_tainted;
6797
6798             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6799                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6800             {
6801                 match = TRUE;
6802             }
6803             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6804                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6805                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6806                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6807                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6808                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6809                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6810                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6811                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6812                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6813                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6814                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6815                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6816                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6817                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6818                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6819                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6820                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6821                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6822                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6823                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6824                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6825                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6826                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6827                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6828                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6829                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6830                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6831                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6832                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6833                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6834                      ) /* How's that for a conditional? */
6835             ) {
6836                 match = TRUE;
6837             }
6838         }
6839     }
6840
6841     /* If the bitmap didn't (or couldn't) match, and something outside the
6842      * bitmap could match, try that.  Locale nodes specifiy completely the
6843      * behavior of code points in the bit map (otherwise, a utf8 target would
6844      * cause them to be treated as Unicode and not locale), except in
6845      * the very unlikely event when this node is a synthetic start class, which
6846      * could be a combination of locale and non-locale nodes.  So allow locale
6847      * to match for the synthetic start class, which will give a false
6848      * positive that will be resolved when the match is done again as not part
6849      * of the synthetic start class */
6850     if (!match) {
6851         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6852             match = TRUE;       /* Everything above 255 matches */
6853         }
6854         else if (ANYOF_NONBITMAP(n)
6855                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6856                      || (utf8_target
6857                          && (c >=256
6858                              || (! (flags & ANYOF_LOCALE))
6859                              || (flags & ANYOF_IS_SYNTHETIC)))))
6860         {
6861             AV *av;
6862             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6863
6864             if (sw) {
6865                 U8 * utf8_p;
6866                 if (utf8_target) {
6867                     utf8_p = (U8 *) p;
6868                 } else {
6869
6870                     /* Not utf8.  Convert as much of the string as available up
6871                      * to the limit of how far the (single) character in the
6872                      * pattern can possibly match (no need to go further).  If
6873                      * the node is a straight ANYOF or not folding, it can't
6874                      * match more than one.  Otherwise, It can match up to how
6875                      * far a single char can fold to.  Since not utf8, each
6876                      * character is a single byte, so the max it can be in
6877                      * bytes is the same as the max it can be in characters */
6878                     STRLEN len = (OP(n) == ANYOF
6879                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6880                                   ? 1
6881                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6882                                     ? maxlen
6883                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6884                     utf8_p = bytes_to_utf8(p, &len);
6885                 }
6886
6887                 if (swash_fetch(sw, utf8_p, TRUE))
6888                     match = TRUE;
6889                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6890
6891                     /* Here, we need to test if the fold of the target string
6892                      * matches.  The non-multi char folds have all been moved to
6893                      * the compilation phase, and the multi-char folds have
6894                      * been stored by regcomp into 'av'; we linearly check to
6895                      * see if any match the target string (folded).   We know
6896                      * that the originals were each one character, but we don't
6897                      * currently know how many characters/bytes each folded to,
6898                      * except we do know that there are small limits imposed by
6899                      * Unicode.  XXX A performance enhancement would be to have
6900                      * regcomp.c store the max number of chars/bytes that are
6901                      * in an av entry, as, say the 0th element.  Even better
6902                      * would be to have a hash of the few characters that can
6903                      * start a multi-char fold to the max number of chars of
6904                      * those folds.
6905                      *
6906                      * If there is a match, we will need to advance (if lenp is
6907                      * specified) the match pointer in the target string.  But
6908                      * what we are comparing here isn't that string directly,
6909                      * but its fold, whose length may differ from the original.
6910                      * As we go along in constructing the fold, therefore, we
6911                      * create a map so that we know how many bytes in the
6912                      * source to advance given that we have matched a certain
6913                      * number of bytes in the fold.  This map is stored in
6914                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6915                      * the fold of the first character that we are folding.
6916                      * Then map_fold_len_back[n] is set to the number of bytes
6917                      * in that first character.  Similarly let m be the
6918                      * corresponding number for the second character to be
6919                      * folded.  Then map_fold_len_back[n+m] is set to the
6920                      * number of bytes occupied by the first two source
6921                      * characters. ... */
6922                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6923                     U8 folded[UTF8_MAXBYTES_CASE+1];
6924                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6925                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6926                                                   chars */
6927
6928                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6929
6930                         /* Here, only need to fold the first char of the target
6931                          * string.  It the source wasn't utf8, is 1 byte long */
6932                         to_utf8_fold(utf8_p, folded, &foldlen);
6933                         total_foldlen = foldlen;
6934                         map_fold_len_back[foldlen] = (utf8_target)
6935                                                      ? UTF8SKIP(utf8_p)
6936                                                      : 1;
6937                     }
6938                     else {
6939
6940                         /* Here, need to fold more than the first char.  Do so
6941                          * up to the limits */
6942                         U8* source_ptr = utf8_p;    /* The source for the fold
6943                                                        is the regex target
6944                                                        string */
6945                         U8* folded_ptr = folded;
6946                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6947                                                        available byte in the
6948                                                        target string */
6949                         U8 i;
6950                         for (i = 0;
6951                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6952                              i++)
6953                         {
6954
6955                             /* Fold the next character */
6956                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6957                             STRLEN this_char_foldlen;
6958                             to_utf8_fold(source_ptr,
6959                                          this_char_folded,
6960                                          &this_char_foldlen);
6961
6962                             /* Bail if it would exceed the byte limit for
6963                              * folding a single char. */
6964                             if (this_char_foldlen + folded_ptr - folded >
6965                                                             UTF8_MAXBYTES_CASE)
6966                             {
6967                                 break;
6968                             }
6969
6970                             /* Add the fold of this character */
6971                             Copy(this_char_folded,
6972                                  folded_ptr,
6973                                  this_char_foldlen,
6974                                  U8);
6975                             source_ptr += UTF8SKIP(source_ptr);
6976                             folded_ptr += this_char_foldlen;
6977                             total_foldlen = folded_ptr - folded;
6978
6979                             /* Create map from the number of bytes in the fold
6980                              * back to the number of bytes in the source.  If
6981                              * the source isn't utf8, the byte count is just
6982                              * the number of characters so far */
6983                             map_fold_len_back[total_foldlen]
6984                                                       = (utf8_target)
6985                                                         ? source_ptr - utf8_p
6986                                                         : i + 1;
6987                         }
6988                         *folded_ptr = '\0';
6989                     }
6990
6991
6992                     /* Do the linear search to see if the fold is in the list
6993                      * of multi-char folds. */
6994                     if (av) {
6995                         I32 i;
6996                         for (i = 0; i <= av_len(av); i++) {
6997                             SV* const sv = *av_fetch(av, i, FALSE);
6998                             STRLEN len;
6999                             const char * const s = SvPV_const(sv, len);
7000
7001                             if (len <= total_foldlen
7002                                 && memEQ(s, (char*)folded, len)
7003
7004                                    /* If 0, means matched a partial char. See
7005                                     * [perl #90536] */
7006                                 && map_fold_len_back[len])
7007                             {
7008
7009                                 /* Advance the target string ptr to account for
7010                                  * this fold, but have to translate from the
7011                                  * folded length to the corresponding source
7012                                  * length. */
7013                                 if (lenp) {
7014                                     *lenp = map_fold_len_back[len];
7015                                 }
7016                                 match = TRUE;
7017                                 break;
7018                             }
7019                         }
7020                     }
7021                 }
7022
7023                 /* If we allocated a string above, free it */
7024                 if (! utf8_target) Safefree(utf8_p);
7025             }
7026         }
7027
7028         if (UNICODE_IS_SUPER(c)
7029             && (flags & ANYOF_WARN_SUPER)
7030             && ckWARN_d(WARN_NON_UNICODE))
7031         {
7032             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7033                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7034         }
7035     }
7036
7037     return (flags & ANYOF_INVERT) ? !match : match;
7038 }
7039
7040 STATIC U8 *
7041 S_reghop3(U8 *s, I32 off, const U8* lim)
7042 {
7043     /* return the position 'off' UTF-8 characters away from 's', forward if
7044      * 'off' >= 0, backwards if negative.  But don't go outside of position
7045      * 'lim', which better be < s  if off < 0 */
7046
7047     dVAR;
7048
7049     PERL_ARGS_ASSERT_REGHOP3;
7050
7051     if (off >= 0) {
7052         while (off-- && s < lim) {
7053             /* XXX could check well-formedness here */
7054             s += UTF8SKIP(s);
7055         }
7056     }
7057     else {
7058         while (off++ && s > lim) {
7059             s--;
7060             if (UTF8_IS_CONTINUED(*s)) {
7061                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7062                     s--;
7063             }
7064             /* XXX could check well-formedness here */
7065         }
7066     }
7067     return s;
7068 }
7069
7070 #ifdef XXX_dmq
7071 /* there are a bunch of places where we use two reghop3's that should
7072    be replaced with this routine. but since thats not done yet
7073    we ifdef it out - dmq
7074 */
7075 STATIC U8 *
7076 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7077 {
7078     dVAR;
7079
7080     PERL_ARGS_ASSERT_REGHOP4;
7081
7082     if (off >= 0) {
7083         while (off-- && s < rlim) {
7084             /* XXX could check well-formedness here */
7085             s += UTF8SKIP(s);
7086         }
7087     }
7088     else {
7089         while (off++ && s > llim) {
7090             s--;
7091             if (UTF8_IS_CONTINUED(*s)) {
7092                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7093                     s--;
7094             }
7095             /* XXX could check well-formedness here */
7096         }
7097     }
7098     return s;
7099 }
7100 #endif
7101
7102 STATIC U8 *
7103 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7104 {
7105     dVAR;
7106
7107     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7108
7109     if (off >= 0) {
7110         while (off-- && s < lim) {
7111             /* XXX could check well-formedness here */
7112             s += UTF8SKIP(s);
7113         }
7114         if (off >= 0)
7115             return NULL;
7116     }
7117     else {
7118         while (off++ && s > lim) {
7119             s--;
7120             if (UTF8_IS_CONTINUED(*s)) {
7121                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7122                     s--;
7123             }
7124             /* XXX could check well-formedness here */
7125         }
7126         if (off <= 0)
7127             return NULL;
7128     }
7129     return s;
7130 }
7131
7132 static void
7133 restore_pos(pTHX_ void *arg)
7134 {
7135     dVAR;
7136     regexp * const rex = (regexp *)arg;
7137     if (PL_reg_state.re_state_eval_setup_done) {
7138         if (PL_reg_oldsaved) {
7139             rex->subbeg = PL_reg_oldsaved;
7140             rex->sublen = PL_reg_oldsavedlen;
7141 #ifdef PERL_OLD_COPY_ON_WRITE
7142             rex->saved_copy = PL_nrs;
7143 #endif
7144             RXp_MATCH_COPIED_on(rex);
7145         }
7146         PL_reg_magic->mg_len = PL_reg_oldpos;
7147         PL_reg_state.re_state_eval_setup_done = FALSE;
7148         PL_curpm = PL_reg_oldcurpm;
7149     }
7150 }
7151
7152 STATIC void
7153 S_to_utf8_substr(pTHX_ register regexp *prog)
7154 {
7155     int i = 1;
7156
7157     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7158
7159     do {
7160         if (prog->substrs->data[i].substr
7161             && !prog->substrs->data[i].utf8_substr) {
7162             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7163             prog->substrs->data[i].utf8_substr = sv;
7164             sv_utf8_upgrade(sv);
7165             if (SvVALID(prog->substrs->data[i].substr)) {
7166                 if (SvTAIL(prog->substrs->data[i].substr)) {
7167                     /* Trim the trailing \n that fbm_compile added last
7168                        time.  */
7169                     SvCUR_set(sv, SvCUR(sv) - 1);
7170                     /* Whilst this makes the SV technically "invalid" (as its
7171                        buffer is no longer followed by "\0") when fbm_compile()
7172                        adds the "\n" back, a "\0" is restored.  */
7173                     fbm_compile(sv, FBMcf_TAIL);
7174                 } else
7175                     fbm_compile(sv, 0);
7176             }
7177             if (prog->substrs->data[i].substr == prog->check_substr)
7178                 prog->check_utf8 = sv;
7179         }
7180     } while (i--);
7181 }
7182
7183 STATIC void
7184 S_to_byte_substr(pTHX_ register regexp *prog)
7185 {
7186     dVAR;
7187     int i = 1;
7188
7189     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7190
7191     do {
7192         if (prog->substrs->data[i].utf8_substr
7193             && !prog->substrs->data[i].substr) {
7194             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7195             if (sv_utf8_downgrade(sv, TRUE)) {
7196                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7197                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7198                         /* Trim the trailing \n that fbm_compile added last
7199                            time.  */
7200                         SvCUR_set(sv, SvCUR(sv) - 1);
7201                         fbm_compile(sv, FBMcf_TAIL);
7202                     } else
7203                         fbm_compile(sv, 0);
7204                 }
7205             } else {
7206                 SvREFCNT_dec(sv);
7207                 sv = &PL_sv_undef;
7208             }
7209             prog->substrs->data[i].substr = sv;
7210             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7211                 prog->check_substr = sv;
7212         }
7213     } while (i--);
7214 }
7215
7216 /*
7217  * Local variables:
7218  * c-indentation-style: bsd
7219  * c-basic-offset: 4
7220  * indent-tabs-mode: nil
7221  * End:
7222  *
7223  * ex: set ts=8 sts=4 sw=4 et:
7224  */