src/5013009/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1               /* tainted information used? */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
  99  * complications: i.e., if everything matchable is straight forward in the
 100  * bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 128
 129 /* Doesn't do an assert to verify that is correct */
 130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 131     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 132
 133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 136
 137 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 138         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 139         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 140         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 141             * assert should likely and hopefully fail on an EBCDIC machine */ \
 142         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 143                                                                             \
 144         /* No asserts are done for these, in case called on an early        \
 145             * Unicode version in which they map to nothing */               \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 152         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 153
 154 /*
 155    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 156    so that it is possible to override the option here without having to
 157    rebuild the entire core. as we are required to do if we change regcomp.h
 158    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 159 */
 160 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 161 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 162 #endif
 163
 164 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 165 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 166 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 167 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 168 #define RE_utf8_perl_word   PL_utf8_alnum
 169 #define RE_utf8_perl_space  PL_utf8_space
 170 #define RE_utf8_posix_digit PL_utf8_digit
 171 #define perl_word  alnum
 172 #define perl_space space
 173 #define posix_digit digit
 174 #else
 175 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 176 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 177 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 178 #define RE_utf8_perl_word   PL_utf8_perl_word
 179 #define RE_utf8_perl_space  PL_utf8_perl_space
 180 #define RE_utf8_posix_digit PL_utf8_posix_digit
 181 #endif
 182
 183 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 184
 185 /* The actual code for CCC_TRY, which uses several variables from the routine
 186  * it's callable from.  It is designed to be the bulk of a case statement.
 187  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 188  *      nextchr matches the class.
 189  * UTF8_TEST is the whole test string to use for utf8 targets
 190  * LOAD is what to use to test, and if not present to load in the swash for the
 191  *      class
 192  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 193  *      UTF8_TEST test.
 194  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 195  * utf8 and a variant, load the swash if necessary and test using the utf8
 196  * test.  Advance to the next character if test is ok, otherwise fail; If not
 197  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 198  * fails, or advance to the next character */
 199
 200 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 201     if (locinput >= PL_regeol) {                                              \
 202         sayNO;                                                                \
 203     }                                                                         \
 204     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 205         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 206         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 207             sayNO;                                                            \
 208         }                                                                     \
 209         locinput += PL_utf8skip[nextchr];                                     \
 210         nextchr = UCHARAT(locinput);                                          \
 211         break;                                                                \
 212     }                                                                         \
 213     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 214         sayNO;                                                                \
 215     }                                                                         \
 216     nextchr = UCHARAT(++locinput);                                            \
 217     break;
 218
 219 /* Handle the non-locale cases for a character class and its complement.  It
 220  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 221  * This is because that code fails when the test succeeds, so we want to have
 222  * the test fail so that the code succeeds.  The swash is stored in a
 223  * predictable PL_ place */
 224 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 225                            CLASS, STR)                                        \
 226     case NAME:                                                                \
 227         _CCC_TRY_CODE( !, FUNC,                                               \
 228                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 229                                             (U8*)locinput, TRUE)),            \
 230                           CLASS, STR)                                         \
 231     case NNAME:                                                               \
 232         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 233                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 234                                             (U8*)locinput, TRUE)),            \
 235                           CLASS, STR)                                         \
 236
 237 /* Generate the case statements for both locale and non-locale character
 238  * classes in regmatch for classes that don't have special unicode semantics.
 239  * Locales don't use an immediate swash, but an intermediary special locale
 240  * function that is called on the pointer to the current place in the input
 241  * string.  That function will resolve to needing the same swash.  One might
 242  * think that because we don't know what the locale will match, we shouldn't
 243  * check with the swash loading function that it loaded properly; ie, that we
 244  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 245  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 246  * irrelevant here */
 247 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 248                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 249                 NAMEA, NNAMEA, FUNCA,                                         \
 250                 CLASS, STR)                                                   \
 251     case NAMEL:                                                               \
 252         PL_reg_flags |= RF_tainted;                                           \
 253         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 254     case NNAMEL:                                                              \
 255         PL_reg_flags |= RF_tainted;                                           \
 256         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 257                        CLASS, STR)                                            \
 258     case NAMEA:                                                               \
 259         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 260             sayNO;                                                            \
 261         }                                                                     \
 262         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 263         nextchr = UCHARAT(++locinput);                                        \
 264         break;                                                                \
 265     case NNAMEA:                                                              \
 266         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 267             sayNO;                                                            \
 268         }                                                                     \
 269         if (utf8_target) {                                                    \
 270             locinput += PL_utf8skip[nextchr];                                 \
 271             nextchr = UCHARAT(locinput);                                      \
 272         }                                                                     \
 273         else {                                                                \
 274             nextchr = UCHARAT(++locinput);                                    \
 275         }                                                                     \
 276         break;                                                                \
 277     /* Generate the non-locale cases */                                       \
 278     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 279
 280 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 281  * statements to handle separate Unicode semantics nodes */
 282 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 283                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 284                   NAMEU, NNAMEU, FUNCU,                                        \
 285                   NAMEA, NNAMEA, FUNCA,                                        \
 286                   CLASS, STR)                                                  \
 287     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 288             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 289             NAMEA, NNAMEA, FUNCA,                                              \
 290             CLASS, STR)                                                        \
 291     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 292
 293 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 294
 295 /* for use after a quantifier and before an EXACT-like node -- japhy */
 296 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 297  *
 298  * NOTE that *nothing* that affects backtracking should be in here, specifically
 299  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 300  * node that is in between two EXACT like nodes when ascertaining what the required
 301  * "follow" character is. This should probably be moved to regex compile time
 302  * although it may be done at run time beause of the REF possibility - more
 303  * investigation required. -- demerphq
 304 */
 305 #define JUMPABLE(rn) (      \
 306     OP(rn) == OPEN ||       \
 307     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 308     OP(rn) == EVAL ||   \
 309     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 310     OP(rn) == PLUS || OP(rn) == MINMOD || \
 311     OP(rn) == KEEPS || \
 312     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 313 )
 314 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 315
 316 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 317
 318 #if 0
 319 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 320    we don't need this definition. */
 321 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 322 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 323 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 324
 325 #else
 326 /* ... so we use this as its faster. */
 327 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 328 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU )
 329 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 330 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 331
 332 #endif
 333
 334 /*
 335   Search for mandatory following text node; for lookahead, the text must
 336   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 337 */
 338 #define FIND_NEXT_IMPT(rn) STMT_START { \
 339     while (JUMPABLE(rn)) { \
 340         const OPCODE type = OP(rn); \
 341         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 342             rn = NEXTOPER(NEXTOPER(rn)); \
 343         else if (type == PLUS) \
 344             rn = NEXTOPER(rn); \
 345         else if (type == IFMATCH) \
 346             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 347         else rn += NEXT_OFF(rn); \
 348     } \
 349 } STMT_END
 350
 351
 352 static void restore_pos(pTHX_ void *arg);
 353
 354 #define REGCP_PAREN_ELEMS 4
 355 #define REGCP_OTHER_ELEMS 5
 356 #define REGCP_FRAME_ELEMS 1
 357 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 358  * are needed for the regexp context stack bookkeeping. */
 359
 360 STATIC CHECKPOINT
 361 S_regcppush(pTHX_ I32 parenfloor)
 362 {
 363     dVAR;
 364     const int retval = PL_savestack_ix;
 365     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 366     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 367     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 368     int p;
 369     GET_RE_DEBUG_FLAGS_DECL;
 370
 371     if (paren_elems_to_push < 0)
 372         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 373
 374     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 375         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 376                    " out of range (%lu-%ld)",
 377                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 378
 379     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 380
 381     for (p = PL_regsize; p > parenfloor; p--) {
 382 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 383         SSPUSHINT(PL_regoffs[p].end);
 384         SSPUSHINT(PL_regoffs[p].start);
 385         SSPUSHPTR(PL_reg_start_tmp[p]);
 386         SSPUSHINT(p);
 387         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 388           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 389                       (UV)p, (IV)PL_regoffs[p].start,
 390                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 391                       (IV)PL_regoffs[p].end
 392         ));
 393     }
 394 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 395     SSPUSHPTR(PL_regoffs);
 396     SSPUSHINT(PL_regsize);
 397     SSPUSHINT(*PL_reglastparen);
 398     SSPUSHINT(*PL_reglastcloseparen);
 399     SSPUSHPTR(PL_reginput);
 400     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 401
 402     return retval;
 403 }
 404
 405 /* These are needed since we do not localize EVAL nodes: */
 406 #define REGCP_SET(cp)                                           \
 407     DEBUG_STATE_r(                                              \
 408             PerlIO_printf(Perl_debug_log,                       \
 409                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 410                 (IV)PL_savestack_ix));                          \
 411     cp = PL_savestack_ix
 412
 413 #define REGCP_UNWIND(cp)                                        \
 414     DEBUG_STATE_r(                                              \
 415         if (cp != PL_savestack_ix)                              \
 416             PerlIO_printf(Perl_debug_log,                       \
 417                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 418                 (IV)(cp), (IV)PL_savestack_ix));                \
 419     regcpblow(cp)
 420
 421 STATIC char *
 422 S_regcppop(pTHX_ const regexp *rex)
 423 {
 424     dVAR;
 425     UV i;
 426     char *input;
 427     GET_RE_DEBUG_FLAGS_DECL;
 428
 429     PERL_ARGS_ASSERT_REGCPPOP;
 430
 431     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 432     i = SSPOPUV;
 433     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 434     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 435     input = (char *) SSPOPPTR;
 436     *PL_reglastcloseparen = SSPOPINT;
 437     *PL_reglastparen = SSPOPINT;
 438     PL_regsize = SSPOPINT;
 439     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 440
 441     i -= REGCP_OTHER_ELEMS;
 442     /* Now restore the parentheses context. */
 443     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 444         I32 tmps;
 445         U32 paren = (U32)SSPOPINT;
 446         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 447         PL_regoffs[paren].start = SSPOPINT;
 448         tmps = SSPOPINT;
 449         if (paren <= *PL_reglastparen)
 450             PL_regoffs[paren].end = tmps;
 451         DEBUG_BUFFERS_r(
 452             PerlIO_printf(Perl_debug_log,
 453                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 454                           (UV)paren, (IV)PL_regoffs[paren].start,
 455                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 456                           (IV)PL_regoffs[paren].end,
 457                           (paren > *PL_reglastparen ? "(no)" : ""));
 458         );
 459     }
 460     DEBUG_BUFFERS_r(
 461         if (*PL_reglastparen + 1 <= rex->nparens) {
 462             PerlIO_printf(Perl_debug_log,
 463                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 464                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 465         }
 466     );
 467 #if 1
 468     /* It would seem that the similar code in regtry()
 469      * already takes care of this, and in fact it is in
 470      * a better location to since this code can #if 0-ed out
 471      * but the code in regtry() is needed or otherwise tests
 472      * requiring null fields (pat.t#187 and split.t#{13,14}
 473      * (as of patchlevel 7877)  will fail.  Then again,
 474      * this code seems to be necessary or otherwise
 475      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 476      * --jhi updated by dapm */
 477     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 478         if (i > PL_regsize)
 479             PL_regoffs[i].start = -1;
 480         PL_regoffs[i].end = -1;
 481     }
 482 #endif
 483     return input;
 484 }
 485
 486 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 487
 488 /*
 489  * pregexec and friends
 490  */
 491
 492 #ifndef PERL_IN_XSUB_RE
 493 /*
 494  - pregexec - match a regexp against a string
 495  */
 496 I32
 497 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 498          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 499 /* strend: pointer to null at end of string */
 500 /* strbeg: real beginning of string */
 501 /* minend: end of match must be >=minend after stringarg. */
 502 /* nosave: For optimizations. */
 503 {
 504     PERL_ARGS_ASSERT_PREGEXEC;
 505
 506     return
 507         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 508                       nosave ? 0 : REXEC_COPY_STR);
 509 }
 510 #endif
 511
 512 /*
 513  * Need to implement the following flags for reg_anch:
 514  *
 515  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 516  * USE_INTUIT_ML
 517  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 518  * INTUIT_AUTORITATIVE_ML
 519  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 520  * INTUIT_ONCE_ML
 521  *
 522  * Another flag for this function: SECOND_TIME (so that float substrs
 523  * with giant delta may be not rechecked).
 524  */
 525
 526 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 527
 528 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 529    Otherwise, only SvCUR(sv) is used to get strbeg. */
 530
 531 /* XXXX We assume that strpos is strbeg unless sv. */
 532
 533 /* XXXX Some places assume that there is a fixed substring.
 534         An update may be needed if optimizer marks as "INTUITable"
 535         RExen without fixed substrings.  Similarly, it is assumed that
 536         lengths of all the strings are no more than minlen, thus they
 537         cannot come from lookahead.
 538         (Or minlen should take into account lookahead.)
 539   NOTE: Some of this comment is not correct. minlen does now take account
 540   of lookahead/behind. Further research is required. -- demerphq
 541
 542 */
 543
 544 /* A failure to find a constant substring means that there is no need to make
 545    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 546    finding a substring too deep into the string means that less calls to
 547    regtry() should be needed.
 548
 549    REx compiler's optimizer found 4 possible hints:
 550         a) Anchored substring;
 551         b) Fixed substring;
 552         c) Whether we are anchored (beginning-of-line or \G);
 553         d) First node (of those at offset 0) which may distinguish positions;
 554    We use a)b)d) and multiline-part of c), and try to find a position in the
 555    string which does not contradict any of them.
 556  */
 557
 558 /* Most of decisions we do here should have been done at compile time.
 559    The nodes of the REx which we used for the search should have been
 560    deleted from the finite automaton. */
 561
 562 char *
 563 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 564                      char *strend, const U32 flags, re_scream_pos_data *data)
 565 {
 566     dVAR;
 567     struct regexp *const prog = (struct regexp *)SvANY(rx);
 568     register I32 start_shift = 0;
 569     /* Should be nonnegative! */
 570     register I32 end_shift   = 0;
 571     register char *s;
 572     register SV *check;
 573     char *strbeg;
 574     char *t;
 575     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 576     I32 ml_anch;
 577     register char *other_last = NULL;   /* other substr checked before this */
 578     char *check_at = NULL;              /* check substr found at this pos */
 579     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 580     RXi_GET_DECL(prog,progi);
 581 #ifdef DEBUGGING
 582     const char * const i_strpos = strpos;
 583 #endif
 584     GET_RE_DEBUG_FLAGS_DECL;
 585
 586     PERL_ARGS_ASSERT_RE_INTUIT_START;
 587
 588     RX_MATCH_UTF8_set(rx,utf8_target);
 589
 590     if (RX_UTF8(rx)) {
 591         PL_reg_flags |= RF_utf8;
 592     }
 593     DEBUG_EXECUTE_r(
 594         debug_start_match(rx, utf8_target, strpos, strend,
 595             sv ? "Guessing start of match in sv for"
 596                : "Guessing start of match in string for");
 597               );
 598
 599     /* CHR_DIST() would be more correct here but it makes things slow. */
 600     if (prog->minlen > strend - strpos) {
 601         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 602                               "String too short... [re_intuit_start]\n"));
 603         goto fail;
 604     }
 605
 606     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 607     PL_regeol = strend;
 608     if (utf8_target) {
 609         if (!prog->check_utf8 && prog->check_substr)
 610             to_utf8_substr(prog);
 611         check = prog->check_utf8;
 612     } else {
 613         if (!prog->check_substr && prog->check_utf8)
 614             to_byte_substr(prog);
 615         check = prog->check_substr;
 616     }
 617     if (check == &PL_sv_undef) {
 618         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 619                 "Non-utf8 string cannot match utf8 check string\n"));
 620         goto fail;
 621     }
 622     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 623         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 624                      || ( (prog->extflags & RXf_ANCH_BOL)
 625                           && !multiline ) );    /* Check after \n? */
 626
 627         if (!ml_anch) {
 628           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 629                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 630                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 631                && sv && !SvROK(sv)
 632                && (strpos != strbeg)) {
 633               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 634               goto fail;
 635           }
 636           if (prog->check_offset_min == prog->check_offset_max &&
 637               !(prog->extflags & RXf_CANY_SEEN)) {
 638             /* Substring at constant offset from beg-of-str... */
 639             I32 slen;
 640
 641             s = HOP3c(strpos, prog->check_offset_min, strend);
 642
 643             if (SvTAIL(check)) {
 644                 slen = SvCUR(check);    /* >= 1 */
 645
 646                 if ( strend - s > slen || strend - s < slen - 1
 647                      || (strend - s == slen && strend[-1] != '\n')) {
 648                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 649                     goto fail_finish;
 650                 }
 651                 /* Now should match s[0..slen-2] */
 652                 slen--;
 653                 if (slen && (*SvPVX_const(check) != *s
 654                              || (slen > 1
 655                                  && memNE(SvPVX_const(check), s, slen)))) {
 656                   report_neq:
 657                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 658                     goto fail_finish;
 659                 }
 660             }
 661             else if (*SvPVX_const(check) != *s
 662                      || ((slen = SvCUR(check)) > 1
 663                          && memNE(SvPVX_const(check), s, slen)))
 664                 goto report_neq;
 665             check_at = s;
 666             goto success_at_start;
 667           }
 668         }
 669         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 670         s = strpos;
 671         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 672         end_shift = prog->check_end_shift;
 673
 674         if (!ml_anch) {
 675             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 676                                          - (SvTAIL(check) != 0);
 677             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 678
 679             if (end_shift < eshift)
 680                 end_shift = eshift;
 681         }
 682     }
 683     else {                              /* Can match at random position */
 684         ml_anch = 0;
 685         s = strpos;
 686         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 687         end_shift = prog->check_end_shift;
 688
 689         /* end shift should be non negative here */
 690     }
 691
 692 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 693     if (end_shift < 0)
 694         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 695                    (IV)end_shift, RX_PRECOMP(prog));
 696 #endif
 697
 698   restart:
 699     /* Find a possible match in the region s..strend by looking for
 700        the "check" substring in the region corrected by start/end_shift. */
 701
 702     {
 703         I32 srch_start_shift = start_shift;
 704         I32 srch_end_shift = end_shift;
 705         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 706             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 707             srch_start_shift = strbeg - s;
 708         }
 709     DEBUG_OPTIMISE_MORE_r({
 710         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 711             (IV)prog->check_offset_min,
 712             (IV)srch_start_shift,
 713             (IV)srch_end_shift,
 714             (IV)prog->check_end_shift);
 715     });
 716
 717     if (flags & REXEC_SCREAM) {
 718         I32 p = -1;                     /* Internal iterator of scream. */
 719         I32 * const pp = data ? data->scream_pos : &p;
 720
 721         if (PL_screamfirst[BmRARE(check)] >= 0
 722             || ( BmRARE(check) == '\n'
 723                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 724                  && SvTAIL(check) ))
 725             s = screaminstr(sv, check,
 726                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 727         else
 728             goto fail_finish;
 729         /* we may be pointing at the wrong string */
 730         if (s && RXp_MATCH_COPIED(prog))
 731             s = strbeg + (s - SvPVX_const(sv));
 732         if (data)
 733             *data->scream_olds = s;
 734     }
 735     else {
 736         U8* start_point;
 737         U8* end_point;
 738         if (prog->extflags & RXf_CANY_SEEN) {
 739             start_point= (U8*)(s + srch_start_shift);
 740             end_point= (U8*)(strend - srch_end_shift);
 741         } else {
 742             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 743             end_point= HOP3(strend, -srch_end_shift, strbeg);
 744         }
 745         DEBUG_OPTIMISE_MORE_r({
 746             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 747                 (int)(end_point - start_point),
 748                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 749                 start_point);
 750         });
 751
 752         s = fbm_instr( start_point, end_point,
 753                       check, multiline ? FBMrf_MULTILINE : 0);
 754     }
 755     }
 756     /* Update the count-of-usability, remove useless subpatterns,
 757         unshift s.  */
 758
 759     DEBUG_EXECUTE_r({
 760         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 761             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 762         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 763                           (s ? "Found" : "Did not find"),
 764             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 765                 ? "anchored" : "floating"),
 766             quoted,
 767             RE_SV_TAIL(check),
 768             (s ? " at offset " : "...\n") );
 769     });
 770
 771     if (!s)
 772         goto fail_finish;
 773     /* Finish the diagnostic message */
 774     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 775
 776     /* XXX dmq: first branch is for positive lookbehind...
 777        Our check string is offset from the beginning of the pattern.
 778        So we need to do any stclass tests offset forward from that
 779        point. I think. :-(
 780      */
 781
 782
 783
 784     check_at=s;
 785
 786
 787     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 788        Start with the other substr.
 789        XXXX no SCREAM optimization yet - and a very coarse implementation
 790        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 791                 *always* match.  Probably should be marked during compile...
 792        Probably it is right to do no SCREAM here...
 793      */
 794
 795     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 796                 : (prog->float_substr && prog->anchored_substr))
 797     {
 798         /* Take into account the "other" substring. */
 799         /* XXXX May be hopelessly wrong for UTF... */
 800         if (!other_last)
 801             other_last = strpos;
 802         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 803           do_other_anchored:
 804             {
 805                 char * const last = HOP3c(s, -start_shift, strbeg);
 806                 char *last1, *last2;
 807                 char * const saved_s = s;
 808                 SV* must;
 809
 810                 t = s - prog->check_offset_max;
 811                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 812                     && (!utf8_target
 813                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 814                             && t > strpos)))
 815                     NOOP;
 816                 else
 817                     t = strpos;
 818                 t = HOP3c(t, prog->anchored_offset, strend);
 819                 if (t < other_last)     /* These positions already checked */
 820                     t = other_last;
 821                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 822                 if (last < last1)
 823                     last1 = last;
 824                 /* XXXX It is not documented what units *_offsets are in.
 825                    We assume bytes, but this is clearly wrong.
 826                    Meaning this code needs to be carefully reviewed for errors.
 827                    dmq.
 828                   */
 829
 830                 /* On end-of-str: see comment below. */
 831                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 832                 if (must == &PL_sv_undef) {
 833                     s = (char*)NULL;
 834                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 835                 }
 836                 else
 837                     s = fbm_instr(
 838                         (unsigned char*)t,
 839                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 840                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 841                         must,
 842                         multiline ? FBMrf_MULTILINE : 0
 843                     );
 844                 DEBUG_EXECUTE_r({
 845                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 846                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 847                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 848                         (s ? "Found" : "Contradicts"),
 849                         quoted, RE_SV_TAIL(must));
 850                 });
 851
 852
 853                 if (!s) {
 854                     if (last1 >= last2) {
 855                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856                                                 ", giving up...\n"));
 857                         goto fail_finish;
 858                     }
 859                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 860                         ", trying floating at offset %ld...\n",
 861                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 862                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 863                     s = HOP3c(last, 1, strend);
 864                     goto restart;
 865                 }
 866                 else {
 867                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 868                           (long)(s - i_strpos)));
 869                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 870                     other_last = HOP3c(s, 1, strend);
 871                     s = saved_s;
 872                     if (t == strpos)
 873                         goto try_at_start;
 874                     goto try_at_offset;
 875                 }
 876             }
 877         }
 878         else {          /* Take into account the floating substring. */
 879             char *last, *last1;
 880             char * const saved_s = s;
 881             SV* must;
 882
 883             t = HOP3c(s, -start_shift, strbeg);
 884             last1 = last =
 885                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 886             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 887                 last = HOP3c(t, prog->float_max_offset, strend);
 888             s = HOP3c(t, prog->float_min_offset, strend);
 889             if (s < other_last)
 890                 s = other_last;
 891  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 892             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 893             /* fbm_instr() takes into account exact value of end-of-str
 894                if the check is SvTAIL(ed).  Since false positives are OK,
 895                and end-of-str is not later than strend we are OK. */
 896             if (must == &PL_sv_undef) {
 897                 s = (char*)NULL;
 898                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 899             }
 900             else
 901                 s = fbm_instr((unsigned char*)s,
 902                               (unsigned char*)last + SvCUR(must)
 903                                   - (SvTAIL(must)!=0),
 904                               must, multiline ? FBMrf_MULTILINE : 0);
 905             DEBUG_EXECUTE_r({
 906                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 907                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 908                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 909                     (s ? "Found" : "Contradicts"),
 910                     quoted, RE_SV_TAIL(must));
 911             });
 912             if (!s) {
 913                 if (last1 == last) {
 914                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 915                                             ", giving up...\n"));
 916                     goto fail_finish;
 917                 }
 918                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 919                     ", trying anchored starting at offset %ld...\n",
 920                     (long)(saved_s + 1 - i_strpos)));
 921                 other_last = last;
 922                 s = HOP3c(t, 1, strend);
 923                 goto restart;
 924             }
 925             else {
 926                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 927                       (long)(s - i_strpos)));
 928                 other_last = s; /* Fix this later. --Hugo */
 929                 s = saved_s;
 930                 if (t == strpos)
 931                     goto try_at_start;
 932                 goto try_at_offset;
 933             }
 934         }
 935     }
 936
 937
 938     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 939
 940     DEBUG_OPTIMISE_MORE_r(
 941         PerlIO_printf(Perl_debug_log,
 942             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 943             (IV)prog->check_offset_min,
 944             (IV)prog->check_offset_max,
 945             (IV)(s-strpos),
 946             (IV)(t-strpos),
 947             (IV)(t-s),
 948             (IV)(strend-strpos)
 949         )
 950     );
 951
 952     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 953         && (!utf8_target
 954             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 955                  && t > strpos)))
 956     {
 957         /* Fixed substring is found far enough so that the match
 958            cannot start at strpos. */
 959       try_at_offset:
 960         if (ml_anch && t[-1] != '\n') {
 961             /* Eventually fbm_*() should handle this, but often
 962                anchored_offset is not 0, so this check will not be wasted. */
 963             /* XXXX In the code below we prefer to look for "^" even in
 964                presence of anchored substrings.  And we search even
 965                beyond the found float position.  These pessimizations
 966                are historical artefacts only.  */
 967           find_anchor:
 968             while (t < strend - prog->minlen) {
 969                 if (*t == '\n') {
 970                     if (t < check_at - prog->check_offset_min) {
 971                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 972                             /* Since we moved from the found position,
 973                                we definitely contradict the found anchored
 974                                substr.  Due to the above check we do not
 975                                contradict "check" substr.
 976                                Thus we can arrive here only if check substr
 977                                is float.  Redo checking for "other"=="fixed".
 978                              */
 979                             strpos = t + 1;
 980                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 981                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 982                             goto do_other_anchored;
 983                         }
 984                         /* We don't contradict the found floating substring. */
 985                         /* XXXX Why not check for STCLASS? */
 986                         s = t + 1;
 987                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 988                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 989                         goto set_useful;
 990                     }
 991                     /* Position contradicts check-string */
 992                     /* XXXX probably better to look for check-string
 993                        than for "\n", so one should lower the limit for t? */
 994                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 995                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 996                     other_last = strpos = s = t + 1;
 997                     goto restart;
 998                 }
 999                 t++;
1000             }
1001             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1002                         PL_colors[0], PL_colors[1]));
1003             goto fail_finish;
1004         }
1005         else {
1006             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1007                         PL_colors[0], PL_colors[1]));
1008         }
1009         s = t;
1010       set_useful:
1011         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1012     }
1013     else {
1014         /* The found string does not prohibit matching at strpos,
1015            - no optimization of calling REx engine can be performed,
1016            unless it was an MBOL and we are not after MBOL,
1017            or a future STCLASS check will fail this. */
1018       try_at_start:
1019         /* Even in this situation we may use MBOL flag if strpos is offset
1020            wrt the start of the string. */
1021         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1022             && (strpos != strbeg) && strpos[-1] != '\n'
1023             /* May be due to an implicit anchor of m{.*foo}  */
1024             && !(prog->intflags & PREGf_IMPLICIT))
1025         {
1026             t = strpos;
1027             goto find_anchor;
1028         }
1029         DEBUG_EXECUTE_r( if (ml_anch)
1030             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1031                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1032         );
1033       success_at_start:
1034         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1035             && (utf8_target ? (
1036                 prog->check_utf8                /* Could be deleted already */
1037                 && --BmUSEFUL(prog->check_utf8) < 0
1038                 && (prog->check_utf8 == prog->float_utf8)
1039             ) : (
1040                 prog->check_substr              /* Could be deleted already */
1041                 && --BmUSEFUL(prog->check_substr) < 0
1042                 && (prog->check_substr == prog->float_substr)
1043             )))
1044         {
1045             /* If flags & SOMETHING - do not do it many times on the same match */
1046             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1047             /* XXX Does the destruction order has to change with utf8_target? */
1048             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1049             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1050             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1051             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1052             check = NULL;                       /* abort */
1053             s = strpos;
1054             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1055                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1056             if (prog->intflags & PREGf_IMPLICIT)
1057                 prog->extflags &= ~RXf_ANCH_MBOL;
1058             /* XXXX This is a remnant of the old implementation.  It
1059                     looks wasteful, since now INTUIT can use many
1060                     other heuristics. */
1061             prog->extflags &= ~RXf_USE_INTUIT;
1062             /* XXXX What other flags might need to be cleared in this branch? */
1063         }
1064         else
1065             s = strpos;
1066     }
1067
1068     /* Last resort... */
1069     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1070     /* trie stclasses are too expensive to use here, we are better off to
1071        leave it to regmatch itself */
1072     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1073         /* minlen == 0 is possible if regstclass is \b or \B,
1074            and the fixed substr is ''$.
1075            Since minlen is already taken into account, s+1 is before strend;
1076            accidentally, minlen >= 1 guaranties no false positives at s + 1
1077            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1078            regstclass does not come from lookahead...  */
1079         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1080            This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass().  */
1081         const U8* const str = (U8*)STRING(progi->regstclass);
1082         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1083                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1084                     : 1);
1085         char * endpos;
1086         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1087             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1088         else if (prog->float_substr || prog->float_utf8)
1089             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1090         else
1091             endpos= strend;
1092
1093         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1094                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1095
1096         t = s;
1097         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1098         if (!s) {
1099 #ifdef DEBUGGING
1100             const char *what = NULL;
1101 #endif
1102             if (endpos == strend) {
1103                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Could not match STCLASS...\n") );
1105                 goto fail;
1106             }
1107             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                                    "This position contradicts STCLASS...\n") );
1109             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1110                 goto fail;
1111             /* Contradict one of substrings */
1112             if (prog->anchored_substr || prog->anchored_utf8) {
1113                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1114                     DEBUG_EXECUTE_r( what = "anchored" );
1115                   hop_and_restart:
1116                     s = HOP3c(t, 1, strend);
1117                     if (s + start_shift + end_shift > strend) {
1118                         /* XXXX Should be taken into account earlier? */
1119                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120                                                "Could not match STCLASS...\n") );
1121                         goto fail;
1122                     }
1123                     if (!check)
1124                         goto giveup;
1125                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126                                 "Looking for %s substr starting at offset %ld...\n",
1127                                  what, (long)(s + start_shift - i_strpos)) );
1128                     goto restart;
1129                 }
1130                 /* Have both, check_string is floating */
1131                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1132                     goto retry_floating_check;
1133                 /* Recheck anchored substring, but not floating... */
1134                 s = check_at;
1135                 if (!check)
1136                     goto giveup;
1137                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1138                           "Looking for anchored substr starting at offset %ld...\n",
1139                           (long)(other_last - i_strpos)) );
1140                 goto do_other_anchored;
1141             }
1142             /* Another way we could have checked stclass at the
1143                current position only: */
1144             if (ml_anch) {
1145                 s = t = t + 1;
1146                 if (!check)
1147                     goto giveup;
1148                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1149                           "Looking for /%s^%s/m starting at offset %ld...\n",
1150                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1151                 goto try_at_offset;
1152             }
1153             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1154                 goto fail;
1155             /* Check is floating substring. */
1156           retry_floating_check:
1157             t = check_at - start_shift;
1158             DEBUG_EXECUTE_r( what = "floating" );
1159             goto hop_and_restart;
1160         }
1161         if (t != s) {
1162             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1163                         "By STCLASS: moving %ld --> %ld\n",
1164                                   (long)(t - i_strpos), (long)(s - i_strpos))
1165                    );
1166         }
1167         else {
1168             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1169                                   "Does not contradict STCLASS...\n");
1170                    );
1171         }
1172     }
1173   giveup:
1174     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1175                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1176                           PL_colors[5], (long)(s - i_strpos)) );
1177     return s;
1178
1179   fail_finish:                          /* Substring not found */
1180     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1181         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1182   fail:
1183     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1184                           PL_colors[4], PL_colors[5]));
1185     return NULL;
1186 }
1187
1188 #define DECL_TRIE_TYPE(scan) \
1189     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1190                     trie_type = (scan->flags != EXACT) \
1191                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1192                               : (utf8_target ? trie_utf8 : trie_plain)
1193
1194 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1195 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1196     switch (trie_type) {                                                    \
1197     case trie_utf8_fold:                                                    \
1198         if ( foldlen>0 ) {                                                  \
1199             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1200             foldlen -= len;                                                 \
1201             uscan += len;                                                   \
1202             len=0;                                                          \
1203         } else {                                                            \
1204             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1205             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1206             foldlen -= UNISKIP( uvc );                                      \
1207             uscan = foldbuf + UNISKIP( uvc );                               \
1208         }                                                                   \
1209         break;                                                              \
1210     case trie_latin_utf8_fold:                                              \
1211         if ( foldlen>0 ) {                                                  \
1212             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1213             foldlen -= len;                                                 \
1214             uscan += len;                                                   \
1215             len=0;                                                          \
1216         } else {                                                            \
1217             len = 1;                                                        \
1218             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1219             foldlen -= UNISKIP( uvc );                                      \
1220             uscan = foldbuf + UNISKIP( uvc );                               \
1221         }                                                                   \
1222         break;                                                              \
1223     case trie_utf8:                                                         \
1224         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1225         break;                                                              \
1226     case trie_plain:                                                        \
1227         uvc = (UV)*uc;                                                      \
1228         len = 1;                                                            \
1229     }                                                                       \
1230     if (uvc < 256) {                                                        \
1231         charid = trie->charmap[ uvc ];                                      \
1232     }                                                                       \
1233     else {                                                                  \
1234         charid = 0;                                                         \
1235         if (widecharmap) {                                                  \
1236             SV** const svpp = hv_fetch(widecharmap,                         \
1237                         (char*)&uvc, sizeof(UV), 0);                        \
1238             if (svpp)                                                       \
1239                 charid = (U16)SvIV(*svpp);                                  \
1240         }                                                                   \
1241     }                                                                       \
1242 } STMT_END
1243
1244 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1245 {                                                      \
1246     char *my_strend= (char *)strend;                   \
1247     if ( (CoNd)                                        \
1248          && (ln == len ||                              \
1249              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1250                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1251          && (!reginfo || regtry(reginfo, &s)) )        \
1252         goto got_it;                                   \
1253     else {                                             \
1254          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1255          uvchr_to_utf8(tmpbuf, c);                     \
1256          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1257          if ( f != c                                   \
1258               && (f == c1 || f == c2)                  \
1259               && (ln == len ||                         \
1260                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1261                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1262               && (!reginfo || regtry(reginfo, &s)) )   \
1263               goto got_it;                             \
1264     }                                                  \
1265 }                                                      \
1266 s += len
1267
1268 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1269 STMT_START {                                              \
1270     re_fold_t folder;                                   \
1271     switch (OP(c)) {                                      \
1272         case EXACTFU: folder = foldEQ_latin1; break;      \
1273         case EXACTFL: folder = foldEQ_locale; break;      \
1274         case EXACTF:  folder = foldEQ; break;             \
1275         default:                                          \
1276             Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
1277     }                                                     \
1278     while (s <= e) {                                      \
1279         if ( (CoNd)                                       \
1280              && (ln == 1 || folder(s, m, ln))             \
1281              && (!reginfo || regtry(reginfo, &s)) )       \
1282             goto got_it;                                  \
1283         s++;                                              \
1284     }                                                     \
1285 } STMT_END
1286
1287 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1288 STMT_START {                                          \
1289     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1290         CoDe                                          \
1291         s += uskip;                                   \
1292     }                                                 \
1293 } STMT_END
1294
1295 #define REXEC_FBC_SCAN(CoDe)                          \
1296 STMT_START {                                          \
1297     while (s < strend) {                              \
1298         CoDe                                          \
1299         s++;                                          \
1300     }                                                 \
1301 } STMT_END
1302
1303 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1304 REXEC_FBC_UTF8_SCAN(                                  \
1305     if (CoNd) {                                       \
1306         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1307             goto got_it;                              \
1308         else                                          \
1309             tmp = doevery;                            \
1310     }                                                 \
1311     else                                              \
1312         tmp = 1;                                      \
1313 )
1314
1315 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1316 REXEC_FBC_SCAN(                                       \
1317     if (CoNd) {                                       \
1318         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1319             goto got_it;                              \
1320         else                                          \
1321             tmp = doevery;                            \
1322     }                                                 \
1323     else                                              \
1324         tmp = 1;                                      \
1325 )
1326
1327 #define REXEC_FBC_TRYIT               \
1328 if ((!reginfo || regtry(reginfo, &s))) \
1329     goto got_it
1330
1331 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1332     if (utf8_target) {                                             \
1333         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1334     }                                                          \
1335     else {                                                     \
1336         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1337     }
1338
1339 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1340     if (utf8_target) {                                             \
1341         UtFpReLoAd;                                            \
1342         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1343     }                                                          \
1344     else {                                                     \
1345         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1346     }
1347
1348 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1349     PL_reg_flags |= RF_tainted;                                \
1350     if (utf8_target) {                                             \
1351         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1352     }                                                          \
1353     else {                                                     \
1354         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1355     }
1356
1357 #define DUMP_EXEC_POS(li,s,doutf8) \
1358     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1359
1360
1361 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1362         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1363         tmp = TEST_NON_UTF8(tmp);                                              \
1364         REXEC_FBC_UTF8_SCAN(                                                   \
1365             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1366                 tmp = !tmp;                                                    \
1367                 IF_SUCCESS;                                                    \
1368             }                                                                  \
1369             else {                                                             \
1370                 IF_FAIL;                                                       \
1371             }                                                                  \
1372         );                                                                     \
1373
1374 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1375         if (s == PL_bostr) {                                                   \
1376             tmp = '\n';                                                        \
1377         }                                                                      \
1378         else {                                                                 \
1379             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1380             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1381         }                                                                      \
1382         tmp = TeSt1_UtF8;                                                      \
1383         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1384         REXEC_FBC_UTF8_SCAN(                                                   \
1385             if (tmp == ! (TeSt2_UtF8)) { \
1386                 tmp = !tmp;                                                    \
1387                 IF_SUCCESS;                                                    \
1388             }                                                                  \
1389             else {                                                             \
1390                 IF_FAIL;                                                       \
1391             }                                                                  \
1392         );                                                                     \
1393
1394 /* The only difference between the BOUND and NBOUND cases is that
1395  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1396  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1397  * with the other one being empty */
1398 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1399     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1400
1401 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1402     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1403
1404 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1405     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1406
1407 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1408     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1409
1410
1411 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1412  * be passed in completely with the variable name being tested, which isn't
1413  * such a clean interface, but this is easier to read than it was before.  We
1414  * are looking for the boundary (or non-boundary between a word and non-word
1415  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1416  * must be different.  Find the "wordness" of the character just prior to this
1417  * one, and compare it with the wordness of this one.  If they differ, we have
1418  * a boundary.  At the beginning of the string, pretend that the previous
1419  * character was a new-line */
1420 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1421     if (utf8_target) {                                                         \
1422                 UTF8_CODE \
1423     }                                                                          \
1424     else {  /* Not utf8 */                                                     \
1425         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1426         tmp = TEST_NON_UTF8(tmp);                                              \
1427         REXEC_FBC_SCAN(                                                        \
1428             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1429                 tmp = !tmp;                                                    \
1430                 IF_SUCCESS;                                                    \
1431             }                                                                  \
1432             else {                                                             \
1433                 IF_FAIL;                                                       \
1434             }                                                                  \
1435         );                                                                     \
1436     }                                                                          \
1437     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1438         goto got_it;
1439
1440 /* We know what class REx starts with.  Try to find this position... */
1441 /* if reginfo is NULL, its a dryrun */
1442 /* annoyingly all the vars in this routine have different names from their counterparts
1443    in regmatch. /grrr */
1444
1445 STATIC char *
1446 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1447     const char *strend, regmatch_info *reginfo)
1448 {
1449         dVAR;
1450         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1451         char *m;
1452         STRLEN ln;
1453         STRLEN lnc;
1454         register STRLEN uskip;
1455         unsigned int c1;
1456         unsigned int c2;
1457         char *e;
1458         register I32 tmp = 1;   /* Scratch variable? */
1459         register const bool utf8_target = PL_reg_match_utf8;
1460         RXi_GET_DECL(prog,progi);
1461
1462         PERL_ARGS_ASSERT_FIND_BYCLASS;
1463
1464         /* We know what class it must start with. */
1465         switch (OP(c)) {
1466         case ANYOFV:
1467         case ANYOF:
1468             if (utf8_target || OP(c) == ANYOFV) {
1469                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
1470                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1471                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1472                           REGINCLASS(prog, c, (U8*)s));
1473             }
1474             else {
1475                  while (s < strend) {
1476                       STRLEN skip = 1;
1477
1478                       if (REGINCLASS(prog, c, (U8*)s) ||
1479                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1480                            /* The assignment of 2 is intentional:
1481                             * for the folded sharp s, the skip is 2. */
1482                            (skip = SHARP_S_SKIP))) {
1483                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1484                                 goto got_it;
1485                            else
1486                                 tmp = doevery;
1487                       }
1488                       else
1489                            tmp = 1;
1490                       s += skip;
1491                  }
1492             }
1493             break;
1494         case CANY:
1495             REXEC_FBC_SCAN(
1496                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1497                     goto got_it;
1498                 else
1499                     tmp = doevery;
1500             );
1501             break;
1502         case EXACTFU:
1503         case EXACTF:
1504             m   = STRING(c);
1505             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1506             lnc = (I32) ln;     /* length to match in characters */
1507             if (UTF_PATTERN) {
1508                 STRLEN ulen1, ulen2;
1509                 U8 *sm = (U8 *) m;
1510                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1511                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1512                 /* used by commented-out code below */
1513                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1514
1515                 /* XXX: Since the node will be case folded at compile
1516                    time this logic is a little odd, although im not
1517                    sure that its actually wrong. --dmq */
1518
1519                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1520                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1521
1522                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1523                    codepoint of the first character in the converted
1524                    form, yet originally we did the extra step.
1525                    No tests fail by commenting this code out however
1526                    so Ive left it out. -- dmq.
1527
1528                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1529                                     0, uniflags);
1530                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1531                                     0, uniflags);
1532                 */
1533
1534                 lnc = 0;
1535                 while (sm < ((U8 *) m + ln)) {
1536                     lnc++;
1537                     sm += UTF8SKIP(sm);
1538                 }
1539             }
1540             else {
1541                 c1 = *(U8*)m;
1542                 if (utf8_target || OP(c) == EXACTFU) {
1543
1544                     /* Micro sign folds to GREEK SMALL LETTER MU;
1545                        LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets
1546                        c2 to the first 's' of the pair, and the code below will
1547                        look for others */
1548                     c2 = (c1 == MICRO_SIGN)
1549                         ? GREEK_SMALL_LETTER_MU
1550                         : (c1 == LATIN_SMALL_LETTER_SHARP_S)
1551                            ? 's'
1552                            : PL_fold_latin1[c1];
1553                 } else c2 = PL_fold[c1];
1554             }
1555             goto do_exactf;
1556         case EXACTFL:
1557             m   = STRING(c);
1558             ln  = STR_LEN(c);
1559             lnc = (I32) ln;
1560             c1 = *(U8*)m;
1561             c2 = PL_fold_locale[c1];
1562           do_exactf:
1563             e = HOP3c(strend, -((I32)lnc), s);
1564
1565             if (!reginfo && e < s)
1566                 e = s;                  /* Due to minlen logic of intuit() */
1567
1568             /* The idea in the EXACTF* cases is to first find the
1569              * first character of the EXACTF* node and then, if
1570              * necessary, case-insensitively compare the full
1571              * text of the node.  The c1 and c2 are the first
1572              * characters (though in Unicode it gets a bit
1573              * more complicated because there are more cases
1574              * than just upper and lower: one needs to use
1575              * the so-called folding case for case-insensitive
1576              * matching (called "loose matching" in Unicode).
1577              * foldEQ_utf8() will do just that. */
1578
1579             if (utf8_target || UTF_PATTERN) {
1580                 UV c, f;
1581                 U8 tmpbuf [UTF8_MAXBYTES+1];
1582                 STRLEN len = 1;
1583                 STRLEN foldlen;
1584                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1585                 if (c1 == c2) {
1586                     /* Upper and lower of 1st char are equal -
1587                      * probably not a "letter". */
1588                     while (s <= e) {
1589                         if (utf8_target) {
1590                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1591                                            uniflags);
1592                         } else {
1593                             c = *((U8*)s);
1594                         }
1595                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1596                     }
1597                 }
1598                 else {
1599                     while (s <= e) {
1600                         if (utf8_target) {
1601                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1602                                            uniflags);
1603                         } else {
1604                             c = *((U8*)s);
1605                         }
1606
1607                         /* Handle some of the three Greek sigmas cases.
1608                          * Note that not all the possible combinations
1609                          * are handled here: some of them are handled
1610                          * by the standard folding rules, and some of
1611                          * them (the character class or ANYOF cases)
1612                          * are handled during compiletime in
1613                          * regexec.c:S_regclass(). */
1614                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1615                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1616                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1617
1618                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1619                     }
1620                 }
1621             }
1622             else {
1623                 /* Neither pattern nor string are UTF8 */
1624                 if (c1 == c2)
1625                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1626                 else
1627                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1628             }
1629             break;
1630         case BOUNDL:
1631             PL_reg_flags |= RF_tainted;
1632             FBC_BOUND(isALNUM_LC,
1633                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1634                       isALNUM_LC_utf8((U8*)s));
1635             break;
1636         case NBOUNDL:
1637             PL_reg_flags |= RF_tainted;
1638             FBC_NBOUND(isALNUM_LC,
1639                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1640                        isALNUM_LC_utf8((U8*)s));
1641             break;
1642         case BOUND:
1643             FBC_BOUND(isWORDCHAR,
1644                       isALNUM_uni(tmp),
1645                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1646             break;
1647         case BOUNDA:
1648             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1649                              isWORDCHAR_A(tmp),
1650                              isWORDCHAR_A((U8*)s));
1651             break;
1652         case NBOUND:
1653             FBC_NBOUND(isWORDCHAR,
1654                        isALNUM_uni(tmp),
1655                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1656             break;
1657         case NBOUNDA:
1658             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1659                               isWORDCHAR_A(tmp),
1660                               isWORDCHAR_A((U8*)s));
1661             break;
1662         case BOUNDU:
1663             FBC_BOUND(isWORDCHAR_L1,
1664                       isALNUM_uni(tmp),
1665                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1666             break;
1667         case NBOUNDU:
1668             FBC_NBOUND(isWORDCHAR_L1,
1669                        isALNUM_uni(tmp),
1670                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1671             break;
1672         case ALNUML:
1673             REXEC_FBC_CSCAN_TAINT(
1674                 isALNUM_LC_utf8((U8*)s),
1675                 isALNUM_LC(*s)
1676             );
1677             break;
1678         case ALNUMU:
1679             REXEC_FBC_CSCAN_PRELOAD(
1680                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1681                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1682                 isWORDCHAR_L1((U8) *s)
1683             );
1684             break;
1685         case ALNUM:
1686             REXEC_FBC_CSCAN_PRELOAD(
1687                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1688                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1689                 isWORDCHAR((U8) *s)
1690             );
1691             break;
1692         case ALNUMA:
1693             /* Don't need to worry about utf8, as it can match only a single
1694              * byte invariant character */
1695             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1696             break;
1697         case NALNUMU:
1698             REXEC_FBC_CSCAN_PRELOAD(
1699                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1700                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1701                 ! isWORDCHAR_L1((U8) *s)
1702             );
1703             break;
1704         case NALNUM:
1705             REXEC_FBC_CSCAN_PRELOAD(
1706                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1707                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1708                 ! isALNUM(*s)
1709             );
1710             break;
1711         case NALNUMA:
1712             REXEC_FBC_CSCAN(
1713                 !isWORDCHAR_A(*s),
1714                 !isWORDCHAR_A(*s)
1715             );
1716             break;
1717         case NALNUML:
1718             REXEC_FBC_CSCAN_TAINT(
1719                 !isALNUM_LC_utf8((U8*)s),
1720                 !isALNUM_LC(*s)
1721             );
1722             break;
1723         case SPACEU:
1724             REXEC_FBC_CSCAN_PRELOAD(
1725                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1726                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1727                 isSPACE_L1((U8) *s)
1728             );
1729             break;
1730         case SPACE:
1731             REXEC_FBC_CSCAN_PRELOAD(
1732                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1733                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1734                 isSPACE((U8) *s)
1735             );
1736             break;
1737         case SPACEA:
1738             /* Don't need to worry about utf8, as it can match only a single
1739              * byte invariant character */
1740             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1741             break;
1742         case SPACEL:
1743             REXEC_FBC_CSCAN_TAINT(
1744                 isSPACE_LC_utf8((U8*)s),
1745                 isSPACE_LC(*s)
1746             );
1747             break;
1748         case NSPACEU:
1749             REXEC_FBC_CSCAN_PRELOAD(
1750                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1751                 !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1752                 ! isSPACE_L1((U8) *s)
1753             );
1754             break;
1755         case NSPACE:
1756             REXEC_FBC_CSCAN_PRELOAD(
1757                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1758                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1759                 ! isSPACE((U8) *s)
1760             );
1761             break;
1762         case NSPACEA:
1763             REXEC_FBC_CSCAN(
1764                 !isSPACE_A(*s),
1765                 !isSPACE_A(*s)
1766             );
1767             break;
1768         case NSPACEL:
1769             REXEC_FBC_CSCAN_TAINT(
1770                 !isSPACE_LC_utf8((U8*)s),
1771                 !isSPACE_LC(*s)
1772             );
1773             break;
1774         case DIGIT:
1775             REXEC_FBC_CSCAN_PRELOAD(
1776                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1777                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1778                 isDIGIT(*s)
1779             );
1780             break;
1781         case DIGITA:
1782             /* Don't need to worry about utf8, as it can match only a single
1783              * byte invariant character */
1784             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1785             break;
1786         case DIGITL:
1787             REXEC_FBC_CSCAN_TAINT(
1788                 isDIGIT_LC_utf8((U8*)s),
1789                 isDIGIT_LC(*s)
1790             );
1791             break;
1792         case NDIGIT:
1793             REXEC_FBC_CSCAN_PRELOAD(
1794                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1795                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1796                 !isDIGIT(*s)
1797             );
1798             break;
1799         case NDIGITA:
1800             REXEC_FBC_CSCAN(
1801                 !isDIGIT_A(*s),
1802                 !isDIGIT_A(*s)
1803             );
1804             break;
1805         case NDIGITL:
1806             REXEC_FBC_CSCAN_TAINT(
1807                 !isDIGIT_LC_utf8((U8*)s),
1808                 !isDIGIT_LC(*s)
1809             );
1810             break;
1811         case LNBREAK:
1812             REXEC_FBC_CSCAN(
1813                 is_LNBREAK_utf8(s),
1814                 is_LNBREAK_latin1(s)
1815             );
1816             break;
1817         case VERTWS:
1818             REXEC_FBC_CSCAN(
1819                 is_VERTWS_utf8(s),
1820                 is_VERTWS_latin1(s)
1821             );
1822             break;
1823         case NVERTWS:
1824             REXEC_FBC_CSCAN(
1825                 !is_VERTWS_utf8(s),
1826                 !is_VERTWS_latin1(s)
1827             );
1828             break;
1829         case HORIZWS:
1830             REXEC_FBC_CSCAN(
1831                 is_HORIZWS_utf8(s),
1832                 is_HORIZWS_latin1(s)
1833             );
1834             break;
1835         case NHORIZWS:
1836             REXEC_FBC_CSCAN(
1837                 !is_HORIZWS_utf8(s),
1838                 !is_HORIZWS_latin1(s)
1839             );
1840             break;
1841         case AHOCORASICKC:
1842         case AHOCORASICK:
1843             {
1844                 DECL_TRIE_TYPE(c);
1845                 /* what trie are we using right now */
1846                 reg_ac_data *aho
1847                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1848                 reg_trie_data *trie
1849                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1850                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1851
1852                 const char *last_start = strend - trie->minlen;
1853 #ifdef DEBUGGING
1854                 const char *real_start = s;
1855 #endif
1856                 STRLEN maxlen = trie->maxlen;
1857                 SV *sv_points;
1858                 U8 **points; /* map of where we were in the input string
1859                                 when reading a given char. For ASCII this
1860                                 is unnecessary overhead as the relationship
1861                                 is always 1:1, but for Unicode, especially
1862                                 case folded Unicode this is not true. */
1863                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1864                 U8 *bitmap=NULL;
1865
1866
1867                 GET_RE_DEBUG_FLAGS_DECL;
1868
1869                 /* We can't just allocate points here. We need to wrap it in
1870                  * an SV so it gets freed properly if there is a croak while
1871                  * running the match */
1872                 ENTER;
1873                 SAVETMPS;
1874                 sv_points=newSV(maxlen * sizeof(U8 *));
1875                 SvCUR_set(sv_points,
1876                     maxlen * sizeof(U8 *));
1877                 SvPOK_on(sv_points);
1878                 sv_2mortal(sv_points);
1879                 points=(U8**)SvPV_nolen(sv_points );
1880                 if ( trie_type != trie_utf8_fold
1881                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1882                 {
1883                     if (trie->bitmap)
1884                         bitmap=(U8*)trie->bitmap;
1885                     else
1886                         bitmap=(U8*)ANYOF_BITMAP(c);
1887                 }
1888                 /* this is the Aho-Corasick algorithm modified a touch
1889                    to include special handling for long "unknown char"
1890                    sequences. The basic idea being that we use AC as long
1891                    as we are dealing with a possible matching char, when
1892                    we encounter an unknown char (and we have not encountered
1893                    an accepting state) we scan forward until we find a legal
1894                    starting char.
1895                    AC matching is basically that of trie matching, except
1896                    that when we encounter a failing transition, we fall back
1897                    to the current states "fail state", and try the current char
1898                    again, a process we repeat until we reach the root state,
1899                    state 1, or a legal transition. If we fail on the root state
1900                    then we can either terminate if we have reached an accepting
1901                    state previously, or restart the entire process from the beginning
1902                    if we have not.
1903
1904                  */
1905                 while (s <= last_start) {
1906                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1907                     U8 *uc = (U8*)s;
1908                     U16 charid = 0;
1909                     U32 base = 1;
1910                     U32 state = 1;
1911                     UV uvc = 0;
1912                     STRLEN len = 0;
1913                     STRLEN foldlen = 0;
1914                     U8 *uscan = (U8*)NULL;
1915                     U8 *leftmost = NULL;
1916 #ifdef DEBUGGING
1917                     U32 accepted_word= 0;
1918 #endif
1919                     U32 pointpos = 0;
1920
1921                     while ( state && uc <= (U8*)strend ) {
1922                         int failed=0;
1923                         U32 word = aho->states[ state ].wordnum;
1924
1925                         if( state==1 ) {
1926                             if ( bitmap ) {
1927                                 DEBUG_TRIE_EXECUTE_r(
1928                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1929                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1930                                             (char *)uc, utf8_target );
1931                                         PerlIO_printf( Perl_debug_log,
1932                                             " Scanning for legal start char...\n");
1933                                     }
1934                                 );
1935                                 if (utf8_target) {
1936                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1937                                         uc += UTF8SKIP(uc);
1938                                     }
1939                                 } else {
1940                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1941                                         uc++;
1942                                     }
1943                                 }
1944                                 s= (char *)uc;
1945                             }
1946                             if (uc >(U8*)last_start) break;
1947                         }
1948
1949                         if ( word ) {
1950                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1951                             if (!leftmost || lpos < leftmost) {
1952                                 DEBUG_r(accepted_word=word);
1953                                 leftmost= lpos;
1954                             }
1955                             if (base==0) break;
1956
1957                         }
1958                         points[pointpos++ % maxlen]= uc;
1959                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1960                                              uscan, len, uvc, charid, foldlen,
1961                                              foldbuf, uniflags);
1962                         DEBUG_TRIE_EXECUTE_r({
1963                             dump_exec_pos( (char *)uc, c, strend, real_start,
1964                                 s,   utf8_target );
1965                             PerlIO_printf(Perl_debug_log,
1966                                 " Charid:%3u CP:%4"UVxf" ",
1967                                  charid, uvc);
1968                         });
1969
1970                         do {
1971 #ifdef DEBUGGING
1972                             word = aho->states[ state ].wordnum;
1973 #endif
1974                             base = aho->states[ state ].trans.base;
1975
1976                             DEBUG_TRIE_EXECUTE_r({
1977                                 if (failed)
1978                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1979                                         s,   utf8_target );
1980                                 PerlIO_printf( Perl_debug_log,
1981                                     "%sState: %4"UVxf", word=%"UVxf,
1982                                     failed ? " Fail transition to " : "",
1983                                     (UV)state, (UV)word);
1984                             });
1985                             if ( base ) {
1986                                 U32 tmp;
1987                                 I32 offset;
1988                                 if (charid &&
1989                                      ( ((offset = base + charid
1990                                         - 1 - trie->uniquecharcount)) >= 0)
1991                                      && ((U32)offset < trie->lasttrans)
1992                                      && trie->trans[offset].check == state
1993                                      && (tmp=trie->trans[offset].next))
1994                                 {
1995                                     DEBUG_TRIE_EXECUTE_r(
1996                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1997                                     state = tmp;
1998                                     break;
1999                                 }
2000                                 else {
2001                                     DEBUG_TRIE_EXECUTE_r(
2002                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2003                                     failed = 1;
2004                                     state = aho->fail[state];
2005                                 }
2006                             }
2007                             else {
2008                                 /* we must be accepting here */
2009                                 DEBUG_TRIE_EXECUTE_r(
2010                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2011                                 failed = 1;
2012                                 break;
2013                             }
2014                         } while(state);
2015                         uc += len;
2016                         if (failed) {
2017                             if (leftmost)
2018                                 break;
2019                             if (!state) state = 1;
2020                         }
2021                     }
2022                     if ( aho->states[ state ].wordnum ) {
2023                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2024                         if (!leftmost || lpos < leftmost) {
2025                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2026                             leftmost = lpos;
2027                         }
2028                     }
2029                     if (leftmost) {
2030                         s = (char*)leftmost;
2031                         DEBUG_TRIE_EXECUTE_r({
2032                             PerlIO_printf(
2033                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2034                                 (UV)accepted_word, (IV)(s - real_start)
2035                             );
2036                         });
2037                         if (!reginfo || regtry(reginfo, &s)) {
2038                             FREETMPS;
2039                             LEAVE;
2040                             goto got_it;
2041                         }
2042                         s = HOPc(s,1);
2043                         DEBUG_TRIE_EXECUTE_r({
2044                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2045                         });
2046                     } else {
2047                         DEBUG_TRIE_EXECUTE_r(
2048                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2049                         break;
2050                     }
2051                 }
2052                 FREETMPS;
2053                 LEAVE;
2054             }
2055             break;
2056         default:
2057             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2058             break;
2059         }
2060         return 0;
2061       got_it:
2062         return s;
2063 }
2064
2065
2066 /*
2067  - regexec_flags - match a regexp against a string
2068  */
2069 I32
2070 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2071               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2072 /* strend: pointer to null at end of string */
2073 /* strbeg: real beginning of string */
2074 /* minend: end of match must be >=minend after stringarg. */
2075 /* data: May be used for some additional optimizations.
2076          Currently its only used, with a U32 cast, for transmitting
2077          the ganch offset when doing a /g match. This will change */
2078 /* nosave: For optimizations. */
2079 {
2080     dVAR;
2081     struct regexp *const prog = (struct regexp *)SvANY(rx);
2082     /*register*/ char *s;
2083     register regnode *c;
2084     /*register*/ char *startpos = stringarg;
2085     I32 minlen;         /* must match at least this many chars */
2086     I32 dontbother = 0; /* how many characters not to try at end */
2087     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2088     I32 scream_pos = -1;                /* Internal iterator of scream. */
2089     char *scream_olds = NULL;
2090     const bool utf8_target = cBOOL(DO_UTF8(sv));
2091     I32 multiline;
2092     RXi_GET_DECL(prog,progi);
2093     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2094     regexp_paren_pair *swap = NULL;
2095     GET_RE_DEBUG_FLAGS_DECL;
2096
2097     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2098     PERL_UNUSED_ARG(data);
2099
2100     /* Be paranoid... */
2101     if (prog == NULL || startpos == NULL) {
2102         Perl_croak(aTHX_ "NULL regexp parameter");
2103         return 0;
2104     }
2105
2106     multiline = prog->extflags & RXf_PMf_MULTILINE;
2107     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2108
2109     RX_MATCH_UTF8_set(rx, utf8_target);
2110     DEBUG_EXECUTE_r(
2111         debug_start_match(rx, utf8_target, startpos, strend,
2112         "Matching");
2113     );
2114
2115     minlen = prog->minlen;
2116
2117     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2118         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2119                               "String too short [regexec_flags]...\n"));
2120         goto phooey;
2121     }
2122
2123
2124     /* Check validity of program. */
2125     if (UCHARAT(progi->program) != REG_MAGIC) {
2126         Perl_croak(aTHX_ "corrupted regexp program");
2127     }
2128
2129     PL_reg_flags = 0;
2130     PL_reg_eval_set = 0;
2131     PL_reg_maxiter = 0;
2132
2133     if (RX_UTF8(rx))
2134         PL_reg_flags |= RF_utf8;
2135
2136     /* Mark beginning of line for ^ and lookbehind. */
2137     reginfo.bol = startpos; /* XXX not used ??? */
2138     PL_bostr  = strbeg;
2139     reginfo.sv = sv;
2140
2141     /* Mark end of line for $ (and such) */
2142     PL_regeol = strend;
2143
2144     /* see how far we have to get to not match where we matched before */
2145     reginfo.till = startpos+minend;
2146
2147     /* If there is a "must appear" string, look for it. */
2148     s = startpos;
2149
2150     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2151         MAGIC *mg;
2152         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2153             reginfo.ganch = startpos + prog->gofs;
2154             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2155               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2156         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2157                   && SvMAGIC(sv)
2158                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2159                   && mg->mg_len >= 0) {
2160             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2161             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2162                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2163
2164             if (prog->extflags & RXf_ANCH_GPOS) {
2165                 if (s > reginfo.ganch)
2166                     goto phooey;
2167                 s = reginfo.ganch - prog->gofs;
2168                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2169                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2170                 if (s < strbeg)
2171                     goto phooey;
2172             }
2173         }
2174         else if (data) {
2175             reginfo.ganch = strbeg + PTR2UV(data);
2176             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2177                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2178
2179         } else {                                /* pos() not defined */
2180             reginfo.ganch = strbeg;
2181             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2182                  "GPOS: reginfo.ganch = strbeg\n"));
2183         }
2184     }
2185     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2186         /* We have to be careful. If the previous successful match
2187            was from this regex we don't want a subsequent partially
2188            successful match to clobber the old results.
2189            So when we detect this possibility we add a swap buffer
2190            to the re, and switch the buffer each match. If we fail
2191            we switch it back, otherwise we leave it swapped.
2192         */
2193         swap = prog->offs;
2194         /* do we need a save destructor here for eval dies? */
2195         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2196     }
2197     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2198         re_scream_pos_data d;
2199
2200         d.scream_olds = &scream_olds;
2201         d.scream_pos = &scream_pos;
2202         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2203         if (!s) {
2204             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2205             goto phooey;        /* not present */
2206         }
2207     }
2208
2209
2210
2211     /* Simplest case:  anchored match need be tried only once. */
2212     /*  [unless only anchor is BOL and multiline is set] */
2213     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2214         if (s == startpos && regtry(&reginfo, &startpos))
2215             goto got_it;
2216         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2217                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2218         {
2219             char *end;
2220
2221             if (minlen)
2222                 dontbother = minlen - 1;
2223             end = HOP3c(strend, -dontbother, strbeg) - 1;
2224             /* for multiline we only have to try after newlines */
2225             if (prog->check_substr || prog->check_utf8) {
2226                 /* because of the goto we can not easily reuse the macros for bifurcating the
2227                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2228                 if (utf8_target) {
2229                     if (s == startpos)
2230                         goto after_try_utf8;
2231                     while (1) {
2232                         if (regtry(&reginfo, &s)) {
2233                             goto got_it;
2234                         }
2235                       after_try_utf8:
2236                         if (s > end) {
2237                             goto phooey;
2238                         }
2239                         if (prog->extflags & RXf_USE_INTUIT) {
2240                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2241                             if (!s) {
2242                                 goto phooey;
2243                             }
2244                         }
2245                         else {
2246                             s += UTF8SKIP(s);
2247                         }
2248                     }
2249                 } /* end search for check string in unicode */
2250                 else {
2251                     if (s == startpos) {
2252                         goto after_try_latin;
2253                     }
2254                     while (1) {
2255                         if (regtry(&reginfo, &s)) {
2256                             goto got_it;
2257                         }
2258                       after_try_latin:
2259                         if (s > end) {
2260                             goto phooey;
2261                         }
2262                         if (prog->extflags & RXf_USE_INTUIT) {
2263                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2264                             if (!s) {
2265                                 goto phooey;
2266                             }
2267                         }
2268                         else {
2269                             s++;
2270                         }
2271                     }
2272                 } /* end search for check string in latin*/
2273             } /* end search for check string */
2274             else { /* search for newline */
2275                 if (s > startpos) {
2276                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2277                     s--;
2278                 }
2279                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2280                 while (s < end) {
2281                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2282                         if (regtry(&reginfo, &s))
2283                             goto got_it;
2284                     }
2285                 }
2286             } /* end search for newline */
2287         } /* end anchored/multiline check string search */
2288         goto phooey;
2289     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2290     {
2291         /* the warning about reginfo.ganch being used without initialization
2292            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2293            and we only enter this block when the same bit is set. */
2294         char *tmp_s = reginfo.ganch - prog->gofs;
2295
2296         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2297             goto got_it;
2298         goto phooey;
2299     }
2300
2301     /* Messy cases:  unanchored match. */
2302     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2303         /* we have /x+whatever/ */
2304         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2305         char ch;
2306 #ifdef DEBUGGING
2307         int did_match = 0;
2308 #endif
2309         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2310             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2311         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2312
2313         if (utf8_target) {
2314             REXEC_FBC_SCAN(
2315                 if (*s == ch) {
2316                     DEBUG_EXECUTE_r( did_match = 1 );
2317                     if (regtry(&reginfo, &s)) goto got_it;
2318                     s += UTF8SKIP(s);
2319                     while (s < strend && *s == ch)
2320                         s += UTF8SKIP(s);
2321                 }
2322             );
2323         }
2324         else {
2325             REXEC_FBC_SCAN(
2326                 if (*s == ch) {
2327                     DEBUG_EXECUTE_r( did_match = 1 );
2328                     if (regtry(&reginfo, &s)) goto got_it;
2329                     s++;
2330                     while (s < strend && *s == ch)
2331                         s++;
2332                 }
2333             );
2334         }
2335         DEBUG_EXECUTE_r(if (!did_match)
2336                 PerlIO_printf(Perl_debug_log,
2337                                   "Did not find anchored character...\n")
2338                );
2339     }
2340     else if (prog->anchored_substr != NULL
2341               || prog->anchored_utf8 != NULL
2342               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2343                   && prog->float_max_offset < strend - s)) {
2344         SV *must;
2345         I32 back_max;
2346         I32 back_min;
2347         char *last;
2348         char *last1;            /* Last position checked before */
2349 #ifdef DEBUGGING
2350         int did_match = 0;
2351 #endif
2352         if (prog->anchored_substr || prog->anchored_utf8) {
2353             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2354                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2355             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2356             back_max = back_min = prog->anchored_offset;
2357         } else {
2358             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2359                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2360             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2361             back_max = prog->float_max_offset;
2362             back_min = prog->float_min_offset;
2363         }
2364
2365
2366         if (must == &PL_sv_undef)
2367             /* could not downgrade utf8 check substring, so must fail */
2368             goto phooey;
2369
2370         if (back_min<0) {
2371             last = strend;
2372         } else {
2373             last = HOP3c(strend,        /* Cannot start after this */
2374                   -(I32)(CHR_SVLEN(must)
2375                          - (SvTAIL(must) != 0) + back_min), strbeg);
2376         }
2377         if (s > PL_bostr)
2378             last1 = HOPc(s, -1);
2379         else
2380             last1 = s - 1;      /* bogus */
2381
2382         /* XXXX check_substr already used to find "s", can optimize if
2383            check_substr==must. */
2384         scream_pos = -1;
2385         dontbother = end_shift;
2386         strend = HOPc(strend, -dontbother);
2387         while ( (s <= last) &&
2388                 ((flags & REXEC_SCREAM)
2389                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2390                                     end_shift, &scream_pos, 0))
2391                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2392                                   (unsigned char*)strend, must,
2393                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2394             /* we may be pointing at the wrong string */
2395             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2396                 s = strbeg + (s - SvPVX_const(sv));
2397             DEBUG_EXECUTE_r( did_match = 1 );
2398             if (HOPc(s, -back_max) > last1) {
2399                 last1 = HOPc(s, -back_min);
2400                 s = HOPc(s, -back_max);
2401             }
2402             else {
2403                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2404
2405                 last1 = HOPc(s, -back_min);
2406                 s = t;
2407             }
2408             if (utf8_target) {
2409                 while (s <= last1) {
2410                     if (regtry(&reginfo, &s))
2411                         goto got_it;
2412                     s += UTF8SKIP(s);
2413                 }
2414             }
2415             else {
2416                 while (s <= last1) {
2417                     if (regtry(&reginfo, &s))
2418                         goto got_it;
2419                     s++;
2420                 }
2421             }
2422         }
2423         DEBUG_EXECUTE_r(if (!did_match) {
2424             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2425                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2426             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2427                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2428                                ? "anchored" : "floating"),
2429                 quoted, RE_SV_TAIL(must));
2430         });
2431         goto phooey;
2432     }
2433     else if ( (c = progi->regstclass) ) {
2434         if (minlen) {
2435             const OPCODE op = OP(progi->regstclass);
2436             /* don't bother with what can't match */
2437             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2438                 strend = HOPc(strend, -(minlen - 1));
2439         }
2440         DEBUG_EXECUTE_r({
2441             SV * const prop = sv_newmortal();
2442             regprop(prog, prop, c);
2443             {
2444                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2445                     s,strend-s,60);
2446                 PerlIO_printf(Perl_debug_log,
2447                     "Matching stclass %.*s against %s (%d bytes)\n",
2448                     (int)SvCUR(prop), SvPVX_const(prop),
2449                      quoted, (int)(strend - s));
2450             }
2451         });
2452         if (find_byclass(prog, c, s, strend, &reginfo))
2453             goto got_it;
2454         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2455     }
2456     else {
2457         dontbother = 0;
2458         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2459             /* Trim the end. */
2460             char *last;
2461             SV* float_real;
2462
2463             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2464                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2465             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2466
2467             if (flags & REXEC_SCREAM) {
2468                 last = screaminstr(sv, float_real, s - strbeg,
2469                                    end_shift, &scream_pos, 1); /* last one */
2470                 if (!last)
2471                     last = scream_olds; /* Only one occurrence. */
2472                 /* we may be pointing at the wrong string */
2473                 else if (RXp_MATCH_COPIED(prog))
2474                     s = strbeg + (s - SvPVX_const(sv));
2475             }
2476             else {
2477                 STRLEN len;
2478                 const char * const little = SvPV_const(float_real, len);
2479
2480                 if (SvTAIL(float_real)) {
2481                     if (memEQ(strend - len + 1, little, len - 1))
2482                         last = strend - len + 1;
2483                     else if (!multiline)
2484                         last = memEQ(strend - len, little, len)
2485                             ? strend - len : NULL;
2486                     else
2487                         goto find_last;
2488                 } else {
2489                   find_last:
2490                     if (len)
2491                         last = rninstr(s, strend, little, little + len);
2492                     else
2493                         last = strend;  /* matching "$" */
2494                 }
2495             }
2496             if (last == NULL) {
2497                 DEBUG_EXECUTE_r(
2498                     PerlIO_printf(Perl_debug_log,
2499                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2500                         PL_colors[4], PL_colors[5]));
2501                 goto phooey; /* Should not happen! */
2502             }
2503             dontbother = strend - last + prog->float_min_offset;
2504         }
2505         if (minlen && (dontbother < minlen))
2506             dontbother = minlen - 1;
2507         strend -= dontbother;              /* this one's always in bytes! */
2508         /* We don't know much -- general case. */
2509         if (utf8_target) {
2510             for (;;) {
2511                 if (regtry(&reginfo, &s))
2512                     goto got_it;
2513                 if (s >= strend)
2514                     break;
2515                 s += UTF8SKIP(s);
2516             };
2517         }
2518         else {
2519             do {
2520                 if (regtry(&reginfo, &s))
2521                     goto got_it;
2522             } while (s++ < strend);
2523         }
2524     }
2525
2526     /* Failure. */
2527     goto phooey;
2528
2529 got_it:
2530     Safefree(swap);
2531     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2532
2533     if (PL_reg_eval_set)
2534         restore_pos(aTHX_ prog);
2535     if (RXp_PAREN_NAMES(prog))
2536         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2537
2538     /* make sure $`, $&, $', and $digit will work later */
2539     if ( !(flags & REXEC_NOT_FIRST) ) {
2540         RX_MATCH_COPY_FREE(rx);
2541         if (flags & REXEC_COPY_STR) {
2542             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2543 #ifdef PERL_OLD_COPY_ON_WRITE
2544             if ((SvIsCOW(sv)
2545                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2546                 if (DEBUG_C_TEST) {
2547                     PerlIO_printf(Perl_debug_log,
2548                                   "Copy on write: regexp capture, type %d\n",
2549                                   (int) SvTYPE(sv));
2550                 }
2551                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2552                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2553                 assert (SvPOKp(prog->saved_copy));
2554             } else
2555 #endif
2556             {
2557                 RX_MATCH_COPIED_on(rx);
2558                 s = savepvn(strbeg, i);
2559                 prog->subbeg = s;
2560             }
2561             prog->sublen = i;
2562         }
2563         else {
2564             prog->subbeg = strbeg;
2565             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2566         }
2567     }
2568
2569     return 1;
2570
2571 phooey:
2572     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2573                           PL_colors[4], PL_colors[5]));
2574     if (PL_reg_eval_set)
2575         restore_pos(aTHX_ prog);
2576     if (swap) {
2577         /* we failed :-( roll it back */
2578         Safefree(prog->offs);
2579         prog->offs = swap;
2580     }
2581
2582     return 0;
2583 }
2584
2585
2586 /*
2587  - regtry - try match at specific point
2588  */
2589 STATIC I32                      /* 0 failure, 1 success */
2590 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2591 {
2592     dVAR;
2593     CHECKPOINT lastcp;
2594     REGEXP *const rx = reginfo->prog;
2595     regexp *const prog = (struct regexp *)SvANY(rx);
2596     RXi_GET_DECL(prog,progi);
2597     GET_RE_DEBUG_FLAGS_DECL;
2598
2599     PERL_ARGS_ASSERT_REGTRY;
2600
2601     reginfo->cutpoint=NULL;
2602
2603     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2604         MAGIC *mg;
2605
2606         PL_reg_eval_set = RS_init;
2607         DEBUG_EXECUTE_r(DEBUG_s(
2608             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2609                           (IV)(PL_stack_sp - PL_stack_base));
2610             ));
2611         SAVESTACK_CXPOS();
2612         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2613         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2614         SAVETMPS;
2615         /* Apparently this is not needed, judging by wantarray. */
2616         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2617            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2618
2619         if (reginfo->sv) {
2620             /* Make $_ available to executed code. */
2621             if (reginfo->sv != DEFSV) {
2622                 SAVE_DEFSV;
2623                 DEFSV_set(reginfo->sv);
2624             }
2625
2626             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2627                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2628                 /* prepare for quick setting of pos */
2629 #ifdef PERL_OLD_COPY_ON_WRITE
2630                 if (SvIsCOW(reginfo->sv))
2631                     sv_force_normal_flags(reginfo->sv, 0);
2632 #endif
2633                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2634                                  &PL_vtbl_mglob, NULL, 0);
2635                 mg->mg_len = -1;
2636             }
2637             PL_reg_magic    = mg;
2638             PL_reg_oldpos   = mg->mg_len;
2639             SAVEDESTRUCTOR_X(restore_pos, prog);
2640         }
2641         if (!PL_reg_curpm) {
2642             Newxz(PL_reg_curpm, 1, PMOP);
2643 #ifdef USE_ITHREADS
2644             {
2645                 SV* const repointer = &PL_sv_undef;
2646                 /* this regexp is also owned by the new PL_reg_curpm, which
2647                    will try to free it.  */
2648                 av_push(PL_regex_padav, repointer);
2649                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2650                 PL_regex_pad = AvARRAY(PL_regex_padav);
2651             }
2652 #endif
2653         }
2654 #ifdef USE_ITHREADS
2655         /* It seems that non-ithreads works both with and without this code.
2656            So for efficiency reasons it seems best not to have the code
2657            compiled when it is not needed.  */
2658         /* This is safe against NULLs: */
2659         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2660         /* PM_reg_curpm owns a reference to this regexp.  */
2661         ReREFCNT_inc(rx);
2662 #endif
2663         PM_SETRE(PL_reg_curpm, rx);
2664         PL_reg_oldcurpm = PL_curpm;
2665         PL_curpm = PL_reg_curpm;
2666         if (RXp_MATCH_COPIED(prog)) {
2667             /*  Here is a serious problem: we cannot rewrite subbeg,
2668                 since it may be needed if this match fails.  Thus
2669                 $` inside (?{}) could fail... */
2670             PL_reg_oldsaved = prog->subbeg;
2671             PL_reg_oldsavedlen = prog->sublen;
2672 #ifdef PERL_OLD_COPY_ON_WRITE
2673             PL_nrs = prog->saved_copy;
2674 #endif
2675             RXp_MATCH_COPIED_off(prog);
2676         }
2677         else
2678             PL_reg_oldsaved = NULL;
2679         prog->subbeg = PL_bostr;
2680         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2681     }
2682     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2683     prog->offs[0].start = *startpos - PL_bostr;
2684     PL_reginput = *startpos;
2685     PL_reglastparen = &prog->lastparen;
2686     PL_reglastcloseparen = &prog->lastcloseparen;
2687     prog->lastparen = 0;
2688     prog->lastcloseparen = 0;
2689     PL_regsize = 0;
2690     PL_regoffs = prog->offs;
2691     if (PL_reg_start_tmpl <= prog->nparens) {
2692         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2693         if(PL_reg_start_tmp)
2694             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2695         else
2696             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2697     }
2698
2699     /* XXXX What this code is doing here?!!!  There should be no need
2700        to do this again and again, PL_reglastparen should take care of
2701        this!  --ilya*/
2702
2703     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2704      * Actually, the code in regcppop() (which Ilya may be meaning by
2705      * PL_reglastparen), is not needed at all by the test suite
2706      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2707      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2708      * Meanwhile, this code *is* needed for the
2709      * above-mentioned test suite tests to succeed.  The common theme
2710      * on those tests seems to be returning null fields from matches.
2711      * --jhi updated by dapm */
2712 #if 1
2713     if (prog->nparens) {
2714         regexp_paren_pair *pp = PL_regoffs;
2715         register I32 i;
2716         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2717             ++pp;
2718             pp->start = -1;
2719             pp->end = -1;
2720         }
2721     }
2722 #endif
2723     REGCP_SET(lastcp);
2724     if (regmatch(reginfo, progi->program + 1)) {
2725         PL_regoffs[0].end = PL_reginput - PL_bostr;
2726         return 1;
2727     }
2728     if (reginfo->cutpoint)
2729         *startpos= reginfo->cutpoint;
2730     REGCP_UNWIND(lastcp);
2731     return 0;
2732 }
2733
2734
2735 #define sayYES goto yes
2736 #define sayNO goto no
2737 #define sayNO_SILENT goto no_silent
2738
2739 /* we dont use STMT_START/END here because it leads to
2740    "unreachable code" warnings, which are bogus, but distracting. */
2741 #define CACHEsayNO \
2742     if (ST.cache_mask) \
2743        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2744     sayNO
2745
2746 /* this is used to determine how far from the left messages like
2747    'failed...' are printed. It should be set such that messages
2748    are inline with the regop output that created them.
2749 */
2750 #define REPORT_CODE_OFF 32
2751
2752
2753 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2754 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2755
2756 #define SLAB_FIRST(s) (&(s)->states[0])
2757 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2758
2759 /* grab a new slab and return the first slot in it */
2760
2761 STATIC regmatch_state *
2762 S_push_slab(pTHX)
2763 {
2764 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2765     dMY_CXT;
2766 #endif
2767     regmatch_slab *s = PL_regmatch_slab->next;
2768     if (!s) {
2769         Newx(s, 1, regmatch_slab);
2770         s->prev = PL_regmatch_slab;
2771         s->next = NULL;
2772         PL_regmatch_slab->next = s;
2773     }
2774     PL_regmatch_slab = s;
2775     return SLAB_FIRST(s);
2776 }
2777
2778
2779 /* push a new state then goto it */
2780
2781 #define PUSH_STATE_GOTO(state, node) \
2782     scan = node; \
2783     st->resume_state = state; \
2784     goto push_state;
2785
2786 /* push a new state with success backtracking, then goto it */
2787
2788 #define PUSH_YES_STATE_GOTO(state, node) \
2789     scan = node; \
2790     st->resume_state = state; \
2791     goto push_yes_state;
2792
2793
2794
2795 /*
2796
2797 regmatch() - main matching routine
2798
2799 This is basically one big switch statement in a loop. We execute an op,
2800 set 'next' to point the next op, and continue. If we come to a point which
2801 we may need to backtrack to on failure such as (A|B|C), we push a
2802 backtrack state onto the backtrack stack. On failure, we pop the top
2803 state, and re-enter the loop at the state indicated. If there are no more
2804 states to pop, we return failure.
2805
2806 Sometimes we also need to backtrack on success; for example /A+/, where
2807 after successfully matching one A, we need to go back and try to
2808 match another one; similarly for lookahead assertions: if the assertion
2809 completes successfully, we backtrack to the state just before the assertion
2810 and then carry on.  In these cases, the pushed state is marked as
2811 'backtrack on success too'. This marking is in fact done by a chain of
2812 pointers, each pointing to the previous 'yes' state. On success, we pop to
2813 the nearest yes state, discarding any intermediate failure-only states.
2814 Sometimes a yes state is pushed just to force some cleanup code to be
2815 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2816 it to free the inner regex.
2817
2818 Note that failure backtracking rewinds the cursor position, while
2819 success backtracking leaves it alone.
2820
2821 A pattern is complete when the END op is executed, while a subpattern
2822 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2823 ops trigger the "pop to last yes state if any, otherwise return true"
2824 behaviour.
2825
2826 A common convention in this function is to use A and B to refer to the two
2827 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2828 the subpattern to be matched possibly multiple times, while B is the entire
2829 rest of the pattern. Variable and state names reflect this convention.
2830
2831 The states in the main switch are the union of ops and failure/success of
2832 substates associated with with that op.  For example, IFMATCH is the op
2833 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2834 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2835 successfully matched A and IFMATCH_A_fail is a state saying that we have
2836 just failed to match A. Resume states always come in pairs. The backtrack
2837 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2838 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2839 on success or failure.
2840
2841 The struct that holds a backtracking state is actually a big union, with
2842 one variant for each major type of op. The variable st points to the
2843 top-most backtrack struct. To make the code clearer, within each
2844 block of code we #define ST to alias the relevant union.
2845
2846 Here's a concrete example of a (vastly oversimplified) IFMATCH
2847 implementation:
2848
2849     switch (state) {
2850     ....
2851
2852 #define ST st->u.ifmatch
2853
2854     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2855         ST.foo = ...; // some state we wish to save
2856         ...
2857         // push a yes backtrack state with a resume value of
2858         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2859         // first node of A:
2860         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2861         // NOTREACHED
2862
2863     case IFMATCH_A: // we have successfully executed A; now continue with B
2864         next = B;
2865         bar = ST.foo; // do something with the preserved value
2866         break;
2867
2868     case IFMATCH_A_fail: // A failed, so the assertion failed
2869         ...;   // do some housekeeping, then ...
2870         sayNO; // propagate the failure
2871
2872 #undef ST
2873
2874     ...
2875     }
2876
2877 For any old-timers reading this who are familiar with the old recursive
2878 approach, the code above is equivalent to:
2879
2880     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2881     {
2882         int foo = ...
2883         ...
2884         if (regmatch(A)) {
2885             next = B;
2886             bar = foo;
2887             break;
2888         }
2889         ...;   // do some housekeeping, then ...
2890         sayNO; // propagate the failure
2891     }
2892
2893 The topmost backtrack state, pointed to by st, is usually free. If you
2894 want to claim it, populate any ST.foo fields in it with values you wish to
2895 save, then do one of
2896
2897         PUSH_STATE_GOTO(resume_state, node);
2898         PUSH_YES_STATE_GOTO(resume_state, node);
2899
2900 which sets that backtrack state's resume value to 'resume_state', pushes a
2901 new free entry to the top of the backtrack stack, then goes to 'node'.
2902 On backtracking, the free slot is popped, and the saved state becomes the
2903 new free state. An ST.foo field in this new top state can be temporarily
2904 accessed to retrieve values, but once the main loop is re-entered, it
2905 becomes available for reuse.
2906
2907 Note that the depth of the backtrack stack constantly increases during the
2908 left-to-right execution of the pattern, rather than going up and down with
2909 the pattern nesting. For example the stack is at its maximum at Z at the
2910 end of the pattern, rather than at X in the following:
2911
2912     /(((X)+)+)+....(Y)+....Z/
2913
2914 The only exceptions to this are lookahead/behind assertions and the cut,
2915 (?>A), which pop all the backtrack states associated with A before
2916 continuing.
2917
2918 Backtrack state structs are allocated in slabs of about 4K in size.
2919 PL_regmatch_state and st always point to the currently active state,
2920 and PL_regmatch_slab points to the slab currently containing
2921 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2922 allocated, and is never freed until interpreter destruction. When the slab
2923 is full, a new one is allocated and chained to the end. At exit from
2924 regmatch(), slabs allocated since entry are freed.
2925
2926 */
2927
2928
2929 #define DEBUG_STATE_pp(pp)                                  \
2930     DEBUG_STATE_r({                                         \
2931         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2932         PerlIO_printf(Perl_debug_log,                       \
2933             "    %*s"pp" %s%s%s%s%s\n",                     \
2934             depth*2, "",                                    \
2935             PL_reg_name[st->resume_state],                     \
2936             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2937             ((st==yes_state) ? "Y" : ""),                   \
2938             ((st==mark_state) ? "M" : ""),                  \
2939             ((st==yes_state||st==mark_state) ? "]" : "")    \
2940         );                                                  \
2941     });
2942
2943
2944 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2945
2946 #ifdef DEBUGGING
2947
2948 STATIC void
2949 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2950     const char *start, const char *end, const char *blurb)
2951 {
2952     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2953
2954     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2955
2956     if (!PL_colorset)
2957             reginitcolors();
2958     {
2959         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2960             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2961
2962         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2963             start, end - start, 60);
2964
2965         PerlIO_printf(Perl_debug_log,
2966             "%s%s REx%s %s against %s\n",
2967                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2968
2969         if (utf8_target||utf8_pat)
2970             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2971                 utf8_pat ? "pattern" : "",
2972                 utf8_pat && utf8_target ? " and " : "",
2973                 utf8_target ? "string" : ""
2974             );
2975     }
2976 }
2977
2978 STATIC void
2979 S_dump_exec_pos(pTHX_ const char *locinput,
2980                       const regnode *scan,
2981                       const char *loc_regeol,
2982                       const char *loc_bostr,
2983                       const char *loc_reg_starttry,
2984                       const bool utf8_target)
2985 {
2986     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2987     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2988     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2989     /* The part of the string before starttry has one color
2990        (pref0_len chars), between starttry and current
2991        position another one (pref_len - pref0_len chars),
2992        after the current position the third one.
2993        We assume that pref0_len <= pref_len, otherwise we
2994        decrease pref0_len.  */
2995     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2996         ? (5 + taill) - l : locinput - loc_bostr;
2997     int pref0_len;
2998
2999     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3000
3001     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3002         pref_len++;
3003     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3004     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3005         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3006               ? (5 + taill) - pref_len : loc_regeol - locinput);
3007     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3008         l--;
3009     if (pref0_len < 0)
3010         pref0_len = 0;
3011     if (pref0_len > pref_len)
3012         pref0_len = pref_len;
3013     {
3014         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3015
3016         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3017             (locinput - pref_len),pref0_len, 60, 4, 5);
3018
3019         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3020                     (locinput - pref_len + pref0_len),
3021                     pref_len - pref0_len, 60, 2, 3);
3022
3023         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3024                     locinput, loc_regeol - locinput, 10, 0, 1);
3025
3026         const STRLEN tlen=len0+len1+len2;
3027         PerlIO_printf(Perl_debug_log,
3028                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3029                     (IV)(locinput - loc_bostr),
3030                     len0, s0,
3031                     len1, s1,
3032                     (docolor ? "" : "> <"),
3033                     len2, s2,
3034                     (int)(tlen > 19 ? 0 :  19 - tlen),
3035                     "");
3036     }
3037 }
3038
3039 #endif
3040
3041 /* reg_check_named_buff_matched()
3042  * Checks to see if a named buffer has matched. The data array of
3043  * buffer numbers corresponding to the buffer is expected to reside
3044  * in the regexp->data->data array in the slot stored in the ARG() of
3045  * node involved. Note that this routine doesn't actually care about the
3046  * name, that information is not preserved from compilation to execution.
3047  * Returns the index of the leftmost defined buffer with the given name
3048  * or 0 if non of the buffers matched.
3049  */
3050 STATIC I32
3051 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3052 {
3053     I32 n;
3054     RXi_GET_DECL(rex,rexi);
3055     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3056     I32 *nums=(I32*)SvPVX(sv_dat);
3057
3058     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3059
3060     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3061         if ((I32)*PL_reglastparen >= nums[n] &&
3062             PL_regoffs[nums[n]].end != -1)
3063         {
3064             return nums[n];
3065         }
3066     }
3067     return 0;
3068 }
3069
3070
3071 /* free all slabs above current one  - called during LEAVE_SCOPE */
3072
3073 STATIC void
3074 S_clear_backtrack_stack(pTHX_ void *p)
3075 {
3076     regmatch_slab *s = PL_regmatch_slab->next;
3077     PERL_UNUSED_ARG(p);
3078
3079     if (!s)
3080         return;
3081     PL_regmatch_slab->next = NULL;
3082     while (s) {
3083         regmatch_slab * const osl = s;
3084         s = s->next;
3085         Safefree(osl);
3086     }
3087 }
3088
3089
3090 #define SETREX(Re1,Re2) \
3091     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3092     Re1 = (Re2)
3093
3094 STATIC I32                      /* 0 failure, 1 success */
3095 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3096 {
3097 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3098     dMY_CXT;
3099 #endif
3100     dVAR;
3101     register const bool utf8_target = PL_reg_match_utf8;
3102     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3103     REGEXP *rex_sv = reginfo->prog;
3104     regexp *rex = (struct regexp *)SvANY(rex_sv);
3105     RXi_GET_DECL(rex,rexi);
3106     I32 oldsave;
3107     /* the current state. This is a cached copy of PL_regmatch_state */
3108     register regmatch_state *st;
3109     /* cache heavy used fields of st in registers */
3110     register regnode *scan;
3111     register regnode *next;
3112     register U32 n = 0; /* general value; init to avoid compiler warning */
3113     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3114     register char *locinput = PL_reginput;
3115     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3116
3117     bool result = 0;        /* return value of S_regmatch */
3118     int depth = 0;          /* depth of backtrack stack */
3119     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3120     const U32 max_nochange_depth =
3121         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3122         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3123     regmatch_state *yes_state = NULL; /* state to pop to on success of
3124                                                             subpattern */
3125     /* mark_state piggy backs on the yes_state logic so that when we unwind
3126        the stack on success we can update the mark_state as we go */
3127     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3128     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3129     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3130     U32 state_num;
3131     bool no_final = 0;      /* prevent failure from backtracking? */
3132     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3133     char *startpoint = PL_reginput;
3134     SV *popmark = NULL;     /* are we looking for a mark? */
3135     SV *sv_commit = NULL;   /* last mark name seen in failure */
3136     SV *sv_yes_mark = NULL; /* last mark name we have seen
3137                                during a successful match */
3138     U32 lastopen = 0;       /* last open we saw */
3139     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3140     SV* const oreplsv = GvSV(PL_replgv);
3141     /* these three flags are set by various ops to signal information to
3142      * the very next op. They have a useful lifetime of exactly one loop
3143      * iteration, and are not preserved or restored by state pushes/pops
3144      */
3145     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3146     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3147     int logical = 0;        /* the following EVAL is:
3148                                 0: (?{...})
3149                                 1: (?(?{...})X|Y)
3150                                 2: (??{...})
3151                                or the following IFMATCH/UNLESSM is:
3152                                 false: plain (?=foo)
3153                                 true:  used as a condition: (?(?=foo))
3154                             */
3155 #ifdef DEBUGGING
3156     GET_RE_DEBUG_FLAGS_DECL;
3157 #endif
3158
3159     PERL_ARGS_ASSERT_REGMATCH;
3160
3161     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3162             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3163     }));
3164     /* on first ever call to regmatch, allocate first slab */
3165     if (!PL_regmatch_slab) {
3166         Newx(PL_regmatch_slab, 1, regmatch_slab);
3167         PL_regmatch_slab->prev = NULL;
3168         PL_regmatch_slab->next = NULL;
3169         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3170     }
3171
3172     oldsave = PL_savestack_ix;
3173     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3174     SAVEVPTR(PL_regmatch_slab);
3175     SAVEVPTR(PL_regmatch_state);
3176
3177     /* grab next free state slot */
3178     st = ++PL_regmatch_state;
3179     if (st >  SLAB_LAST(PL_regmatch_slab))
3180         st = PL_regmatch_state = S_push_slab(aTHX);
3181
3182     /* Note that nextchr is a byte even in UTF */
3183     nextchr = UCHARAT(locinput);
3184     scan = prog;
3185     while (scan != NULL) {
3186
3187         DEBUG_EXECUTE_r( {
3188             SV * const prop = sv_newmortal();
3189             regnode *rnext=regnext(scan);
3190             DUMP_EXEC_POS( locinput, scan, utf8_target );
3191             regprop(rex, prop, scan);
3192
3193             PerlIO_printf(Perl_debug_log,
3194                     "%3"IVdf":%*s%s(%"IVdf")\n",
3195                     (IV)(scan - rexi->program), depth*2, "",
3196                     SvPVX_const(prop),
3197                     (PL_regkind[OP(scan)] == END || !rnext) ?
3198                         0 : (IV)(rnext - rexi->program));
3199         });
3200
3201         next = scan + NEXT_OFF(scan);
3202         if (next == scan)
3203             next = NULL;
3204         state_num = OP(scan);
3205
3206         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3207       reenter_switch:
3208
3209         assert(PL_reglastparen == &rex->lastparen);
3210         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3211         assert(PL_regoffs == rex->offs);
3212
3213         switch (state_num) {
3214         case BOL:
3215             if (locinput == PL_bostr)
3216             {
3217                 /* reginfo->till = reginfo->bol; */
3218                 break;
3219             }
3220             sayNO;
3221         case MBOL:
3222             if (locinput == PL_bostr ||
3223                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3224             {
3225                 break;
3226             }
3227             sayNO;
3228         case SBOL:
3229             if (locinput == PL_bostr)
3230                 break;
3231             sayNO;
3232         case GPOS:
3233             if (locinput == reginfo->ganch)
3234                 break;
3235             sayNO;
3236
3237         case KEEPS:
3238             /* update the startpoint */
3239             st->u.keeper.val = PL_regoffs[0].start;
3240             PL_reginput = locinput;
3241             PL_regoffs[0].start = locinput - PL_bostr;
3242             PUSH_STATE_GOTO(KEEPS_next, next);
3243             /*NOT-REACHED*/
3244         case KEEPS_next_fail:
3245             /* rollback the start point change */
3246             PL_regoffs[0].start = st->u.keeper.val;
3247             sayNO_SILENT;
3248             /*NOT-REACHED*/
3249         case EOL:
3250                 goto seol;
3251         case MEOL:
3252             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3253                 sayNO;
3254             break;
3255         case SEOL:
3256           seol:
3257             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3258                 sayNO;
3259             if (PL_regeol - locinput > 1)
3260                 sayNO;
3261             break;
3262         case EOS:
3263             if (PL_regeol != locinput)
3264                 sayNO;
3265             break;
3266         case SANY:
3267             if (!nextchr && locinput >= PL_regeol)
3268                 sayNO;
3269             if (utf8_target) {
3270                 locinput += PL_utf8skip[nextchr];
3271                 if (locinput > PL_regeol)
3272                     sayNO;
3273                 nextchr = UCHARAT(locinput);
3274             }
3275             else
3276                 nextchr = UCHARAT(++locinput);
3277             break;
3278         case CANY:
3279             if (!nextchr && locinput >= PL_regeol)
3280                 sayNO;
3281             nextchr = UCHARAT(++locinput);
3282             break;
3283         case REG_ANY:
3284             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3285                 sayNO;
3286             if (utf8_target) {
3287                 locinput += PL_utf8skip[nextchr];
3288                 if (locinput > PL_regeol)
3289                     sayNO;
3290                 nextchr = UCHARAT(locinput);
3291             }
3292             else
3293                 nextchr = UCHARAT(++locinput);
3294             break;
3295
3296 #undef  ST
3297 #define ST st->u.trie
3298         case TRIEC:
3299             /* In this case the charclass data is available inline so
3300                we can fail fast without a lot of extra overhead.
3301              */
3302             if (scan->flags == EXACT || !utf8_target) {
3303                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3304                     DEBUG_EXECUTE_r(
3305                         PerlIO_printf(Perl_debug_log,
3306                                   "%*s  %sfailed to match trie start class...%s\n",
3307                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3308                     );
3309                     sayNO_SILENT;
3310                     /* NOTREACHED */
3311                 }
3312             }
3313             /* FALL THROUGH */
3314         case TRIE:
3315             /* the basic plan of execution of the trie is:
3316              * At the beginning, run though all the states, and
3317              * find the longest-matching word. Also remember the position
3318              * of the shortest matching word. For example, this pattern:
3319              *    1  2 3 4    5
3320              *    ab|a|x|abcd|abc
3321              * when matched against the string "abcde", will generate
3322              * accept states for all words except 3, with the longest
3323              * matching word being 4, and the shortest being 1 (with
3324              * the position being after char 1 of the string).
3325              *
3326              * Then for each matching word, in word order (i.e. 1,2,4,5),
3327              * we run the remainder of the pattern; on each try setting
3328              * the current position to the character following the word,
3329              * returning to try the next word on failure.
3330              *
3331              * We avoid having to build a list of words at runtime by
3332              * using a compile-time structure, wordinfo[].prev, which
3333              * gives, for each word, the previous accepting word (if any).
3334              * In the case above it would contain the mappings 1->2, 2->0,
3335              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3336              * the longest word (4 above), a list of all words, by
3337              * following the list of prev pointers; this gives us the
3338              * unordered list 4,5,1,2. Then given the current word we have
3339              * just tried, we can go through the list and find the
3340              * next-biggest word to try (so if we just failed on word 2,
3341              * the next in the list is 4).
3342              *
3343              * Since at runtime we don't record the matching position in
3344              * the string for each word, we have to work that out for
3345              * each word we're about to process. The wordinfo table holds
3346              * the character length of each word; given that we recorded
3347              * at the start: the position of the shortest word and its
3348              * length in chars, we just need to move the pointer the
3349              * difference between the two char lengths. Depending on
3350              * Unicode status and folding, that's cheap or expensive.
3351              *
3352              * This algorithm is optimised for the case where are only a
3353              * small number of accept states, i.e. 0,1, or maybe 2.
3354              * With lots of accepts states, and having to try all of them,
3355              * it becomes quadratic on number of accept states to find all
3356              * the next words.
3357              */
3358
3359             {
3360                 /* what type of TRIE am I? (utf8 makes this contextual) */
3361                 DECL_TRIE_TYPE(scan);
3362
3363                 /* what trie are we using right now */
3364                 reg_trie_data * const trie
3365                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3366                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3367                 U32 state = trie->startstate;
3368
3369                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3370                     !TRIE_BITMAP_TEST(trie,*locinput)
3371                 ) {
3372                     if (trie->states[ state ].wordnum) {
3373                          DEBUG_EXECUTE_r(
3374                             PerlIO_printf(Perl_debug_log,
3375                                           "%*s  %smatched empty string...%s\n",
3376                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3377                         );
3378                         if (!trie->jump)
3379                             break;
3380                     } else {
3381                         DEBUG_EXECUTE_r(
3382                             PerlIO_printf(Perl_debug_log,
3383                                           "%*s  %sfailed to match trie start class...%s\n",
3384                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3385                         );
3386                         sayNO_SILENT;
3387                    }
3388                 }
3389
3390             {
3391                 U8 *uc = ( U8* )locinput;
3392
3393                 STRLEN len = 0;
3394                 STRLEN foldlen = 0;
3395                 U8 *uscan = (U8*)NULL;
3396                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3397                 U32 charcount = 0; /* how many input chars we have matched */
3398                 U32 accepted = 0; /* have we seen any accepting states? */
3399
3400                 ST.B = next;
3401                 ST.jump = trie->jump;
3402                 ST.me = scan;
3403                 ST.firstpos = NULL;
3404                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3405                 ST.nextword = 0;
3406
3407                 /* fully traverse the TRIE; note the position of the
3408                    shortest accept state and the wordnum of the longest
3409                    accept state */
3410
3411                 while ( state && uc <= (U8*)PL_regeol ) {
3412                     U32 base = trie->states[ state ].trans.base;
3413                     UV uvc = 0;
3414                     U16 charid = 0;
3415                     U16 wordnum;
3416                     wordnum = trie->states[ state ].wordnum;
3417
3418                     if (wordnum) { /* it's an accept state */
3419                         if (!accepted) {
3420                             accepted = 1;
3421                             /* record first match position */
3422                             if (ST.longfold) {
3423                                 ST.firstpos = (U8*)locinput;
3424                                 ST.firstchars = 0;
3425                             }
3426                             else {
3427                                 ST.firstpos = uc;
3428                                 ST.firstchars = charcount;
3429                             }
3430                         }
3431                         if (!ST.nextword || wordnum < ST.nextword)
3432                             ST.nextword = wordnum;
3433                         ST.topword = wordnum;
3434                     }
3435
3436                     DEBUG_TRIE_EXECUTE_r({
3437                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3438                                 PerlIO_printf( Perl_debug_log,
3439                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3440                                     2+depth * 2, "", PL_colors[4],
3441                                     (UV)state, (accepted ? 'Y' : 'N'));
3442                     });
3443
3444                     /* read a char and goto next state */
3445                     if ( base ) {
3446                         I32 offset;
3447                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3448                                              uscan, len, uvc, charid, foldlen,
3449                                              foldbuf, uniflags);
3450                         charcount++;
3451                         if (foldlen>0)
3452                             ST.longfold = TRUE;
3453                         if (charid &&
3454                              ( ((offset =
3455                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3456
3457                              && ((U32)offset < trie->lasttrans)
3458                              && trie->trans[offset].check == state)
3459                         {
3460                             state = trie->trans[offset].next;
3461                         }
3462                         else {
3463                             state = 0;
3464                         }
3465                         uc += len;
3466
3467                     }
3468                     else {
3469                         state = 0;
3470                     }
3471                     DEBUG_TRIE_EXECUTE_r(
3472                         PerlIO_printf( Perl_debug_log,
3473                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3474                             charid, uvc, (UV)state, PL_colors[5] );
3475                     );
3476                 }
3477                 if (!accepted)
3478                    sayNO;
3479
3480                 /* calculate total number of accept states */
3481                 {
3482                     U16 w = ST.topword;
3483                     accepted = 0;
3484                     while (w) {
3485                         w = trie->wordinfo[w].prev;
3486                         accepted++;
3487                     }
3488                     ST.accepted = accepted;
3489                 }
3490
3491                 DEBUG_EXECUTE_r(
3492                     PerlIO_printf( Perl_debug_log,
3493                         "%*s  %sgot %"IVdf" possible matches%s\n",
3494                         REPORT_CODE_OFF + depth * 2, "",
3495                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3496                 );
3497                 goto trie_first_try; /* jump into the fail handler */
3498             }}
3499             /* NOTREACHED */
3500
3501         case TRIE_next_fail: /* we failed - try next alternative */
3502             if ( ST.jump) {
3503                 REGCP_UNWIND(ST.cp);
3504                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3505                     PL_regoffs[n].end = -1;
3506                 *PL_reglastparen = n;
3507             }
3508             if (!--ST.accepted) {
3509                 DEBUG_EXECUTE_r({
3510                     PerlIO_printf( Perl_debug_log,
3511                         "%*s  %sTRIE failed...%s\n",
3512                         REPORT_CODE_OFF+depth*2, "",
3513                         PL_colors[4],
3514                         PL_colors[5] );
3515                 });
3516                 sayNO_SILENT;
3517             }
3518             {
3519                 /* Find next-highest word to process.  Note that this code
3520                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3521                 register U16 min = 0;
3522                 register U16 word;
3523                 register U16 const nextword = ST.nextword;
3524                 register reg_trie_wordinfo * const wordinfo
3525                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3526                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3527                     if (word > nextword && (!min || word < min))
3528                         min = word;
3529                 }
3530                 ST.nextword = min;
3531             }
3532
3533           trie_first_try:
3534             if (do_cutgroup) {
3535                 do_cutgroup = 0;
3536                 no_final = 0;
3537             }
3538
3539             if ( ST.jump) {
3540                 ST.lastparen = *PL_reglastparen;
3541                 REGCP_SET(ST.cp);
3542             }
3543
3544             /* find start char of end of current word */
3545             {
3546                 U32 chars; /* how many chars to skip */
3547                 U8 *uc = ST.firstpos;
3548                 reg_trie_data * const trie
3549                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3550
3551                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3552                             >=  ST.firstchars);
3553                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3554                             - ST.firstchars;
3555
3556                 if (ST.longfold) {
3557                     /* the hard option - fold each char in turn and find
3558                      * its folded length (which may be different */
3559                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3560                     STRLEN foldlen;
3561                     STRLEN len;
3562                     UV uvc;
3563                     U8 *uscan;
3564
3565                     while (chars) {
3566                         if (utf8_target) {
3567                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3568                                                     uniflags);
3569                             uc += len;
3570                         }
3571                         else {
3572                             uvc = *uc;
3573                             uc++;
3574                         }
3575                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3576                         uscan = foldbuf;
3577                         while (foldlen) {
3578                             if (!--chars)
3579                                 break;
3580                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3581                                             uniflags);
3582                             uscan += len;
3583                             foldlen -= len;
3584                         }
3585                     }
3586                 }
3587                 else {
3588                     if (utf8_target)
3589                         while (chars--)
3590                             uc += UTF8SKIP(uc);
3591                     else
3592                         uc += chars;
3593                 }
3594                 PL_reginput = (char *)uc;
3595             }
3596
3597             scan = (ST.jump && ST.jump[ST.nextword])
3598                         ? ST.me + ST.jump[ST.nextword]
3599                         : ST.B;
3600
3601             DEBUG_EXECUTE_r({
3602                 PerlIO_printf( Perl_debug_log,
3603                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3604                     REPORT_CODE_OFF+depth*2, "",
3605                     PL_colors[4],
3606                     ST.nextword,
3607                     PL_colors[5]
3608                     );
3609             });
3610
3611             if (ST.accepted > 1 || has_cutgroup) {
3612                 PUSH_STATE_GOTO(TRIE_next, scan);
3613                 /* NOTREACHED */
3614             }
3615             /* only one choice left - just continue */
3616             DEBUG_EXECUTE_r({
3617                 AV *const trie_words
3618                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3619                 SV ** const tmp = av_fetch( trie_words,
3620                     ST.nextword-1, 0 );
3621                 SV *sv= tmp ? sv_newmortal() : NULL;
3622
3623                 PerlIO_printf( Perl_debug_log,
3624                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3625                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3626                     ST.nextword,
3627                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3628                             PL_colors[0], PL_colors[1],
3629                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3630                         )
3631                     : "not compiled under -Dr",
3632                     PL_colors[5] );
3633             });
3634
3635             locinput = PL_reginput;
3636             nextchr = UCHARAT(locinput);
3637             continue; /* execute rest of RE */
3638             /* NOTREACHED */
3639 #undef  ST
3640
3641         case EXACT: {
3642             char *s = STRING(scan);
3643             ln = STR_LEN(scan);
3644             if (utf8_target != UTF_PATTERN) {
3645                 /* The target and the pattern have differing utf8ness. */
3646                 char *l = locinput;
3647                 const char * const e = s + ln;
3648
3649                 if (utf8_target) {
3650                     /* The target is utf8, the pattern is not utf8. */
3651                     while (s < e) {
3652                         STRLEN ulen;
3653                         if (l >= PL_regeol)
3654                              sayNO;
3655                         if (NATIVE_TO_UNI(*(U8*)s) !=
3656                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3657                                             uniflags))
3658                              sayNO;
3659                         l += ulen;
3660                         s ++;
3661                     }
3662                 }
3663                 else {
3664                     /* The target is not utf8, the pattern is utf8. */
3665                     while (s < e) {
3666                         STRLEN ulen;
3667                         if (l >= PL_regeol)
3668                             sayNO;
3669                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3670                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3671                                            uniflags))
3672                             sayNO;
3673                         s += ulen;
3674                         l ++;
3675                     }
3676                 }
3677                 locinput = l;
3678                 nextchr = UCHARAT(locinput);
3679                 break;
3680             }
3681             /* The target and the pattern have the same utf8ness. */
3682             /* Inline the first character, for speed. */
3683             if (UCHARAT(s) != nextchr)
3684                 sayNO;
3685             if (PL_regeol - locinput < ln)
3686                 sayNO;
3687             if (ln > 1 && memNE(s, locinput, ln))
3688                 sayNO;
3689             locinput += ln;
3690             nextchr = UCHARAT(locinput);
3691             break;
3692             }
3693         case EXACTFL: {
3694             re_fold_t folder;
3695             const U8 * fold_array;
3696             const char * s;
3697
3698             PL_reg_flags |= RF_tainted;
3699             folder = foldEQ_locale;
3700             fold_array = PL_fold_locale;
3701             goto do_exactf;
3702
3703         case EXACTFU:
3704             folder = foldEQ_latin1;
3705             fold_array = PL_fold_latin1;
3706             goto do_exactf;
3707
3708         case EXACTF:
3709             folder = foldEQ;
3710             fold_array = PL_fold;
3711
3712           do_exactf:
3713             s = STRING(scan);
3714             ln = STR_LEN(scan);
3715
3716             if (utf8_target || UTF_PATTERN) {
3717               /* Either target or the pattern are utf8. */
3718                 const char * const l = locinput;
3719                 char *e = PL_regeol;
3720
3721                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3722                                l, &e, 0,  utf8_target)) {
3723                      /* One more case for the sharp s:
3724                       * pack("U0U*", 0xDF) =~ /ss/i,
3725                       * the 0xC3 0x9F are the UTF-8
3726                       * byte sequence for the U+00DF. */
3727
3728                      if (!(utf8_target &&
3729                            toLOWER(s[0]) == 's' &&
3730                            ln >= 2 &&
3731                            toLOWER(s[1]) == 's' &&
3732                            (U8)l[0] == 0xC3 &&
3733                            e - l >= 2 &&
3734                            (U8)l[1] == 0x9F))
3735                           sayNO;
3736                 }
3737                 locinput = e;
3738                 nextchr = UCHARAT(locinput);
3739                 break;
3740             }
3741
3742             /* Neither the target and the pattern are utf8. */
3743
3744             /* Inline the first character, for speed. */
3745             if (UCHARAT(s) != nextchr &&
3746                 UCHARAT(s) != fold_array[nextchr])
3747             {
3748                 sayNO;
3749             }
3750             if (PL_regeol - locinput < ln)
3751                 sayNO;
3752             if (ln > 1 && ! folder(s, locinput, ln))
3753                 sayNO;
3754             locinput += ln;
3755             nextchr = UCHARAT(locinput);
3756             break;
3757         }
3758
3759         /* XXX Could improve efficiency by separating these all out using a
3760          * macro or in-line function.  At that point regcomp.c would no longer
3761          * have to set the FLAGS fields of these */
3762         case BOUNDL:
3763         case NBOUNDL:
3764             PL_reg_flags |= RF_tainted;
3765             /* FALL THROUGH */
3766         case BOUND:
3767         case BOUNDU:
3768         case BOUNDA:
3769         case NBOUND:
3770         case NBOUNDU:
3771         case NBOUNDA:
3772             /* was last char in word? */
3773             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3774                 if (locinput == PL_bostr)
3775                     ln = '\n';
3776                 else {
3777                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3778
3779                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3780                 }
3781                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3782                     ln = isALNUM_uni(ln);
3783                     LOAD_UTF8_CHARCLASS_ALNUM();
3784                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3785                 }
3786                 else {
3787                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3788                     n = isALNUM_LC_utf8((U8*)locinput);
3789                 }
3790             }
3791             else {
3792
3793                 /* Here the string isn't utf8, or is utf8 and only ascii
3794                  * characters are to match \w.  In the latter case looking at
3795                  * the byte just prior to the current one may be just the final
3796                  * byte of a multi-byte character.  This is ok.  There are two
3797                  * cases:
3798                  * 1) it is a single byte character, and then the test is doing
3799                  *      just what it's supposed to.
3800                  * 2) it is a multi-byte character, in which case the final
3801                  *      byte is never mistakable for ASCII, and so the test
3802                  *      will say it is not a word character, which is the
3803                  *      correct answer. */
3804                 ln = (locinput != PL_bostr) ?
3805                     UCHARAT(locinput - 1) : '\n';
3806                 switch (FLAGS(scan)) {
3807                     case REGEX_UNICODE_CHARSET:
3808                         ln = isWORDCHAR_L1(ln);
3809                         n = isWORDCHAR_L1(nextchr);
3810                         break;
3811                     case REGEX_LOCALE_CHARSET:
3812                         ln = isALNUM_LC(ln);
3813                         n = isALNUM_LC(nextchr);
3814                         break;
3815                     case REGEX_DEPENDS_CHARSET:
3816                         ln = isALNUM(ln);
3817                         n = isALNUM(nextchr);
3818                         break;
3819                     case REGEX_ASCII_RESTRICTED_CHARSET:
3820                         ln = isWORDCHAR_A(ln);
3821                         n = isWORDCHAR_A(nextchr);
3822                         break;
3823                     default:
3824                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3825                         break;
3826                 }
3827             }
3828             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3829              * regcomp.sym */
3830             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3831                     sayNO;
3832             break;
3833         case ANYOFV:
3834         case ANYOF:
3835             if (utf8_target || state_num == ANYOFV) {
3836                 STRLEN inclasslen = PL_regeol - locinput;
3837                 if (locinput >= PL_regeol)
3838                     sayNO;
3839
3840                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3841                     sayNO;
3842                 locinput += inclasslen;
3843                 nextchr = UCHARAT(locinput);
3844                 break;
3845             }
3846             else {
3847                 if (nextchr < 0)
3848                     nextchr = UCHARAT(locinput);
3849                 if (!nextchr && locinput >= PL_regeol)
3850                     sayNO;
3851                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3852                     sayNO;
3853                 nextchr = UCHARAT(++locinput);
3854                 break;
3855             }
3856             break;
3857         /* Special char classes - The defines start on line 129 or so */
3858         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3859                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3860                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3861                   ALNUMA, NALNUMA, isWORDCHAR_A,
3862                   perl_word, "a");
3863
3864         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3865                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3866                   SPACEU, NSPACEU, isSPACE_L1,
3867                   SPACEA, NSPACEA, isSPACE_A,
3868                   perl_space, " ");
3869
3870         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3871                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3872                 DIGITA, NDIGITA, isDIGIT_A,
3873                 posix_digit, "0");
3874
3875         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3876                        a Unicode extended Grapheme Cluster */
3877             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3878               extended Grapheme Cluster is:
3879
3880                CR LF
3881                | Prepend* Begin Extend*
3882                | .
3883
3884                Begin is (Hangul-syllable | ! Control)
3885                Extend is (Grapheme_Extend | Spacing_Mark)
3886                Control is [ GCB_Control CR LF ]
3887
3888                The discussion below shows how the code for CLUMP is derived
3889                from this regex.  Note that most of these concepts are from
3890                property values of the Grapheme Cluster Boundary (GCB) property.
3891                No code point can have multiple property values for a given
3892                property.  Thus a code point in Prepend can't be in Control, but
3893                it must be in !Control.  This is why Control above includes
3894                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3895                property separately, and so can't be in GCB_Control, even though
3896                they logically are controls.  Control is not the same as gc=cc,
3897                but includes format and other characters as well.
3898
3899                The Unicode definition of Hangul-syllable is:
3900                    L+
3901                    | (L* ( ( V | LV ) V* | LVT ) T*)
3902                    | T+
3903                   )
3904                Each of these is a value for the GCB property, and hence must be
3905                disjoint, so the order they are tested is immaterial, so the
3906                above can safely be changed to
3907                    T+
3908                    | L+
3909                    | (L* ( LVT | ( V | LV ) V*) T*)
3910
3911                The last two terms can be combined like this:
3912                    L* ( L
3913                         | (( LVT | ( V | LV ) V*) T*))
3914
3915                And refactored into this:
3916                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3917
3918                That means that if we have seen any L's at all we can quit
3919                there, but if the next character is a LVT, a V or and LV we
3920                should keep going.
3921
3922                There is a subtlety with Prepend* which showed up in testing.
3923                Note that the Begin, and only the Begin is required in:
3924                 | Prepend* Begin Extend*
3925                Also, Begin contains '! Control'.  A Prepend must be a '!
3926                Control', which means it must be a Begin.  What it comes down to
3927                is that if we match Prepend* and then find no suitable Begin
3928                afterwards, that if we backtrack the last Prepend, that one will
3929                be a suitable Begin.
3930             */
3931
3932             if (locinput >= PL_regeol)
3933                 sayNO;
3934             if  (! utf8_target) {
3935
3936                 /* Match either CR LF  or '.', as all the other possibilities
3937                  * require utf8 */
3938                 locinput++;         /* Match the . or CR */
3939                 if (nextchr == '\r'
3940                     && locinput < PL_regeol
3941                     && UCHARAT(locinput) == '\n') locinput++;
3942             }
3943             else {
3944
3945                 /* Utf8: See if is ( CR LF ); already know that locinput <
3946                  * PL_regeol, so locinput+1 is in bounds */
3947                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3948                     locinput += 2;
3949                 }
3950                 else {
3951                     /* In case have to backtrack to beginning, then match '.' */
3952                     char *starting = locinput;
3953
3954                     /* In case have to backtrack the last prepend */
3955                     char *previous_prepend = 0;
3956
3957                     LOAD_UTF8_CHARCLASS_GCB();
3958
3959                     /* Match (prepend)* */
3960                     while (locinput < PL_regeol
3961                            && swash_fetch(PL_utf8_X_prepend,
3962                                           (U8*)locinput, utf8_target))
3963                     {
3964                         previous_prepend = locinput;
3965                         locinput += UTF8SKIP(locinput);
3966                     }
3967
3968                     /* As noted above, if we matched a prepend character, but
3969                      * the next thing won't match, back off the last prepend we
3970                      * matched, as it is guaranteed to match the begin */
3971                     if (previous_prepend
3972                         && (locinput >=  PL_regeol
3973                             || ! swash_fetch(PL_utf8_X_begin,
3974                                              (U8*)locinput, utf8_target)))
3975                     {
3976                         locinput = previous_prepend;
3977                     }
3978
3979                     /* Note that here we know PL_regeol > locinput, as we
3980                      * tested that upon input to this switch case, and if we
3981                      * moved locinput forward, we tested the result just above
3982                      * and it either passed, or we backed off so that it will
3983                      * now pass */
3984                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3985
3986                         /* Here did not match the required 'Begin' in the
3987                          * second term.  So just match the very first
3988                          * character, the '.' of the final term of the regex */
3989                         locinput = starting + UTF8SKIP(starting);
3990                     } else {
3991
3992                         /* Here is the beginning of a character that can have
3993                          * an extender.  It is either a hangul syllable, or a
3994                          * non-control */
3995                         if (swash_fetch(PL_utf8_X_non_hangul,
3996                                         (U8*)locinput, utf8_target))
3997                         {
3998
3999                             /* Here not a Hangul syllable, must be a
4000                              * ('!  * Control') */
4001                             locinput += UTF8SKIP(locinput);
4002                         } else {
4003
4004                             /* Here is a Hangul syllable.  It can be composed
4005                              * of several individual characters.  One
4006                              * possibility is T+ */
4007                             if (swash_fetch(PL_utf8_X_T,
4008                                             (U8*)locinput, utf8_target))
4009                             {
4010                                 while (locinput < PL_regeol
4011                                         && swash_fetch(PL_utf8_X_T,
4012                                                         (U8*)locinput, utf8_target))
4013                                 {
4014                                     locinput += UTF8SKIP(locinput);
4015                                 }
4016                             } else {
4017
4018                                 /* Here, not T+, but is a Hangul.  That means
4019                                  * it is one of the others: L, LV, LVT or V,
4020                                  * and matches:
4021                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4022
4023                                 /* Match L*           */
4024                                 while (locinput < PL_regeol
4025                                         && swash_fetch(PL_utf8_X_L,
4026                                                         (U8*)locinput, utf8_target))
4027                                 {
4028                                     locinput += UTF8SKIP(locinput);
4029                                 }
4030
4031                                 /* Here, have exhausted L*.  If the next
4032                                  * character is not an LV, LVT nor V, it means
4033                                  * we had to have at least one L, so matches L+
4034                                  * in the original equation, we have a complete
4035                                  * hangul syllable.  Are done. */
4036
4037                                 if (locinput < PL_regeol
4038                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4039                                                     (U8*)locinput, utf8_target))
4040                                 {
4041
4042                                     /* Otherwise keep going.  Must be LV, LVT
4043                                      * or V.  See if LVT */
4044                                     if (swash_fetch(PL_utf8_X_LVT,
4045                                                     (U8*)locinput, utf8_target))
4046                                     {
4047                                         locinput += UTF8SKIP(locinput);
4048                                     } else {
4049
4050                                         /* Must be  V or LV.  Take it, then
4051                                          * match V*     */
4052                                         locinput += UTF8SKIP(locinput);
4053                                         while (locinput < PL_regeol
4054                                                 && swash_fetch(PL_utf8_X_V,
4055                                                          (U8*)locinput, utf8_target))
4056                                         {
4057                                             locinput += UTF8SKIP(locinput);
4058                                         }
4059                                     }
4060
4061                                     /* And any of LV, LVT, or V can be followed
4062                                      * by T*            */
4063                                     while (locinput < PL_regeol
4064                                            && swash_fetch(PL_utf8_X_T,
4065                                                            (U8*)locinput,
4066                                                            utf8_target))
4067                                     {
4068                                         locinput += UTF8SKIP(locinput);
4069                                     }
4070                                 }
4071                             }
4072                         }
4073
4074                         /* Match any extender */
4075                         while (locinput < PL_regeol
4076                                 && swash_fetch(PL_utf8_X_extend,
4077                                                 (U8*)locinput, utf8_target))
4078                         {
4079                             locinput += UTF8SKIP(locinput);
4080                         }
4081                     }
4082                 }
4083                 if (locinput > PL_regeol) sayNO;
4084             }
4085             nextchr = UCHARAT(locinput);
4086             break;
4087
4088         case NREFFL:
4089         {   /* The capture buffer cases.  The ones beginning with N for the
4090                named buffers just convert to the equivalent numbered and
4091                pretend they were called as the corresponding numbered buffer
4092                op.  */
4093             /* don't initialize these, it makes C++ unhappy */
4094             char *s;
4095             char type;
4096             re_fold_t folder;
4097             const U8 *fold_array;
4098
4099             PL_reg_flags |= RF_tainted;
4100             folder = foldEQ_locale;
4101             fold_array = PL_fold_locale;
4102             type = REFFL;
4103             goto do_nref;
4104
4105         case NREFFU:
4106             folder = foldEQ_latin1;
4107             fold_array = PL_fold_latin1;
4108             type = REFFU;
4109             goto do_nref;
4110
4111         case NREFF:
4112             folder = foldEQ;
4113             fold_array = PL_fold;
4114             type = REFF;
4115             goto do_nref;
4116
4117         case NREF:
4118             type = REF;
4119             folder = NULL;
4120             fold_array = NULL;
4121           do_nref:
4122
4123             /* For the named back references, find the corresponding buffer
4124              * number */
4125             n = reg_check_named_buff_matched(rex,scan);
4126
4127             if ( ! n ) {
4128                 sayNO;
4129             }
4130             goto do_nref_ref_common;
4131
4132         case REFFL:
4133             PL_reg_flags |= RF_tainted;
4134             folder = foldEQ_locale;
4135             fold_array = PL_fold_locale;
4136             goto do_ref;
4137
4138         case REFFU:
4139             folder = foldEQ_latin1;
4140             fold_array = PL_fold_latin1;
4141             goto do_ref;
4142
4143         case REFF:
4144             folder = foldEQ;
4145             fold_array = PL_fold;
4146             goto do_ref;
4147
4148         case REF:
4149             folder = NULL;
4150             fold_array = NULL;
4151
4152           do_ref:
4153             type = OP(scan);
4154             n = ARG(scan);  /* which paren pair */
4155
4156           do_nref_ref_common:
4157             ln = PL_regoffs[n].start;
4158             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4159             if (*PL_reglastparen < n || ln == -1)
4160                 sayNO;                  /* Do not match unless seen CLOSEn. */
4161             if (ln == PL_regoffs[n].end)
4162                 break;
4163
4164             s = PL_bostr + ln;
4165             if (type != REF     /* REF can do byte comparison */
4166                 && (utf8_target
4167                     || (type == REFFU
4168                         && (*s == (char) LATIN_SMALL_LETTER_SHARP_S
4169                             || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S))))
4170             { /* XXX handle REFFL better */
4171                 char * limit = PL_regeol;
4172
4173                 /* This call case insensitively compares the entire buffer
4174                     * at s, with the current input starting at locinput, but
4175                     * not going off the end given by PL_regeol, and returns in
4176                     * limit upon success, how much of the current input was
4177                     * matched */
4178                 if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4179                                     locinput, &limit, 0, utf8_target))
4180                 {
4181                     sayNO;
4182                 }
4183                 locinput = limit;
4184                 nextchr = UCHARAT(locinput);
4185                 break;
4186             }
4187
4188             /* Not utf8:  Inline the first character, for speed. */
4189             if (UCHARAT(s) != nextchr &&
4190                 (type == REF ||
4191                  UCHARAT(s) != fold_array[nextchr]))
4192                 sayNO;
4193             ln = PL_regoffs[n].end - ln;
4194             if (locinput + ln > PL_regeol)
4195                 sayNO;
4196             if (ln > 1 && (type == REF
4197                            ? memNE(s, locinput, ln)
4198                            : ! folder(s, locinput, ln)))
4199                 sayNO;
4200             locinput += ln;
4201             nextchr = UCHARAT(locinput);
4202             break;
4203         }
4204         case NOTHING:
4205         case TAIL:
4206             break;
4207         case BACK:
4208             break;
4209
4210 #undef  ST
4211 #define ST st->u.eval
4212         {
4213             SV *ret;
4214             REGEXP *re_sv;
4215             regexp *re;
4216             regexp_internal *rei;
4217             regnode *startpoint;
4218
4219         case GOSTART:
4220         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4221             if (cur_eval && cur_eval->locinput==locinput) {
4222                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4223                     Perl_croak(aTHX_ "Infinite recursion in regex");
4224                 if ( ++nochange_depth > max_nochange_depth )
4225                     Perl_croak(aTHX_
4226                         "Pattern subroutine nesting without pos change"
4227                         " exceeded limit in regex");
4228             } else {
4229                 nochange_depth = 0;
4230             }
4231             re_sv = rex_sv;
4232             re = rex;
4233             rei = rexi;
4234             (void)ReREFCNT_inc(rex_sv);
4235             if (OP(scan)==GOSUB) {
4236                 startpoint = scan + ARG2L(scan);
4237                 ST.close_paren = ARG(scan);
4238             } else {
4239                 startpoint = rei->program+1;
4240                 ST.close_paren = 0;
4241             }
4242             goto eval_recurse_doit;
4243             /* NOTREACHED */
4244         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4245             if (cur_eval && cur_eval->locinput==locinput) {
4246                 if ( ++nochange_depth > max_nochange_depth )
4247                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4248             } else {
4249                 nochange_depth = 0;
4250             }
4251             {
4252                 /* execute the code in the {...} */
4253                 dSP;
4254                 SV ** const before = SP;
4255                 OP_4tree * const oop = PL_op;
4256                 COP * const ocurcop = PL_curcop;
4257                 PAD *old_comppad;
4258                 char *saved_regeol = PL_regeol;
4259                 struct re_save_state saved_state;
4260
4261                 /* To not corrupt the existing regex state while executing the
4262                  * eval we would normally put it on the save stack, like with
4263                  * save_re_context. However, re-evals have a weird scoping so we
4264                  * can't just add ENTER/LEAVE here. With that, things like
4265                  *
4266                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4267                  *
4268                  * would break, as they expect the localisation to be unwound
4269                  * only when the re-engine backtracks through the bit that
4270                  * localised it.
4271                  *
4272                  * What we do instead is just saving the state in a local c
4273                  * variable.
4274                  */
4275                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4276
4277                 n = ARG(scan);
4278                 PL_op = (OP_4tree*)rexi->data->data[n];
4279                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4280                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4281                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4282                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4283
4284                 if (sv_yes_mark) {
4285                     SV *sv_mrk = get_sv("REGMARK", 1);
4286                     sv_setsv(sv_mrk, sv_yes_mark);
4287                 }
4288
4289                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4290                 SPAGAIN;
4291                 if (SP == before)
4292                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4293                 else {
4294                     ret = POPs;
4295                     PUTBACK;
4296                 }
4297
4298                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4299
4300                 PL_op = oop;
4301                 PAD_RESTORE_LOCAL(old_comppad);
4302                 PL_curcop = ocurcop;
4303                 PL_regeol = saved_regeol;
4304                 if (!logical) {
4305                     /* /(?{...})/ */
4306                     sv_setsv(save_scalar(PL_replgv), ret);
4307                     break;
4308                 }
4309             }
4310             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4311                 logical = 0;
4312                 {
4313                     /* extract RE object from returned value; compiling if
4314                      * necessary */
4315                     MAGIC *mg = NULL;
4316                     REGEXP *rx = NULL;
4317
4318                     if (SvROK(ret)) {
4319                         SV *const sv = SvRV(ret);
4320
4321                         if (SvTYPE(sv) == SVt_REGEXP) {
4322                             rx = (REGEXP*) sv;
4323                         } else if (SvSMAGICAL(sv)) {
4324                             mg = mg_find(sv, PERL_MAGIC_qr);
4325                             assert(mg);
4326                         }
4327                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4328                         rx = (REGEXP*) ret;
4329                     } else if (SvSMAGICAL(ret)) {
4330                         if (SvGMAGICAL(ret)) {
4331                             /* I don't believe that there is ever qr magic
4332                                here.  */
4333                             assert(!mg_find(ret, PERL_MAGIC_qr));
4334                             sv_unmagic(ret, PERL_MAGIC_qr);
4335                         }
4336                         else {
4337                             mg = mg_find(ret, PERL_MAGIC_qr);
4338                             /* testing suggests mg only ends up non-NULL for
4339                                scalars who were upgraded and compiled in the
4340                                else block below. In turn, this is only
4341                                triggered in the "postponed utf8 string" tests
4342                                in t/op/pat.t  */
4343                         }
4344                     }
4345
4346                     if (mg) {
4347                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4348                         assert(rx);
4349                     }
4350                     if (rx) {
4351                         rx = reg_temp_copy(NULL, rx);
4352                     }
4353                     else {
4354                         U32 pm_flags = 0;
4355                         const I32 osize = PL_regsize;
4356
4357                         if (DO_UTF8(ret)) {
4358                             assert (SvUTF8(ret));
4359                         } else if (SvUTF8(ret)) {
4360                             /* Not doing UTF-8, despite what the SV says. Is
4361                                this only if we're trapped in use 'bytes'?  */
4362                             /* Make a copy of the octet sequence, but without
4363                                the flag on, as the compiler now honours the
4364                                SvUTF8 flag on ret.  */
4365                             STRLEN len;
4366                             const char *const p = SvPV(ret, len);
4367                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4368                         }
4369                         rx = CALLREGCOMP(ret, pm_flags);
4370                         if (!(SvFLAGS(ret)
4371                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4372                                  | SVs_GMG))) {
4373                             /* This isn't a first class regexp. Instead, it's
4374                                caching a regexp onto an existing, Perl visible
4375                                scalar.  */
4376                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4377                         }
4378                         PL_regsize = osize;
4379                     }
4380                     re_sv = rx;
4381                     re = (struct regexp *)SvANY(rx);
4382                 }
4383                 RXp_MATCH_COPIED_off(re);
4384                 re->subbeg = rex->subbeg;
4385                 re->sublen = rex->sublen;
4386                 rei = RXi_GET(re);
4387                 DEBUG_EXECUTE_r(
4388                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4389                         "Matching embedded");
4390                 );
4391                 startpoint = rei->program + 1;
4392                 ST.close_paren = 0; /* only used for GOSUB */
4393                 /* borrowed from regtry */
4394                 if (PL_reg_start_tmpl <= re->nparens) {
4395                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4396                     if(PL_reg_start_tmp)
4397                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4398                     else
4399                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4400                 }
4401
4402         eval_recurse_doit: /* Share code with GOSUB below this line */
4403                 /* run the pattern returned from (??{...}) */
4404                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4405                 REGCP_SET(ST.lastcp);
4406
4407                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4408
4409                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4410                 PL_reglastparen = &re->lastparen;
4411                 PL_reglastcloseparen = &re->lastcloseparen;
4412                 re->lastparen = 0;
4413                 re->lastcloseparen = 0;
4414
4415                 PL_reginput = locinput;
4416                 PL_regsize = 0;
4417
4418                 /* XXXX This is too dramatic a measure... */
4419                 PL_reg_maxiter = 0;
4420
4421                 ST.toggle_reg_flags = PL_reg_flags;
4422                 if (RX_UTF8(re_sv))
4423                     PL_reg_flags |= RF_utf8;
4424                 else
4425                     PL_reg_flags &= ~RF_utf8;
4426                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4427
4428                 ST.prev_rex = rex_sv;
4429                 ST.prev_curlyx = cur_curlyx;
4430                 SETREX(rex_sv,re_sv);
4431                 rex = re;
4432                 rexi = rei;
4433                 cur_curlyx = NULL;
4434                 ST.B = next;
4435                 ST.prev_eval = cur_eval;
4436                 cur_eval = st;
4437                 /* now continue from first node in postoned RE */
4438                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4439                 /* NOTREACHED */
4440             }
4441             /* logical is 1,   /(?(?{...})X|Y)/ */
4442             sw = cBOOL(SvTRUE(ret));
4443             logical = 0;
4444             break;
4445         }
4446
4447         case EVAL_AB: /* cleanup after a successful (??{A})B */
4448             /* note: this is called twice; first after popping B, then A */
4449             PL_reg_flags ^= ST.toggle_reg_flags;
4450             ReREFCNT_dec(rex_sv);
4451             SETREX(rex_sv,ST.prev_rex);
4452             rex = (struct regexp *)SvANY(rex_sv);
4453             rexi = RXi_GET(rex);
4454             regcpblow(ST.cp);
4455             cur_eval = ST.prev_eval;
4456             cur_curlyx = ST.prev_curlyx;
4457
4458             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4459             PL_reglastparen = &rex->lastparen;
4460             PL_reglastcloseparen = &rex->lastcloseparen;
4461             /* also update PL_regoffs */
4462             PL_regoffs = rex->offs;
4463
4464             /* XXXX This is too dramatic a measure... */
4465             PL_reg_maxiter = 0;
4466             if ( nochange_depth )
4467                 nochange_depth--;
4468             sayYES;
4469
4470
4471         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4472             /* note: this is called twice; first after popping B, then A */
4473             PL_reg_flags ^= ST.toggle_reg_flags;
4474             ReREFCNT_dec(rex_sv);
4475             SETREX(rex_sv,ST.prev_rex);
4476             rex = (struct regexp *)SvANY(rex_sv);
4477             rexi = RXi_GET(rex);
4478             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4479             PL_reglastparen = &rex->lastparen;
4480             PL_reglastcloseparen = &rex->lastcloseparen;
4481
4482             PL_reginput = locinput;
4483             REGCP_UNWIND(ST.lastcp);
4484             regcppop(rex);
4485             cur_eval = ST.prev_eval;
4486             cur_curlyx = ST.prev_curlyx;
4487             /* XXXX This is too dramatic a measure... */
4488             PL_reg_maxiter = 0;
4489             if ( nochange_depth )
4490                 nochange_depth--;
4491             sayNO_SILENT;
4492 #undef ST
4493
4494         case OPEN:
4495             n = ARG(scan);  /* which paren pair */
4496             PL_reg_start_tmp[n] = locinput;
4497             if (n > PL_regsize)
4498                 PL_regsize = n;
4499             lastopen = n;
4500             break;
4501         case CLOSE:
4502             n = ARG(scan);  /* which paren pair */
4503             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4504             PL_regoffs[n].end = locinput - PL_bostr;
4505             /*if (n > PL_regsize)
4506                 PL_regsize = n;*/
4507             if (n > *PL_reglastparen)
4508                 *PL_reglastparen = n;
4509             *PL_reglastcloseparen = n;
4510             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4511                 goto fake_end;
4512             }
4513             break;
4514         case ACCEPT:
4515             if (ARG(scan)){
4516                 regnode *cursor;
4517                 for (cursor=scan;
4518                      cursor && OP(cursor)!=END;
4519                      cursor=regnext(cursor))
4520                 {
4521                     if ( OP(cursor)==CLOSE ){
4522                         n = ARG(cursor);
4523                         if ( n <= lastopen ) {
4524                             PL_regoffs[n].start
4525                                 = PL_reg_start_tmp[n] - PL_bostr;
4526                             PL_regoffs[n].end = locinput - PL_bostr;
4527                             /*if (n > PL_regsize)
4528                             PL_regsize = n;*/
4529                             if (n > *PL_reglastparen)
4530                                 *PL_reglastparen = n;
4531                             *PL_reglastcloseparen = n;
4532                             if ( n == ARG(scan) || (cur_eval &&
4533                                 cur_eval->u.eval.close_paren == n))
4534                                 break;
4535                         }
4536                     }
4537                 }
4538             }
4539             goto fake_end;
4540             /*NOTREACHED*/
4541         case GROUPP:
4542             n = ARG(scan);  /* which paren pair */
4543             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4544             break;
4545         case NGROUPP:
4546             /* reg_check_named_buff_matched returns 0 for no match */
4547             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4548             break;
4549         case INSUBP:
4550             n = ARG(scan);
4551             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4552             break;
4553         case DEFINEP:
4554             sw = 0;
4555             break;
4556         case IFTHEN:
4557             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4558             if (sw)
4559                 next = NEXTOPER(NEXTOPER(scan));
4560             else {
4561                 next = scan + ARG(scan);
4562                 if (OP(next) == IFTHEN) /* Fake one. */
4563                     next = NEXTOPER(NEXTOPER(next));
4564             }
4565             break;
4566         case LOGICAL:
4567             logical = scan->flags;
4568             break;
4569
4570 /*******************************************************************
4571
4572 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4573 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4574 STAR/PLUS/CURLY/CURLYN are used instead.)
4575
4576 A*B is compiled as <CURLYX><A><WHILEM><B>
4577
4578 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4579 state, which contains the current count, initialised to -1. It also sets
4580 cur_curlyx to point to this state, with any previous value saved in the
4581 state block.
4582
4583 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4584 since the pattern may possibly match zero times (i.e. it's a while {} loop
4585 rather than a do {} while loop).
4586
4587 Each entry to WHILEM represents a successful match of A. The count in the
4588 CURLYX block is incremented, another WHILEM state is pushed, and execution
4589 passes to A or B depending on greediness and the current count.
4590
4591 For example, if matching against the string a1a2a3b (where the aN are
4592 substrings that match /A/), then the match progresses as follows: (the
4593 pushed states are interspersed with the bits of strings matched so far):
4594
4595     <CURLYX cnt=-1>
4596     <CURLYX cnt=0><WHILEM>
4597     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4598     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4599     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4600     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4601
4602 (Contrast this with something like CURLYM, which maintains only a single
4603 backtrack state:
4604
4605     <CURLYM cnt=0> a1
4606     a1 <CURLYM cnt=1> a2
4607     a1 a2 <CURLYM cnt=2> a3
4608     a1 a2 a3 <CURLYM cnt=3> b
4609 )
4610
4611 Each WHILEM state block marks a point to backtrack to upon partial failure
4612 of A or B, and also contains some minor state data related to that
4613 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4614 overall state, such as the count, and pointers to the A and B ops.
4615
4616 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4617 must always point to the *current* CURLYX block, the rules are:
4618
4619 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4620 and set cur_curlyx to point the new block.
4621
4622 When popping the CURLYX block after a successful or unsuccessful match,
4623 restore the previous cur_curlyx.
4624
4625 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4626 to the outer one saved in the CURLYX block.
4627
4628 When popping the WHILEM block after a successful or unsuccessful B match,
4629 restore the previous cur_curlyx.
4630
4631 Here's an example for the pattern (AI* BI)*BO
4632 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4633
4634 cur_
4635 curlyx backtrack stack
4636 ------ ---------------
4637 NULL
4638 CO     <CO prev=NULL> <WO>
4639 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4640 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4641 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4642
4643 At this point the pattern succeeds, and we work back down the stack to
4644 clean up, restoring as we go:
4645
4646 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4647 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4648 CO     <CO prev=NULL> <WO>
4649 NULL
4650
4651 *******************************************************************/
4652
4653 #define ST st->u.curlyx
4654
4655         case CURLYX:    /* start of /A*B/  (for complex A) */
4656         {
4657             /* No need to save/restore up to this paren */
4658             I32 parenfloor = scan->flags;
4659
4660             assert(next); /* keep Coverity happy */
4661             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4662                 next += ARG(next);
4663
4664             /* XXXX Probably it is better to teach regpush to support
4665                parenfloor > PL_regsize... */
4666             if (parenfloor > (I32)*PL_reglastparen)
4667                 parenfloor = *PL_reglastparen; /* Pessimization... */
4668
4669             ST.prev_curlyx= cur_curlyx;
4670             cur_curlyx = st;
4671             ST.cp = PL_savestack_ix;
4672
4673             /* these fields contain the state of the current curly.
4674              * they are accessed by subsequent WHILEMs */
4675             ST.parenfloor = parenfloor;
4676             ST.me = scan;
4677             ST.B = next;
4678             ST.minmod = minmod;
4679             minmod = 0;
4680             ST.count = -1;      /* this will be updated by WHILEM */
4681             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4682
4683             PL_reginput = locinput;
4684             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4685             /* NOTREACHED */
4686         }
4687
4688         case CURLYX_end: /* just finished matching all of A*B */
4689             cur_curlyx = ST.prev_curlyx;
4690             sayYES;
4691             /* NOTREACHED */
4692
4693         case CURLYX_end_fail: /* just failed to match all of A*B */
4694             regcpblow(ST.cp);
4695             cur_curlyx = ST.prev_curlyx;
4696             sayNO;
4697             /* NOTREACHED */
4698
4699
4700 #undef ST
4701 #define ST st->u.whilem
4702
4703         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4704         {
4705             /* see the discussion above about CURLYX/WHILEM */
4706             I32 n;
4707             int min = ARG1(cur_curlyx->u.curlyx.me);
4708             int max = ARG2(cur_curlyx->u.curlyx.me);
4709             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4710
4711             assert(cur_curlyx); /* keep Coverity happy */
4712             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4713             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4714             ST.cache_offset = 0;
4715             ST.cache_mask = 0;
4716
4717             PL_reginput = locinput;
4718
4719             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4720                   "%*s  whilem: matched %ld out of %d..%d\n",
4721                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4722             );
4723
4724             /* First just match a string of min A's. */
4725
4726             if (n < min) {
4727                 cur_curlyx->u.curlyx.lastloc = locinput;
4728                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4729                 /* NOTREACHED */
4730             }
4731
4732             /* If degenerate A matches "", assume A done. */
4733
4734             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4735                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4736                    "%*s  whilem: empty match detected, trying continuation...\n",
4737                    REPORT_CODE_OFF+depth*2, "")
4738                 );
4739                 goto do_whilem_B_max;
4740             }
4741
4742             /* super-linear cache processing */
4743
4744             if (scan->flags) {
4745
4746                 if (!PL_reg_maxiter) {
4747                     /* start the countdown: Postpone detection until we
4748                      * know the match is not *that* much linear. */
4749                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4750                     /* possible overflow for long strings and many CURLYX's */
4751                     if (PL_reg_maxiter < 0)
4752                         PL_reg_maxiter = I32_MAX;
4753                     PL_reg_leftiter = PL_reg_maxiter;
4754                 }
4755
4756                 if (PL_reg_leftiter-- == 0) {
4757                     /* initialise cache */
4758                     const I32 size = (PL_reg_maxiter + 7)/8;
4759                     if (PL_reg_poscache) {
4760                         if ((I32)PL_reg_poscache_size < size) {
4761                             Renew(PL_reg_poscache, size, char);
4762                             PL_reg_poscache_size = size;
4763                         }
4764                         Zero(PL_reg_poscache, size, char);
4765                     }
4766                     else {
4767                         PL_reg_poscache_size = size;
4768                         Newxz(PL_reg_poscache, size, char);
4769                     }
4770                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4771       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4772                               PL_colors[4], PL_colors[5])
4773                     );
4774                 }
4775
4776                 if (PL_reg_leftiter < 0) {
4777                     /* have we already failed at this position? */
4778                     I32 offset, mask;
4779                     offset  = (scan->flags & 0xf) - 1
4780                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4781                     mask    = 1 << (offset % 8);
4782                     offset /= 8;
4783                     if (PL_reg_poscache[offset] & mask) {
4784                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4785                             "%*s  whilem: (cache) already tried at this position...\n",
4786                             REPORT_CODE_OFF+depth*2, "")
4787                         );
4788                         sayNO; /* cache records failure */
4789                     }
4790                     ST.cache_offset = offset;
4791                     ST.cache_mask   = mask;
4792                 }
4793             }
4794
4795             /* Prefer B over A for minimal matching. */
4796
4797             if (cur_curlyx->u.curlyx.minmod) {
4798                 ST.save_curlyx = cur_curlyx;
4799                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4800                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4801                 REGCP_SET(ST.lastcp);
4802                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4803                 /* NOTREACHED */
4804             }
4805
4806             /* Prefer A over B for maximal matching. */
4807
4808             if (n < max) { /* More greed allowed? */
4809                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4810                 cur_curlyx->u.curlyx.lastloc = locinput;
4811                 REGCP_SET(ST.lastcp);
4812                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4813                 /* NOTREACHED */
4814             }
4815             goto do_whilem_B_max;
4816         }
4817         /* NOTREACHED */
4818
4819         case WHILEM_B_min: /* just matched B in a minimal match */
4820         case WHILEM_B_max: /* just matched B in a maximal match */
4821             cur_curlyx = ST.save_curlyx;
4822             sayYES;
4823             /* NOTREACHED */
4824
4825         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4826             cur_curlyx = ST.save_curlyx;
4827             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4828             cur_curlyx->u.curlyx.count--;
4829             CACHEsayNO;
4830             /* NOTREACHED */
4831
4832         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4833             REGCP_UNWIND(ST.lastcp);
4834             regcppop(rex);
4835             /* FALL THROUGH */
4836         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4837             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4838             cur_curlyx->u.curlyx.count--;
4839             CACHEsayNO;
4840             /* NOTREACHED */
4841
4842         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4843             REGCP_UNWIND(ST.lastcp);
4844             regcppop(rex);      /* Restore some previous $<digit>s? */
4845             PL_reginput = locinput;
4846             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4847                 "%*s  whilem: failed, trying continuation...\n",
4848                 REPORT_CODE_OFF+depth*2, "")
4849             );
4850           do_whilem_B_max:
4851             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4852                 && ckWARN(WARN_REGEXP)
4853                 && !(PL_reg_flags & RF_warned))
4854             {
4855                 PL_reg_flags |= RF_warned;
4856                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4857                      "Complex regular subexpression recursion",
4858                      REG_INFTY - 1);
4859             }
4860
4861             /* now try B */
4862             ST.save_curlyx = cur_curlyx;
4863             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4864             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4865             /* NOTREACHED */
4866
4867         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4868             cur_curlyx = ST.save_curlyx;
4869             REGCP_UNWIND(ST.lastcp);
4870             regcppop(rex);
4871
4872             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4873                 /* Maximum greed exceeded */
4874                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4875                     && ckWARN(WARN_REGEXP)
4876                     && !(PL_reg_flags & RF_warned))
4877                 {
4878                     PL_reg_flags |= RF_warned;
4879                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4880                         "%s limit (%d) exceeded",
4881                         "Complex regular subexpression recursion",
4882                         REG_INFTY - 1);
4883                 }
4884                 cur_curlyx->u.curlyx.count--;
4885                 CACHEsayNO;
4886             }
4887
4888             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4889                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4890             );
4891             /* Try grabbing another A and see if it helps. */
4892             PL_reginput = locinput;
4893             cur_curlyx->u.curlyx.lastloc = locinput;
4894             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4895             REGCP_SET(ST.lastcp);
4896             PUSH_STATE_GOTO(WHILEM_A_min,
4897                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4898             /* NOTREACHED */
4899
4900 #undef  ST
4901 #define ST st->u.branch
4902
4903         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4904             next = scan + ARG(scan);
4905             if (next == scan)
4906                 next = NULL;
4907             scan = NEXTOPER(scan);
4908             /* FALL THROUGH */
4909
4910         case BRANCH:        /*  /(...|A|...)/ */
4911             scan = NEXTOPER(scan); /* scan now points to inner node */
4912             ST.lastparen = *PL_reglastparen;
4913             ST.next_branch = next;
4914             REGCP_SET(ST.cp);
4915             PL_reginput = locinput;
4916
4917             /* Now go into the branch */
4918             if (has_cutgroup) {
4919                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4920             } else {
4921                 PUSH_STATE_GOTO(BRANCH_next, scan);
4922             }
4923             /* NOTREACHED */
4924         case CUTGROUP:
4925             PL_reginput = locinput;
4926             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4927                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4928             PUSH_STATE_GOTO(CUTGROUP_next,next);
4929             /* NOTREACHED */
4930         case CUTGROUP_next_fail:
4931             do_cutgroup = 1;
4932             no_final = 1;
4933             if (st->u.mark.mark_name)
4934                 sv_commit = st->u.mark.mark_name;
4935             sayNO;
4936             /* NOTREACHED */
4937         case BRANCH_next:
4938             sayYES;
4939             /* NOTREACHED */
4940         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4941             if (do_cutgroup) {
4942                 do_cutgroup = 0;
4943                 no_final = 0;
4944             }
4945             REGCP_UNWIND(ST.cp);
4946             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4947                 PL_regoffs[n].end = -1;
4948             *PL_reglastparen = n;
4949             /*dmq: *PL_reglastcloseparen = n; */
4950             scan = ST.next_branch;
4951             /* no more branches? */
4952             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4953                 DEBUG_EXECUTE_r({
4954                     PerlIO_printf( Perl_debug_log,
4955                         "%*s  %sBRANCH failed...%s\n",
4956                         REPORT_CODE_OFF+depth*2, "",
4957                         PL_colors[4],
4958                         PL_colors[5] );
4959                 });
4960                 sayNO_SILENT;
4961             }
4962             continue; /* execute next BRANCH[J] op */
4963             /* NOTREACHED */
4964
4965         case MINMOD:
4966             minmod = 1;
4967             break;
4968
4969 #undef  ST
4970 #define ST st->u.curlym
4971
4972         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4973
4974             /* This is an optimisation of CURLYX that enables us to push
4975              * only a single backtracking state, no matter how many matches
4976              * there are in {m,n}. It relies on the pattern being constant
4977              * length, with no parens to influence future backrefs
4978              */
4979
4980             ST.me = scan;
4981             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4982
4983             /* if paren positive, emulate an OPEN/CLOSE around A */
4984             if (ST.me->flags) {
4985                 U32 paren = ST.me->flags;
4986                 if (paren > PL_regsize)
4987                     PL_regsize = paren;
4988                 if (paren > *PL_reglastparen)
4989                     *PL_reglastparen = paren;
4990                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4991             }
4992             ST.A = scan;
4993             ST.B = next;
4994             ST.alen = 0;
4995             ST.count = 0;
4996             ST.minmod = minmod;
4997             minmod = 0;
4998             ST.c1 = CHRTEST_UNINIT;
4999             REGCP_SET(ST.cp);
5000
5001             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5002                 goto curlym_do_B;
5003
5004           curlym_do_A: /* execute the A in /A{m,n}B/  */
5005             PL_reginput = locinput;
5006             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5007             /* NOTREACHED */
5008
5009         case CURLYM_A: /* we've just matched an A */
5010             locinput = st->locinput;
5011             nextchr = UCHARAT(locinput);
5012
5013             ST.count++;
5014             /* after first match, determine A's length: u.curlym.alen */
5015             if (ST.count == 1) {
5016                 if (PL_reg_match_utf8) {
5017                     char *s = locinput;
5018                     while (s < PL_reginput) {
5019                         ST.alen++;
5020                         s += UTF8SKIP(s);
5021                     }
5022                 }
5023                 else {
5024                     ST.alen = PL_reginput - locinput;
5025                 }
5026                 if (ST.alen == 0)
5027                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5028             }
5029             DEBUG_EXECUTE_r(
5030                 PerlIO_printf(Perl_debug_log,
5031                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5032                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5033                           (IV) ST.count, (IV)ST.alen)
5034             );
5035
5036             locinput = PL_reginput;
5037
5038             if (cur_eval && cur_eval->u.eval.close_paren &&
5039                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5040                 goto fake_end;
5041
5042             {
5043                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5044                 if ( max == REG_INFTY || ST.count < max )
5045                     goto curlym_do_A; /* try to match another A */
5046             }
5047             goto curlym_do_B; /* try to match B */
5048
5049         case CURLYM_A_fail: /* just failed to match an A */
5050             REGCP_UNWIND(ST.cp);
5051
5052             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5053                 || (cur_eval && cur_eval->u.eval.close_paren &&
5054                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5055                 sayNO;
5056
5057           curlym_do_B: /* execute the B in /A{m,n}B/  */
5058             PL_reginput = locinput;
5059             if (ST.c1 == CHRTEST_UNINIT) {
5060                 /* calculate c1 and c2 for possible match of 1st char
5061                  * following curly */
5062                 ST.c1 = ST.c2 = CHRTEST_VOID;
5063                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5064                     regnode *text_node = ST.B;
5065                     if (! HAS_TEXT(text_node))
5066                         FIND_NEXT_IMPT(text_node);
5067                     /* this used to be
5068
5069                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5070
5071                         But the former is redundant in light of the latter.
5072
5073                         if this changes back then the macro for
5074                         IS_TEXT and friends need to change.
5075                      */
5076                     if (PL_regkind[OP(text_node)] == EXACT)
5077                     {
5078
5079                         ST.c1 = (U8)*STRING(text_node);
5080                         switch (OP(text_node)) {
5081                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5082                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5083                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5084                             default: ST.c2 = ST.c1;
5085                         }
5086                     }
5087                 }
5088             }
5089
5090             DEBUG_EXECUTE_r(
5091                 PerlIO_printf(Perl_debug_log,
5092                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5093                     (int)(REPORT_CODE_OFF+(depth*2)),
5094                     "", (IV)ST.count)
5095                 );
5096             if (ST.c1 != CHRTEST_VOID
5097                     && UCHARAT(PL_reginput) != ST.c1
5098                     && UCHARAT(PL_reginput) != ST.c2)
5099             {
5100                 /* simulate B failing */
5101                 DEBUG_OPTIMISE_r(
5102                     PerlIO_printf(Perl_debug_log,
5103                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5104                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5105                         (IV)ST.c1,(IV)ST.c2
5106                 ));
5107                 state_num = CURLYM_B_fail;
5108                 goto reenter_switch;
5109             }
5110
5111             if (ST.me->flags) {
5112                 /* mark current A as captured */
5113                 I32 paren = ST.me->flags;
5114                 if (ST.count) {
5115                     PL_regoffs[paren].start
5116                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5117                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5118                     /*dmq: *PL_reglastcloseparen = paren; */
5119                 }
5120                 else
5121                     PL_regoffs[paren].end = -1;
5122                 if (cur_eval && cur_eval->u.eval.close_paren &&
5123                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5124                 {
5125                     if (ST.count)
5126                         goto fake_end;
5127                     else
5128                         sayNO;
5129                 }
5130             }
5131
5132             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5133             /* NOTREACHED */
5134
5135         case CURLYM_B_fail: /* just failed to match a B */
5136             REGCP_UNWIND(ST.cp);
5137             if (ST.minmod) {
5138                 I32 max = ARG2(ST.me);
5139                 if (max != REG_INFTY && ST.count == max)
5140                     sayNO;
5141                 goto curlym_do_A; /* try to match a further A */
5142             }
5143             /* backtrack one A */
5144             if (ST.count == ARG1(ST.me) /* min */)
5145                 sayNO;
5146             ST.count--;
5147             locinput = HOPc(locinput, -ST.alen);
5148             goto curlym_do_B; /* try to match B */
5149
5150 #undef ST
5151 #define ST st->u.curly
5152
5153 #define CURLY_SETPAREN(paren, success) \
5154     if (paren) { \
5155         if (success) { \
5156             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5157             PL_regoffs[paren].end = locinput - PL_bostr; \
5158             *PL_reglastcloseparen = paren; \
5159         } \
5160         else \
5161             PL_regoffs[paren].end = -1; \
5162     }
5163
5164         case STAR:              /*  /A*B/ where A is width 1 */
5165             ST.paren = 0;
5166             ST.min = 0;
5167             ST.max = REG_INFTY;
5168             scan = NEXTOPER(scan);
5169             goto repeat;
5170         case PLUS:              /*  /A+B/ where A is width 1 */
5171             ST.paren = 0;
5172             ST.min = 1;
5173             ST.max = REG_INFTY;
5174             scan = NEXTOPER(scan);
5175             goto repeat;
5176         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5177             ST.paren = scan->flags;     /* Which paren to set */
5178             if (ST.paren > PL_regsize)
5179                 PL_regsize = ST.paren;
5180             if (ST.paren > *PL_reglastparen)
5181                 *PL_reglastparen = ST.paren;
5182             ST.min = ARG1(scan);  /* min to match */
5183             ST.max = ARG2(scan);  /* max to match */
5184             if (cur_eval && cur_eval->u.eval.close_paren &&
5185                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5186                 ST.min=1;
5187                 ST.max=1;
5188             }
5189             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5190             goto repeat;
5191         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5192             ST.paren = 0;
5193             ST.min = ARG1(scan);  /* min to match */
5194             ST.max = ARG2(scan);  /* max to match */
5195             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5196           repeat:
5197             /*
5198             * Lookahead to avoid useless match attempts
5199             * when we know what character comes next.
5200             *
5201             * Used to only do .*x and .*?x, but now it allows
5202             * for )'s, ('s and (?{ ... })'s to be in the way
5203             * of the quantifier and the EXACT-like node.  -- japhy
5204             */
5205
5206             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5207                 sayNO;
5208             if (HAS_TEXT(next) || JUMPABLE(next)) {
5209                 U8 *s;
5210                 regnode *text_node = next;
5211
5212                 if (! HAS_TEXT(text_node))
5213                     FIND_NEXT_IMPT(text_node);
5214
5215                 if (! HAS_TEXT(text_node))
5216                     ST.c1 = ST.c2 = CHRTEST_VOID;
5217                 else {
5218                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5219                         ST.c1 = ST.c2 = CHRTEST_VOID;
5220                         goto assume_ok_easy;
5221                     }
5222                     else
5223                         s = (U8*)STRING(text_node);
5224
5225                     /*  Currently we only get here when
5226
5227                         PL_rekind[OP(text_node)] == EXACT
5228
5229                         if this changes back then the macro for IS_TEXT and
5230                         friends need to change. */
5231                     if (!UTF_PATTERN) {
5232                         ST.c1 = *s;
5233                         switch (OP(text_node)) {
5234                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5235                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5236                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5237                             default: ST.c2 = ST.c1; break;
5238                         }
5239                     }
5240                     else { /* UTF_PATTERN */
5241                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5242                              STRLEN ulen1, ulen2;
5243                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5244                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5245
5246                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5247                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5248 #ifdef EBCDIC
5249                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5250                                                     ckWARN(WARN_UTF8) ?
5251                                                     0 : UTF8_ALLOW_ANY);
5252                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5253                                                     ckWARN(WARN_UTF8) ?
5254                                                     0 : UTF8_ALLOW_ANY);
5255 #else
5256                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5257                                                     uniflags);
5258                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5259                                                     uniflags);
5260 #endif
5261                         }
5262                         else {
5263                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5264                                                      uniflags);
5265                         }
5266                     }
5267                 }
5268             }
5269             else
5270                 ST.c1 = ST.c2 = CHRTEST_VOID;
5271         assume_ok_easy:
5272
5273             ST.A = scan;
5274             ST.B = next;
5275             PL_reginput = locinput;
5276             if (minmod) {
5277                 minmod = 0;
5278                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5279                     sayNO;
5280                 ST.count = ST.min;
5281                 locinput = PL_reginput;
5282                 REGCP_SET(ST.cp);
5283                 if (ST.c1 == CHRTEST_VOID)
5284                     goto curly_try_B_min;
5285
5286                 ST.oldloc = locinput;
5287
5288                 /* set ST.maxpos to the furthest point along the
5289                  * string that could possibly match */
5290                 if  (ST.max == REG_INFTY) {
5291                     ST.maxpos = PL_regeol - 1;
5292                     if (utf8_target)
5293                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5294                             ST.maxpos--;
5295                 }
5296                 else if (utf8_target) {
5297                     int m = ST.max - ST.min;
5298                     for (ST.maxpos = locinput;
5299                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5300                         ST.maxpos += UTF8SKIP(ST.maxpos);
5301                 }
5302                 else {
5303                     ST.maxpos = locinput + ST.max - ST.min;
5304                     if (ST.maxpos >= PL_regeol)
5305                         ST.maxpos = PL_regeol - 1;
5306                 }
5307                 goto curly_try_B_min_known;
5308
5309             }
5310             else {
5311                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5312                 locinput = PL_reginput;
5313                 if (ST.count < ST.min)
5314                     sayNO;
5315                 if ((ST.count > ST.min)
5316                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5317                 {
5318                     /* A{m,n} must come at the end of the string, there's
5319                      * no point in backing off ... */
5320                     ST.min = ST.count;
5321                     /* ...except that $ and \Z can match before *and* after
5322                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5323                        We may back off by one in this case. */
5324                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5325                         ST.min--;
5326                 }
5327                 REGCP_SET(ST.cp);
5328                 goto curly_try_B_max;
5329             }
5330             /* NOTREACHED */
5331
5332
5333         case CURLY_B_min_known_fail:
5334             /* failed to find B in a non-greedy match where c1,c2 valid */
5335             if (ST.paren && ST.count)
5336                 PL_regoffs[ST.paren].end = -1;
5337
5338             PL_reginput = locinput;     /* Could be reset... */
5339             REGCP_UNWIND(ST.cp);
5340             /* Couldn't or didn't -- move forward. */
5341             ST.oldloc = locinput;
5342             if (utf8_target)
5343                 locinput += UTF8SKIP(locinput);
5344             else
5345                 locinput++;
5346             ST.count++;
5347           curly_try_B_min_known:
5348              /* find the next place where 'B' could work, then call B */
5349             {
5350                 int n;
5351                 if (utf8_target) {
5352                     n = (ST.oldloc == locinput) ? 0 : 1;
5353                     if (ST.c1 == ST.c2) {
5354                         STRLEN len;
5355                         /* set n to utf8_distance(oldloc, locinput) */
5356                         while (locinput <= ST.maxpos &&
5357                                utf8n_to_uvchr((U8*)locinput,
5358                                               UTF8_MAXBYTES, &len,
5359                                               uniflags) != (UV)ST.c1) {
5360                             locinput += len;
5361                             n++;
5362                         }
5363                     }
5364                     else {
5365                         /* set n to utf8_distance(oldloc, locinput) */
5366                         while (locinput <= ST.maxpos) {
5367                             STRLEN len;
5368                             const UV c = utf8n_to_uvchr((U8*)locinput,
5369                                                   UTF8_MAXBYTES, &len,
5370                                                   uniflags);
5371                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5372                                 break;
5373                             locinput += len;
5374                             n++;
5375                         }
5376                     }
5377                 }
5378                 else {
5379                     if (ST.c1 == ST.c2) {
5380                         while (locinput <= ST.maxpos &&
5381                                UCHARAT(locinput) != ST.c1)
5382                             locinput++;
5383                     }
5384                     else {
5385                         while (locinput <= ST.maxpos
5386                                && UCHARAT(locinput) != ST.c1
5387                                && UCHARAT(locinput) != ST.c2)
5388                             locinput++;
5389                     }
5390                     n = locinput - ST.oldloc;
5391                 }
5392                 if (locinput > ST.maxpos)
5393                     sayNO;
5394                 /* PL_reginput == oldloc now */
5395                 if (n) {
5396                     ST.count += n;
5397                     if (regrepeat(rex, ST.A, n, depth) < n)
5398                         sayNO;
5399                 }
5400                 PL_reginput = locinput;
5401                 CURLY_SETPAREN(ST.paren, ST.count);
5402                 if (cur_eval && cur_eval->u.eval.close_paren &&
5403                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5404                     goto fake_end;
5405                 }
5406                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5407             }
5408             /* NOTREACHED */
5409
5410
5411         case CURLY_B_min_fail:
5412             /* failed to find B in a non-greedy match where c1,c2 invalid */
5413             if (ST.paren && ST.count)
5414                 PL_regoffs[ST.paren].end = -1;
5415
5416             REGCP_UNWIND(ST.cp);
5417             /* failed -- move forward one */
5418             PL_reginput = locinput;
5419             if (regrepeat(rex, ST.A, 1, depth)) {
5420                 ST.count++;
5421                 locinput = PL_reginput;
5422                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5423                         ST.count > 0)) /* count overflow ? */
5424                 {
5425                   curly_try_B_min:
5426                     CURLY_SETPAREN(ST.paren, ST.count);
5427                     if (cur_eval && cur_eval->u.eval.close_paren &&
5428                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5429                         goto fake_end;
5430                     }
5431                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5432                 }
5433             }
5434             sayNO;
5435             /* NOTREACHED */
5436
5437
5438         curly_try_B_max:
5439             /* a successful greedy match: now try to match B */
5440             if (cur_eval && cur_eval->u.eval.close_paren &&
5441                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5442                 goto fake_end;
5443             }
5444             {
5445                 UV c = 0;
5446                 if (ST.c1 != CHRTEST_VOID)
5447                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5448                                            UTF8_MAXBYTES, 0, uniflags)
5449                                 : (UV) UCHARAT(PL_reginput);
5450                 /* If it could work, try it. */
5451                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5452                     CURLY_SETPAREN(ST.paren, ST.count);
5453                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5454                     /* NOTREACHED */
5455                 }
5456             }
5457             /* FALL THROUGH */
5458         case CURLY_B_max_fail:
5459             /* failed to find B in a greedy match */
5460             if (ST.paren && ST.count)
5461                 PL_regoffs[ST.paren].end = -1;
5462
5463             REGCP_UNWIND(ST.cp);
5464             /*  back up. */
5465             if (--ST.count < ST.min)
5466                 sayNO;
5467             PL_reginput = locinput = HOPc(locinput, -1);
5468             goto curly_try_B_max;
5469
5470 #undef ST
5471
5472         case END:
5473             fake_end:
5474             if (cur_eval) {
5475                 /* we've just finished A in /(??{A})B/; now continue with B */
5476                 I32 tmpix;
5477                 st->u.eval.toggle_reg_flags
5478                             = cur_eval->u.eval.toggle_reg_flags;
5479                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5480
5481                 st->u.eval.prev_rex = rex_sv;           /* inner */
5482                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5483                 rex = (struct regexp *)SvANY(rex_sv);
5484                 rexi = RXi_GET(rex);
5485                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5486                 ReREFCNT_inc(rex_sv);
5487                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5488
5489                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5490                 PL_reglastparen = &rex->lastparen;
5491                 PL_reglastcloseparen = &rex->lastcloseparen;
5492
5493                 REGCP_SET(st->u.eval.lastcp);
5494                 PL_reginput = locinput;
5495
5496                 /* Restore parens of the outer rex without popping the
5497                  * savestack */
5498                 tmpix = PL_savestack_ix;
5499                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5500                 regcppop(rex);
5501                 PL_savestack_ix = tmpix;
5502
5503                 st->u.eval.prev_eval = cur_eval;
5504                 cur_eval = cur_eval->u.eval.prev_eval;
5505                 DEBUG_EXECUTE_r(
5506                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5507                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5508                 if ( nochange_depth )
5509                     nochange_depth--;
5510
5511                 PUSH_YES_STATE_GOTO(EVAL_AB,
5512                         st->u.eval.prev_eval->u.eval.B); /* match B */
5513             }
5514
5515             if (locinput < reginfo->till) {
5516                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5517                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5518                                       PL_colors[4],
5519                                       (long)(locinput - PL_reg_starttry),
5520                                       (long)(reginfo->till - PL_reg_starttry),
5521                                       PL_colors[5]));
5522
5523                 sayNO_SILENT;           /* Cannot match: too short. */
5524             }
5525             PL_reginput = locinput;     /* put where regtry can find it */
5526             sayYES;                     /* Success! */
5527
5528         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5529             DEBUG_EXECUTE_r(
5530             PerlIO_printf(Perl_debug_log,
5531                 "%*s  %ssubpattern success...%s\n",
5532                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5533             PL_reginput = locinput;     /* put where regtry can find it */
5534             sayYES;                     /* Success! */
5535
5536 #undef  ST
5537 #define ST st->u.ifmatch
5538
5539         case SUSPEND:   /* (?>A) */
5540             ST.wanted = 1;
5541             PL_reginput = locinput;
5542             goto do_ifmatch;
5543
5544         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5545             ST.wanted = 0;
5546             goto ifmatch_trivial_fail_test;
5547
5548         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5549             ST.wanted = 1;
5550           ifmatch_trivial_fail_test:
5551             if (scan->flags) {
5552                 char * const s = HOPBACKc(locinput, scan->flags);
5553                 if (!s) {
5554                     /* trivial fail */
5555                     if (logical) {
5556                         logical = 0;
5557                         sw = 1 - cBOOL(ST.wanted);
5558                     }
5559                     else if (ST.wanted)
5560                         sayNO;
5561                     next = scan + ARG(scan);
5562                     if (next == scan)
5563                         next = NULL;
5564                     break;
5565                 }
5566                 PL_reginput = s;
5567             }
5568             else
5569                 PL_reginput = locinput;
5570
5571           do_ifmatch:
5572             ST.me = scan;
5573             ST.logical = logical;
5574             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5575
5576             /* execute body of (?...A) */
5577             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5578             /* NOTREACHED */
5579
5580         case IFMATCH_A_fail: /* body of (?...A) failed */
5581             ST.wanted = !ST.wanted;
5582             /* FALL THROUGH */
5583
5584         case IFMATCH_A: /* body of (?...A) succeeded */
5585             if (ST.logical) {
5586                 sw = cBOOL(ST.wanted);
5587             }
5588             else if (!ST.wanted)
5589                 sayNO;
5590
5591             if (OP(ST.me) == SUSPEND)
5592                 locinput = PL_reginput;
5593             else {
5594                 locinput = PL_reginput = st->locinput;
5595                 nextchr = UCHARAT(locinput);
5596             }
5597             scan = ST.me + ARG(ST.me);
5598             if (scan == ST.me)
5599                 scan = NULL;
5600             continue; /* execute B */
5601
5602 #undef ST
5603
5604         case LONGJMP:
5605             next = scan + ARG(scan);
5606             if (next == scan)
5607                 next = NULL;
5608             break;
5609         case COMMIT:
5610             reginfo->cutpoint = PL_regeol;
5611             /* FALLTHROUGH */
5612         case PRUNE:
5613             PL_reginput = locinput;
5614             if (!scan->flags)
5615                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5616             PUSH_STATE_GOTO(COMMIT_next,next);
5617             /* NOTREACHED */
5618         case COMMIT_next_fail:
5619             no_final = 1;
5620             /* FALLTHROUGH */
5621         case OPFAIL:
5622             sayNO;
5623             /* NOTREACHED */
5624
5625 #define ST st->u.mark
5626         case MARKPOINT:
5627             ST.prev_mark = mark_state;
5628             ST.mark_name = sv_commit = sv_yes_mark
5629                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5630             mark_state = st;
5631             ST.mark_loc = PL_reginput = locinput;
5632             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5633             /* NOTREACHED */
5634         case MARKPOINT_next:
5635             mark_state = ST.prev_mark;
5636             sayYES;
5637             /* NOTREACHED */
5638         case MARKPOINT_next_fail:
5639             if (popmark && sv_eq(ST.mark_name,popmark))
5640             {
5641                 if (ST.mark_loc > startpoint)
5642                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5643                 popmark = NULL; /* we found our mark */
5644                 sv_commit = ST.mark_name;
5645
5646                 DEBUG_EXECUTE_r({
5647                         PerlIO_printf(Perl_debug_log,
5648                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5649                             REPORT_CODE_OFF+depth*2, "",
5650                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5651                 });
5652             }
5653             mark_state = ST.prev_mark;
5654             sv_yes_mark = mark_state ?
5655                 mark_state->u.mark.mark_name : NULL;
5656             sayNO;
5657             /* NOTREACHED */
5658         case SKIP:
5659             PL_reginput = locinput;
5660             if (scan->flags) {
5661                 /* (*SKIP) : if we fail we cut here*/
5662                 ST.mark_name = NULL;
5663                 ST.mark_loc = locinput;
5664                 PUSH_STATE_GOTO(SKIP_next,next);
5665             } else {
5666                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5667                    otherwise do nothing.  Meaning we need to scan
5668                  */
5669                 regmatch_state *cur = mark_state;
5670                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5671
5672                 while (cur) {
5673                     if ( sv_eq( cur->u.mark.mark_name,
5674                                 find ) )
5675                     {
5676                         ST.mark_name = find;
5677                         PUSH_STATE_GOTO( SKIP_next, next );
5678                     }
5679                     cur = cur->u.mark.prev_mark;
5680                 }
5681             }
5682             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5683             break;
5684         case SKIP_next_fail:
5685             if (ST.mark_name) {
5686                 /* (*CUT:NAME) - Set up to search for the name as we
5687                    collapse the stack*/
5688                 popmark = ST.mark_name;
5689             } else {
5690                 /* (*CUT) - No name, we cut here.*/
5691                 if (ST.mark_loc > startpoint)
5692                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5693                 /* but we set sv_commit to latest mark_name if there
5694                    is one so they can test to see how things lead to this
5695                    cut */
5696                 if (mark_state)
5697                     sv_commit=mark_state->u.mark.mark_name;
5698             }
5699             no_final = 1;
5700             sayNO;
5701             /* NOTREACHED */
5702 #undef ST
5703         case FOLDCHAR:
5704             n = ARG(scan);
5705             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5706                 locinput += ln;
5707             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5708                 sayNO;
5709             } else  {
5710                 U8 folded[UTF8_MAXBYTES_CASE+1];
5711                 STRLEN foldlen;
5712                 const char * const l = locinput;
5713                 char *e = PL_regeol;
5714                 to_uni_fold(n, folded, &foldlen);
5715
5716                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5717                                l, &e, 0,  utf8_target)) {
5718                         sayNO;
5719                 }
5720                 locinput = e;
5721             }
5722             nextchr = UCHARAT(locinput);
5723             break;
5724         case LNBREAK:
5725             if ((n=is_LNBREAK(locinput,utf8_target))) {
5726                 locinput += n;
5727                 nextchr = UCHARAT(locinput);
5728             } else
5729                 sayNO;
5730             break;
5731
5732 #define CASE_CLASS(nAmE)                              \
5733         case nAmE:                                    \
5734             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5735                 locinput += n;                        \
5736                 nextchr = UCHARAT(locinput);          \
5737             } else                                    \
5738                 sayNO;                                \
5739             break;                                    \
5740         case N##nAmE:                                 \
5741             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5742                 sayNO;                                \
5743             } else {                                  \
5744                 locinput += UTF8SKIP(locinput);       \
5745                 nextchr = UCHARAT(locinput);          \
5746             }                                         \
5747             break
5748
5749         CASE_CLASS(VERTWS);
5750         CASE_CLASS(HORIZWS);
5751 #undef CASE_CLASS
5752
5753         default:
5754             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5755                           PTR2UV(scan), OP(scan));
5756             Perl_croak(aTHX_ "regexp memory corruption");
5757
5758         } /* end switch */
5759
5760         /* switch break jumps here */
5761         scan = next; /* prepare to execute the next op and ... */
5762         continue;    /* ... jump back to the top, reusing st */
5763         /* NOTREACHED */
5764
5765       push_yes_state:
5766         /* push a state that backtracks on success */
5767         st->u.yes.prev_yes_state = yes_state;
5768         yes_state = st;
5769         /* FALL THROUGH */
5770       push_state:
5771         /* push a new regex state, then continue at scan  */
5772         {
5773             regmatch_state *newst;
5774
5775             DEBUG_STACK_r({
5776                 regmatch_state *cur = st;
5777                 regmatch_state *curyes = yes_state;
5778                 int curd = depth;
5779                 regmatch_slab *slab = PL_regmatch_slab;
5780                 for (;curd > -1;cur--,curd--) {
5781                     if (cur < SLAB_FIRST(slab)) {
5782                         slab = slab->prev;
5783                         cur = SLAB_LAST(slab);
5784                     }
5785                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5786                         REPORT_CODE_OFF + 2 + depth * 2,"",
5787                         curd, PL_reg_name[cur->resume_state],
5788                         (curyes == cur) ? "yes" : ""
5789                     );
5790                     if (curyes == cur)
5791                         curyes = cur->u.yes.prev_yes_state;
5792                 }
5793             } else
5794                 DEBUG_STATE_pp("push")
5795             );
5796             depth++;
5797             st->locinput = locinput;
5798             newst = st+1;
5799             if (newst >  SLAB_LAST(PL_regmatch_slab))
5800                 newst = S_push_slab(aTHX);
5801             PL_regmatch_state = newst;
5802
5803             locinput = PL_reginput;
5804             nextchr = UCHARAT(locinput);
5805             st = newst;
5806             continue;
5807             /* NOTREACHED */
5808         }
5809     }
5810
5811     /*
5812     * We get here only if there's trouble -- normally "case END" is
5813     * the terminating point.
5814     */
5815     Perl_croak(aTHX_ "corrupted regexp pointers");
5816     /*NOTREACHED*/
5817     sayNO;
5818
5819 yes:
5820     if (yes_state) {
5821         /* we have successfully completed a subexpression, but we must now
5822          * pop to the state marked by yes_state and continue from there */
5823         assert(st != yes_state);
5824 #ifdef DEBUGGING
5825         while (st != yes_state) {
5826             st--;
5827             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5828                 PL_regmatch_slab = PL_regmatch_slab->prev;
5829                 st = SLAB_LAST(PL_regmatch_slab);
5830             }
5831             DEBUG_STATE_r({
5832                 if (no_final) {
5833                     DEBUG_STATE_pp("pop (no final)");
5834                 } else {
5835                     DEBUG_STATE_pp("pop (yes)");
5836                 }
5837             });
5838             depth--;
5839         }
5840 #else
5841         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5842             || yes_state > SLAB_LAST(PL_regmatch_slab))
5843         {
5844             /* not in this slab, pop slab */
5845             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5846             PL_regmatch_slab = PL_regmatch_slab->prev;
5847             st = SLAB_LAST(PL_regmatch_slab);
5848         }
5849         depth -= (st - yes_state);
5850 #endif
5851         st = yes_state;
5852         yes_state = st->u.yes.prev_yes_state;
5853         PL_regmatch_state = st;
5854
5855         if (no_final) {
5856             locinput= st->locinput;
5857             nextchr = UCHARAT(locinput);
5858         }
5859         state_num = st->resume_state + no_final;
5860         goto reenter_switch;
5861     }
5862
5863     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5864                           PL_colors[4], PL_colors[5]));
5865
5866     if (PL_reg_eval_set) {
5867         /* each successfully executed (?{...}) block does the equivalent of
5868          *   local $^R = do {...}
5869          * When popping the save stack, all these locals would be undone;
5870          * bypass this by setting the outermost saved $^R to the latest
5871          * value */
5872         if (oreplsv != GvSV(PL_replgv))
5873             sv_setsv(oreplsv, GvSV(PL_replgv));
5874     }
5875     result = 1;
5876     goto final_exit;
5877
5878 no:
5879     DEBUG_EXECUTE_r(
5880         PerlIO_printf(Perl_debug_log,
5881             "%*s  %sfailed...%s\n",
5882             REPORT_CODE_OFF+depth*2, "",
5883             PL_colors[4], PL_colors[5])
5884         );
5885
5886 no_silent:
5887     if (no_final) {
5888         if (yes_state) {
5889             goto yes;
5890         } else {
5891             goto final_exit;
5892         }
5893     }
5894     if (depth) {
5895         /* there's a previous state to backtrack to */
5896         st--;
5897         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5898             PL_regmatch_slab = PL_regmatch_slab->prev;
5899             st = SLAB_LAST(PL_regmatch_slab);
5900         }
5901         PL_regmatch_state = st;
5902         locinput= st->locinput;
5903         nextchr = UCHARAT(locinput);
5904
5905         DEBUG_STATE_pp("pop");
5906         depth--;
5907         if (yes_state == st)
5908             yes_state = st->u.yes.prev_yes_state;
5909
5910         state_num = st->resume_state + 1; /* failure = success + 1 */
5911         goto reenter_switch;
5912     }
5913     result = 0;
5914
5915   final_exit:
5916     if (rex->intflags & PREGf_VERBARG_SEEN) {
5917         SV *sv_err = get_sv("REGERROR", 1);
5918         SV *sv_mrk = get_sv("REGMARK", 1);
5919         if (result) {
5920             sv_commit = &PL_sv_no;
5921             if (!sv_yes_mark)
5922                 sv_yes_mark = &PL_sv_yes;
5923         } else {
5924             if (!sv_commit)
5925                 sv_commit = &PL_sv_yes;
5926             sv_yes_mark = &PL_sv_no;
5927         }
5928         sv_setsv(sv_err, sv_commit);
5929         sv_setsv(sv_mrk, sv_yes_mark);
5930     }
5931
5932     /* clean up; in particular, free all slabs above current one */
5933     LEAVE_SCOPE(oldsave);
5934
5935     return result;
5936 }
5937
5938 /*
5939  - regrepeat - repeatedly match something simple, report how many
5940  */
5941 /*
5942  * [This routine now assumes that it will only match on things of length 1.
5943  * That was true before, but now we assume scan - reginput is the count,
5944  * rather than incrementing count on every character.  [Er, except utf8.]]
5945  */
5946 STATIC I32
5947 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5948 {
5949     dVAR;
5950     register char *scan;
5951     register I32 c;
5952     register char *loceol = PL_regeol;
5953     register I32 hardcount = 0;
5954     register bool utf8_target = PL_reg_match_utf8;
5955 #ifndef DEBUGGING
5956     PERL_UNUSED_ARG(depth);
5957 #endif
5958
5959     PERL_ARGS_ASSERT_REGREPEAT;
5960
5961     scan = PL_reginput;
5962     if (max == REG_INFTY)
5963         max = I32_MAX;
5964     else if (max < loceol - scan)
5965         loceol = scan + max;
5966     switch (OP(p)) {
5967     case REG_ANY:
5968         if (utf8_target) {
5969             loceol = PL_regeol;
5970             while (scan < loceol && hardcount < max && *scan != '\n') {
5971                 scan += UTF8SKIP(scan);
5972                 hardcount++;
5973             }
5974         } else {
5975             while (scan < loceol && *scan != '\n')
5976                 scan++;
5977         }
5978         break;
5979     case SANY:
5980         if (utf8_target) {
5981             loceol = PL_regeol;
5982             while (scan < loceol && hardcount < max) {
5983                 scan += UTF8SKIP(scan);
5984                 hardcount++;
5985             }
5986         }
5987         else
5988             scan = loceol;
5989         break;
5990     case CANY:
5991         scan = loceol;
5992         break;
5993     case EXACT:
5994         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5995          * means they match only characters in the string that can be expressed
5996          * as a single byte.  For non-utf8 strings, that means a simple match.
5997          * For utf8 strings, the character matched must be an invariant, or
5998          * downgradable to a single byte.  The pattern's utf8ness is
5999          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6000          * it is, it's an invariant */
6001
6002         c = (U8)*STRING(p);
6003         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6004
6005         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6006             while (scan < loceol && UCHARAT(scan) == c) {
6007                 scan++;
6008             }
6009         }
6010         else {
6011
6012             /* Here, the string is utf8, and the pattern char is different
6013              * in utf8 than not, so can't compare them directly.  Outside the
6014              * loop, find find the two utf8 bytes that represent c, and then
6015              * look for those in sequence in the utf8 string */
6016             U8 high = UTF8_TWO_BYTE_HI(c);
6017             U8 low = UTF8_TWO_BYTE_LO(c);
6018             loceol = PL_regeol;
6019
6020             while (hardcount < max
6021                     && scan + 1 < loceol
6022                     && UCHARAT(scan) == high
6023                     && UCHARAT(scan + 1) == low)
6024             {
6025                 scan += 2;
6026                 hardcount++;
6027             }
6028         }
6029         break;
6030     case EXACTFL:
6031         PL_reg_flags |= RF_tainted;
6032         /* FALL THROUGH */
6033     case EXACTF:
6034     case EXACTFU:
6035
6036         /* The comments for the EXACT case above apply as well to these fold
6037          * ones */
6038
6039         c = (U8)*STRING(p);
6040         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6041
6042         if (utf8_target) { /* Use full Unicode fold matching */
6043
6044             /* For the EXACTFL case, It doesn't really make sense to compare
6045              * locale and utf8, but it is best we can do.  The documents warn
6046              * against mixing them */
6047
6048             char *tmpeol = loceol;
6049             while (hardcount < max
6050                     && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
6051                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN)))
6052             {
6053                 scan = tmpeol;
6054                 tmpeol = loceol;
6055                 hardcount++;
6056             }
6057
6058             /* XXX Note that the above handles properly the German sharp s in
6059              * the pattern matching ss in the string.  But it doesn't handle
6060              * properly cases where the string contains say 'LIGATURE ff' and
6061              * the pattern is 'f+'.  This would require, say, a new function or
6062              * revised interface to foldEQ_utf8(), in which the maximum number
6063              * of characters to match could be passed and it would return how
6064              * many actually did.  This is just one of many cases where
6065              * multi-char folds don't work properly, and so the fix is being
6066              * deferred */
6067         }
6068         else {
6069             U8 folded;
6070
6071             /* Here, the string isn't utf8 and c is a single byte; and either
6072              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6073              * doesn't affect c.  Can just do simple comparisons for exact or
6074              * fold matching. */
6075             switch (OP(p)) {
6076                 case EXACTF: folded = PL_fold[c]; break;
6077                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6078                 case EXACTFL: folded = PL_fold_locale[c]; break;
6079                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6080             }
6081             while (scan < loceol &&
6082                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6083             {
6084                 scan++;
6085             }
6086         }
6087         break;
6088     case ANYOFV:
6089     case ANYOF:
6090         if (utf8_target) {
6091             loceol = PL_regeol;
6092             while (hardcount < max && scan < loceol &&
6093                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
6094                 scan += UTF8SKIP(scan);
6095                 hardcount++;
6096             }
6097         } else {
6098             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6099                 scan++;
6100         }
6101         break;
6102     case ALNUMU:
6103         if (utf8_target) {
6104     utf8_wordchar:
6105             loceol = PL_regeol;
6106             LOAD_UTF8_CHARCLASS_ALNUM();
6107             while (hardcount < max && scan < loceol &&
6108                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6109             {
6110                 scan += UTF8SKIP(scan);
6111                 hardcount++;
6112             }
6113         } else {
6114             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6115                 scan++;
6116             }
6117         }
6118         break;
6119     case ALNUM:
6120         if (utf8_target)
6121             goto utf8_wordchar;
6122         while (scan < loceol && isALNUM((U8) *scan)) {
6123             scan++;
6124         }
6125         break;
6126     case ALNUMA:
6127         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6128             scan++;
6129         }
6130         break;
6131     case ALNUML:
6132         PL_reg_flags |= RF_tainted;
6133         if (utf8_target) {
6134             loceol = PL_regeol;
6135             while (hardcount < max && scan < loceol &&
6136                    isALNUM_LC_utf8((U8*)scan)) {
6137                 scan += UTF8SKIP(scan);
6138                 hardcount++;
6139             }
6140         } else {
6141             while (scan < loceol && isALNUM_LC(*scan))
6142                 scan++;
6143         }
6144         break;
6145     case NALNUMU:
6146         if (utf8_target) {
6147
6148     utf8_Nwordchar:
6149
6150             loceol = PL_regeol;
6151             LOAD_UTF8_CHARCLASS_ALNUM();
6152             while (hardcount < max && scan < loceol &&
6153                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6154             {
6155                 scan += UTF8SKIP(scan);
6156                 hardcount++;
6157             }
6158         } else {
6159             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6160                 scan++;
6161             }
6162         }
6163         break;
6164     case NALNUM:
6165         if (utf8_target)
6166             goto utf8_Nwordchar;
6167         while (scan < loceol && ! isALNUM((U8) *scan)) {
6168             scan++;
6169         }
6170         break;
6171     case NALNUMA:
6172         if (utf8_target) {
6173             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6174                 scan += UTF8SKIP(scan);
6175             }
6176         }
6177         else {
6178             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6179                 scan++;
6180             }
6181         }
6182         break;
6183     case NALNUML:
6184         PL_reg_flags |= RF_tainted;
6185         if (utf8_target) {
6186             loceol = PL_regeol;
6187             while (hardcount < max && scan < loceol &&
6188                    !isALNUM_LC_utf8((U8*)scan)) {
6189                 scan += UTF8SKIP(scan);
6190                 hardcount++;
6191             }
6192         } else {
6193             while (scan < loceol && !isALNUM_LC(*scan))
6194                 scan++;
6195         }
6196         break;
6197     case SPACEU:
6198         if (utf8_target) {
6199
6200     utf8_space:
6201
6202             loceol = PL_regeol;
6203             LOAD_UTF8_CHARCLASS_SPACE();
6204             while (hardcount < max && scan < loceol &&
6205                    (*scan == ' ' ||
6206                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6207             {
6208                 scan += UTF8SKIP(scan);
6209                 hardcount++;
6210             }
6211             break;
6212         }
6213         else {
6214             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6215                 scan++;
6216             }
6217             break;
6218         }
6219     case SPACE:
6220         if (utf8_target)
6221             goto utf8_space;
6222
6223         while (scan < loceol && isSPACE((U8) *scan)) {
6224             scan++;
6225         }
6226         break;
6227     case SPACEA:
6228         while (scan < loceol && isSPACE_A((U8) *scan)) {
6229             scan++;
6230         }
6231         break;
6232     case SPACEL:
6233         PL_reg_flags |= RF_tainted;
6234         if (utf8_target) {
6235             loceol = PL_regeol;
6236             while (hardcount < max && scan < loceol &&
6237                    isSPACE_LC_utf8((U8*)scan)) {
6238                 scan += UTF8SKIP(scan);
6239                 hardcount++;
6240             }
6241         } else {
6242             while (scan < loceol && isSPACE_LC(*scan))
6243                 scan++;
6244         }
6245         break;
6246     case NSPACEU:
6247         if (utf8_target) {
6248
6249     utf8_Nspace:
6250
6251             loceol = PL_regeol;
6252             LOAD_UTF8_CHARCLASS_SPACE();
6253             while (hardcount < max && scan < loceol &&
6254                    ! (*scan == ' ' ||
6255                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6256             {
6257                 scan += UTF8SKIP(scan);
6258                 hardcount++;
6259             }
6260             break;
6261         }
6262         else {
6263             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6264                 scan++;
6265             }
6266         }
6267         break;
6268     case NSPACE:
6269         if (utf8_target)
6270             goto utf8_Nspace;
6271
6272         while (scan < loceol && ! isSPACE((U8) *scan)) {
6273             scan++;
6274         }
6275         break;
6276     case NSPACEA:
6277         if (utf8_target) {
6278             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6279                 scan += UTF8SKIP(scan);
6280             }
6281         }
6282         else {
6283             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6284                 scan++;
6285             }
6286         }
6287         break;
6288     case NSPACEL:
6289         PL_reg_flags |= RF_tainted;
6290         if (utf8_target) {
6291             loceol = PL_regeol;
6292             while (hardcount < max && scan < loceol &&
6293                    !isSPACE_LC_utf8((U8*)scan)) {
6294                 scan += UTF8SKIP(scan);
6295                 hardcount++;
6296             }
6297         } else {
6298             while (scan < loceol && !isSPACE_LC(*scan))
6299                 scan++;
6300         }
6301         break;
6302     case DIGIT:
6303         if (utf8_target) {
6304             loceol = PL_regeol;
6305             LOAD_UTF8_CHARCLASS_DIGIT();
6306             while (hardcount < max && scan < loceol &&
6307                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6308                 scan += UTF8SKIP(scan);
6309                 hardcount++;
6310             }
6311         } else {
6312             while (scan < loceol && isDIGIT(*scan))
6313                 scan++;
6314         }
6315         break;
6316     case DIGITA:
6317         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6318             scan++;
6319         }
6320         break;
6321     case DIGITL:
6322         PL_reg_flags |= RF_tainted;
6323         if (utf8_target) {
6324             loceol = PL_regeol;
6325             while (hardcount < max && scan < loceol &&
6326                    isDIGIT_LC_utf8((U8*)scan)) {
6327                 scan += UTF8SKIP(scan);
6328                 hardcount++;
6329             }
6330         } else {
6331             while (scan < loceol && isDIGIT_LC(*scan))
6332                 scan++;
6333         }
6334         break;
6335     case NDIGIT:
6336         if (utf8_target) {
6337             loceol = PL_regeol;
6338             LOAD_UTF8_CHARCLASS_DIGIT();
6339             while (hardcount < max && scan < loceol &&
6340                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6341                 scan += UTF8SKIP(scan);
6342                 hardcount++;
6343             }
6344         } else {
6345             while (scan < loceol && !isDIGIT(*scan))
6346                 scan++;
6347         }
6348         break;
6349     case NDIGITA:
6350         if (utf8_target) {
6351             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6352                 scan += UTF8SKIP(scan);
6353             }
6354         }
6355         else {
6356             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6357                 scan++;
6358             }
6359         }
6360         break;
6361     case NDIGITL:
6362         PL_reg_flags |= RF_tainted;
6363         if (utf8_target) {
6364             loceol = PL_regeol;
6365             while (hardcount < max && scan < loceol &&
6366                    !isDIGIT_LC_utf8((U8*)scan)) {
6367                 scan += UTF8SKIP(scan);
6368                 hardcount++;
6369             }
6370         } else {
6371             while (scan < loceol && !isDIGIT_LC(*scan))
6372                 scan++;
6373         }
6374         break;
6375     case LNBREAK:
6376         if (utf8_target) {
6377             loceol = PL_regeol;
6378             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6379                 scan += c;
6380                 hardcount++;
6381             }
6382         } else {
6383             /*
6384               LNBREAK can match two latin chars, which is ok,
6385               because we have a null terminated string, but we
6386               have to use hardcount in this situation
6387             */
6388             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6389                 scan+=c;
6390                 hardcount++;
6391             }
6392         }
6393         break;
6394     case HORIZWS:
6395         if (utf8_target) {
6396             loceol = PL_regeol;
6397             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6398                 scan += c;
6399                 hardcount++;
6400             }
6401         } else {
6402             while (scan < loceol && is_HORIZWS_latin1(scan))
6403                 scan++;
6404         }
6405         break;
6406     case NHORIZWS:
6407         if (utf8_target) {
6408             loceol = PL_regeol;
6409             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6410                 scan += UTF8SKIP(scan);
6411                 hardcount++;
6412             }
6413         } else {
6414             while (scan < loceol && !is_HORIZWS_latin1(scan))
6415                 scan++;
6416
6417         }
6418         break;
6419     case VERTWS:
6420         if (utf8_target) {
6421             loceol = PL_regeol;
6422             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6423                 scan += c;
6424                 hardcount++;
6425             }
6426         } else {
6427             while (scan < loceol && is_VERTWS_latin1(scan))
6428                 scan++;
6429
6430         }
6431         break;
6432     case NVERTWS:
6433         if (utf8_target) {
6434             loceol = PL_regeol;
6435             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6436                 scan += UTF8SKIP(scan);
6437                 hardcount++;
6438             }
6439         } else {
6440             while (scan < loceol && !is_VERTWS_latin1(scan))
6441                 scan++;
6442
6443         }
6444         break;
6445
6446     default:            /* Called on something of 0 width. */
6447         break;          /* So match right here or not at all. */
6448     }
6449
6450     if (hardcount)
6451         c = hardcount;
6452     else
6453         c = scan - PL_reginput;
6454     PL_reginput = scan;
6455
6456     DEBUG_r({
6457         GET_RE_DEBUG_FLAGS_DECL;
6458         DEBUG_EXECUTE_r({
6459             SV * const prop = sv_newmortal();
6460             regprop(prog, prop, p);
6461             PerlIO_printf(Perl_debug_log,
6462                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6463                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6464         });
6465     });
6466
6467     return(c);
6468 }
6469
6470
6471 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6472 /*
6473 - regclass_swash - prepare the utf8 swash
6474 */
6475
6476 SV *
6477 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6478 {
6479     dVAR;
6480     SV *sw  = NULL;
6481     SV *si  = NULL;
6482     SV *alt = NULL;
6483     RXi_GET_DECL(prog,progi);
6484     const struct reg_data * const data = prog ? progi->data : NULL;
6485
6486     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6487
6488     if (data && data->count) {
6489         const U32 n = ARG(node);
6490
6491         if (data->what[n] == 's') {
6492             SV * const rv = MUTABLE_SV(data->data[n]);
6493             AV * const av = MUTABLE_AV(SvRV(rv));
6494             SV **const ary = AvARRAY(av);
6495             SV **a, **b;
6496
6497             /* See the end of regcomp.c:S_regclass() for
6498              * documentation of these array elements. */
6499
6500             si = *ary;
6501             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6502             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6503
6504             if (a)
6505                 sw = *a;
6506             else if (si && doinit) {
6507                 sw = swash_init("utf8", "", si, 1, 0);
6508                 (void)av_store(av, 1, sw);
6509             }
6510             if (b)
6511                 alt = *b;
6512         }
6513     }
6514
6515     if (listsvp)
6516         *listsvp = si;
6517     if (altsvp)
6518         *altsvp  = alt;
6519
6520     return sw;
6521 }
6522 #endif
6523
6524 /*
6525  - reginclass - determine if a character falls into a character class
6526
6527   n is the ANYOF regnode
6528   p is the target string
6529   lenp is pointer to the maximum number of bytes of how far to go in p
6530     (This is assumed wthout checking to always be at least the current
6531     character's size)
6532   utf8_target tells whether p is in UTF-8.
6533
6534   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6535   from a successful match, the value it points to will be updated to how many
6536   bytes in p were matched.  If there was no match, the value is undefined,
6537   possibly changed from the input.
6538
6539   Note that this can be a synthetic start class, a combination of various
6540   nodes, so things you think might be mutually exclusive, such as locale,
6541   aren't.  It can match both locale and non-locale
6542
6543  */
6544
6545 STATIC bool
6546 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6547 {
6548     dVAR;
6549     const char flags = ANYOF_FLAGS(n);
6550     bool match = FALSE;
6551     UV c = *p;
6552     STRLEN c_len = 0;
6553     STRLEN maxlen;
6554
6555     PERL_ARGS_ASSERT_REGINCLASS;
6556
6557     /* If c is not already the code point, get it */
6558     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6559         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6560                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6561                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6562                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6563                  * UTF8_ALLOW_FFFF */
6564         if (c_len == (STRLEN)-1)
6565             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6566     }
6567     else {
6568         c_len = 1;
6569     }
6570
6571     /* Use passed in max length, or one character if none passed in or less
6572      * than one character.  And assume will match just one character.  This is
6573      * overwritten later if matched more. */
6574     if (lenp) {
6575         maxlen = (*lenp > c_len) ? *lenp : c_len;
6576         *lenp = c_len;
6577
6578     }
6579     else {
6580         maxlen = c_len;
6581     }
6582
6583     /* If this character is potentially in the bitmap, check it */
6584     if (c < 256) {
6585         if (ANYOF_BITMAP_TEST(n, c))
6586             match = TRUE;
6587         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6588                 && ! utf8_target
6589                 && ! isASCII(c))
6590         {
6591             match = TRUE;
6592         }
6593
6594         else if (flags & ANYOF_LOCALE) {
6595             PL_reg_flags |= RF_tainted;
6596
6597             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6598                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6599             {
6600                 match = TRUE;
6601             }
6602             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6603                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6604                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6605                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6606                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6607                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6608                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6609                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6610                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6611                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6612                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6613                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6614                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6615                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6616                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6617                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6618                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6619                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6620                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6621                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6622                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6623                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6624                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6625                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6626                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6627                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6628                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6629                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6630                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6631                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6632                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6633                      ) /* How's that for a conditional? */
6634             ) {
6635                 match = TRUE;
6636             }
6637         }
6638     }
6639
6640     /* If the bitmap didn't (or couldn't) match, and something outside the
6641      * bitmap could match, try that */
6642     if (!match) {
6643         if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
6644             if (c >= 256
6645                 || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a
6646                                                           non-Latin1 fold
6647                                                           should match */
6648                     && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
6649             {
6650                 match = TRUE;
6651             }
6652         }
6653         if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6654                        || (utf8_target && flags & ANYOF_UTF8)))
6655         {
6656             AV *av;
6657             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6658
6659             if (sw) {
6660                 U8 * utf8_p;
6661                 if (utf8_target) {
6662                     utf8_p = (U8 *) p;
6663                 } else {
6664
6665                     /* Not utf8.  Convert as much of the string as available up
6666                      * to the limit of how far the (single) character in the
6667                      * pattern can possibly match (no need to go further).  If
6668                      * the node is a straight ANYOF or not folding, it can't
6669                      * match more than one.  Otherwise, It can match up to how
6670                      * far a single char can fold to.  Since not utf8, each
6671                      * character is a single byte, so the max it can be in
6672                      * bytes is the same as the max it can be in characters */
6673                     STRLEN len = (OP(n) == ANYOF
6674                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6675                                   ? 1
6676                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6677                                     ? maxlen
6678                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6679                     utf8_p = bytes_to_utf8(p, &len);
6680                 }
6681
6682                 if (swash_fetch(sw, utf8_p, TRUE))
6683                     match = TRUE;
6684                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6685
6686                     /* Here, we need to test if the fold of the target string
6687                      * matches.  In the case of a multi-char fold that is
6688                      * caught by regcomp.c, it has stored all such folds into
6689                      * 'av'; we linearly check to see if any match the target
6690                      * string (folded).   We know that the originals were each
6691                      * one character, but we don't currently know how many
6692                      * characters/bytes each folded to, except we do know that
6693                      * there are small limits imposed by Unicode.  XXX A
6694                      * performance enhancement would be to have regcomp.c store
6695                      * the max number of chars/bytes that are in an av entry,
6696                      * as, say the 0th element.  Even better would be to have a
6697                      * hash of the few characters that can start a multi-char
6698                      * fold to the max number of chars of those folds.
6699                      *
6700                      * Further down, if there isn't a
6701                      * match in the av, we will check if there is another
6702                      * fold-type match.  For that, we also need the fold, but
6703                      * only the first character.  No sense in folding it twice,
6704                      * so we do it here, even if there isn't any multi-char
6705                      * fold, so we always fold at least the first character.
6706                      * If the node is a straight ANYOF node, or there is only
6707                      * one character available in the string, or if there isn't
6708                      * any av, that's all we have to fold.  In the case of a
6709                      * multi-char fold, we do have guarantees in Unicode that
6710                      * it can only expand up to so many characters and so many
6711                      * bytes.  We keep track so don't exceed either.
6712                      *
6713                      * If there is a match, we will need to advance (if lenp is
6714                      * specified) the match pointer in the target string.  But
6715                      * what we are comparing here isn't that string directly,
6716                      * but its fold, whose length may differ from the original.
6717                      * As we go along in constructing the fold, therefore, we
6718                      * create a map so that we know how many bytes in the
6719                      * source to advance given that we have matched a certain
6720                      * number of bytes in the fold.  This map is stored in
6721                      * 'map_fold_len_back'.  The first character in the fold
6722                      * has array element 1 contain the number of bytes in the
6723                      * source that folded to it; the 2nd is the cumulative
6724                      * number to match it; ... */
6725                     U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND] = { 0 };
6726                     U8 folded[UTF8_MAXBYTES_CASE+1];
6727                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6728                     STRLEN foldlen_for_av; /* num bytes in fold of all chars */
6729
6730                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6731
6732                         /* Here, only need to fold the first char of the target
6733                          * string */
6734                         to_utf8_fold(utf8_p, folded, &foldlen);
6735                         foldlen_for_av = foldlen;
6736                         map_fold_len_back[1] = UTF8SKIP(utf8_p);
6737                     }
6738                     else {
6739
6740                         /* Here, need to fold more than the first char.  Do so
6741                          * up to the limits */
6742                         UV which_char = 0;
6743                         U8* source_ptr = utf8_p;    /* The source for the fold
6744                                                        is the regex target
6745                                                        string */
6746                         U8* folded_ptr = folded;
6747                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6748                                                        available byte in the
6749                                                        target string */
6750                         while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND
6751                                && source_ptr < e)
6752                         {
6753
6754                             /* Fold the next character */
6755                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6756                             STRLEN this_char_foldlen;
6757                             to_utf8_fold(source_ptr,
6758                                          this_char_folded,
6759                                          &this_char_foldlen);
6760
6761                             /* Bail if it would exceed the byte limit for
6762                              * folding a single char. */
6763                             if (this_char_foldlen + folded_ptr - folded >
6764                                                             UTF8_MAXBYTES_CASE)
6765                             {
6766                                 break;
6767                             }
6768
6769                             /* Save the first character's folded length, in
6770                              * case we have to use it later */
6771                             if (! foldlen) {
6772                                 foldlen = this_char_foldlen;
6773                             }
6774
6775                             /* Here, add the fold of this character */
6776                             Copy(this_char_folded,
6777                                  folded_ptr,
6778                                  this_char_foldlen,
6779                                  U8);
6780                             which_char++;
6781                             map_fold_len_back[which_char] =
6782                                 map_fold_len_back[which_char - 1]
6783                                 + UTF8SKIP(source_ptr);
6784                             folded_ptr += this_char_foldlen;
6785                             source_ptr += UTF8SKIP(source_ptr);
6786                         }
6787                         *folded_ptr = '\0';
6788                         foldlen_for_av = folded_ptr - folded;
6789                     }
6790
6791
6792                     /* Do the linear search to see if the fold is in the list
6793                      * of multi-char folds.  (Useless to look if won't be able
6794                      * to store that it is a multi-char fold in *lenp) */
6795                     if (lenp && av) {
6796                         I32 i;
6797                         for (i = 0; i <= av_len(av); i++) {
6798                             SV* const sv = *av_fetch(av, i, FALSE);
6799                             STRLEN len;
6800                             const char * const s = SvPV_const(sv, len);
6801                             if (len <= foldlen_for_av && memEQ(s,
6802                                                                (char*)folded,
6803                                                                len))
6804                             {
6805
6806                                 /* Advance the target string ptr to account for
6807                                  * this fold, but have to translate from the
6808                                  * folded length to the corresponding source
6809                                  * length.  The array is indexed by how many
6810                                  * characters in the match */
6811                                 *lenp = map_fold_len_back[
6812                                         utf8_length(folded, folded + len)];
6813                                 match = TRUE;
6814                                 break;
6815                             }
6816                         }
6817                     }
6818                     if (!match) { /* See if the folded version matches */
6819                         SV** listp;
6820
6821                         /* Consider "k" =~ /[K]/i.  The line above would have
6822                          * just folded the 'k' to itself, and that isn't going
6823                          * to match 'K'.  So we look through the closure of
6824                          * everything that folds to 'k'.  That will find the
6825                          * 'K'.  Initialize the list, if necessary */
6826                         if (! PL_utf8_foldclosures) {
6827
6828                             /* If the folds haven't been read in, call a fold
6829                              * function to force that */
6830                             if (! PL_utf8_tofold) {
6831                                 U8 dummy[UTF8_MAXBYTES+1];
6832                                 STRLEN dummy_len;
6833                                 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6834                             }
6835                             PL_utf8_foldclosures =
6836                                   _swash_inversion_hash(PL_utf8_tofold);
6837                         }
6838
6839                         /* The data structure is a hash with the keys every
6840                          * character that is folded to, like 'k', and the
6841                          * values each an array of everything that folds to its
6842                          * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6843                         if ((listp = hv_fetch(PL_utf8_foldclosures,
6844                                       (char *) folded, foldlen, FALSE)))
6845                         {
6846                             AV* list = (AV*) *listp;
6847                             IV i;
6848                             for (i = 0; i <= av_len(list); i++) {
6849                                 SV** try_p = av_fetch(list, i, FALSE);
6850                                 char* try_c;
6851                                 if (try_p == NULL) {
6852                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6853                                 }
6854                                 /* Don't have to worry about embedded nulls
6855                                  * since NULL isn't folded or foldable */
6856                                 try_c = SvPVX(*try_p);
6857
6858                                 /* The fold in a few cases  of an above Latin1
6859                                  * char is in the Latin1 range, and hence may
6860                                  * be in the bitmap */
6861                                 if (UTF8_IS_INVARIANT(*try_c)
6862                                     && ANYOF_BITMAP_TEST(n,
6863                                                     UNI_TO_NATIVE(*try_c)))
6864                                 {
6865                                     match = TRUE;
6866                                     break;
6867                                 }
6868                                 else if
6869                                     (UTF8_IS_DOWNGRADEABLE_START(*try_c)
6870                                      && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
6871                                                 TWO_BYTE_UTF8_TO_UNI(try_c[0],
6872                                                                     try_c[1]))))
6873                                 {
6874                                    /* Since the fold comes from internally
6875                                     * generated data, we can safely assume it
6876                                     * is valid utf8 in the test above */
6877                                     match = TRUE;
6878                                     break;
6879                                 } else if (swash_fetch(sw, (U8*) try_c, TRUE)) {
6880                                     match = TRUE;
6881                                     break;
6882                                 }
6883                             }
6884                         }
6885                     }
6886                 }
6887
6888                 /* If we allocated a string above, free it */
6889                 if (! utf8_target) Safefree(utf8_p);
6890             }
6891         }
6892     }
6893
6894     return (flags & ANYOF_INVERT) ? !match : match;
6895 }
6896
6897 STATIC U8 *
6898 S_reghop3(U8 *s, I32 off, const U8* lim)
6899 {
6900     dVAR;
6901
6902     PERL_ARGS_ASSERT_REGHOP3;
6903
6904     if (off >= 0) {
6905         while (off-- && s < lim) {
6906             /* XXX could check well-formedness here */
6907             s += UTF8SKIP(s);
6908         }
6909     }
6910     else {
6911         while (off++ && s > lim) {
6912             s--;
6913             if (UTF8_IS_CONTINUED(*s)) {
6914                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6915                     s--;
6916             }
6917             /* XXX could check well-formedness here */
6918         }
6919     }
6920     return s;
6921 }
6922
6923 #ifdef XXX_dmq
6924 /* there are a bunch of places where we use two reghop3's that should
6925    be replaced with this routine. but since thats not done yet
6926    we ifdef it out - dmq
6927 */
6928 STATIC U8 *
6929 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6930 {
6931     dVAR;
6932
6933     PERL_ARGS_ASSERT_REGHOP4;
6934
6935     if (off >= 0) {
6936         while (off-- && s < rlim) {
6937             /* XXX could check well-formedness here */
6938             s += UTF8SKIP(s);
6939         }
6940     }
6941     else {
6942         while (off++ && s > llim) {
6943             s--;
6944             if (UTF8_IS_CONTINUED(*s)) {
6945                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6946                     s--;
6947             }
6948             /* XXX could check well-formedness here */
6949         }
6950     }
6951     return s;
6952 }
6953 #endif
6954
6955 STATIC U8 *
6956 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6957 {
6958     dVAR;
6959
6960     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6961
6962     if (off >= 0) {
6963         while (off-- && s < lim) {
6964             /* XXX could check well-formedness here */
6965             s += UTF8SKIP(s);
6966         }
6967         if (off >= 0)
6968             return NULL;
6969     }
6970     else {
6971         while (off++ && s > lim) {
6972             s--;
6973             if (UTF8_IS_CONTINUED(*s)) {
6974                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6975                     s--;
6976             }
6977             /* XXX could check well-formedness here */
6978         }
6979         if (off <= 0)
6980             return NULL;
6981     }
6982     return s;
6983 }
6984
6985 static void
6986 restore_pos(pTHX_ void *arg)
6987 {
6988     dVAR;
6989     regexp * const rex = (regexp *)arg;
6990     if (PL_reg_eval_set) {
6991         if (PL_reg_oldsaved) {
6992             rex->subbeg = PL_reg_oldsaved;
6993             rex->sublen = PL_reg_oldsavedlen;
6994 #ifdef PERL_OLD_COPY_ON_WRITE
6995             rex->saved_copy = PL_nrs;
6996 #endif
6997             RXp_MATCH_COPIED_on(rex);
6998         }
6999         PL_reg_magic->mg_len = PL_reg_oldpos;
7000         PL_reg_eval_set = 0;
7001         PL_curpm = PL_reg_oldcurpm;
7002     }
7003 }
7004
7005 STATIC void
7006 S_to_utf8_substr(pTHX_ register regexp *prog)
7007 {
7008     int i = 1;
7009
7010     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7011
7012     do {
7013         if (prog->substrs->data[i].substr
7014             && !prog->substrs->data[i].utf8_substr) {
7015             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7016             prog->substrs->data[i].utf8_substr = sv;
7017             sv_utf8_upgrade(sv);
7018             if (SvVALID(prog->substrs->data[i].substr)) {
7019                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
7020                 if (flags & FBMcf_TAIL) {
7021                     /* Trim the trailing \n that fbm_compile added last
7022                        time.  */
7023                     SvCUR_set(sv, SvCUR(sv) - 1);
7024                     /* Whilst this makes the SV technically "invalid" (as its
7025                        buffer is no longer followed by "\0") when fbm_compile()
7026                        adds the "\n" back, a "\0" is restored.  */
7027                 }
7028                 fbm_compile(sv, flags);
7029             }
7030             if (prog->substrs->data[i].substr == prog->check_substr)
7031                 prog->check_utf8 = sv;
7032         }
7033     } while (i--);
7034 }
7035
7036 STATIC void
7037 S_to_byte_substr(pTHX_ register regexp *prog)
7038 {
7039     dVAR;
7040     int i = 1;
7041
7042     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7043
7044     do {
7045         if (prog->substrs->data[i].utf8_substr
7046             && !prog->substrs->data[i].substr) {
7047             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7048             if (sv_utf8_downgrade(sv, TRUE)) {
7049                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7050                     const U8 flags
7051                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
7052                     if (flags & FBMcf_TAIL) {
7053                         /* Trim the trailing \n that fbm_compile added last
7054                            time.  */
7055                         SvCUR_set(sv, SvCUR(sv) - 1);
7056                     }
7057                     fbm_compile(sv, flags);
7058                 }
7059             } else {
7060                 SvREFCNT_dec(sv);
7061                 sv = &PL_sv_undef;
7062             }
7063             prog->substrs->data[i].substr = sv;
7064             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7065                 prog->check_substr = sv;
7066         }
7067     } while (i--);
7068 }
7069
7070 /*
7071  * Local variables:
7072  * c-indentation-style: bsd
7073  * c-basic-offset: 4
7074  * indent-tabs-mode: t
7075  * End:
7076  *
7077  * ex: set ts=8 sts=4 sw=4 noet:
7078  */