src/5013010/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 128
 129 /* Doesn't do an assert to verify that is correct */
 130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 131     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 132
 133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 136
 137 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 138         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 139         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 140         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 141             * assert should likely and hopefully fail on an EBCDIC machine */ \
 142         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 143                                                                             \
 144         /* No asserts are done for these, in case called on an early        \
 145             * Unicode version in which they map to nothing */               \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 152         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 153
 154 /*
 155    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 156    so that it is possible to override the option here without having to
 157    rebuild the entire core. as we are required to do if we change regcomp.h
 158    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 159 */
 160 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 161 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 162 #endif
 163
 164 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 165 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 166 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 167 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 168 #define RE_utf8_perl_word   PL_utf8_alnum
 169 #define RE_utf8_perl_space  PL_utf8_space
 170 #define RE_utf8_posix_digit PL_utf8_digit
 171 #define perl_word  alnum
 172 #define perl_space space
 173 #define posix_digit digit
 174 #else
 175 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 176 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 177 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 178 #define RE_utf8_perl_word   PL_utf8_perl_word
 179 #define RE_utf8_perl_space  PL_utf8_perl_space
 180 #define RE_utf8_posix_digit PL_utf8_posix_digit
 181 #endif
 182
 183 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 184
 185 /* The actual code for CCC_TRY, which uses several variables from the routine
 186  * it's callable from.  It is designed to be the bulk of a case statement.
 187  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 188  *      nextchr matches the class.
 189  * UTF8_TEST is the whole test string to use for utf8 targets
 190  * LOAD is what to use to test, and if not present to load in the swash for the
 191  *      class
 192  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 193  *      UTF8_TEST test.
 194  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 195  * utf8 and a variant, load the swash if necessary and test using the utf8
 196  * test.  Advance to the next character if test is ok, otherwise fail; If not
 197  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 198  * fails, or advance to the next character */
 199
 200 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 201     if (locinput >= PL_regeol) {                                              \
 202         sayNO;                                                                \
 203     }                                                                         \
 204     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 205         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 206         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 207             sayNO;                                                            \
 208         }                                                                     \
 209         locinput += PL_utf8skip[nextchr];                                     \
 210         nextchr = UCHARAT(locinput);                                          \
 211         break;                                                                \
 212     }                                                                         \
 213     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 214         sayNO;                                                                \
 215     }                                                                         \
 216     nextchr = UCHARAT(++locinput);                                            \
 217     break;
 218
 219 /* Handle the non-locale cases for a character class and its complement.  It
 220  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 221  * This is because that code fails when the test succeeds, so we want to have
 222  * the test fail so that the code succeeds.  The swash is stored in a
 223  * predictable PL_ place */
 224 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 225                            CLASS, STR)                                        \
 226     case NAME:                                                                \
 227         _CCC_TRY_CODE( !, FUNC,                                               \
 228                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 229                                             (U8*)locinput, TRUE)),            \
 230                           CLASS, STR)                                         \
 231     case NNAME:                                                               \
 232         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 233                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 234                                             (U8*)locinput, TRUE)),            \
 235                           CLASS, STR)                                         \
 236
 237 /* Generate the case statements for both locale and non-locale character
 238  * classes in regmatch for classes that don't have special unicode semantics.
 239  * Locales don't use an immediate swash, but an intermediary special locale
 240  * function that is called on the pointer to the current place in the input
 241  * string.  That function will resolve to needing the same swash.  One might
 242  * think that because we don't know what the locale will match, we shouldn't
 243  * check with the swash loading function that it loaded properly; ie, that we
 244  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 245  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 246  * irrelevant here */
 247 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 248                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 249                 NAMEA, NNAMEA, FUNCA,                                         \
 250                 CLASS, STR)                                                   \
 251     case NAMEL:                                                               \
 252         PL_reg_flags |= RF_tainted;                                           \
 253         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 254     case NNAMEL:                                                              \
 255         PL_reg_flags |= RF_tainted;                                           \
 256         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 257                        CLASS, STR)                                            \
 258     case NAMEA:                                                               \
 259         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 260             sayNO;                                                            \
 261         }                                                                     \
 262         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 263         nextchr = UCHARAT(++locinput);                                        \
 264         break;                                                                \
 265     case NNAMEA:                                                              \
 266         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 267             sayNO;                                                            \
 268         }                                                                     \
 269         if (utf8_target) {                                                    \
 270             locinput += PL_utf8skip[nextchr];                                 \
 271             nextchr = UCHARAT(locinput);                                      \
 272         }                                                                     \
 273         else {                                                                \
 274             nextchr = UCHARAT(++locinput);                                    \
 275         }                                                                     \
 276         break;                                                                \
 277     /* Generate the non-locale cases */                                       \
 278     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 279
 280 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 281  * statements to handle separate Unicode semantics nodes */
 282 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 283                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 284                   NAMEU, NNAMEU, FUNCU,                                        \
 285                   NAMEA, NNAMEA, FUNCA,                                        \
 286                   CLASS, STR)                                                  \
 287     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 288             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 289             NAMEA, NNAMEA, FUNCA,                                              \
 290             CLASS, STR)                                                        \
 291     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 292
 293 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 294
 295 /* for use after a quantifier and before an EXACT-like node -- japhy */
 296 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 297  *
 298  * NOTE that *nothing* that affects backtracking should be in here, specifically
 299  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 300  * node that is in between two EXACT like nodes when ascertaining what the required
 301  * "follow" character is. This should probably be moved to regex compile time
 302  * although it may be done at run time beause of the REF possibility - more
 303  * investigation required. -- demerphq
 304 */
 305 #define JUMPABLE(rn) (      \
 306     OP(rn) == OPEN ||       \
 307     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 308     OP(rn) == EVAL ||   \
 309     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 310     OP(rn) == PLUS || OP(rn) == MINMOD || \
 311     OP(rn) == KEEPS || \
 312     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 313 )
 314 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 315
 316 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 317
 318 #if 0
 319 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 320    we don't need this definition. */
 321 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 322 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 323 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 324
 325 #else
 326 /* ... so we use this as its faster. */
 327 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 328 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 329 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 330 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 331
 332 #endif
 333
 334 /*
 335   Search for mandatory following text node; for lookahead, the text must
 336   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 337 */
 338 #define FIND_NEXT_IMPT(rn) STMT_START { \
 339     while (JUMPABLE(rn)) { \
 340         const OPCODE type = OP(rn); \
 341         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 342             rn = NEXTOPER(NEXTOPER(rn)); \
 343         else if (type == PLUS) \
 344             rn = NEXTOPER(rn); \
 345         else if (type == IFMATCH) \
 346             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 347         else rn += NEXT_OFF(rn); \
 348     } \
 349 } STMT_END
 350
 351
 352 static void restore_pos(pTHX_ void *arg);
 353
 354 #define REGCP_PAREN_ELEMS 4
 355 #define REGCP_OTHER_ELEMS 5
 356 #define REGCP_FRAME_ELEMS 1
 357 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 358  * are needed for the regexp context stack bookkeeping. */
 359
 360 STATIC CHECKPOINT
 361 S_regcppush(pTHX_ I32 parenfloor)
 362 {
 363     dVAR;
 364     const int retval = PL_savestack_ix;
 365     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 366     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 367     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 368     int p;
 369     GET_RE_DEBUG_FLAGS_DECL;
 370
 371     if (paren_elems_to_push < 0)
 372         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 373
 374     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 375         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 376                    " out of range (%lu-%ld)",
 377                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 378
 379     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 380
 381     for (p = PL_regsize; p > parenfloor; p--) {
 382 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 383         SSPUSHINT(PL_regoffs[p].end);
 384         SSPUSHINT(PL_regoffs[p].start);
 385         SSPUSHPTR(PL_reg_start_tmp[p]);
 386         SSPUSHINT(p);
 387         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 388           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 389                       (UV)p, (IV)PL_regoffs[p].start,
 390                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 391                       (IV)PL_regoffs[p].end
 392         ));
 393     }
 394 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 395     SSPUSHPTR(PL_regoffs);
 396     SSPUSHINT(PL_regsize);
 397     SSPUSHINT(*PL_reglastparen);
 398     SSPUSHINT(*PL_reglastcloseparen);
 399     SSPUSHPTR(PL_reginput);
 400     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 401
 402     return retval;
 403 }
 404
 405 /* These are needed since we do not localize EVAL nodes: */
 406 #define REGCP_SET(cp)                                           \
 407     DEBUG_STATE_r(                                              \
 408             PerlIO_printf(Perl_debug_log,                       \
 409                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 410                 (IV)PL_savestack_ix));                          \
 411     cp = PL_savestack_ix
 412
 413 #define REGCP_UNWIND(cp)                                        \
 414     DEBUG_STATE_r(                                              \
 415         if (cp != PL_savestack_ix)                              \
 416             PerlIO_printf(Perl_debug_log,                       \
 417                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 418                 (IV)(cp), (IV)PL_savestack_ix));                \
 419     regcpblow(cp)
 420
 421 STATIC char *
 422 S_regcppop(pTHX_ const regexp *rex)
 423 {
 424     dVAR;
 425     UV i;
 426     char *input;
 427     GET_RE_DEBUG_FLAGS_DECL;
 428
 429     PERL_ARGS_ASSERT_REGCPPOP;
 430
 431     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 432     i = SSPOPUV;
 433     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 434     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 435     input = (char *) SSPOPPTR;
 436     *PL_reglastcloseparen = SSPOPINT;
 437     *PL_reglastparen = SSPOPINT;
 438     PL_regsize = SSPOPINT;
 439     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 440
 441     i -= REGCP_OTHER_ELEMS;
 442     /* Now restore the parentheses context. */
 443     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 444         I32 tmps;
 445         U32 paren = (U32)SSPOPINT;
 446         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 447         PL_regoffs[paren].start = SSPOPINT;
 448         tmps = SSPOPINT;
 449         if (paren <= *PL_reglastparen)
 450             PL_regoffs[paren].end = tmps;
 451         DEBUG_BUFFERS_r(
 452             PerlIO_printf(Perl_debug_log,
 453                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 454                           (UV)paren, (IV)PL_regoffs[paren].start,
 455                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 456                           (IV)PL_regoffs[paren].end,
 457                           (paren > *PL_reglastparen ? "(no)" : ""));
 458         );
 459     }
 460     DEBUG_BUFFERS_r(
 461         if (*PL_reglastparen + 1 <= rex->nparens) {
 462             PerlIO_printf(Perl_debug_log,
 463                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 464                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 465         }
 466     );
 467 #if 1
 468     /* It would seem that the similar code in regtry()
 469      * already takes care of this, and in fact it is in
 470      * a better location to since this code can #if 0-ed out
 471      * but the code in regtry() is needed or otherwise tests
 472      * requiring null fields (pat.t#187 and split.t#{13,14}
 473      * (as of patchlevel 7877)  will fail.  Then again,
 474      * this code seems to be necessary or otherwise
 475      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 476      * --jhi updated by dapm */
 477     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 478         if (i > PL_regsize)
 479             PL_regoffs[i].start = -1;
 480         PL_regoffs[i].end = -1;
 481     }
 482 #endif
 483     return input;
 484 }
 485
 486 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 487
 488 /*
 489  * pregexec and friends
 490  */
 491
 492 #ifndef PERL_IN_XSUB_RE
 493 /*
 494  - pregexec - match a regexp against a string
 495  */
 496 I32
 497 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 498          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 499 /* strend: pointer to null at end of string */
 500 /* strbeg: real beginning of string */
 501 /* minend: end of match must be >=minend after stringarg. */
 502 /* nosave: For optimizations. */
 503 {
 504     PERL_ARGS_ASSERT_PREGEXEC;
 505
 506     return
 507         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 508                       nosave ? 0 : REXEC_COPY_STR);
 509 }
 510 #endif
 511
 512 /*
 513  * Need to implement the following flags for reg_anch:
 514  *
 515  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 516  * USE_INTUIT_ML
 517  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 518  * INTUIT_AUTORITATIVE_ML
 519  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 520  * INTUIT_ONCE_ML
 521  *
 522  * Another flag for this function: SECOND_TIME (so that float substrs
 523  * with giant delta may be not rechecked).
 524  */
 525
 526 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 527
 528 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 529    Otherwise, only SvCUR(sv) is used to get strbeg. */
 530
 531 /* XXXX We assume that strpos is strbeg unless sv. */
 532
 533 /* XXXX Some places assume that there is a fixed substring.
 534         An update may be needed if optimizer marks as "INTUITable"
 535         RExen without fixed substrings.  Similarly, it is assumed that
 536         lengths of all the strings are no more than minlen, thus they
 537         cannot come from lookahead.
 538         (Or minlen should take into account lookahead.)
 539   NOTE: Some of this comment is not correct. minlen does now take account
 540   of lookahead/behind. Further research is required. -- demerphq
 541
 542 */
 543
 544 /* A failure to find a constant substring means that there is no need to make
 545    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 546    finding a substring too deep into the string means that less calls to
 547    regtry() should be needed.
 548
 549    REx compiler's optimizer found 4 possible hints:
 550         a) Anchored substring;
 551         b) Fixed substring;
 552         c) Whether we are anchored (beginning-of-line or \G);
 553         d) First node (of those at offset 0) which may distinguish positions;
 554    We use a)b)d) and multiline-part of c), and try to find a position in the
 555    string which does not contradict any of them.
 556  */
 557
 558 /* Most of decisions we do here should have been done at compile time.
 559    The nodes of the REx which we used for the search should have been
 560    deleted from the finite automaton. */
 561
 562 char *
 563 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 564                      char *strend, const U32 flags, re_scream_pos_data *data)
 565 {
 566     dVAR;
 567     struct regexp *const prog = (struct regexp *)SvANY(rx);
 568     register I32 start_shift = 0;
 569     /* Should be nonnegative! */
 570     register I32 end_shift   = 0;
 571     register char *s;
 572     register SV *check;
 573     char *strbeg;
 574     char *t;
 575     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 576     I32 ml_anch;
 577     register char *other_last = NULL;   /* other substr checked before this */
 578     char *check_at = NULL;              /* check substr found at this pos */
 579     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 580     RXi_GET_DECL(prog,progi);
 581 #ifdef DEBUGGING
 582     const char * const i_strpos = strpos;
 583 #endif
 584     GET_RE_DEBUG_FLAGS_DECL;
 585
 586     PERL_ARGS_ASSERT_RE_INTUIT_START;
 587
 588     RX_MATCH_UTF8_set(rx,utf8_target);
 589
 590     if (RX_UTF8(rx)) {
 591         PL_reg_flags |= RF_utf8;
 592     }
 593     DEBUG_EXECUTE_r(
 594         debug_start_match(rx, utf8_target, strpos, strend,
 595             sv ? "Guessing start of match in sv for"
 596                : "Guessing start of match in string for");
 597               );
 598
 599     /* CHR_DIST() would be more correct here but it makes things slow. */
 600     if (prog->minlen > strend - strpos) {
 601         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 602                               "String too short... [re_intuit_start]\n"));
 603         goto fail;
 604     }
 605
 606     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 607     PL_regeol = strend;
 608     if (utf8_target) {
 609         if (!prog->check_utf8 && prog->check_substr)
 610             to_utf8_substr(prog);
 611         check = prog->check_utf8;
 612     } else {
 613         if (!prog->check_substr && prog->check_utf8)
 614             to_byte_substr(prog);
 615         check = prog->check_substr;
 616     }
 617     if (check == &PL_sv_undef) {
 618         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 619                 "Non-utf8 string cannot match utf8 check string\n"));
 620         goto fail;
 621     }
 622     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 623         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 624                      || ( (prog->extflags & RXf_ANCH_BOL)
 625                           && !multiline ) );    /* Check after \n? */
 626
 627         if (!ml_anch) {
 628           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 629                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 630                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 631                && sv && !SvROK(sv)
 632                && (strpos != strbeg)) {
 633               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 634               goto fail;
 635           }
 636           if (prog->check_offset_min == prog->check_offset_max &&
 637               !(prog->extflags & RXf_CANY_SEEN)) {
 638             /* Substring at constant offset from beg-of-str... */
 639             I32 slen;
 640
 641             s = HOP3c(strpos, prog->check_offset_min, strend);
 642
 643             if (SvTAIL(check)) {
 644                 slen = SvCUR(check);    /* >= 1 */
 645
 646                 if ( strend - s > slen || strend - s < slen - 1
 647                      || (strend - s == slen && strend[-1] != '\n')) {
 648                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 649                     goto fail_finish;
 650                 }
 651                 /* Now should match s[0..slen-2] */
 652                 slen--;
 653                 if (slen && (*SvPVX_const(check) != *s
 654                              || (slen > 1
 655                                  && memNE(SvPVX_const(check), s, slen)))) {
 656                   report_neq:
 657                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 658                     goto fail_finish;
 659                 }
 660             }
 661             else if (*SvPVX_const(check) != *s
 662                      || ((slen = SvCUR(check)) > 1
 663                          && memNE(SvPVX_const(check), s, slen)))
 664                 goto report_neq;
 665             check_at = s;
 666             goto success_at_start;
 667           }
 668         }
 669         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 670         s = strpos;
 671         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 672         end_shift = prog->check_end_shift;
 673
 674         if (!ml_anch) {
 675             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 676                                          - (SvTAIL(check) != 0);
 677             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 678
 679             if (end_shift < eshift)
 680                 end_shift = eshift;
 681         }
 682     }
 683     else {                              /* Can match at random position */
 684         ml_anch = 0;
 685         s = strpos;
 686         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 687         end_shift = prog->check_end_shift;
 688
 689         /* end shift should be non negative here */
 690     }
 691
 692 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 693     if (end_shift < 0)
 694         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 695                    (IV)end_shift, RX_PRECOMP(prog));
 696 #endif
 697
 698   restart:
 699     /* Find a possible match in the region s..strend by looking for
 700        the "check" substring in the region corrected by start/end_shift. */
 701
 702     {
 703         I32 srch_start_shift = start_shift;
 704         I32 srch_end_shift = end_shift;
 705         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 706             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 707             srch_start_shift = strbeg - s;
 708         }
 709     DEBUG_OPTIMISE_MORE_r({
 710         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 711             (IV)prog->check_offset_min,
 712             (IV)srch_start_shift,
 713             (IV)srch_end_shift,
 714             (IV)prog->check_end_shift);
 715     });
 716
 717     if (flags & REXEC_SCREAM) {
 718         I32 p = -1;                     /* Internal iterator of scream. */
 719         I32 * const pp = data ? data->scream_pos : &p;
 720
 721         if (PL_screamfirst[BmRARE(check)] >= 0
 722             || ( BmRARE(check) == '\n'
 723                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 724                  && SvTAIL(check) ))
 725             s = screaminstr(sv, check,
 726                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 727         else
 728             goto fail_finish;
 729         /* we may be pointing at the wrong string */
 730         if (s && RXp_MATCH_COPIED(prog))
 731             s = strbeg + (s - SvPVX_const(sv));
 732         if (data)
 733             *data->scream_olds = s;
 734     }
 735     else {
 736         U8* start_point;
 737         U8* end_point;
 738         if (prog->extflags & RXf_CANY_SEEN) {
 739             start_point= (U8*)(s + srch_start_shift);
 740             end_point= (U8*)(strend - srch_end_shift);
 741         } else {
 742             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 743             end_point= HOP3(strend, -srch_end_shift, strbeg);
 744         }
 745         DEBUG_OPTIMISE_MORE_r({
 746             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 747                 (int)(end_point - start_point),
 748                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 749                 start_point);
 750         });
 751
 752         s = fbm_instr( start_point, end_point,
 753                       check, multiline ? FBMrf_MULTILINE : 0);
 754     }
 755     }
 756     /* Update the count-of-usability, remove useless subpatterns,
 757         unshift s.  */
 758
 759     DEBUG_EXECUTE_r({
 760         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 761             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 762         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 763                           (s ? "Found" : "Did not find"),
 764             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 765                 ? "anchored" : "floating"),
 766             quoted,
 767             RE_SV_TAIL(check),
 768             (s ? " at offset " : "...\n") );
 769     });
 770
 771     if (!s)
 772         goto fail_finish;
 773     /* Finish the diagnostic message */
 774     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 775
 776     /* XXX dmq: first branch is for positive lookbehind...
 777        Our check string is offset from the beginning of the pattern.
 778        So we need to do any stclass tests offset forward from that
 779        point. I think. :-(
 780      */
 781
 782
 783
 784     check_at=s;
 785
 786
 787     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 788        Start with the other substr.
 789        XXXX no SCREAM optimization yet - and a very coarse implementation
 790        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 791                 *always* match.  Probably should be marked during compile...
 792        Probably it is right to do no SCREAM here...
 793      */
 794
 795     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 796                 : (prog->float_substr && prog->anchored_substr))
 797     {
 798         /* Take into account the "other" substring. */
 799         /* XXXX May be hopelessly wrong for UTF... */
 800         if (!other_last)
 801             other_last = strpos;
 802         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 803           do_other_anchored:
 804             {
 805                 char * const last = HOP3c(s, -start_shift, strbeg);
 806                 char *last1, *last2;
 807                 char * const saved_s = s;
 808                 SV* must;
 809
 810                 t = s - prog->check_offset_max;
 811                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 812                     && (!utf8_target
 813                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 814                             && t > strpos)))
 815                     NOOP;
 816                 else
 817                     t = strpos;
 818                 t = HOP3c(t, prog->anchored_offset, strend);
 819                 if (t < other_last)     /* These positions already checked */
 820                     t = other_last;
 821                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 822                 if (last < last1)
 823                     last1 = last;
 824                 /* XXXX It is not documented what units *_offsets are in.
 825                    We assume bytes, but this is clearly wrong.
 826                    Meaning this code needs to be carefully reviewed for errors.
 827                    dmq.
 828                   */
 829
 830                 /* On end-of-str: see comment below. */
 831                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 832                 if (must == &PL_sv_undef) {
 833                     s = (char*)NULL;
 834                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 835                 }
 836                 else
 837                     s = fbm_instr(
 838                         (unsigned char*)t,
 839                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 840                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 841                         must,
 842                         multiline ? FBMrf_MULTILINE : 0
 843                     );
 844                 DEBUG_EXECUTE_r({
 845                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 846                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 847                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 848                         (s ? "Found" : "Contradicts"),
 849                         quoted, RE_SV_TAIL(must));
 850                 });
 851
 852
 853                 if (!s) {
 854                     if (last1 >= last2) {
 855                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856                                                 ", giving up...\n"));
 857                         goto fail_finish;
 858                     }
 859                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 860                         ", trying floating at offset %ld...\n",
 861                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 862                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 863                     s = HOP3c(last, 1, strend);
 864                     goto restart;
 865                 }
 866                 else {
 867                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 868                           (long)(s - i_strpos)));
 869                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 870                     other_last = HOP3c(s, 1, strend);
 871                     s = saved_s;
 872                     if (t == strpos)
 873                         goto try_at_start;
 874                     goto try_at_offset;
 875                 }
 876             }
 877         }
 878         else {          /* Take into account the floating substring. */
 879             char *last, *last1;
 880             char * const saved_s = s;
 881             SV* must;
 882
 883             t = HOP3c(s, -start_shift, strbeg);
 884             last1 = last =
 885                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 886             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 887                 last = HOP3c(t, prog->float_max_offset, strend);
 888             s = HOP3c(t, prog->float_min_offset, strend);
 889             if (s < other_last)
 890                 s = other_last;
 891  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 892             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 893             /* fbm_instr() takes into account exact value of end-of-str
 894                if the check is SvTAIL(ed).  Since false positives are OK,
 895                and end-of-str is not later than strend we are OK. */
 896             if (must == &PL_sv_undef) {
 897                 s = (char*)NULL;
 898                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 899             }
 900             else
 901                 s = fbm_instr((unsigned char*)s,
 902                               (unsigned char*)last + SvCUR(must)
 903                                   - (SvTAIL(must)!=0),
 904                               must, multiline ? FBMrf_MULTILINE : 0);
 905             DEBUG_EXECUTE_r({
 906                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 907                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 908                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 909                     (s ? "Found" : "Contradicts"),
 910                     quoted, RE_SV_TAIL(must));
 911             });
 912             if (!s) {
 913                 if (last1 == last) {
 914                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 915                                             ", giving up...\n"));
 916                     goto fail_finish;
 917                 }
 918                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 919                     ", trying anchored starting at offset %ld...\n",
 920                     (long)(saved_s + 1 - i_strpos)));
 921                 other_last = last;
 922                 s = HOP3c(t, 1, strend);
 923                 goto restart;
 924             }
 925             else {
 926                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 927                       (long)(s - i_strpos)));
 928                 other_last = s; /* Fix this later. --Hugo */
 929                 s = saved_s;
 930                 if (t == strpos)
 931                     goto try_at_start;
 932                 goto try_at_offset;
 933             }
 934         }
 935     }
 936
 937
 938     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 939
 940     DEBUG_OPTIMISE_MORE_r(
 941         PerlIO_printf(Perl_debug_log,
 942             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 943             (IV)prog->check_offset_min,
 944             (IV)prog->check_offset_max,
 945             (IV)(s-strpos),
 946             (IV)(t-strpos),
 947             (IV)(t-s),
 948             (IV)(strend-strpos)
 949         )
 950     );
 951
 952     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 953         && (!utf8_target
 954             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 955                  && t > strpos)))
 956     {
 957         /* Fixed substring is found far enough so that the match
 958            cannot start at strpos. */
 959       try_at_offset:
 960         if (ml_anch && t[-1] != '\n') {
 961             /* Eventually fbm_*() should handle this, but often
 962                anchored_offset is not 0, so this check will not be wasted. */
 963             /* XXXX In the code below we prefer to look for "^" even in
 964                presence of anchored substrings.  And we search even
 965                beyond the found float position.  These pessimizations
 966                are historical artefacts only.  */
 967           find_anchor:
 968             while (t < strend - prog->minlen) {
 969                 if (*t == '\n') {
 970                     if (t < check_at - prog->check_offset_min) {
 971                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 972                             /* Since we moved from the found position,
 973                                we definitely contradict the found anchored
 974                                substr.  Due to the above check we do not
 975                                contradict "check" substr.
 976                                Thus we can arrive here only if check substr
 977                                is float.  Redo checking for "other"=="fixed".
 978                              */
 979                             strpos = t + 1;
 980                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 981                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 982                             goto do_other_anchored;
 983                         }
 984                         /* We don't contradict the found floating substring. */
 985                         /* XXXX Why not check for STCLASS? */
 986                         s = t + 1;
 987                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 988                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 989                         goto set_useful;
 990                     }
 991                     /* Position contradicts check-string */
 992                     /* XXXX probably better to look for check-string
 993                        than for "\n", so one should lower the limit for t? */
 994                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 995                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 996                     other_last = strpos = s = t + 1;
 997                     goto restart;
 998                 }
 999                 t++;
1000             }
1001             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1002                         PL_colors[0], PL_colors[1]));
1003             goto fail_finish;
1004         }
1005         else {
1006             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1007                         PL_colors[0], PL_colors[1]));
1008         }
1009         s = t;
1010       set_useful:
1011         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1012     }
1013     else {
1014         /* The found string does not prohibit matching at strpos,
1015            - no optimization of calling REx engine can be performed,
1016            unless it was an MBOL and we are not after MBOL,
1017            or a future STCLASS check will fail this. */
1018       try_at_start:
1019         /* Even in this situation we may use MBOL flag if strpos is offset
1020            wrt the start of the string. */
1021         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1022             && (strpos != strbeg) && strpos[-1] != '\n'
1023             /* May be due to an implicit anchor of m{.*foo}  */
1024             && !(prog->intflags & PREGf_IMPLICIT))
1025         {
1026             t = strpos;
1027             goto find_anchor;
1028         }
1029         DEBUG_EXECUTE_r( if (ml_anch)
1030             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1031                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1032         );
1033       success_at_start:
1034         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1035             && (utf8_target ? (
1036                 prog->check_utf8                /* Could be deleted already */
1037                 && --BmUSEFUL(prog->check_utf8) < 0
1038                 && (prog->check_utf8 == prog->float_utf8)
1039             ) : (
1040                 prog->check_substr              /* Could be deleted already */
1041                 && --BmUSEFUL(prog->check_substr) < 0
1042                 && (prog->check_substr == prog->float_substr)
1043             )))
1044         {
1045             /* If flags & SOMETHING - do not do it many times on the same match */
1046             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1047             /* XXX Does the destruction order has to change with utf8_target? */
1048             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1049             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1050             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1051             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1052             check = NULL;                       /* abort */
1053             s = strpos;
1054             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1055                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1056             if (prog->intflags & PREGf_IMPLICIT)
1057                 prog->extflags &= ~RXf_ANCH_MBOL;
1058             /* XXXX This is a remnant of the old implementation.  It
1059                     looks wasteful, since now INTUIT can use many
1060                     other heuristics. */
1061             prog->extflags &= ~RXf_USE_INTUIT;
1062             /* XXXX What other flags might need to be cleared in this branch? */
1063         }
1064         else
1065             s = strpos;
1066     }
1067
1068     /* Last resort... */
1069     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1070     /* trie stclasses are too expensive to use here, we are better off to
1071        leave it to regmatch itself */
1072     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1073         /* minlen == 0 is possible if regstclass is \b or \B,
1074            and the fixed substr is ''$.
1075            Since minlen is already taken into account, s+1 is before strend;
1076            accidentally, minlen >= 1 guaranties no false positives at s + 1
1077            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1078            regstclass does not come from lookahead...  */
1079         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1080            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1081         const U8* const str = (U8*)STRING(progi->regstclass);
1082         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1083                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1084                     : 1);
1085         char * endpos;
1086         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1087             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1088         else if (prog->float_substr || prog->float_utf8)
1089             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1090         else
1091             endpos= strend;
1092
1093         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1094                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1095
1096         t = s;
1097         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1098         if (!s) {
1099 #ifdef DEBUGGING
1100             const char *what = NULL;
1101 #endif
1102             if (endpos == strend) {
1103                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Could not match STCLASS...\n") );
1105                 goto fail;
1106             }
1107             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                                    "This position contradicts STCLASS...\n") );
1109             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1110                 goto fail;
1111             /* Contradict one of substrings */
1112             if (prog->anchored_substr || prog->anchored_utf8) {
1113                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1114                     DEBUG_EXECUTE_r( what = "anchored" );
1115                   hop_and_restart:
1116                     s = HOP3c(t, 1, strend);
1117                     if (s + start_shift + end_shift > strend) {
1118                         /* XXXX Should be taken into account earlier? */
1119                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120                                                "Could not match STCLASS...\n") );
1121                         goto fail;
1122                     }
1123                     if (!check)
1124                         goto giveup;
1125                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126                                 "Looking for %s substr starting at offset %ld...\n",
1127                                  what, (long)(s + start_shift - i_strpos)) );
1128                     goto restart;
1129                 }
1130                 /* Have both, check_string is floating */
1131                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1132                     goto retry_floating_check;
1133                 /* Recheck anchored substring, but not floating... */
1134                 s = check_at;
1135                 if (!check)
1136                     goto giveup;
1137                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1138                           "Looking for anchored substr starting at offset %ld...\n",
1139                           (long)(other_last - i_strpos)) );
1140                 goto do_other_anchored;
1141             }
1142             /* Another way we could have checked stclass at the
1143                current position only: */
1144             if (ml_anch) {
1145                 s = t = t + 1;
1146                 if (!check)
1147                     goto giveup;
1148                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1149                           "Looking for /%s^%s/m starting at offset %ld...\n",
1150                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1151                 goto try_at_offset;
1152             }
1153             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1154                 goto fail;
1155             /* Check is floating substring. */
1156           retry_floating_check:
1157             t = check_at - start_shift;
1158             DEBUG_EXECUTE_r( what = "floating" );
1159             goto hop_and_restart;
1160         }
1161         if (t != s) {
1162             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1163                         "By STCLASS: moving %ld --> %ld\n",
1164                                   (long)(t - i_strpos), (long)(s - i_strpos))
1165                    );
1166         }
1167         else {
1168             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1169                                   "Does not contradict STCLASS...\n");
1170                    );
1171         }
1172     }
1173   giveup:
1174     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1175                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1176                           PL_colors[5], (long)(s - i_strpos)) );
1177     return s;
1178
1179   fail_finish:                          /* Substring not found */
1180     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1181         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1182   fail:
1183     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1184                           PL_colors[4], PL_colors[5]));
1185     return NULL;
1186 }
1187
1188 #define DECL_TRIE_TYPE(scan) \
1189     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1190                     trie_type = (scan->flags != EXACT) \
1191                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1192                               : (utf8_target ? trie_utf8 : trie_plain)
1193
1194 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1195 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1196     switch (trie_type) {                                                    \
1197     case trie_utf8_fold:                                                    \
1198         if ( foldlen>0 ) {                                                  \
1199             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1200             foldlen -= len;                                                 \
1201             uscan += len;                                                   \
1202             len=0;                                                          \
1203         } else {                                                            \
1204             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1205             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1206             foldlen -= UNISKIP( uvc );                                      \
1207             uscan = foldbuf + UNISKIP( uvc );                               \
1208         }                                                                   \
1209         break;                                                              \
1210     case trie_latin_utf8_fold:                                              \
1211         if ( foldlen>0 ) {                                                  \
1212             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1213             foldlen -= len;                                                 \
1214             uscan += len;                                                   \
1215             len=0;                                                          \
1216         } else {                                                            \
1217             len = 1;                                                        \
1218             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1219             foldlen -= UNISKIP( uvc );                                      \
1220             uscan = foldbuf + UNISKIP( uvc );                               \
1221         }                                                                   \
1222         break;                                                              \
1223     case trie_utf8:                                                         \
1224         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1225         break;                                                              \
1226     case trie_plain:                                                        \
1227         uvc = (UV)*uc;                                                      \
1228         len = 1;                                                            \
1229     }                                                                       \
1230     if (uvc < 256) {                                                        \
1231         charid = trie->charmap[ uvc ];                                      \
1232     }                                                                       \
1233     else {                                                                  \
1234         charid = 0;                                                         \
1235         if (widecharmap) {                                                  \
1236             SV** const svpp = hv_fetch(widecharmap,                         \
1237                         (char*)&uvc, sizeof(UV), 0);                        \
1238             if (svpp)                                                       \
1239                 charid = (U16)SvIV(*svpp);                                  \
1240         }                                                                   \
1241     }                                                                       \
1242 } STMT_END
1243
1244 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1245 STMT_START {                                              \
1246     while (s <= e) {                                      \
1247         if ( (CoNd)                                       \
1248              && (ln == 1 || folder(s, pat_string, ln))    \
1249              && (!reginfo || regtry(reginfo, &s)) )       \
1250             goto got_it;                                  \
1251         s++;                                              \
1252     }                                                     \
1253 } STMT_END
1254
1255 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1256 STMT_START {                                          \
1257     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1258         CoDe                                          \
1259         s += uskip;                                   \
1260     }                                                 \
1261 } STMT_END
1262
1263 #define REXEC_FBC_SCAN(CoDe)                          \
1264 STMT_START {                                          \
1265     while (s < strend) {                              \
1266         CoDe                                          \
1267         s++;                                          \
1268     }                                                 \
1269 } STMT_END
1270
1271 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1272 REXEC_FBC_UTF8_SCAN(                                  \
1273     if (CoNd) {                                       \
1274         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1275             goto got_it;                              \
1276         else                                          \
1277             tmp = doevery;                            \
1278     }                                                 \
1279     else                                              \
1280         tmp = 1;                                      \
1281 )
1282
1283 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1284 REXEC_FBC_SCAN(                                       \
1285     if (CoNd) {                                       \
1286         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1287             goto got_it;                              \
1288         else                                          \
1289             tmp = doevery;                            \
1290     }                                                 \
1291     else                                              \
1292         tmp = 1;                                      \
1293 )
1294
1295 #define REXEC_FBC_TRYIT               \
1296 if ((!reginfo || regtry(reginfo, &s))) \
1297     goto got_it
1298
1299 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1300     if (utf8_target) {                                             \
1301         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1302     }                                                          \
1303     else {                                                     \
1304         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1305     }
1306
1307 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1308     if (utf8_target) {                                             \
1309         UtFpReLoAd;                                            \
1310         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1311     }                                                          \
1312     else {                                                     \
1313         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1314     }
1315
1316 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1317     PL_reg_flags |= RF_tainted;                                \
1318     if (utf8_target) {                                             \
1319         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1320     }                                                          \
1321     else {                                                     \
1322         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1323     }
1324
1325 #define DUMP_EXEC_POS(li,s,doutf8) \
1326     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1327
1328
1329 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1330         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1331         tmp = TEST_NON_UTF8(tmp);                                              \
1332         REXEC_FBC_UTF8_SCAN(                                                   \
1333             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1334                 tmp = !tmp;                                                    \
1335                 IF_SUCCESS;                                                    \
1336             }                                                                  \
1337             else {                                                             \
1338                 IF_FAIL;                                                       \
1339             }                                                                  \
1340         );                                                                     \
1341
1342 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1343         if (s == PL_bostr) {                                                   \
1344             tmp = '\n';                                                        \
1345         }                                                                      \
1346         else {                                                                 \
1347             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1348             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1349         }                                                                      \
1350         tmp = TeSt1_UtF8;                                                      \
1351         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1352         REXEC_FBC_UTF8_SCAN(                                                   \
1353             if (tmp == ! (TeSt2_UtF8)) { \
1354                 tmp = !tmp;                                                    \
1355                 IF_SUCCESS;                                                    \
1356             }                                                                  \
1357             else {                                                             \
1358                 IF_FAIL;                                                       \
1359             }                                                                  \
1360         );                                                                     \
1361
1362 /* The only difference between the BOUND and NBOUND cases is that
1363  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1364  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1365  * with the other one being empty */
1366 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1367     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1368
1369 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1370     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1371
1372 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1373     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1374
1375 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1376     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1377
1378
1379 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1380  * be passed in completely with the variable name being tested, which isn't
1381  * such a clean interface, but this is easier to read than it was before.  We
1382  * are looking for the boundary (or non-boundary between a word and non-word
1383  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1384  * must be different.  Find the "wordness" of the character just prior to this
1385  * one, and compare it with the wordness of this one.  If they differ, we have
1386  * a boundary.  At the beginning of the string, pretend that the previous
1387  * character was a new-line */
1388 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1389     if (utf8_target) {                                                         \
1390                 UTF8_CODE \
1391     }                                                                          \
1392     else {  /* Not utf8 */                                                     \
1393         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1394         tmp = TEST_NON_UTF8(tmp);                                              \
1395         REXEC_FBC_SCAN(                                                        \
1396             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1397                 tmp = !tmp;                                                    \
1398                 IF_SUCCESS;                                                    \
1399             }                                                                  \
1400             else {                                                             \
1401                 IF_FAIL;                                                       \
1402             }                                                                  \
1403         );                                                                     \
1404     }                                                                          \
1405     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1406         goto got_it;
1407
1408 /* We know what class REx starts with.  Try to find this position... */
1409 /* if reginfo is NULL, its a dryrun */
1410 /* annoyingly all the vars in this routine have different names from their counterparts
1411    in regmatch. /grrr */
1412
1413 STATIC char *
1414 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1415     const char *strend, regmatch_info *reginfo)
1416 {
1417         dVAR;
1418         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1419         char *pat_string;   /* The pattern's exactish string */
1420         char *pat_end;      /* ptr to end char of pat_string */
1421         re_fold_t folder;       /* Function for computing non-utf8 folds */
1422         const U8 *fold_array;   /* array for folding ords < 256 */
1423         STRLEN ln;
1424         STRLEN lnc;
1425         register STRLEN uskip;
1426         U8 c1;
1427         U8 c2;
1428         char *e;
1429         register I32 tmp = 1;   /* Scratch variable? */
1430         register const bool utf8_target = PL_reg_match_utf8;
1431         UV utf8_fold_flags = 0;
1432         RXi_GET_DECL(prog,progi);
1433
1434         PERL_ARGS_ASSERT_FIND_BYCLASS;
1435
1436         /* We know what class it must start with. */
1437         switch (OP(c)) {
1438         case ANYOFV:
1439         case ANYOF:
1440             if (utf8_target || OP(c) == ANYOFV) {
1441                 STRLEN inclasslen = strend - s;
1442                 REXEC_FBC_UTF8_CLASS_SCAN(
1443                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1444             }
1445             else {
1446                  while (s < strend) {
1447                       STRLEN skip = 1;
1448
1449                       if (REGINCLASS(prog, c, (U8*)s) ||
1450                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1451                            /* The assignment of 2 is intentional:
1452                             * for the folded sharp s, the skip is 2. */
1453                            (skip = SHARP_S_SKIP))) {
1454                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1455                                 goto got_it;
1456                            else
1457                                 tmp = doevery;
1458                       }
1459                       else
1460                            tmp = 1;
1461                       s += skip;
1462                  }
1463             }
1464             break;
1465         case CANY:
1466             REXEC_FBC_SCAN(
1467                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1468                     goto got_it;
1469                 else
1470                     tmp = doevery;
1471             );
1472             break;
1473
1474         case EXACTFA:
1475             if (UTF_PATTERN || utf8_target) {
1476                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1477                 goto do_exactf_utf8;
1478             }
1479             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1480             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1481             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1482
1483         case EXACTFU:
1484             if (UTF_PATTERN || utf8_target) {
1485                 utf8_fold_flags = 0;
1486                 goto do_exactf_utf8;
1487             }
1488             fold_array = PL_fold_latin1;
1489             folder = foldEQ_latin1;
1490             /* XXX This uses the full utf8 fold because if the pattern contains
1491              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1492              * There could be a new node type, say EXACTFU_SS, which is
1493              * generated by regcomp only if there is an 'ss', and then every
1494              * other case could goto do_exactf_non_utf8;*/
1495             goto do_exactf_utf8;
1496
1497         case EXACTF:
1498             if (UTF_PATTERN || utf8_target) {
1499                 utf8_fold_flags = 0;
1500                 goto do_exactf_utf8;
1501             }
1502             fold_array = PL_fold;
1503             folder = foldEQ;
1504             goto do_exactf_non_utf8;
1505
1506         case EXACTFL:
1507             if (UTF_PATTERN || utf8_target) {
1508                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1509                 goto do_exactf_utf8;
1510             }
1511             fold_array = PL_fold_locale;
1512             folder = foldEQ_locale;
1513
1514             /* FALL THROUGH */
1515
1516         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1517
1518             /* The idea in the non-utf8 EXACTF* cases is to first find the
1519              * first character of the EXACTF* node and then, if necessary,
1520              * case-insensitively compare the full text of the node.  c1 is the
1521              * first character.  c2 is its fold.  This logic will not work for
1522              * Unicode semantics and the german sharp ss, which hence should
1523              * not be compiled into a node that gets here. */
1524             pat_string = STRING(c);
1525             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1526
1527             e = HOP3c(strend, -((I32)ln), s);
1528
1529             if (!reginfo && e < s) {
1530                 e = s;                  /* Due to minlen logic of intuit() */
1531             }
1532
1533             c1 = *pat_string;
1534             c2 = fold_array[c1];
1535             if (c1 == c2) { /* If char and fold are the same */
1536                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1537             }
1538             else {
1539                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1540             }
1541             break;
1542
1543         do_exactf_utf8:
1544
1545             /* If one of the operands is in utf8, we can't use the simpler
1546              * folding above, due to the fact that many different characters
1547              * can have the same fold, or portion of a fold, or different-
1548              * length fold */
1549             pat_string = STRING(c);
1550             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1551             pat_end = pat_string + ln;
1552             lnc = (UTF_PATTERN) /* length to match in characters */
1553                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1554                     : ln;
1555
1556             e = HOP3c(strend, -((I32)lnc), s);
1557
1558             if (!reginfo && e < s) {
1559                 e = s;                  /* Due to minlen logic of intuit() */
1560             }
1561
1562             while (s <= e) {
1563                 char *my_strend= (char *)strend;
1564                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1565                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1566                     && (!reginfo || regtry(reginfo, &s)) )
1567                 {
1568                     goto got_it;
1569                 }
1570                 s += UTF8SKIP(s);
1571             }
1572             break;
1573         case BOUNDL:
1574             PL_reg_flags |= RF_tainted;
1575             FBC_BOUND(isALNUM_LC,
1576                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1577                       isALNUM_LC_utf8((U8*)s));
1578             break;
1579         case NBOUNDL:
1580             PL_reg_flags |= RF_tainted;
1581             FBC_NBOUND(isALNUM_LC,
1582                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1583                        isALNUM_LC_utf8((U8*)s));
1584             break;
1585         case BOUND:
1586             FBC_BOUND(isWORDCHAR,
1587                       isALNUM_uni(tmp),
1588                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1589             break;
1590         case BOUNDA:
1591             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1592                              isWORDCHAR_A(tmp),
1593                              isWORDCHAR_A((U8*)s));
1594             break;
1595         case NBOUND:
1596             FBC_NBOUND(isWORDCHAR,
1597                        isALNUM_uni(tmp),
1598                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1599             break;
1600         case NBOUNDA:
1601             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1602                               isWORDCHAR_A(tmp),
1603                               isWORDCHAR_A((U8*)s));
1604             break;
1605         case BOUNDU:
1606             FBC_BOUND(isWORDCHAR_L1,
1607                       isALNUM_uni(tmp),
1608                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1609             break;
1610         case NBOUNDU:
1611             FBC_NBOUND(isWORDCHAR_L1,
1612                        isALNUM_uni(tmp),
1613                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1614             break;
1615         case ALNUML:
1616             REXEC_FBC_CSCAN_TAINT(
1617                 isALNUM_LC_utf8((U8*)s),
1618                 isALNUM_LC(*s)
1619             );
1620             break;
1621         case ALNUMU:
1622             REXEC_FBC_CSCAN_PRELOAD(
1623                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1624                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1625                 isWORDCHAR_L1((U8) *s)
1626             );
1627             break;
1628         case ALNUM:
1629             REXEC_FBC_CSCAN_PRELOAD(
1630                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1631                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1632                 isWORDCHAR((U8) *s)
1633             );
1634             break;
1635         case ALNUMA:
1636             /* Don't need to worry about utf8, as it can match only a single
1637              * byte invariant character */
1638             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1639             break;
1640         case NALNUMU:
1641             REXEC_FBC_CSCAN_PRELOAD(
1642                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1643                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1644                 ! isWORDCHAR_L1((U8) *s)
1645             );
1646             break;
1647         case NALNUM:
1648             REXEC_FBC_CSCAN_PRELOAD(
1649                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1650                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1651                 ! isALNUM(*s)
1652             );
1653             break;
1654         case NALNUMA:
1655             REXEC_FBC_CSCAN(
1656                 !isWORDCHAR_A(*s),
1657                 !isWORDCHAR_A(*s)
1658             );
1659             break;
1660         case NALNUML:
1661             REXEC_FBC_CSCAN_TAINT(
1662                 !isALNUM_LC_utf8((U8*)s),
1663                 !isALNUM_LC(*s)
1664             );
1665             break;
1666         case SPACEU:
1667             REXEC_FBC_CSCAN_PRELOAD(
1668                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1669                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1670                 isSPACE_L1((U8) *s)
1671             );
1672             break;
1673         case SPACE:
1674             REXEC_FBC_CSCAN_PRELOAD(
1675                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1676                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1677                 isSPACE((U8) *s)
1678             );
1679             break;
1680         case SPACEA:
1681             /* Don't need to worry about utf8, as it can match only a single
1682              * byte invariant character */
1683             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1684             break;
1685         case SPACEL:
1686             REXEC_FBC_CSCAN_TAINT(
1687                 isSPACE_LC_utf8((U8*)s),
1688                 isSPACE_LC(*s)
1689             );
1690             break;
1691         case NSPACEU:
1692             REXEC_FBC_CSCAN_PRELOAD(
1693                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1694                 !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1695                 ! isSPACE_L1((U8) *s)
1696             );
1697             break;
1698         case NSPACE:
1699             REXEC_FBC_CSCAN_PRELOAD(
1700                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1701                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1702                 ! isSPACE((U8) *s)
1703             );
1704             break;
1705         case NSPACEA:
1706             REXEC_FBC_CSCAN(
1707                 !isSPACE_A(*s),
1708                 !isSPACE_A(*s)
1709             );
1710             break;
1711         case NSPACEL:
1712             REXEC_FBC_CSCAN_TAINT(
1713                 !isSPACE_LC_utf8((U8*)s),
1714                 !isSPACE_LC(*s)
1715             );
1716             break;
1717         case DIGIT:
1718             REXEC_FBC_CSCAN_PRELOAD(
1719                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1720                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1721                 isDIGIT(*s)
1722             );
1723             break;
1724         case DIGITA:
1725             /* Don't need to worry about utf8, as it can match only a single
1726              * byte invariant character */
1727             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1728             break;
1729         case DIGITL:
1730             REXEC_FBC_CSCAN_TAINT(
1731                 isDIGIT_LC_utf8((U8*)s),
1732                 isDIGIT_LC(*s)
1733             );
1734             break;
1735         case NDIGIT:
1736             REXEC_FBC_CSCAN_PRELOAD(
1737                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1738                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1739                 !isDIGIT(*s)
1740             );
1741             break;
1742         case NDIGITA:
1743             REXEC_FBC_CSCAN(
1744                 !isDIGIT_A(*s),
1745                 !isDIGIT_A(*s)
1746             );
1747             break;
1748         case NDIGITL:
1749             REXEC_FBC_CSCAN_TAINT(
1750                 !isDIGIT_LC_utf8((U8*)s),
1751                 !isDIGIT_LC(*s)
1752             );
1753             break;
1754         case LNBREAK:
1755             REXEC_FBC_CSCAN(
1756                 is_LNBREAK_utf8(s),
1757                 is_LNBREAK_latin1(s)
1758             );
1759             break;
1760         case VERTWS:
1761             REXEC_FBC_CSCAN(
1762                 is_VERTWS_utf8(s),
1763                 is_VERTWS_latin1(s)
1764             );
1765             break;
1766         case NVERTWS:
1767             REXEC_FBC_CSCAN(
1768                 !is_VERTWS_utf8(s),
1769                 !is_VERTWS_latin1(s)
1770             );
1771             break;
1772         case HORIZWS:
1773             REXEC_FBC_CSCAN(
1774                 is_HORIZWS_utf8(s),
1775                 is_HORIZWS_latin1(s)
1776             );
1777             break;
1778         case NHORIZWS:
1779             REXEC_FBC_CSCAN(
1780                 !is_HORIZWS_utf8(s),
1781                 !is_HORIZWS_latin1(s)
1782             );
1783             break;
1784         case AHOCORASICKC:
1785         case AHOCORASICK:
1786             {
1787                 DECL_TRIE_TYPE(c);
1788                 /* what trie are we using right now */
1789                 reg_ac_data *aho
1790                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1791                 reg_trie_data *trie
1792                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1793                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1794
1795                 const char *last_start = strend - trie->minlen;
1796 #ifdef DEBUGGING
1797                 const char *real_start = s;
1798 #endif
1799                 STRLEN maxlen = trie->maxlen;
1800                 SV *sv_points;
1801                 U8 **points; /* map of where we were in the input string
1802                                 when reading a given char. For ASCII this
1803                                 is unnecessary overhead as the relationship
1804                                 is always 1:1, but for Unicode, especially
1805                                 case folded Unicode this is not true. */
1806                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1807                 U8 *bitmap=NULL;
1808
1809
1810                 GET_RE_DEBUG_FLAGS_DECL;
1811
1812                 /* We can't just allocate points here. We need to wrap it in
1813                  * an SV so it gets freed properly if there is a croak while
1814                  * running the match */
1815                 ENTER;
1816                 SAVETMPS;
1817                 sv_points=newSV(maxlen * sizeof(U8 *));
1818                 SvCUR_set(sv_points,
1819                     maxlen * sizeof(U8 *));
1820                 SvPOK_on(sv_points);
1821                 sv_2mortal(sv_points);
1822                 points=(U8**)SvPV_nolen(sv_points );
1823                 if ( trie_type != trie_utf8_fold
1824                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1825                 {
1826                     if (trie->bitmap)
1827                         bitmap=(U8*)trie->bitmap;
1828                     else
1829                         bitmap=(U8*)ANYOF_BITMAP(c);
1830                 }
1831                 /* this is the Aho-Corasick algorithm modified a touch
1832                    to include special handling for long "unknown char"
1833                    sequences. The basic idea being that we use AC as long
1834                    as we are dealing with a possible matching char, when
1835                    we encounter an unknown char (and we have not encountered
1836                    an accepting state) we scan forward until we find a legal
1837                    starting char.
1838                    AC matching is basically that of trie matching, except
1839                    that when we encounter a failing transition, we fall back
1840                    to the current states "fail state", and try the current char
1841                    again, a process we repeat until we reach the root state,
1842                    state 1, or a legal transition. If we fail on the root state
1843                    then we can either terminate if we have reached an accepting
1844                    state previously, or restart the entire process from the beginning
1845                    if we have not.
1846
1847                  */
1848                 while (s <= last_start) {
1849                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1850                     U8 *uc = (U8*)s;
1851                     U16 charid = 0;
1852                     U32 base = 1;
1853                     U32 state = 1;
1854                     UV uvc = 0;
1855                     STRLEN len = 0;
1856                     STRLEN foldlen = 0;
1857                     U8 *uscan = (U8*)NULL;
1858                     U8 *leftmost = NULL;
1859 #ifdef DEBUGGING
1860                     U32 accepted_word= 0;
1861 #endif
1862                     U32 pointpos = 0;
1863
1864                     while ( state && uc <= (U8*)strend ) {
1865                         int failed=0;
1866                         U32 word = aho->states[ state ].wordnum;
1867
1868                         if( state==1 ) {
1869                             if ( bitmap ) {
1870                                 DEBUG_TRIE_EXECUTE_r(
1871                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1872                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1873                                             (char *)uc, utf8_target );
1874                                         PerlIO_printf( Perl_debug_log,
1875                                             " Scanning for legal start char...\n");
1876                                     }
1877                                 );
1878                                 if (utf8_target) {
1879                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1880                                         uc += UTF8SKIP(uc);
1881                                     }
1882                                 } else {
1883                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1884                                         uc++;
1885                                     }
1886                                 }
1887                                 s= (char *)uc;
1888                             }
1889                             if (uc >(U8*)last_start) break;
1890                         }
1891
1892                         if ( word ) {
1893                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1894                             if (!leftmost || lpos < leftmost) {
1895                                 DEBUG_r(accepted_word=word);
1896                                 leftmost= lpos;
1897                             }
1898                             if (base==0) break;
1899
1900                         }
1901                         points[pointpos++ % maxlen]= uc;
1902                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1903                                              uscan, len, uvc, charid, foldlen,
1904                                              foldbuf, uniflags);
1905                         DEBUG_TRIE_EXECUTE_r({
1906                             dump_exec_pos( (char *)uc, c, strend, real_start,
1907                                 s,   utf8_target );
1908                             PerlIO_printf(Perl_debug_log,
1909                                 " Charid:%3u CP:%4"UVxf" ",
1910                                  charid, uvc);
1911                         });
1912
1913                         do {
1914 #ifdef DEBUGGING
1915                             word = aho->states[ state ].wordnum;
1916 #endif
1917                             base = aho->states[ state ].trans.base;
1918
1919                             DEBUG_TRIE_EXECUTE_r({
1920                                 if (failed)
1921                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1922                                         s,   utf8_target );
1923                                 PerlIO_printf( Perl_debug_log,
1924                                     "%sState: %4"UVxf", word=%"UVxf,
1925                                     failed ? " Fail transition to " : "",
1926                                     (UV)state, (UV)word);
1927                             });
1928                             if ( base ) {
1929                                 U32 tmp;
1930                                 I32 offset;
1931                                 if (charid &&
1932                                      ( ((offset = base + charid
1933                                         - 1 - trie->uniquecharcount)) >= 0)
1934                                      && ((U32)offset < trie->lasttrans)
1935                                      && trie->trans[offset].check == state
1936                                      && (tmp=trie->trans[offset].next))
1937                                 {
1938                                     DEBUG_TRIE_EXECUTE_r(
1939                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1940                                     state = tmp;
1941                                     break;
1942                                 }
1943                                 else {
1944                                     DEBUG_TRIE_EXECUTE_r(
1945                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1946                                     failed = 1;
1947                                     state = aho->fail[state];
1948                                 }
1949                             }
1950                             else {
1951                                 /* we must be accepting here */
1952                                 DEBUG_TRIE_EXECUTE_r(
1953                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1954                                 failed = 1;
1955                                 break;
1956                             }
1957                         } while(state);
1958                         uc += len;
1959                         if (failed) {
1960                             if (leftmost)
1961                                 break;
1962                             if (!state) state = 1;
1963                         }
1964                     }
1965                     if ( aho->states[ state ].wordnum ) {
1966                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1967                         if (!leftmost || lpos < leftmost) {
1968                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1969                             leftmost = lpos;
1970                         }
1971                     }
1972                     if (leftmost) {
1973                         s = (char*)leftmost;
1974                         DEBUG_TRIE_EXECUTE_r({
1975                             PerlIO_printf(
1976                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1977                                 (UV)accepted_word, (IV)(s - real_start)
1978                             );
1979                         });
1980                         if (!reginfo || regtry(reginfo, &s)) {
1981                             FREETMPS;
1982                             LEAVE;
1983                             goto got_it;
1984                         }
1985                         s = HOPc(s,1);
1986                         DEBUG_TRIE_EXECUTE_r({
1987                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1988                         });
1989                     } else {
1990                         DEBUG_TRIE_EXECUTE_r(
1991                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1992                         break;
1993                     }
1994                 }
1995                 FREETMPS;
1996                 LEAVE;
1997             }
1998             break;
1999         default:
2000             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2001             break;
2002         }
2003         return 0;
2004       got_it:
2005         return s;
2006 }
2007
2008
2009 /*
2010  - regexec_flags - match a regexp against a string
2011  */
2012 I32
2013 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2014               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2015 /* strend: pointer to null at end of string */
2016 /* strbeg: real beginning of string */
2017 /* minend: end of match must be >=minend after stringarg. */
2018 /* data: May be used for some additional optimizations.
2019          Currently its only used, with a U32 cast, for transmitting
2020          the ganch offset when doing a /g match. This will change */
2021 /* nosave: For optimizations. */
2022 {
2023     dVAR;
2024     struct regexp *const prog = (struct regexp *)SvANY(rx);
2025     /*register*/ char *s;
2026     register regnode *c;
2027     /*register*/ char *startpos = stringarg;
2028     I32 minlen;         /* must match at least this many chars */
2029     I32 dontbother = 0; /* how many characters not to try at end */
2030     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2031     I32 scream_pos = -1;                /* Internal iterator of scream. */
2032     char *scream_olds = NULL;
2033     const bool utf8_target = cBOOL(DO_UTF8(sv));
2034     I32 multiline;
2035     RXi_GET_DECL(prog,progi);
2036     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2037     regexp_paren_pair *swap = NULL;
2038     GET_RE_DEBUG_FLAGS_DECL;
2039
2040     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2041     PERL_UNUSED_ARG(data);
2042
2043     /* Be paranoid... */
2044     if (prog == NULL || startpos == NULL) {
2045         Perl_croak(aTHX_ "NULL regexp parameter");
2046         return 0;
2047     }
2048
2049     multiline = prog->extflags & RXf_PMf_MULTILINE;
2050     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2051
2052     RX_MATCH_UTF8_set(rx, utf8_target);
2053     DEBUG_EXECUTE_r(
2054         debug_start_match(rx, utf8_target, startpos, strend,
2055         "Matching");
2056     );
2057
2058     minlen = prog->minlen;
2059
2060     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2061         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2062                               "String too short [regexec_flags]...\n"));
2063         goto phooey;
2064     }
2065
2066
2067     /* Check validity of program. */
2068     if (UCHARAT(progi->program) != REG_MAGIC) {
2069         Perl_croak(aTHX_ "corrupted regexp program");
2070     }
2071
2072     PL_reg_flags = 0;
2073     PL_reg_eval_set = 0;
2074     PL_reg_maxiter = 0;
2075
2076     if (RX_UTF8(rx))
2077         PL_reg_flags |= RF_utf8;
2078
2079     /* Mark beginning of line for ^ and lookbehind. */
2080     reginfo.bol = startpos; /* XXX not used ??? */
2081     PL_bostr  = strbeg;
2082     reginfo.sv = sv;
2083
2084     /* Mark end of line for $ (and such) */
2085     PL_regeol = strend;
2086
2087     /* see how far we have to get to not match where we matched before */
2088     reginfo.till = startpos+minend;
2089
2090     /* If there is a "must appear" string, look for it. */
2091     s = startpos;
2092
2093     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2094         MAGIC *mg;
2095         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2096             reginfo.ganch = startpos + prog->gofs;
2097             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2098               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2099         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2100                   && SvMAGIC(sv)
2101                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2102                   && mg->mg_len >= 0) {
2103             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2104             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2105                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2106
2107             if (prog->extflags & RXf_ANCH_GPOS) {
2108                 if (s > reginfo.ganch)
2109                     goto phooey;
2110                 s = reginfo.ganch - prog->gofs;
2111                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2112                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2113                 if (s < strbeg)
2114                     goto phooey;
2115             }
2116         }
2117         else if (data) {
2118             reginfo.ganch = strbeg + PTR2UV(data);
2119             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2120                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2121
2122         } else {                                /* pos() not defined */
2123             reginfo.ganch = strbeg;
2124             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2125                  "GPOS: reginfo.ganch = strbeg\n"));
2126         }
2127     }
2128     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2129         /* We have to be careful. If the previous successful match
2130            was from this regex we don't want a subsequent partially
2131            successful match to clobber the old results.
2132            So when we detect this possibility we add a swap buffer
2133            to the re, and switch the buffer each match. If we fail
2134            we switch it back, otherwise we leave it swapped.
2135         */
2136         swap = prog->offs;
2137         /* do we need a save destructor here for eval dies? */
2138         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2139     }
2140     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2141         re_scream_pos_data d;
2142
2143         d.scream_olds = &scream_olds;
2144         d.scream_pos = &scream_pos;
2145         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2146         if (!s) {
2147             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2148             goto phooey;        /* not present */
2149         }
2150     }
2151
2152
2153
2154     /* Simplest case:  anchored match need be tried only once. */
2155     /*  [unless only anchor is BOL and multiline is set] */
2156     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2157         if (s == startpos && regtry(&reginfo, &startpos))
2158             goto got_it;
2159         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2160                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2161         {
2162             char *end;
2163
2164             if (minlen)
2165                 dontbother = minlen - 1;
2166             end = HOP3c(strend, -dontbother, strbeg) - 1;
2167             /* for multiline we only have to try after newlines */
2168             if (prog->check_substr || prog->check_utf8) {
2169                 /* because of the goto we can not easily reuse the macros for bifurcating the
2170                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2171                 if (utf8_target) {
2172                     if (s == startpos)
2173                         goto after_try_utf8;
2174                     while (1) {
2175                         if (regtry(&reginfo, &s)) {
2176                             goto got_it;
2177                         }
2178                       after_try_utf8:
2179                         if (s > end) {
2180                             goto phooey;
2181                         }
2182                         if (prog->extflags & RXf_USE_INTUIT) {
2183                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2184                             if (!s) {
2185                                 goto phooey;
2186                             }
2187                         }
2188                         else {
2189                             s += UTF8SKIP(s);
2190                         }
2191                     }
2192                 } /* end search for check string in unicode */
2193                 else {
2194                     if (s == startpos) {
2195                         goto after_try_latin;
2196                     }
2197                     while (1) {
2198                         if (regtry(&reginfo, &s)) {
2199                             goto got_it;
2200                         }
2201                       after_try_latin:
2202                         if (s > end) {
2203                             goto phooey;
2204                         }
2205                         if (prog->extflags & RXf_USE_INTUIT) {
2206                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2207                             if (!s) {
2208                                 goto phooey;
2209                             }
2210                         }
2211                         else {
2212                             s++;
2213                         }
2214                     }
2215                 } /* end search for check string in latin*/
2216             } /* end search for check string */
2217             else { /* search for newline */
2218                 if (s > startpos) {
2219                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2220                     s--;
2221                 }
2222                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2223                 while (s < end) {
2224                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2225                         if (regtry(&reginfo, &s))
2226                             goto got_it;
2227                     }
2228                 }
2229             } /* end search for newline */
2230         } /* end anchored/multiline check string search */
2231         goto phooey;
2232     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2233     {
2234         /* the warning about reginfo.ganch being used without initialization
2235            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2236            and we only enter this block when the same bit is set. */
2237         char *tmp_s = reginfo.ganch - prog->gofs;
2238
2239         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2240             goto got_it;
2241         goto phooey;
2242     }
2243
2244     /* Messy cases:  unanchored match. */
2245     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2246         /* we have /x+whatever/ */
2247         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2248         char ch;
2249 #ifdef DEBUGGING
2250         int did_match = 0;
2251 #endif
2252         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2253             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2254         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2255
2256         if (utf8_target) {
2257             REXEC_FBC_SCAN(
2258                 if (*s == ch) {
2259                     DEBUG_EXECUTE_r( did_match = 1 );
2260                     if (regtry(&reginfo, &s)) goto got_it;
2261                     s += UTF8SKIP(s);
2262                     while (s < strend && *s == ch)
2263                         s += UTF8SKIP(s);
2264                 }
2265             );
2266         }
2267         else {
2268             REXEC_FBC_SCAN(
2269                 if (*s == ch) {
2270                     DEBUG_EXECUTE_r( did_match = 1 );
2271                     if (regtry(&reginfo, &s)) goto got_it;
2272                     s++;
2273                     while (s < strend && *s == ch)
2274                         s++;
2275                 }
2276             );
2277         }
2278         DEBUG_EXECUTE_r(if (!did_match)
2279                 PerlIO_printf(Perl_debug_log,
2280                                   "Did not find anchored character...\n")
2281                );
2282     }
2283     else if (prog->anchored_substr != NULL
2284               || prog->anchored_utf8 != NULL
2285               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2286                   && prog->float_max_offset < strend - s)) {
2287         SV *must;
2288         I32 back_max;
2289         I32 back_min;
2290         char *last;
2291         char *last1;            /* Last position checked before */
2292 #ifdef DEBUGGING
2293         int did_match = 0;
2294 #endif
2295         if (prog->anchored_substr || prog->anchored_utf8) {
2296             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2297                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2298             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2299             back_max = back_min = prog->anchored_offset;
2300         } else {
2301             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2302                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2303             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2304             back_max = prog->float_max_offset;
2305             back_min = prog->float_min_offset;
2306         }
2307
2308
2309         if (must == &PL_sv_undef)
2310             /* could not downgrade utf8 check substring, so must fail */
2311             goto phooey;
2312
2313         if (back_min<0) {
2314             last = strend;
2315         } else {
2316             last = HOP3c(strend,        /* Cannot start after this */
2317                   -(I32)(CHR_SVLEN(must)
2318                          - (SvTAIL(must) != 0) + back_min), strbeg);
2319         }
2320         if (s > PL_bostr)
2321             last1 = HOPc(s, -1);
2322         else
2323             last1 = s - 1;      /* bogus */
2324
2325         /* XXXX check_substr already used to find "s", can optimize if
2326            check_substr==must. */
2327         scream_pos = -1;
2328         dontbother = end_shift;
2329         strend = HOPc(strend, -dontbother);
2330         while ( (s <= last) &&
2331                 ((flags & REXEC_SCREAM)
2332                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2333                                     end_shift, &scream_pos, 0))
2334                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2335                                   (unsigned char*)strend, must,
2336                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2337             /* we may be pointing at the wrong string */
2338             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2339                 s = strbeg + (s - SvPVX_const(sv));
2340             DEBUG_EXECUTE_r( did_match = 1 );
2341             if (HOPc(s, -back_max) > last1) {
2342                 last1 = HOPc(s, -back_min);
2343                 s = HOPc(s, -back_max);
2344             }
2345             else {
2346                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2347
2348                 last1 = HOPc(s, -back_min);
2349                 s = t;
2350             }
2351             if (utf8_target) {
2352                 while (s <= last1) {
2353                     if (regtry(&reginfo, &s))
2354                         goto got_it;
2355                     s += UTF8SKIP(s);
2356                 }
2357             }
2358             else {
2359                 while (s <= last1) {
2360                     if (regtry(&reginfo, &s))
2361                         goto got_it;
2362                     s++;
2363                 }
2364             }
2365         }
2366         DEBUG_EXECUTE_r(if (!did_match) {
2367             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2368                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2369             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2370                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2371                                ? "anchored" : "floating"),
2372                 quoted, RE_SV_TAIL(must));
2373         });
2374         goto phooey;
2375     }
2376     else if ( (c = progi->regstclass) ) {
2377         if (minlen) {
2378             const OPCODE op = OP(progi->regstclass);
2379             /* don't bother with what can't match */
2380             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2381                 strend = HOPc(strend, -(minlen - 1));
2382         }
2383         DEBUG_EXECUTE_r({
2384             SV * const prop = sv_newmortal();
2385             regprop(prog, prop, c);
2386             {
2387                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2388                     s,strend-s,60);
2389                 PerlIO_printf(Perl_debug_log,
2390                     "Matching stclass %.*s against %s (%d bytes)\n",
2391                     (int)SvCUR(prop), SvPVX_const(prop),
2392                      quoted, (int)(strend - s));
2393             }
2394         });
2395         if (find_byclass(prog, c, s, strend, &reginfo))
2396             goto got_it;
2397         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2398     }
2399     else {
2400         dontbother = 0;
2401         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2402             /* Trim the end. */
2403             char *last;
2404             SV* float_real;
2405
2406             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2407                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2408             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2409
2410             if (flags & REXEC_SCREAM) {
2411                 last = screaminstr(sv, float_real, s - strbeg,
2412                                    end_shift, &scream_pos, 1); /* last one */
2413                 if (!last)
2414                     last = scream_olds; /* Only one occurrence. */
2415                 /* we may be pointing at the wrong string */
2416                 else if (RXp_MATCH_COPIED(prog))
2417                     s = strbeg + (s - SvPVX_const(sv));
2418             }
2419             else {
2420                 STRLEN len;
2421                 const char * const little = SvPV_const(float_real, len);
2422
2423                 if (SvTAIL(float_real)) {
2424                     if (memEQ(strend - len + 1, little, len - 1))
2425                         last = strend - len + 1;
2426                     else if (!multiline)
2427                         last = memEQ(strend - len, little, len)
2428                             ? strend - len : NULL;
2429                     else
2430                         goto find_last;
2431                 } else {
2432                   find_last:
2433                     if (len)
2434                         last = rninstr(s, strend, little, little + len);
2435                     else
2436                         last = strend;  /* matching "$" */
2437                 }
2438             }
2439             if (last == NULL) {
2440                 DEBUG_EXECUTE_r(
2441                     PerlIO_printf(Perl_debug_log,
2442                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2443                         PL_colors[4], PL_colors[5]));
2444                 goto phooey; /* Should not happen! */
2445             }
2446             dontbother = strend - last + prog->float_min_offset;
2447         }
2448         if (minlen && (dontbother < minlen))
2449             dontbother = minlen - 1;
2450         strend -= dontbother;              /* this one's always in bytes! */
2451         /* We don't know much -- general case. */
2452         if (utf8_target) {
2453             for (;;) {
2454                 if (regtry(&reginfo, &s))
2455                     goto got_it;
2456                 if (s >= strend)
2457                     break;
2458                 s += UTF8SKIP(s);
2459             };
2460         }
2461         else {
2462             do {
2463                 if (regtry(&reginfo, &s))
2464                     goto got_it;
2465             } while (s++ < strend);
2466         }
2467     }
2468
2469     /* Failure. */
2470     goto phooey;
2471
2472 got_it:
2473     Safefree(swap);
2474     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2475
2476     if (PL_reg_eval_set)
2477         restore_pos(aTHX_ prog);
2478     if (RXp_PAREN_NAMES(prog))
2479         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2480
2481     /* make sure $`, $&, $', and $digit will work later */
2482     if ( !(flags & REXEC_NOT_FIRST) ) {
2483         RX_MATCH_COPY_FREE(rx);
2484         if (flags & REXEC_COPY_STR) {
2485             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2486 #ifdef PERL_OLD_COPY_ON_WRITE
2487             if ((SvIsCOW(sv)
2488                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2489                 if (DEBUG_C_TEST) {
2490                     PerlIO_printf(Perl_debug_log,
2491                                   "Copy on write: regexp capture, type %d\n",
2492                                   (int) SvTYPE(sv));
2493                 }
2494                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2495                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2496                 assert (SvPOKp(prog->saved_copy));
2497             } else
2498 #endif
2499             {
2500                 RX_MATCH_COPIED_on(rx);
2501                 s = savepvn(strbeg, i);
2502                 prog->subbeg = s;
2503             }
2504             prog->sublen = i;
2505         }
2506         else {
2507             prog->subbeg = strbeg;
2508             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2509         }
2510     }
2511
2512     return 1;
2513
2514 phooey:
2515     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2516                           PL_colors[4], PL_colors[5]));
2517     if (PL_reg_eval_set)
2518         restore_pos(aTHX_ prog);
2519     if (swap) {
2520         /* we failed :-( roll it back */
2521         Safefree(prog->offs);
2522         prog->offs = swap;
2523     }
2524
2525     return 0;
2526 }
2527
2528
2529 /*
2530  - regtry - try match at specific point
2531  */
2532 STATIC I32                      /* 0 failure, 1 success */
2533 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2534 {
2535     dVAR;
2536     CHECKPOINT lastcp;
2537     REGEXP *const rx = reginfo->prog;
2538     regexp *const prog = (struct regexp *)SvANY(rx);
2539     RXi_GET_DECL(prog,progi);
2540     GET_RE_DEBUG_FLAGS_DECL;
2541
2542     PERL_ARGS_ASSERT_REGTRY;
2543
2544     reginfo->cutpoint=NULL;
2545
2546     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2547         MAGIC *mg;
2548
2549         PL_reg_eval_set = RS_init;
2550         DEBUG_EXECUTE_r(DEBUG_s(
2551             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2552                           (IV)(PL_stack_sp - PL_stack_base));
2553             ));
2554         SAVESTACK_CXPOS();
2555         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2556         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2557         SAVETMPS;
2558         /* Apparently this is not needed, judging by wantarray. */
2559         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2560            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2561
2562         if (reginfo->sv) {
2563             /* Make $_ available to executed code. */
2564             if (reginfo->sv != DEFSV) {
2565                 SAVE_DEFSV;
2566                 DEFSV_set(reginfo->sv);
2567             }
2568
2569             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2570                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2571                 /* prepare for quick setting of pos */
2572 #ifdef PERL_OLD_COPY_ON_WRITE
2573                 if (SvIsCOW(reginfo->sv))
2574                     sv_force_normal_flags(reginfo->sv, 0);
2575 #endif
2576                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2577                                  &PL_vtbl_mglob, NULL, 0);
2578                 mg->mg_len = -1;
2579             }
2580             PL_reg_magic    = mg;
2581             PL_reg_oldpos   = mg->mg_len;
2582             SAVEDESTRUCTOR_X(restore_pos, prog);
2583         }
2584         if (!PL_reg_curpm) {
2585             Newxz(PL_reg_curpm, 1, PMOP);
2586 #ifdef USE_ITHREADS
2587             {
2588                 SV* const repointer = &PL_sv_undef;
2589                 /* this regexp is also owned by the new PL_reg_curpm, which
2590                    will try to free it.  */
2591                 av_push(PL_regex_padav, repointer);
2592                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2593                 PL_regex_pad = AvARRAY(PL_regex_padav);
2594             }
2595 #endif
2596         }
2597 #ifdef USE_ITHREADS
2598         /* It seems that non-ithreads works both with and without this code.
2599            So for efficiency reasons it seems best not to have the code
2600            compiled when it is not needed.  */
2601         /* This is safe against NULLs: */
2602         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2603         /* PM_reg_curpm owns a reference to this regexp.  */
2604         ReREFCNT_inc(rx);
2605 #endif
2606         PM_SETRE(PL_reg_curpm, rx);
2607         PL_reg_oldcurpm = PL_curpm;
2608         PL_curpm = PL_reg_curpm;
2609         if (RXp_MATCH_COPIED(prog)) {
2610             /*  Here is a serious problem: we cannot rewrite subbeg,
2611                 since it may be needed if this match fails.  Thus
2612                 $` inside (?{}) could fail... */
2613             PL_reg_oldsaved = prog->subbeg;
2614             PL_reg_oldsavedlen = prog->sublen;
2615 #ifdef PERL_OLD_COPY_ON_WRITE
2616             PL_nrs = prog->saved_copy;
2617 #endif
2618             RXp_MATCH_COPIED_off(prog);
2619         }
2620         else
2621             PL_reg_oldsaved = NULL;
2622         prog->subbeg = PL_bostr;
2623         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2624     }
2625     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2626     prog->offs[0].start = *startpos - PL_bostr;
2627     PL_reginput = *startpos;
2628     PL_reglastparen = &prog->lastparen;
2629     PL_reglastcloseparen = &prog->lastcloseparen;
2630     prog->lastparen = 0;
2631     prog->lastcloseparen = 0;
2632     PL_regsize = 0;
2633     PL_regoffs = prog->offs;
2634     if (PL_reg_start_tmpl <= prog->nparens) {
2635         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2636         if(PL_reg_start_tmp)
2637             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2638         else
2639             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2640     }
2641
2642     /* XXXX What this code is doing here?!!!  There should be no need
2643        to do this again and again, PL_reglastparen should take care of
2644        this!  --ilya*/
2645
2646     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2647      * Actually, the code in regcppop() (which Ilya may be meaning by
2648      * PL_reglastparen), is not needed at all by the test suite
2649      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2650      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2651      * Meanwhile, this code *is* needed for the
2652      * above-mentioned test suite tests to succeed.  The common theme
2653      * on those tests seems to be returning null fields from matches.
2654      * --jhi updated by dapm */
2655 #if 1
2656     if (prog->nparens) {
2657         regexp_paren_pair *pp = PL_regoffs;
2658         register I32 i;
2659         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2660             ++pp;
2661             pp->start = -1;
2662             pp->end = -1;
2663         }
2664     }
2665 #endif
2666     REGCP_SET(lastcp);
2667     if (regmatch(reginfo, progi->program + 1)) {
2668         PL_regoffs[0].end = PL_reginput - PL_bostr;
2669         return 1;
2670     }
2671     if (reginfo->cutpoint)
2672         *startpos= reginfo->cutpoint;
2673     REGCP_UNWIND(lastcp);
2674     return 0;
2675 }
2676
2677
2678 #define sayYES goto yes
2679 #define sayNO goto no
2680 #define sayNO_SILENT goto no_silent
2681
2682 /* we dont use STMT_START/END here because it leads to
2683    "unreachable code" warnings, which are bogus, but distracting. */
2684 #define CACHEsayNO \
2685     if (ST.cache_mask) \
2686        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2687     sayNO
2688
2689 /* this is used to determine how far from the left messages like
2690    'failed...' are printed. It should be set such that messages
2691    are inline with the regop output that created them.
2692 */
2693 #define REPORT_CODE_OFF 32
2694
2695
2696 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2697 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2698
2699 #define SLAB_FIRST(s) (&(s)->states[0])
2700 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2701
2702 /* grab a new slab and return the first slot in it */
2703
2704 STATIC regmatch_state *
2705 S_push_slab(pTHX)
2706 {
2707 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2708     dMY_CXT;
2709 #endif
2710     regmatch_slab *s = PL_regmatch_slab->next;
2711     if (!s) {
2712         Newx(s, 1, regmatch_slab);
2713         s->prev = PL_regmatch_slab;
2714         s->next = NULL;
2715         PL_regmatch_slab->next = s;
2716     }
2717     PL_regmatch_slab = s;
2718     return SLAB_FIRST(s);
2719 }
2720
2721
2722 /* push a new state then goto it */
2723
2724 #define PUSH_STATE_GOTO(state, node) \
2725     scan = node; \
2726     st->resume_state = state; \
2727     goto push_state;
2728
2729 /* push a new state with success backtracking, then goto it */
2730
2731 #define PUSH_YES_STATE_GOTO(state, node) \
2732     scan = node; \
2733     st->resume_state = state; \
2734     goto push_yes_state;
2735
2736
2737
2738 /*
2739
2740 regmatch() - main matching routine
2741
2742 This is basically one big switch statement in a loop. We execute an op,
2743 set 'next' to point the next op, and continue. If we come to a point which
2744 we may need to backtrack to on failure such as (A|B|C), we push a
2745 backtrack state onto the backtrack stack. On failure, we pop the top
2746 state, and re-enter the loop at the state indicated. If there are no more
2747 states to pop, we return failure.
2748
2749 Sometimes we also need to backtrack on success; for example /A+/, where
2750 after successfully matching one A, we need to go back and try to
2751 match another one; similarly for lookahead assertions: if the assertion
2752 completes successfully, we backtrack to the state just before the assertion
2753 and then carry on.  In these cases, the pushed state is marked as
2754 'backtrack on success too'. This marking is in fact done by a chain of
2755 pointers, each pointing to the previous 'yes' state. On success, we pop to
2756 the nearest yes state, discarding any intermediate failure-only states.
2757 Sometimes a yes state is pushed just to force some cleanup code to be
2758 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2759 it to free the inner regex.
2760
2761 Note that failure backtracking rewinds the cursor position, while
2762 success backtracking leaves it alone.
2763
2764 A pattern is complete when the END op is executed, while a subpattern
2765 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2766 ops trigger the "pop to last yes state if any, otherwise return true"
2767 behaviour.
2768
2769 A common convention in this function is to use A and B to refer to the two
2770 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2771 the subpattern to be matched possibly multiple times, while B is the entire
2772 rest of the pattern. Variable and state names reflect this convention.
2773
2774 The states in the main switch are the union of ops and failure/success of
2775 substates associated with with that op.  For example, IFMATCH is the op
2776 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2777 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2778 successfully matched A and IFMATCH_A_fail is a state saying that we have
2779 just failed to match A. Resume states always come in pairs. The backtrack
2780 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2781 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2782 on success or failure.
2783
2784 The struct that holds a backtracking state is actually a big union, with
2785 one variant for each major type of op. The variable st points to the
2786 top-most backtrack struct. To make the code clearer, within each
2787 block of code we #define ST to alias the relevant union.
2788
2789 Here's a concrete example of a (vastly oversimplified) IFMATCH
2790 implementation:
2791
2792     switch (state) {
2793     ....
2794
2795 #define ST st->u.ifmatch
2796
2797     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2798         ST.foo = ...; // some state we wish to save
2799         ...
2800         // push a yes backtrack state with a resume value of
2801         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2802         // first node of A:
2803         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2804         // NOTREACHED
2805
2806     case IFMATCH_A: // we have successfully executed A; now continue with B
2807         next = B;
2808         bar = ST.foo; // do something with the preserved value
2809         break;
2810
2811     case IFMATCH_A_fail: // A failed, so the assertion failed
2812         ...;   // do some housekeeping, then ...
2813         sayNO; // propagate the failure
2814
2815 #undef ST
2816
2817     ...
2818     }
2819
2820 For any old-timers reading this who are familiar with the old recursive
2821 approach, the code above is equivalent to:
2822
2823     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2824     {
2825         int foo = ...
2826         ...
2827         if (regmatch(A)) {
2828             next = B;
2829             bar = foo;
2830             break;
2831         }
2832         ...;   // do some housekeeping, then ...
2833         sayNO; // propagate the failure
2834     }
2835
2836 The topmost backtrack state, pointed to by st, is usually free. If you
2837 want to claim it, populate any ST.foo fields in it with values you wish to
2838 save, then do one of
2839
2840         PUSH_STATE_GOTO(resume_state, node);
2841         PUSH_YES_STATE_GOTO(resume_state, node);
2842
2843 which sets that backtrack state's resume value to 'resume_state', pushes a
2844 new free entry to the top of the backtrack stack, then goes to 'node'.
2845 On backtracking, the free slot is popped, and the saved state becomes the
2846 new free state. An ST.foo field in this new top state can be temporarily
2847 accessed to retrieve values, but once the main loop is re-entered, it
2848 becomes available for reuse.
2849
2850 Note that the depth of the backtrack stack constantly increases during the
2851 left-to-right execution of the pattern, rather than going up and down with
2852 the pattern nesting. For example the stack is at its maximum at Z at the
2853 end of the pattern, rather than at X in the following:
2854
2855     /(((X)+)+)+....(Y)+....Z/
2856
2857 The only exceptions to this are lookahead/behind assertions and the cut,
2858 (?>A), which pop all the backtrack states associated with A before
2859 continuing.
2860
2861 Backtrack state structs are allocated in slabs of about 4K in size.
2862 PL_regmatch_state and st always point to the currently active state,
2863 and PL_regmatch_slab points to the slab currently containing
2864 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2865 allocated, and is never freed until interpreter destruction. When the slab
2866 is full, a new one is allocated and chained to the end. At exit from
2867 regmatch(), slabs allocated since entry are freed.
2868
2869 */
2870
2871
2872 #define DEBUG_STATE_pp(pp)                                  \
2873     DEBUG_STATE_r({                                         \
2874         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2875         PerlIO_printf(Perl_debug_log,                       \
2876             "    %*s"pp" %s%s%s%s%s\n",                     \
2877             depth*2, "",                                    \
2878             PL_reg_name[st->resume_state],                     \
2879             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2880             ((st==yes_state) ? "Y" : ""),                   \
2881             ((st==mark_state) ? "M" : ""),                  \
2882             ((st==yes_state||st==mark_state) ? "]" : "")    \
2883         );                                                  \
2884     });
2885
2886
2887 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2888
2889 #ifdef DEBUGGING
2890
2891 STATIC void
2892 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2893     const char *start, const char *end, const char *blurb)
2894 {
2895     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2896
2897     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2898
2899     if (!PL_colorset)
2900             reginitcolors();
2901     {
2902         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2903             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2904
2905         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2906             start, end - start, 60);
2907
2908         PerlIO_printf(Perl_debug_log,
2909             "%s%s REx%s %s against %s\n",
2910                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2911
2912         if (utf8_target||utf8_pat)
2913             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2914                 utf8_pat ? "pattern" : "",
2915                 utf8_pat && utf8_target ? " and " : "",
2916                 utf8_target ? "string" : ""
2917             );
2918     }
2919 }
2920
2921 STATIC void
2922 S_dump_exec_pos(pTHX_ const char *locinput,
2923                       const regnode *scan,
2924                       const char *loc_regeol,
2925                       const char *loc_bostr,
2926                       const char *loc_reg_starttry,
2927                       const bool utf8_target)
2928 {
2929     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2930     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2931     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2932     /* The part of the string before starttry has one color
2933        (pref0_len chars), between starttry and current
2934        position another one (pref_len - pref0_len chars),
2935        after the current position the third one.
2936        We assume that pref0_len <= pref_len, otherwise we
2937        decrease pref0_len.  */
2938     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2939         ? (5 + taill) - l : locinput - loc_bostr;
2940     int pref0_len;
2941
2942     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2943
2944     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2945         pref_len++;
2946     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2947     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2948         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2949               ? (5 + taill) - pref_len : loc_regeol - locinput);
2950     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2951         l--;
2952     if (pref0_len < 0)
2953         pref0_len = 0;
2954     if (pref0_len > pref_len)
2955         pref0_len = pref_len;
2956     {
2957         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2958
2959         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2960             (locinput - pref_len),pref0_len, 60, 4, 5);
2961
2962         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2963                     (locinput - pref_len + pref0_len),
2964                     pref_len - pref0_len, 60, 2, 3);
2965
2966         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2967                     locinput, loc_regeol - locinput, 10, 0, 1);
2968
2969         const STRLEN tlen=len0+len1+len2;
2970         PerlIO_printf(Perl_debug_log,
2971                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2972                     (IV)(locinput - loc_bostr),
2973                     len0, s0,
2974                     len1, s1,
2975                     (docolor ? "" : "> <"),
2976                     len2, s2,
2977                     (int)(tlen > 19 ? 0 :  19 - tlen),
2978                     "");
2979     }
2980 }
2981
2982 #endif
2983
2984 /* reg_check_named_buff_matched()
2985  * Checks to see if a named buffer has matched. The data array of
2986  * buffer numbers corresponding to the buffer is expected to reside
2987  * in the regexp->data->data array in the slot stored in the ARG() of
2988  * node involved. Note that this routine doesn't actually care about the
2989  * name, that information is not preserved from compilation to execution.
2990  * Returns the index of the leftmost defined buffer with the given name
2991  * or 0 if non of the buffers matched.
2992  */
2993 STATIC I32
2994 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2995 {
2996     I32 n;
2997     RXi_GET_DECL(rex,rexi);
2998     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2999     I32 *nums=(I32*)SvPVX(sv_dat);
3000
3001     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3002
3003     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3004         if ((I32)*PL_reglastparen >= nums[n] &&
3005             PL_regoffs[nums[n]].end != -1)
3006         {
3007             return nums[n];
3008         }
3009     }
3010     return 0;
3011 }
3012
3013
3014 /* free all slabs above current one  - called during LEAVE_SCOPE */
3015
3016 STATIC void
3017 S_clear_backtrack_stack(pTHX_ void *p)
3018 {
3019     regmatch_slab *s = PL_regmatch_slab->next;
3020     PERL_UNUSED_ARG(p);
3021
3022     if (!s)
3023         return;
3024     PL_regmatch_slab->next = NULL;
3025     while (s) {
3026         regmatch_slab * const osl = s;
3027         s = s->next;
3028         Safefree(osl);
3029     }
3030 }
3031
3032
3033 #define SETREX(Re1,Re2) \
3034     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3035     Re1 = (Re2)
3036
3037 STATIC I32                      /* 0 failure, 1 success */
3038 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3039 {
3040 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3041     dMY_CXT;
3042 #endif
3043     dVAR;
3044     register const bool utf8_target = PL_reg_match_utf8;
3045     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3046     REGEXP *rex_sv = reginfo->prog;
3047     regexp *rex = (struct regexp *)SvANY(rex_sv);
3048     RXi_GET_DECL(rex,rexi);
3049     I32 oldsave;
3050     /* the current state. This is a cached copy of PL_regmatch_state */
3051     register regmatch_state *st;
3052     /* cache heavy used fields of st in registers */
3053     register regnode *scan;
3054     register regnode *next;
3055     register U32 n = 0; /* general value; init to avoid compiler warning */
3056     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3057     register char *locinput = PL_reginput;
3058     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3059
3060     bool result = 0;        /* return value of S_regmatch */
3061     int depth = 0;          /* depth of backtrack stack */
3062     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3063     const U32 max_nochange_depth =
3064         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3065         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3066     regmatch_state *yes_state = NULL; /* state to pop to on success of
3067                                                             subpattern */
3068     /* mark_state piggy backs on the yes_state logic so that when we unwind
3069        the stack on success we can update the mark_state as we go */
3070     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3071     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3072     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3073     U32 state_num;
3074     bool no_final = 0;      /* prevent failure from backtracking? */
3075     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3076     char *startpoint = PL_reginput;
3077     SV *popmark = NULL;     /* are we looking for a mark? */
3078     SV *sv_commit = NULL;   /* last mark name seen in failure */
3079     SV *sv_yes_mark = NULL; /* last mark name we have seen
3080                                during a successful match */
3081     U32 lastopen = 0;       /* last open we saw */
3082     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3083     SV* const oreplsv = GvSV(PL_replgv);
3084     /* these three flags are set by various ops to signal information to
3085      * the very next op. They have a useful lifetime of exactly one loop
3086      * iteration, and are not preserved or restored by state pushes/pops
3087      */
3088     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3089     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3090     int logical = 0;        /* the following EVAL is:
3091                                 0: (?{...})
3092                                 1: (?(?{...})X|Y)
3093                                 2: (??{...})
3094                                or the following IFMATCH/UNLESSM is:
3095                                 false: plain (?=foo)
3096                                 true:  used as a condition: (?(?=foo))
3097                             */
3098 #ifdef DEBUGGING
3099     GET_RE_DEBUG_FLAGS_DECL;
3100 #endif
3101
3102     PERL_ARGS_ASSERT_REGMATCH;
3103
3104     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3105             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3106     }));
3107     /* on first ever call to regmatch, allocate first slab */
3108     if (!PL_regmatch_slab) {
3109         Newx(PL_regmatch_slab, 1, regmatch_slab);
3110         PL_regmatch_slab->prev = NULL;
3111         PL_regmatch_slab->next = NULL;
3112         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3113     }
3114
3115     oldsave = PL_savestack_ix;
3116     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3117     SAVEVPTR(PL_regmatch_slab);
3118     SAVEVPTR(PL_regmatch_state);
3119
3120     /* grab next free state slot */
3121     st = ++PL_regmatch_state;
3122     if (st >  SLAB_LAST(PL_regmatch_slab))
3123         st = PL_regmatch_state = S_push_slab(aTHX);
3124
3125     /* Note that nextchr is a byte even in UTF */
3126     nextchr = UCHARAT(locinput);
3127     scan = prog;
3128     while (scan != NULL) {
3129
3130         DEBUG_EXECUTE_r( {
3131             SV * const prop = sv_newmortal();
3132             regnode *rnext=regnext(scan);
3133             DUMP_EXEC_POS( locinput, scan, utf8_target );
3134             regprop(rex, prop, scan);
3135
3136             PerlIO_printf(Perl_debug_log,
3137                     "%3"IVdf":%*s%s(%"IVdf")\n",
3138                     (IV)(scan - rexi->program), depth*2, "",
3139                     SvPVX_const(prop),
3140                     (PL_regkind[OP(scan)] == END || !rnext) ?
3141                         0 : (IV)(rnext - rexi->program));
3142         });
3143
3144         next = scan + NEXT_OFF(scan);
3145         if (next == scan)
3146             next = NULL;
3147         state_num = OP(scan);
3148
3149         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3150       reenter_switch:
3151
3152         assert(PL_reglastparen == &rex->lastparen);
3153         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3154         assert(PL_regoffs == rex->offs);
3155
3156         switch (state_num) {
3157         case BOL:
3158             if (locinput == PL_bostr)
3159             {
3160                 /* reginfo->till = reginfo->bol; */
3161                 break;
3162             }
3163             sayNO;
3164         case MBOL:
3165             if (locinput == PL_bostr ||
3166                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3167             {
3168                 break;
3169             }
3170             sayNO;
3171         case SBOL:
3172             if (locinput == PL_bostr)
3173                 break;
3174             sayNO;
3175         case GPOS:
3176             if (locinput == reginfo->ganch)
3177                 break;
3178             sayNO;
3179
3180         case KEEPS:
3181             /* update the startpoint */
3182             st->u.keeper.val = PL_regoffs[0].start;
3183             PL_reginput = locinput;
3184             PL_regoffs[0].start = locinput - PL_bostr;
3185             PUSH_STATE_GOTO(KEEPS_next, next);
3186             /*NOT-REACHED*/
3187         case KEEPS_next_fail:
3188             /* rollback the start point change */
3189             PL_regoffs[0].start = st->u.keeper.val;
3190             sayNO_SILENT;
3191             /*NOT-REACHED*/
3192         case EOL:
3193                 goto seol;
3194         case MEOL:
3195             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3196                 sayNO;
3197             break;
3198         case SEOL:
3199           seol:
3200             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3201                 sayNO;
3202             if (PL_regeol - locinput > 1)
3203                 sayNO;
3204             break;
3205         case EOS:
3206             if (PL_regeol != locinput)
3207                 sayNO;
3208             break;
3209         case SANY:
3210             if (!nextchr && locinput >= PL_regeol)
3211                 sayNO;
3212             if (utf8_target) {
3213                 locinput += PL_utf8skip[nextchr];
3214                 if (locinput > PL_regeol)
3215                     sayNO;
3216                 nextchr = UCHARAT(locinput);
3217             }
3218             else
3219                 nextchr = UCHARAT(++locinput);
3220             break;
3221         case CANY:
3222             if (!nextchr && locinput >= PL_regeol)
3223                 sayNO;
3224             nextchr = UCHARAT(++locinput);
3225             break;
3226         case REG_ANY:
3227             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3228                 sayNO;
3229             if (utf8_target) {
3230                 locinput += PL_utf8skip[nextchr];
3231                 if (locinput > PL_regeol)
3232                     sayNO;
3233                 nextchr = UCHARAT(locinput);
3234             }
3235             else
3236                 nextchr = UCHARAT(++locinput);
3237             break;
3238
3239 #undef  ST
3240 #define ST st->u.trie
3241         case TRIEC:
3242             /* In this case the charclass data is available inline so
3243                we can fail fast without a lot of extra overhead.
3244              */
3245             if (scan->flags == EXACT || !utf8_target) {
3246                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3247                     DEBUG_EXECUTE_r(
3248                         PerlIO_printf(Perl_debug_log,
3249                                   "%*s  %sfailed to match trie start class...%s\n",
3250                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3251                     );
3252                     sayNO_SILENT;
3253                     /* NOTREACHED */
3254                 }
3255             }
3256             /* FALL THROUGH */
3257         case TRIE:
3258             /* the basic plan of execution of the trie is:
3259              * At the beginning, run though all the states, and
3260              * find the longest-matching word. Also remember the position
3261              * of the shortest matching word. For example, this pattern:
3262              *    1  2 3 4    5
3263              *    ab|a|x|abcd|abc
3264              * when matched against the string "abcde", will generate
3265              * accept states for all words except 3, with the longest
3266              * matching word being 4, and the shortest being 1 (with
3267              * the position being after char 1 of the string).
3268              *
3269              * Then for each matching word, in word order (i.e. 1,2,4,5),
3270              * we run the remainder of the pattern; on each try setting
3271              * the current position to the character following the word,
3272              * returning to try the next word on failure.
3273              *
3274              * We avoid having to build a list of words at runtime by
3275              * using a compile-time structure, wordinfo[].prev, which
3276              * gives, for each word, the previous accepting word (if any).
3277              * In the case above it would contain the mappings 1->2, 2->0,
3278              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3279              * the longest word (4 above), a list of all words, by
3280              * following the list of prev pointers; this gives us the
3281              * unordered list 4,5,1,2. Then given the current word we have
3282              * just tried, we can go through the list and find the
3283              * next-biggest word to try (so if we just failed on word 2,
3284              * the next in the list is 4).
3285              *
3286              * Since at runtime we don't record the matching position in
3287              * the string for each word, we have to work that out for
3288              * each word we're about to process. The wordinfo table holds
3289              * the character length of each word; given that we recorded
3290              * at the start: the position of the shortest word and its
3291              * length in chars, we just need to move the pointer the
3292              * difference between the two char lengths. Depending on
3293              * Unicode status and folding, that's cheap or expensive.
3294              *
3295              * This algorithm is optimised for the case where are only a
3296              * small number of accept states, i.e. 0,1, or maybe 2.
3297              * With lots of accepts states, and having to try all of them,
3298              * it becomes quadratic on number of accept states to find all
3299              * the next words.
3300              */
3301
3302             {
3303                 /* what type of TRIE am I? (utf8 makes this contextual) */
3304                 DECL_TRIE_TYPE(scan);
3305
3306                 /* what trie are we using right now */
3307                 reg_trie_data * const trie
3308                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3309                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3310                 U32 state = trie->startstate;
3311
3312                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3313                     !TRIE_BITMAP_TEST(trie,*locinput)
3314                 ) {
3315                     if (trie->states[ state ].wordnum) {
3316                          DEBUG_EXECUTE_r(
3317                             PerlIO_printf(Perl_debug_log,
3318                                           "%*s  %smatched empty string...%s\n",
3319                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3320                         );
3321                         if (!trie->jump)
3322                             break;
3323                     } else {
3324                         DEBUG_EXECUTE_r(
3325                             PerlIO_printf(Perl_debug_log,
3326                                           "%*s  %sfailed to match trie start class...%s\n",
3327                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3328                         );
3329                         sayNO_SILENT;
3330                    }
3331                 }
3332
3333             {
3334                 U8 *uc = ( U8* )locinput;
3335
3336                 STRLEN len = 0;
3337                 STRLEN foldlen = 0;
3338                 U8 *uscan = (U8*)NULL;
3339                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3340                 U32 charcount = 0; /* how many input chars we have matched */
3341                 U32 accepted = 0; /* have we seen any accepting states? */
3342
3343                 ST.B = next;
3344                 ST.jump = trie->jump;
3345                 ST.me = scan;
3346                 ST.firstpos = NULL;
3347                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3348                 ST.nextword = 0;
3349
3350                 /* fully traverse the TRIE; note the position of the
3351                    shortest accept state and the wordnum of the longest
3352                    accept state */
3353
3354                 while ( state && uc <= (U8*)PL_regeol ) {
3355                     U32 base = trie->states[ state ].trans.base;
3356                     UV uvc = 0;
3357                     U16 charid = 0;
3358                     U16 wordnum;
3359                     wordnum = trie->states[ state ].wordnum;
3360
3361                     if (wordnum) { /* it's an accept state */
3362                         if (!accepted) {
3363                             accepted = 1;
3364                             /* record first match position */
3365                             if (ST.longfold) {
3366                                 ST.firstpos = (U8*)locinput;
3367                                 ST.firstchars = 0;
3368                             }
3369                             else {
3370                                 ST.firstpos = uc;
3371                                 ST.firstchars = charcount;
3372                             }
3373                         }
3374                         if (!ST.nextword || wordnum < ST.nextword)
3375                             ST.nextword = wordnum;
3376                         ST.topword = wordnum;
3377                     }
3378
3379                     DEBUG_TRIE_EXECUTE_r({
3380                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3381                                 PerlIO_printf( Perl_debug_log,
3382                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3383                                     2+depth * 2, "", PL_colors[4],
3384                                     (UV)state, (accepted ? 'Y' : 'N'));
3385                     });
3386
3387                     /* read a char and goto next state */
3388                     if ( base ) {
3389                         I32 offset;
3390                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3391                                              uscan, len, uvc, charid, foldlen,
3392                                              foldbuf, uniflags);
3393                         charcount++;
3394                         if (foldlen>0)
3395                             ST.longfold = TRUE;
3396                         if (charid &&
3397                              ( ((offset =
3398                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3399
3400                              && ((U32)offset < trie->lasttrans)
3401                              && trie->trans[offset].check == state)
3402                         {
3403                             state = trie->trans[offset].next;
3404                         }
3405                         else {
3406                             state = 0;
3407                         }
3408                         uc += len;
3409
3410                     }
3411                     else {
3412                         state = 0;
3413                     }
3414                     DEBUG_TRIE_EXECUTE_r(
3415                         PerlIO_printf( Perl_debug_log,
3416                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3417                             charid, uvc, (UV)state, PL_colors[5] );
3418                     );
3419                 }
3420                 if (!accepted)
3421                    sayNO;
3422
3423                 /* calculate total number of accept states */
3424                 {
3425                     U16 w = ST.topword;
3426                     accepted = 0;
3427                     while (w) {
3428                         w = trie->wordinfo[w].prev;
3429                         accepted++;
3430                     }
3431                     ST.accepted = accepted;
3432                 }
3433
3434                 DEBUG_EXECUTE_r(
3435                     PerlIO_printf( Perl_debug_log,
3436                         "%*s  %sgot %"IVdf" possible matches%s\n",
3437                         REPORT_CODE_OFF + depth * 2, "",
3438                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3439                 );
3440                 goto trie_first_try; /* jump into the fail handler */
3441             }}
3442             /* NOTREACHED */
3443
3444         case TRIE_next_fail: /* we failed - try next alternative */
3445             if ( ST.jump) {
3446                 REGCP_UNWIND(ST.cp);
3447                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3448                     PL_regoffs[n].end = -1;
3449                 *PL_reglastparen = n;
3450             }
3451             if (!--ST.accepted) {
3452                 DEBUG_EXECUTE_r({
3453                     PerlIO_printf( Perl_debug_log,
3454                         "%*s  %sTRIE failed...%s\n",
3455                         REPORT_CODE_OFF+depth*2, "",
3456                         PL_colors[4],
3457                         PL_colors[5] );
3458                 });
3459                 sayNO_SILENT;
3460             }
3461             {
3462                 /* Find next-highest word to process.  Note that this code
3463                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3464                 register U16 min = 0;
3465                 register U16 word;
3466                 register U16 const nextword = ST.nextword;
3467                 register reg_trie_wordinfo * const wordinfo
3468                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3469                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3470                     if (word > nextword && (!min || word < min))
3471                         min = word;
3472                 }
3473                 ST.nextword = min;
3474             }
3475
3476           trie_first_try:
3477             if (do_cutgroup) {
3478                 do_cutgroup = 0;
3479                 no_final = 0;
3480             }
3481
3482             if ( ST.jump) {
3483                 ST.lastparen = *PL_reglastparen;
3484                 REGCP_SET(ST.cp);
3485             }
3486
3487             /* find start char of end of current word */
3488             {
3489                 U32 chars; /* how many chars to skip */
3490                 U8 *uc = ST.firstpos;
3491                 reg_trie_data * const trie
3492                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3493
3494                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3495                             >=  ST.firstchars);
3496                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3497                             - ST.firstchars;
3498
3499                 if (ST.longfold) {
3500                     /* the hard option - fold each char in turn and find
3501                      * its folded length (which may be different */
3502                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3503                     STRLEN foldlen;
3504                     STRLEN len;
3505                     UV uvc;
3506                     U8 *uscan;
3507
3508                     while (chars) {
3509                         if (utf8_target) {
3510                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3511                                                     uniflags);
3512                             uc += len;
3513                         }
3514                         else {
3515                             uvc = *uc;
3516                             uc++;
3517                         }
3518                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3519                         uscan = foldbuf;
3520                         while (foldlen) {
3521                             if (!--chars)
3522                                 break;
3523                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3524                                             uniflags);
3525                             uscan += len;
3526                             foldlen -= len;
3527                         }
3528                     }
3529                 }
3530                 else {
3531                     if (utf8_target)
3532                         while (chars--)
3533                             uc += UTF8SKIP(uc);
3534                     else
3535                         uc += chars;
3536                 }
3537                 PL_reginput = (char *)uc;
3538             }
3539
3540             scan = (ST.jump && ST.jump[ST.nextword])
3541                         ? ST.me + ST.jump[ST.nextword]
3542                         : ST.B;
3543
3544             DEBUG_EXECUTE_r({
3545                 PerlIO_printf( Perl_debug_log,
3546                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3547                     REPORT_CODE_OFF+depth*2, "",
3548                     PL_colors[4],
3549                     ST.nextword,
3550                     PL_colors[5]
3551                     );
3552             });
3553
3554             if (ST.accepted > 1 || has_cutgroup) {
3555                 PUSH_STATE_GOTO(TRIE_next, scan);
3556                 /* NOTREACHED */
3557             }
3558             /* only one choice left - just continue */
3559             DEBUG_EXECUTE_r({
3560                 AV *const trie_words
3561                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3562                 SV ** const tmp = av_fetch( trie_words,
3563                     ST.nextword-1, 0 );
3564                 SV *sv= tmp ? sv_newmortal() : NULL;
3565
3566                 PerlIO_printf( Perl_debug_log,
3567                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3568                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3569                     ST.nextword,
3570                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3571                             PL_colors[0], PL_colors[1],
3572                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3573                         )
3574                     : "not compiled under -Dr",
3575                     PL_colors[5] );
3576             });
3577
3578             locinput = PL_reginput;
3579             nextchr = UCHARAT(locinput);
3580             continue; /* execute rest of RE */
3581             /* NOTREACHED */
3582 #undef  ST
3583
3584         case EXACT: {
3585             char *s = STRING(scan);
3586             ln = STR_LEN(scan);
3587             if (utf8_target != UTF_PATTERN) {
3588                 /* The target and the pattern have differing utf8ness. */
3589                 char *l = locinput;
3590                 const char * const e = s + ln;
3591
3592                 if (utf8_target) {
3593                     /* The target is utf8, the pattern is not utf8. */
3594                     while (s < e) {
3595                         STRLEN ulen;
3596                         if (l >= PL_regeol)
3597                              sayNO;
3598                         if (NATIVE_TO_UNI(*(U8*)s) !=
3599                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3600                                             uniflags))
3601                              sayNO;
3602                         l += ulen;
3603                         s ++;
3604                     }
3605                 }
3606                 else {
3607                     /* The target is not utf8, the pattern is utf8. */
3608                     while (s < e) {
3609                         STRLEN ulen;
3610                         if (l >= PL_regeol)
3611                             sayNO;
3612                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3613                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3614                                            uniflags))
3615                             sayNO;
3616                         s += ulen;
3617                         l ++;
3618                     }
3619                 }
3620                 locinput = l;
3621                 nextchr = UCHARAT(locinput);
3622                 break;
3623             }
3624             /* The target and the pattern have the same utf8ness. */
3625             /* Inline the first character, for speed. */
3626             if (UCHARAT(s) != nextchr)
3627                 sayNO;
3628             if (PL_regeol - locinput < ln)
3629                 sayNO;
3630             if (ln > 1 && memNE(s, locinput, ln))
3631                 sayNO;
3632             locinput += ln;
3633             nextchr = UCHARAT(locinput);
3634             break;
3635             }
3636         case EXACTFL: {
3637             re_fold_t folder;
3638             const U8 * fold_array;
3639             const char * s;
3640             U32 fold_utf8_flags;
3641
3642             PL_reg_flags |= RF_tainted;
3643             folder = foldEQ_locale;
3644             fold_array = PL_fold_locale;
3645             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3646             goto do_exactf;
3647
3648         case EXACTFU:
3649             folder = foldEQ_latin1;
3650             fold_array = PL_fold_latin1;
3651             fold_utf8_flags = 0;
3652             goto do_exactf;
3653
3654         case EXACTFA:
3655             folder = foldEQ_latin1;
3656             fold_array = PL_fold_latin1;
3657             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3658             goto do_exactf;
3659
3660         case EXACTF:
3661             folder = foldEQ;
3662             fold_array = PL_fold;
3663             fold_utf8_flags = 0;
3664
3665           do_exactf:
3666             s = STRING(scan);
3667             ln = STR_LEN(scan);
3668
3669             if (utf8_target || UTF_PATTERN) {
3670               /* Either target or the pattern are utf8. */
3671                 const char * const l = locinput;
3672                 char *e = PL_regeol;
3673
3674                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3675                                l, &e, 0,  utf8_target, fold_utf8_flags)) {
3676                      /* One more case for the sharp s:
3677                       * pack("U0U*", 0xDF) =~ /ss/i,
3678                       * the 0xC3 0x9F are the UTF-8
3679                       * byte sequence for the U+00DF. */
3680
3681                      if (!(utf8_target &&
3682                            toLOWER(s[0]) == 's' &&
3683                            ln >= 2 &&
3684                            toLOWER(s[1]) == 's' &&
3685                            (U8)l[0] == 0xC3 &&
3686                            e - l >= 2 &&
3687                            (U8)l[1] == 0x9F))
3688                           sayNO;
3689                 }
3690                 locinput = e;
3691                 nextchr = UCHARAT(locinput);
3692                 break;
3693             }
3694
3695             /* Neither the target nor the pattern are utf8 */
3696             if (UCHARAT(s) != nextchr &&
3697                 UCHARAT(s) != fold_array[nextchr])
3698             {
3699                 sayNO;
3700             }
3701             if (PL_regeol - locinput < ln)
3702                 sayNO;
3703             if (ln > 1 && ! folder(s, locinput, ln))
3704                 sayNO;
3705             locinput += ln;
3706             nextchr = UCHARAT(locinput);
3707             break;
3708         }
3709
3710         /* XXX Could improve efficiency by separating these all out using a
3711          * macro or in-line function.  At that point regcomp.c would no longer
3712          * have to set the FLAGS fields of these */
3713         case BOUNDL:
3714         case NBOUNDL:
3715             PL_reg_flags |= RF_tainted;
3716             /* FALL THROUGH */
3717         case BOUND:
3718         case BOUNDU:
3719         case BOUNDA:
3720         case NBOUND:
3721         case NBOUNDU:
3722         case NBOUNDA:
3723             /* was last char in word? */
3724             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3725                 if (locinput == PL_bostr)
3726                     ln = '\n';
3727                 else {
3728                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3729
3730                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3731                 }
3732                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3733                     ln = isALNUM_uni(ln);
3734                     LOAD_UTF8_CHARCLASS_ALNUM();
3735                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3736                 }
3737                 else {
3738                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3739                     n = isALNUM_LC_utf8((U8*)locinput);
3740                 }
3741             }
3742             else {
3743
3744                 /* Here the string isn't utf8, or is utf8 and only ascii
3745                  * characters are to match \w.  In the latter case looking at
3746                  * the byte just prior to the current one may be just the final
3747                  * byte of a multi-byte character.  This is ok.  There are two
3748                  * cases:
3749                  * 1) it is a single byte character, and then the test is doing
3750                  *      just what it's supposed to.
3751                  * 2) it is a multi-byte character, in which case the final
3752                  *      byte is never mistakable for ASCII, and so the test
3753                  *      will say it is not a word character, which is the
3754                  *      correct answer. */
3755                 ln = (locinput != PL_bostr) ?
3756                     UCHARAT(locinput - 1) : '\n';
3757                 switch (FLAGS(scan)) {
3758                     case REGEX_UNICODE_CHARSET:
3759                         ln = isWORDCHAR_L1(ln);
3760                         n = isWORDCHAR_L1(nextchr);
3761                         break;
3762                     case REGEX_LOCALE_CHARSET:
3763                         ln = isALNUM_LC(ln);
3764                         n = isALNUM_LC(nextchr);
3765                         break;
3766                     case REGEX_DEPENDS_CHARSET:
3767                         ln = isALNUM(ln);
3768                         n = isALNUM(nextchr);
3769                         break;
3770                     case REGEX_ASCII_RESTRICTED_CHARSET:
3771                         ln = isWORDCHAR_A(ln);
3772                         n = isWORDCHAR_A(nextchr);
3773                         break;
3774                     default:
3775                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3776                         break;
3777                 }
3778             }
3779             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3780              * regcomp.sym */
3781             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3782                     sayNO;
3783             break;
3784         case ANYOFV:
3785         case ANYOF:
3786             if (utf8_target || state_num == ANYOFV) {
3787                 STRLEN inclasslen = PL_regeol - locinput;
3788                 if (locinput >= PL_regeol)
3789                     sayNO;
3790
3791                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3792                     sayNO;
3793                 locinput += inclasslen;
3794                 nextchr = UCHARAT(locinput);
3795                 break;
3796             }
3797             else {
3798                 if (nextchr < 0)
3799                     nextchr = UCHARAT(locinput);
3800                 if (!nextchr && locinput >= PL_regeol)
3801                     sayNO;
3802                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3803                     sayNO;
3804                 nextchr = UCHARAT(++locinput);
3805                 break;
3806             }
3807             break;
3808         /* Special char classes - The defines start on line 129 or so */
3809         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3810                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3811                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3812                   ALNUMA, NALNUMA, isWORDCHAR_A,
3813                   perl_word, "a");
3814
3815         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3816                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3817                   SPACEU, NSPACEU, isSPACE_L1,
3818                   SPACEA, NSPACEA, isSPACE_A,
3819                   perl_space, " ");
3820
3821         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3822                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3823                 DIGITA, NDIGITA, isDIGIT_A,
3824                 posix_digit, "0");
3825
3826         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3827                        a Unicode extended Grapheme Cluster */
3828             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3829               extended Grapheme Cluster is:
3830
3831                CR LF
3832                | Prepend* Begin Extend*
3833                | .
3834
3835                Begin is (Hangul-syllable | ! Control)
3836                Extend is (Grapheme_Extend | Spacing_Mark)
3837                Control is [ GCB_Control CR LF ]
3838
3839                The discussion below shows how the code for CLUMP is derived
3840                from this regex.  Note that most of these concepts are from
3841                property values of the Grapheme Cluster Boundary (GCB) property.
3842                No code point can have multiple property values for a given
3843                property.  Thus a code point in Prepend can't be in Control, but
3844                it must be in !Control.  This is why Control above includes
3845                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3846                property separately, and so can't be in GCB_Control, even though
3847                they logically are controls.  Control is not the same as gc=cc,
3848                but includes format and other characters as well.
3849
3850                The Unicode definition of Hangul-syllable is:
3851                    L+
3852                    | (L* ( ( V | LV ) V* | LVT ) T*)
3853                    | T+
3854                   )
3855                Each of these is a value for the GCB property, and hence must be
3856                disjoint, so the order they are tested is immaterial, so the
3857                above can safely be changed to
3858                    T+
3859                    | L+
3860                    | (L* ( LVT | ( V | LV ) V*) T*)
3861
3862                The last two terms can be combined like this:
3863                    L* ( L
3864                         | (( LVT | ( V | LV ) V*) T*))
3865
3866                And refactored into this:
3867                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3868
3869                That means that if we have seen any L's at all we can quit
3870                there, but if the next character is a LVT, a V or and LV we
3871                should keep going.
3872
3873                There is a subtlety with Prepend* which showed up in testing.
3874                Note that the Begin, and only the Begin is required in:
3875                 | Prepend* Begin Extend*
3876                Also, Begin contains '! Control'.  A Prepend must be a '!
3877                Control', which means it must be a Begin.  What it comes down to
3878                is that if we match Prepend* and then find no suitable Begin
3879                afterwards, that if we backtrack the last Prepend, that one will
3880                be a suitable Begin.
3881             */
3882
3883             if (locinput >= PL_regeol)
3884                 sayNO;
3885             if  (! utf8_target) {
3886
3887                 /* Match either CR LF  or '.', as all the other possibilities
3888                  * require utf8 */
3889                 locinput++;         /* Match the . or CR */
3890                 if (nextchr == '\r'
3891                     && locinput < PL_regeol
3892                     && UCHARAT(locinput) == '\n') locinput++;
3893             }
3894             else {
3895
3896                 /* Utf8: See if is ( CR LF ); already know that locinput <
3897                  * PL_regeol, so locinput+1 is in bounds */
3898                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3899                     locinput += 2;
3900                 }
3901                 else {
3902                     /* In case have to backtrack to beginning, then match '.' */
3903                     char *starting = locinput;
3904
3905                     /* In case have to backtrack the last prepend */
3906                     char *previous_prepend = 0;
3907
3908                     LOAD_UTF8_CHARCLASS_GCB();
3909
3910                     /* Match (prepend)* */
3911                     while (locinput < PL_regeol
3912                            && swash_fetch(PL_utf8_X_prepend,
3913                                           (U8*)locinput, utf8_target))
3914                     {
3915                         previous_prepend = locinput;
3916                         locinput += UTF8SKIP(locinput);
3917                     }
3918
3919                     /* As noted above, if we matched a prepend character, but
3920                      * the next thing won't match, back off the last prepend we
3921                      * matched, as it is guaranteed to match the begin */
3922                     if (previous_prepend
3923                         && (locinput >=  PL_regeol
3924                             || ! swash_fetch(PL_utf8_X_begin,
3925                                              (U8*)locinput, utf8_target)))
3926                     {
3927                         locinput = previous_prepend;
3928                     }
3929
3930                     /* Note that here we know PL_regeol > locinput, as we
3931                      * tested that upon input to this switch case, and if we
3932                      * moved locinput forward, we tested the result just above
3933                      * and it either passed, or we backed off so that it will
3934                      * now pass */
3935                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3936
3937                         /* Here did not match the required 'Begin' in the
3938                          * second term.  So just match the very first
3939                          * character, the '.' of the final term of the regex */
3940                         locinput = starting + UTF8SKIP(starting);
3941                     } else {
3942
3943                         /* Here is the beginning of a character that can have
3944                          * an extender.  It is either a hangul syllable, or a
3945                          * non-control */
3946                         if (swash_fetch(PL_utf8_X_non_hangul,
3947                                         (U8*)locinput, utf8_target))
3948                         {
3949
3950                             /* Here not a Hangul syllable, must be a
3951                              * ('!  * Control') */
3952                             locinput += UTF8SKIP(locinput);
3953                         } else {
3954
3955                             /* Here is a Hangul syllable.  It can be composed
3956                              * of several individual characters.  One
3957                              * possibility is T+ */
3958                             if (swash_fetch(PL_utf8_X_T,
3959                                             (U8*)locinput, utf8_target))
3960                             {
3961                                 while (locinput < PL_regeol
3962                                         && swash_fetch(PL_utf8_X_T,
3963                                                         (U8*)locinput, utf8_target))
3964                                 {
3965                                     locinput += UTF8SKIP(locinput);
3966                                 }
3967                             } else {
3968
3969                                 /* Here, not T+, but is a Hangul.  That means
3970                                  * it is one of the others: L, LV, LVT or V,
3971                                  * and matches:
3972                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3973
3974                                 /* Match L*           */
3975                                 while (locinput < PL_regeol
3976                                         && swash_fetch(PL_utf8_X_L,
3977                                                         (U8*)locinput, utf8_target))
3978                                 {
3979                                     locinput += UTF8SKIP(locinput);
3980                                 }
3981
3982                                 /* Here, have exhausted L*.  If the next
3983                                  * character is not an LV, LVT nor V, it means
3984                                  * we had to have at least one L, so matches L+
3985                                  * in the original equation, we have a complete
3986                                  * hangul syllable.  Are done. */
3987
3988                                 if (locinput < PL_regeol
3989                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3990                                                     (U8*)locinput, utf8_target))
3991                                 {
3992
3993                                     /* Otherwise keep going.  Must be LV, LVT
3994                                      * or V.  See if LVT */
3995                                     if (swash_fetch(PL_utf8_X_LVT,
3996                                                     (U8*)locinput, utf8_target))
3997                                     {
3998                                         locinput += UTF8SKIP(locinput);
3999                                     } else {
4000
4001                                         /* Must be  V or LV.  Take it, then
4002                                          * match V*     */
4003                                         locinput += UTF8SKIP(locinput);
4004                                         while (locinput < PL_regeol
4005                                                 && swash_fetch(PL_utf8_X_V,
4006                                                          (U8*)locinput, utf8_target))
4007                                         {
4008                                             locinput += UTF8SKIP(locinput);
4009                                         }
4010                                     }
4011
4012                                     /* And any of LV, LVT, or V can be followed
4013                                      * by T*            */
4014                                     while (locinput < PL_regeol
4015                                            && swash_fetch(PL_utf8_X_T,
4016                                                            (U8*)locinput,
4017                                                            utf8_target))
4018                                     {
4019                                         locinput += UTF8SKIP(locinput);
4020                                     }
4021                                 }
4022                             }
4023                         }
4024
4025                         /* Match any extender */
4026                         while (locinput < PL_regeol
4027                                 && swash_fetch(PL_utf8_X_extend,
4028                                                 (U8*)locinput, utf8_target))
4029                         {
4030                             locinput += UTF8SKIP(locinput);
4031                         }
4032                     }
4033                 }
4034                 if (locinput > PL_regeol) sayNO;
4035             }
4036             nextchr = UCHARAT(locinput);
4037             break;
4038
4039         case NREFFL:
4040         {   /* The capture buffer cases.  The ones beginning with N for the
4041                named buffers just convert to the equivalent numbered and
4042                pretend they were called as the corresponding numbered buffer
4043                op.  */
4044             /* don't initialize these in the declaration, it makes C++
4045                unhappy */
4046             char *s;
4047             char type;
4048             re_fold_t folder;
4049             const U8 *fold_array;
4050             UV utf8_fold_flags;
4051
4052             PL_reg_flags |= RF_tainted;
4053             folder = foldEQ_locale;
4054             fold_array = PL_fold_locale;
4055             type = REFFL;
4056             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4057             goto do_nref;
4058
4059         case NREFFA:
4060             folder = foldEQ_latin1;
4061             fold_array = PL_fold_latin1;
4062             type = REFFA;
4063             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4064             goto do_nref;
4065
4066         case NREFFU:
4067             folder = foldEQ_latin1;
4068             fold_array = PL_fold_latin1;
4069             type = REFFU;
4070             utf8_fold_flags = 0;
4071             goto do_nref;
4072
4073         case NREFF:
4074             folder = foldEQ;
4075             fold_array = PL_fold;
4076             type = REFF;
4077             utf8_fold_flags = 0;
4078             goto do_nref;
4079
4080         case NREF:
4081             type = REF;
4082             folder = NULL;
4083             fold_array = NULL;
4084             utf8_fold_flags = 0;
4085           do_nref:
4086
4087             /* For the named back references, find the corresponding buffer
4088              * number */
4089             n = reg_check_named_buff_matched(rex,scan);
4090
4091             if ( ! n ) {
4092                 sayNO;
4093             }
4094             goto do_nref_ref_common;
4095
4096         case REFFL:
4097             PL_reg_flags |= RF_tainted;
4098             folder = foldEQ_locale;
4099             fold_array = PL_fold_locale;
4100             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4101             goto do_ref;
4102
4103         case REFFA:
4104             folder = foldEQ_latin1;
4105             fold_array = PL_fold_latin1;
4106             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4107             goto do_ref;
4108
4109         case REFFU:
4110             folder = foldEQ_latin1;
4111             fold_array = PL_fold_latin1;
4112             utf8_fold_flags = 0;
4113             goto do_ref;
4114
4115         case REFF:
4116             folder = foldEQ;
4117             fold_array = PL_fold;
4118             utf8_fold_flags = 0;
4119             goto do_ref;
4120
4121         case REF:
4122             folder = NULL;
4123             fold_array = NULL;
4124             utf8_fold_flags = 0;
4125
4126           do_ref:
4127             type = OP(scan);
4128             n = ARG(scan);  /* which paren pair */
4129
4130           do_nref_ref_common:
4131             ln = PL_regoffs[n].start;
4132             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4133             if (*PL_reglastparen < n || ln == -1)
4134                 sayNO;                  /* Do not match unless seen CLOSEn. */
4135             if (ln == PL_regoffs[n].end)
4136                 break;
4137
4138             s = PL_bostr + ln;
4139             if (type != REF     /* REF can do byte comparison */
4140                 && (utf8_target || type == REFFU))
4141             { /* XXX handle REFFL better */
4142                 char * limit = PL_regeol;
4143
4144                 /* This call case insensitively compares the entire buffer
4145                     * at s, with the current input starting at locinput, but
4146                     * not going off the end given by PL_regeol, and returns in
4147                     * limit upon success, how much of the current input was
4148                     * matched */
4149                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4150                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4151                 {
4152                     sayNO;
4153                 }
4154                 locinput = limit;
4155                 nextchr = UCHARAT(locinput);
4156                 break;
4157             }
4158
4159             /* Not utf8:  Inline the first character, for speed. */
4160             if (UCHARAT(s) != nextchr &&
4161                 (type == REF ||
4162                  UCHARAT(s) != fold_array[nextchr]))
4163                 sayNO;
4164             ln = PL_regoffs[n].end - ln;
4165             if (locinput + ln > PL_regeol)
4166                 sayNO;
4167             if (ln > 1 && (type == REF
4168                            ? memNE(s, locinput, ln)
4169                            : ! folder(s, locinput, ln)))
4170                 sayNO;
4171             locinput += ln;
4172             nextchr = UCHARAT(locinput);
4173             break;
4174         }
4175         case NOTHING:
4176         case TAIL:
4177             break;
4178         case BACK:
4179             break;
4180
4181 #undef  ST
4182 #define ST st->u.eval
4183         {
4184             SV *ret;
4185             REGEXP *re_sv;
4186             regexp *re;
4187             regexp_internal *rei;
4188             regnode *startpoint;
4189
4190         case GOSTART:
4191         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4192             if (cur_eval && cur_eval->locinput==locinput) {
4193                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4194                     Perl_croak(aTHX_ "Infinite recursion in regex");
4195                 if ( ++nochange_depth > max_nochange_depth )
4196                     Perl_croak(aTHX_
4197                         "Pattern subroutine nesting without pos change"
4198                         " exceeded limit in regex");
4199             } else {
4200                 nochange_depth = 0;
4201             }
4202             re_sv = rex_sv;
4203             re = rex;
4204             rei = rexi;
4205             (void)ReREFCNT_inc(rex_sv);
4206             if (OP(scan)==GOSUB) {
4207                 startpoint = scan + ARG2L(scan);
4208                 ST.close_paren = ARG(scan);
4209             } else {
4210                 startpoint = rei->program+1;
4211                 ST.close_paren = 0;
4212             }
4213             goto eval_recurse_doit;
4214             /* NOTREACHED */
4215         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4216             if (cur_eval && cur_eval->locinput==locinput) {
4217                 if ( ++nochange_depth > max_nochange_depth )
4218                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4219             } else {
4220                 nochange_depth = 0;
4221             }
4222             {
4223                 /* execute the code in the {...} */
4224                 dSP;
4225                 SV ** const before = SP;
4226                 OP_4tree * const oop = PL_op;
4227                 COP * const ocurcop = PL_curcop;
4228                 PAD *old_comppad;
4229                 char *saved_regeol = PL_regeol;
4230                 struct re_save_state saved_state;
4231
4232                 /* To not corrupt the existing regex state while executing the
4233                  * eval we would normally put it on the save stack, like with
4234                  * save_re_context. However, re-evals have a weird scoping so we
4235                  * can't just add ENTER/LEAVE here. With that, things like
4236                  *
4237                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4238                  *
4239                  * would break, as they expect the localisation to be unwound
4240                  * only when the re-engine backtracks through the bit that
4241                  * localised it.
4242                  *
4243                  * What we do instead is just saving the state in a local c
4244                  * variable.
4245                  */
4246                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4247
4248                 n = ARG(scan);
4249                 PL_op = (OP_4tree*)rexi->data->data[n];
4250                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4251                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4252                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4253                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4254
4255                 if (sv_yes_mark) {
4256                     SV *sv_mrk = get_sv("REGMARK", 1);
4257                     sv_setsv(sv_mrk, sv_yes_mark);
4258                 }
4259
4260                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4261                 SPAGAIN;
4262                 if (SP == before)
4263                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4264                 else {
4265                     ret = POPs;
4266                     PUTBACK;
4267                 }
4268
4269                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4270
4271                 PL_op = oop;
4272                 PAD_RESTORE_LOCAL(old_comppad);
4273                 PL_curcop = ocurcop;
4274                 PL_regeol = saved_regeol;
4275                 if (!logical) {
4276                     /* /(?{...})/ */
4277                     sv_setsv(save_scalar(PL_replgv), ret);
4278                     break;
4279                 }
4280             }
4281             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4282                 logical = 0;
4283                 {
4284                     /* extract RE object from returned value; compiling if
4285                      * necessary */
4286                     MAGIC *mg = NULL;
4287                     REGEXP *rx = NULL;
4288
4289                     if (SvROK(ret)) {
4290                         SV *const sv = SvRV(ret);
4291
4292                         if (SvTYPE(sv) == SVt_REGEXP) {
4293                             rx = (REGEXP*) sv;
4294                         } else if (SvSMAGICAL(sv)) {
4295                             mg = mg_find(sv, PERL_MAGIC_qr);
4296                             assert(mg);
4297                         }
4298                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4299                         rx = (REGEXP*) ret;
4300                     } else if (SvSMAGICAL(ret)) {
4301                         if (SvGMAGICAL(ret)) {
4302                             /* I don't believe that there is ever qr magic
4303                                here.  */
4304                             assert(!mg_find(ret, PERL_MAGIC_qr));
4305                             sv_unmagic(ret, PERL_MAGIC_qr);
4306                         }
4307                         else {
4308                             mg = mg_find(ret, PERL_MAGIC_qr);
4309                             /* testing suggests mg only ends up non-NULL for
4310                                scalars who were upgraded and compiled in the
4311                                else block below. In turn, this is only
4312                                triggered in the "postponed utf8 string" tests
4313                                in t/op/pat.t  */
4314                         }
4315                     }
4316
4317                     if (mg) {
4318                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4319                         assert(rx);
4320                     }
4321                     if (rx) {
4322                         rx = reg_temp_copy(NULL, rx);
4323                     }
4324                     else {
4325                         U32 pm_flags = 0;
4326                         const I32 osize = PL_regsize;
4327
4328                         if (DO_UTF8(ret)) {
4329                             assert (SvUTF8(ret));
4330                         } else if (SvUTF8(ret)) {
4331                             /* Not doing UTF-8, despite what the SV says. Is
4332                                this only if we're trapped in use 'bytes'?  */
4333                             /* Make a copy of the octet sequence, but without
4334                                the flag on, as the compiler now honours the
4335                                SvUTF8 flag on ret.  */
4336                             STRLEN len;
4337                             const char *const p = SvPV(ret, len);
4338                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4339                         }
4340                         rx = CALLREGCOMP(ret, pm_flags);
4341                         if (!(SvFLAGS(ret)
4342                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4343                                  | SVs_GMG))) {
4344                             /* This isn't a first class regexp. Instead, it's
4345                                caching a regexp onto an existing, Perl visible
4346                                scalar.  */
4347                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4348                         }
4349                         PL_regsize = osize;
4350                     }
4351                     re_sv = rx;
4352                     re = (struct regexp *)SvANY(rx);
4353                 }
4354                 RXp_MATCH_COPIED_off(re);
4355                 re->subbeg = rex->subbeg;
4356                 re->sublen = rex->sublen;
4357                 rei = RXi_GET(re);
4358                 DEBUG_EXECUTE_r(
4359                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4360                         "Matching embedded");
4361                 );
4362                 startpoint = rei->program + 1;
4363                 ST.close_paren = 0; /* only used for GOSUB */
4364                 /* borrowed from regtry */
4365                 if (PL_reg_start_tmpl <= re->nparens) {
4366                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4367                     if(PL_reg_start_tmp)
4368                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4369                     else
4370                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4371                 }
4372
4373         eval_recurse_doit: /* Share code with GOSUB below this line */
4374                 /* run the pattern returned from (??{...}) */
4375                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4376                 REGCP_SET(ST.lastcp);
4377
4378                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4379
4380                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4381                 PL_reglastparen = &re->lastparen;
4382                 PL_reglastcloseparen = &re->lastcloseparen;
4383                 re->lastparen = 0;
4384                 re->lastcloseparen = 0;
4385
4386                 PL_reginput = locinput;
4387                 PL_regsize = 0;
4388
4389                 /* XXXX This is too dramatic a measure... */
4390                 PL_reg_maxiter = 0;
4391
4392                 ST.toggle_reg_flags = PL_reg_flags;
4393                 if (RX_UTF8(re_sv))
4394                     PL_reg_flags |= RF_utf8;
4395                 else
4396                     PL_reg_flags &= ~RF_utf8;
4397                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4398
4399                 ST.prev_rex = rex_sv;
4400                 ST.prev_curlyx = cur_curlyx;
4401                 SETREX(rex_sv,re_sv);
4402                 rex = re;
4403                 rexi = rei;
4404                 cur_curlyx = NULL;
4405                 ST.B = next;
4406                 ST.prev_eval = cur_eval;
4407                 cur_eval = st;
4408                 /* now continue from first node in postoned RE */
4409                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4410                 /* NOTREACHED */
4411             }
4412             /* logical is 1,   /(?(?{...})X|Y)/ */
4413             sw = cBOOL(SvTRUE(ret));
4414             logical = 0;
4415             break;
4416         }
4417
4418         case EVAL_AB: /* cleanup after a successful (??{A})B */
4419             /* note: this is called twice; first after popping B, then A */
4420             PL_reg_flags ^= ST.toggle_reg_flags;
4421             ReREFCNT_dec(rex_sv);
4422             SETREX(rex_sv,ST.prev_rex);
4423             rex = (struct regexp *)SvANY(rex_sv);
4424             rexi = RXi_GET(rex);
4425             regcpblow(ST.cp);
4426             cur_eval = ST.prev_eval;
4427             cur_curlyx = ST.prev_curlyx;
4428
4429             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4430             PL_reglastparen = &rex->lastparen;
4431             PL_reglastcloseparen = &rex->lastcloseparen;
4432             /* also update PL_regoffs */
4433             PL_regoffs = rex->offs;
4434
4435             /* XXXX This is too dramatic a measure... */
4436             PL_reg_maxiter = 0;
4437             if ( nochange_depth )
4438                 nochange_depth--;
4439             sayYES;
4440
4441
4442         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4443             /* note: this is called twice; first after popping B, then A */
4444             PL_reg_flags ^= ST.toggle_reg_flags;
4445             ReREFCNT_dec(rex_sv);
4446             SETREX(rex_sv,ST.prev_rex);
4447             rex = (struct regexp *)SvANY(rex_sv);
4448             rexi = RXi_GET(rex);
4449             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4450             PL_reglastparen = &rex->lastparen;
4451             PL_reglastcloseparen = &rex->lastcloseparen;
4452
4453             PL_reginput = locinput;
4454             REGCP_UNWIND(ST.lastcp);
4455             regcppop(rex);
4456             cur_eval = ST.prev_eval;
4457             cur_curlyx = ST.prev_curlyx;
4458             /* XXXX This is too dramatic a measure... */
4459             PL_reg_maxiter = 0;
4460             if ( nochange_depth )
4461                 nochange_depth--;
4462             sayNO_SILENT;
4463 #undef ST
4464
4465         case OPEN:
4466             n = ARG(scan);  /* which paren pair */
4467             PL_reg_start_tmp[n] = locinput;
4468             if (n > PL_regsize)
4469                 PL_regsize = n;
4470             lastopen = n;
4471             break;
4472         case CLOSE:
4473             n = ARG(scan);  /* which paren pair */
4474             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4475             PL_regoffs[n].end = locinput - PL_bostr;
4476             /*if (n > PL_regsize)
4477                 PL_regsize = n;*/
4478             if (n > *PL_reglastparen)
4479                 *PL_reglastparen = n;
4480             *PL_reglastcloseparen = n;
4481             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4482                 goto fake_end;
4483             }
4484             break;
4485         case ACCEPT:
4486             if (ARG(scan)){
4487                 regnode *cursor;
4488                 for (cursor=scan;
4489                      cursor && OP(cursor)!=END;
4490                      cursor=regnext(cursor))
4491                 {
4492                     if ( OP(cursor)==CLOSE ){
4493                         n = ARG(cursor);
4494                         if ( n <= lastopen ) {
4495                             PL_regoffs[n].start
4496                                 = PL_reg_start_tmp[n] - PL_bostr;
4497                             PL_regoffs[n].end = locinput - PL_bostr;
4498                             /*if (n > PL_regsize)
4499                             PL_regsize = n;*/
4500                             if (n > *PL_reglastparen)
4501                                 *PL_reglastparen = n;
4502                             *PL_reglastcloseparen = n;
4503                             if ( n == ARG(scan) || (cur_eval &&
4504                                 cur_eval->u.eval.close_paren == n))
4505                                 break;
4506                         }
4507                     }
4508                 }
4509             }
4510             goto fake_end;
4511             /*NOTREACHED*/
4512         case GROUPP:
4513             n = ARG(scan);  /* which paren pair */
4514             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4515             break;
4516         case NGROUPP:
4517             /* reg_check_named_buff_matched returns 0 for no match */
4518             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4519             break;
4520         case INSUBP:
4521             n = ARG(scan);
4522             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4523             break;
4524         case DEFINEP:
4525             sw = 0;
4526             break;
4527         case IFTHEN:
4528             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4529             if (sw)
4530                 next = NEXTOPER(NEXTOPER(scan));
4531             else {
4532                 next = scan + ARG(scan);
4533                 if (OP(next) == IFTHEN) /* Fake one. */
4534                     next = NEXTOPER(NEXTOPER(next));
4535             }
4536             break;
4537         case LOGICAL:
4538             logical = scan->flags;
4539             break;
4540
4541 /*******************************************************************
4542
4543 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4544 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4545 STAR/PLUS/CURLY/CURLYN are used instead.)
4546
4547 A*B is compiled as <CURLYX><A><WHILEM><B>
4548
4549 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4550 state, which contains the current count, initialised to -1. It also sets
4551 cur_curlyx to point to this state, with any previous value saved in the
4552 state block.
4553
4554 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4555 since the pattern may possibly match zero times (i.e. it's a while {} loop
4556 rather than a do {} while loop).
4557
4558 Each entry to WHILEM represents a successful match of A. The count in the
4559 CURLYX block is incremented, another WHILEM state is pushed, and execution
4560 passes to A or B depending on greediness and the current count.
4561
4562 For example, if matching against the string a1a2a3b (where the aN are
4563 substrings that match /A/), then the match progresses as follows: (the
4564 pushed states are interspersed with the bits of strings matched so far):
4565
4566     <CURLYX cnt=-1>
4567     <CURLYX cnt=0><WHILEM>
4568     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4569     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4570     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4571     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4572
4573 (Contrast this with something like CURLYM, which maintains only a single
4574 backtrack state:
4575
4576     <CURLYM cnt=0> a1
4577     a1 <CURLYM cnt=1> a2
4578     a1 a2 <CURLYM cnt=2> a3
4579     a1 a2 a3 <CURLYM cnt=3> b
4580 )
4581
4582 Each WHILEM state block marks a point to backtrack to upon partial failure
4583 of A or B, and also contains some minor state data related to that
4584 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4585 overall state, such as the count, and pointers to the A and B ops.
4586
4587 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4588 must always point to the *current* CURLYX block, the rules are:
4589
4590 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4591 and set cur_curlyx to point the new block.
4592
4593 When popping the CURLYX block after a successful or unsuccessful match,
4594 restore the previous cur_curlyx.
4595
4596 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4597 to the outer one saved in the CURLYX block.
4598
4599 When popping the WHILEM block after a successful or unsuccessful B match,
4600 restore the previous cur_curlyx.
4601
4602 Here's an example for the pattern (AI* BI)*BO
4603 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4604
4605 cur_
4606 curlyx backtrack stack
4607 ------ ---------------
4608 NULL
4609 CO     <CO prev=NULL> <WO>
4610 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4611 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4612 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4613
4614 At this point the pattern succeeds, and we work back down the stack to
4615 clean up, restoring as we go:
4616
4617 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4618 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4619 CO     <CO prev=NULL> <WO>
4620 NULL
4621
4622 *******************************************************************/
4623
4624 #define ST st->u.curlyx
4625
4626         case CURLYX:    /* start of /A*B/  (for complex A) */
4627         {
4628             /* No need to save/restore up to this paren */
4629             I32 parenfloor = scan->flags;
4630
4631             assert(next); /* keep Coverity happy */
4632             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4633                 next += ARG(next);
4634
4635             /* XXXX Probably it is better to teach regpush to support
4636                parenfloor > PL_regsize... */
4637             if (parenfloor > (I32)*PL_reglastparen)
4638                 parenfloor = *PL_reglastparen; /* Pessimization... */
4639
4640             ST.prev_curlyx= cur_curlyx;
4641             cur_curlyx = st;
4642             ST.cp = PL_savestack_ix;
4643
4644             /* these fields contain the state of the current curly.
4645              * they are accessed by subsequent WHILEMs */
4646             ST.parenfloor = parenfloor;
4647             ST.me = scan;
4648             ST.B = next;
4649             ST.minmod = minmod;
4650             minmod = 0;
4651             ST.count = -1;      /* this will be updated by WHILEM */
4652             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4653
4654             PL_reginput = locinput;
4655             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4656             /* NOTREACHED */
4657         }
4658
4659         case CURLYX_end: /* just finished matching all of A*B */
4660             cur_curlyx = ST.prev_curlyx;
4661             sayYES;
4662             /* NOTREACHED */
4663
4664         case CURLYX_end_fail: /* just failed to match all of A*B */
4665             regcpblow(ST.cp);
4666             cur_curlyx = ST.prev_curlyx;
4667             sayNO;
4668             /* NOTREACHED */
4669
4670
4671 #undef ST
4672 #define ST st->u.whilem
4673
4674         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4675         {
4676             /* see the discussion above about CURLYX/WHILEM */
4677             I32 n;
4678             int min = ARG1(cur_curlyx->u.curlyx.me);
4679             int max = ARG2(cur_curlyx->u.curlyx.me);
4680             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4681
4682             assert(cur_curlyx); /* keep Coverity happy */
4683             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4684             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4685             ST.cache_offset = 0;
4686             ST.cache_mask = 0;
4687
4688             PL_reginput = locinput;
4689
4690             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4691                   "%*s  whilem: matched %ld out of %d..%d\n",
4692                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4693             );
4694
4695             /* First just match a string of min A's. */
4696
4697             if (n < min) {
4698                 cur_curlyx->u.curlyx.lastloc = locinput;
4699                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4700                 /* NOTREACHED */
4701             }
4702
4703             /* If degenerate A matches "", assume A done. */
4704
4705             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4706                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4707                    "%*s  whilem: empty match detected, trying continuation...\n",
4708                    REPORT_CODE_OFF+depth*2, "")
4709                 );
4710                 goto do_whilem_B_max;
4711             }
4712
4713             /* super-linear cache processing */
4714
4715             if (scan->flags) {
4716
4717                 if (!PL_reg_maxiter) {
4718                     /* start the countdown: Postpone detection until we
4719                      * know the match is not *that* much linear. */
4720                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4721                     /* possible overflow for long strings and many CURLYX's */
4722                     if (PL_reg_maxiter < 0)
4723                         PL_reg_maxiter = I32_MAX;
4724                     PL_reg_leftiter = PL_reg_maxiter;
4725                 }
4726
4727                 if (PL_reg_leftiter-- == 0) {
4728                     /* initialise cache */
4729                     const I32 size = (PL_reg_maxiter + 7)/8;
4730                     if (PL_reg_poscache) {
4731                         if ((I32)PL_reg_poscache_size < size) {
4732                             Renew(PL_reg_poscache, size, char);
4733                             PL_reg_poscache_size = size;
4734                         }
4735                         Zero(PL_reg_poscache, size, char);
4736                     }
4737                     else {
4738                         PL_reg_poscache_size = size;
4739                         Newxz(PL_reg_poscache, size, char);
4740                     }
4741                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4742       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4743                               PL_colors[4], PL_colors[5])
4744                     );
4745                 }
4746
4747                 if (PL_reg_leftiter < 0) {
4748                     /* have we already failed at this position? */
4749                     I32 offset, mask;
4750                     offset  = (scan->flags & 0xf) - 1
4751                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4752                     mask    = 1 << (offset % 8);
4753                     offset /= 8;
4754                     if (PL_reg_poscache[offset] & mask) {
4755                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4756                             "%*s  whilem: (cache) already tried at this position...\n",
4757                             REPORT_CODE_OFF+depth*2, "")
4758                         );
4759                         sayNO; /* cache records failure */
4760                     }
4761                     ST.cache_offset = offset;
4762                     ST.cache_mask   = mask;
4763                 }
4764             }
4765
4766             /* Prefer B over A for minimal matching. */
4767
4768             if (cur_curlyx->u.curlyx.minmod) {
4769                 ST.save_curlyx = cur_curlyx;
4770                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4771                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4772                 REGCP_SET(ST.lastcp);
4773                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4774                 /* NOTREACHED */
4775             }
4776
4777             /* Prefer A over B for maximal matching. */
4778
4779             if (n < max) { /* More greed allowed? */
4780                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4781                 cur_curlyx->u.curlyx.lastloc = locinput;
4782                 REGCP_SET(ST.lastcp);
4783                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4784                 /* NOTREACHED */
4785             }
4786             goto do_whilem_B_max;
4787         }
4788         /* NOTREACHED */
4789
4790         case WHILEM_B_min: /* just matched B in a minimal match */
4791         case WHILEM_B_max: /* just matched B in a maximal match */
4792             cur_curlyx = ST.save_curlyx;
4793             sayYES;
4794             /* NOTREACHED */
4795
4796         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4797             cur_curlyx = ST.save_curlyx;
4798             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4799             cur_curlyx->u.curlyx.count--;
4800             CACHEsayNO;
4801             /* NOTREACHED */
4802
4803         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4804             REGCP_UNWIND(ST.lastcp);
4805             regcppop(rex);
4806             /* FALL THROUGH */
4807         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4808             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4809             cur_curlyx->u.curlyx.count--;
4810             CACHEsayNO;
4811             /* NOTREACHED */
4812
4813         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4814             REGCP_UNWIND(ST.lastcp);
4815             regcppop(rex);      /* Restore some previous $<digit>s? */
4816             PL_reginput = locinput;
4817             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4818                 "%*s  whilem: failed, trying continuation...\n",
4819                 REPORT_CODE_OFF+depth*2, "")
4820             );
4821           do_whilem_B_max:
4822             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4823                 && ckWARN(WARN_REGEXP)
4824                 && !(PL_reg_flags & RF_warned))
4825             {
4826                 PL_reg_flags |= RF_warned;
4827                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4828                      "Complex regular subexpression recursion",
4829                      REG_INFTY - 1);
4830             }
4831
4832             /* now try B */
4833             ST.save_curlyx = cur_curlyx;
4834             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4835             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4836             /* NOTREACHED */
4837
4838         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4839             cur_curlyx = ST.save_curlyx;
4840             REGCP_UNWIND(ST.lastcp);
4841             regcppop(rex);
4842
4843             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4844                 /* Maximum greed exceeded */
4845                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4846                     && ckWARN(WARN_REGEXP)
4847                     && !(PL_reg_flags & RF_warned))
4848                 {
4849                     PL_reg_flags |= RF_warned;
4850                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4851                         "%s limit (%d) exceeded",
4852                         "Complex regular subexpression recursion",
4853                         REG_INFTY - 1);
4854                 }
4855                 cur_curlyx->u.curlyx.count--;
4856                 CACHEsayNO;
4857             }
4858
4859             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4860                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4861             );
4862             /* Try grabbing another A and see if it helps. */
4863             PL_reginput = locinput;
4864             cur_curlyx->u.curlyx.lastloc = locinput;
4865             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4866             REGCP_SET(ST.lastcp);
4867             PUSH_STATE_GOTO(WHILEM_A_min,
4868                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4869             /* NOTREACHED */
4870
4871 #undef  ST
4872 #define ST st->u.branch
4873
4874         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4875             next = scan + ARG(scan);
4876             if (next == scan)
4877                 next = NULL;
4878             scan = NEXTOPER(scan);
4879             /* FALL THROUGH */
4880
4881         case BRANCH:        /*  /(...|A|...)/ */
4882             scan = NEXTOPER(scan); /* scan now points to inner node */
4883             ST.lastparen = *PL_reglastparen;
4884             ST.next_branch = next;
4885             REGCP_SET(ST.cp);
4886             PL_reginput = locinput;
4887
4888             /* Now go into the branch */
4889             if (has_cutgroup) {
4890                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4891             } else {
4892                 PUSH_STATE_GOTO(BRANCH_next, scan);
4893             }
4894             /* NOTREACHED */
4895         case CUTGROUP:
4896             PL_reginput = locinput;
4897             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4898                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4899             PUSH_STATE_GOTO(CUTGROUP_next,next);
4900             /* NOTREACHED */
4901         case CUTGROUP_next_fail:
4902             do_cutgroup = 1;
4903             no_final = 1;
4904             if (st->u.mark.mark_name)
4905                 sv_commit = st->u.mark.mark_name;
4906             sayNO;
4907             /* NOTREACHED */
4908         case BRANCH_next:
4909             sayYES;
4910             /* NOTREACHED */
4911         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4912             if (do_cutgroup) {
4913                 do_cutgroup = 0;
4914                 no_final = 0;
4915             }
4916             REGCP_UNWIND(ST.cp);
4917             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4918                 PL_regoffs[n].end = -1;
4919             *PL_reglastparen = n;
4920             /*dmq: *PL_reglastcloseparen = n; */
4921             scan = ST.next_branch;
4922             /* no more branches? */
4923             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4924                 DEBUG_EXECUTE_r({
4925                     PerlIO_printf( Perl_debug_log,
4926                         "%*s  %sBRANCH failed...%s\n",
4927                         REPORT_CODE_OFF+depth*2, "",
4928                         PL_colors[4],
4929                         PL_colors[5] );
4930                 });
4931                 sayNO_SILENT;
4932             }
4933             continue; /* execute next BRANCH[J] op */
4934             /* NOTREACHED */
4935
4936         case MINMOD:
4937             minmod = 1;
4938             break;
4939
4940 #undef  ST
4941 #define ST st->u.curlym
4942
4943         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4944
4945             /* This is an optimisation of CURLYX that enables us to push
4946              * only a single backtracking state, no matter how many matches
4947              * there are in {m,n}. It relies on the pattern being constant
4948              * length, with no parens to influence future backrefs
4949              */
4950
4951             ST.me = scan;
4952             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4953
4954             /* if paren positive, emulate an OPEN/CLOSE around A */
4955             if (ST.me->flags) {
4956                 U32 paren = ST.me->flags;
4957                 if (paren > PL_regsize)
4958                     PL_regsize = paren;
4959                 if (paren > *PL_reglastparen)
4960                     *PL_reglastparen = paren;
4961                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4962             }
4963             ST.A = scan;
4964             ST.B = next;
4965             ST.alen = 0;
4966             ST.count = 0;
4967             ST.minmod = minmod;
4968             minmod = 0;
4969             ST.c1 = CHRTEST_UNINIT;
4970             REGCP_SET(ST.cp);
4971
4972             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4973                 goto curlym_do_B;
4974
4975           curlym_do_A: /* execute the A in /A{m,n}B/  */
4976             PL_reginput = locinput;
4977             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4978             /* NOTREACHED */
4979
4980         case CURLYM_A: /* we've just matched an A */
4981             locinput = st->locinput;
4982             nextchr = UCHARAT(locinput);
4983
4984             ST.count++;
4985             /* after first match, determine A's length: u.curlym.alen */
4986             if (ST.count == 1) {
4987                 if (PL_reg_match_utf8) {
4988                     char *s = locinput;
4989                     while (s < PL_reginput) {
4990                         ST.alen++;
4991                         s += UTF8SKIP(s);
4992                     }
4993                 }
4994                 else {
4995                     ST.alen = PL_reginput - locinput;
4996                 }
4997                 if (ST.alen == 0)
4998                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4999             }
5000             DEBUG_EXECUTE_r(
5001                 PerlIO_printf(Perl_debug_log,
5002                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5003                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5004                           (IV) ST.count, (IV)ST.alen)
5005             );
5006
5007             locinput = PL_reginput;
5008
5009             if (cur_eval && cur_eval->u.eval.close_paren &&
5010                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5011                 goto fake_end;
5012
5013             {
5014                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5015                 if ( max == REG_INFTY || ST.count < max )
5016                     goto curlym_do_A; /* try to match another A */
5017             }
5018             goto curlym_do_B; /* try to match B */
5019
5020         case CURLYM_A_fail: /* just failed to match an A */
5021             REGCP_UNWIND(ST.cp);
5022
5023             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5024                 || (cur_eval && cur_eval->u.eval.close_paren &&
5025                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5026                 sayNO;
5027
5028           curlym_do_B: /* execute the B in /A{m,n}B/  */
5029             PL_reginput = locinput;
5030             if (ST.c1 == CHRTEST_UNINIT) {
5031                 /* calculate c1 and c2 for possible match of 1st char
5032                  * following curly */
5033                 ST.c1 = ST.c2 = CHRTEST_VOID;
5034                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5035                     regnode *text_node = ST.B;
5036                     if (! HAS_TEXT(text_node))
5037                         FIND_NEXT_IMPT(text_node);
5038                     /* this used to be
5039
5040                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5041
5042                         But the former is redundant in light of the latter.
5043
5044                         if this changes back then the macro for
5045                         IS_TEXT and friends need to change.
5046                      */
5047                     if (PL_regkind[OP(text_node)] == EXACT)
5048                     {
5049
5050                         ST.c1 = (U8)*STRING(text_node);
5051                         switch (OP(text_node)) {
5052                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5053                             case EXACTFA:
5054                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5055                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5056                             default: ST.c2 = ST.c1;
5057                         }
5058                     }
5059                 }
5060             }
5061
5062             DEBUG_EXECUTE_r(
5063                 PerlIO_printf(Perl_debug_log,
5064                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5065                     (int)(REPORT_CODE_OFF+(depth*2)),
5066                     "", (IV)ST.count)
5067                 );
5068             if (ST.c1 != CHRTEST_VOID
5069                     && UCHARAT(PL_reginput) != ST.c1
5070                     && UCHARAT(PL_reginput) != ST.c2)
5071             {
5072                 /* simulate B failing */
5073                 DEBUG_OPTIMISE_r(
5074                     PerlIO_printf(Perl_debug_log,
5075                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5076                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5077                         (IV)ST.c1,(IV)ST.c2
5078                 ));
5079                 state_num = CURLYM_B_fail;
5080                 goto reenter_switch;
5081             }
5082
5083             if (ST.me->flags) {
5084                 /* mark current A as captured */
5085                 I32 paren = ST.me->flags;
5086                 if (ST.count) {
5087                     PL_regoffs[paren].start
5088                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5089                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5090                     /*dmq: *PL_reglastcloseparen = paren; */
5091                 }
5092                 else
5093                     PL_regoffs[paren].end = -1;
5094                 if (cur_eval && cur_eval->u.eval.close_paren &&
5095                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5096                 {
5097                     if (ST.count)
5098                         goto fake_end;
5099                     else
5100                         sayNO;
5101                 }
5102             }
5103
5104             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5105             /* NOTREACHED */
5106
5107         case CURLYM_B_fail: /* just failed to match a B */
5108             REGCP_UNWIND(ST.cp);
5109             if (ST.minmod) {
5110                 I32 max = ARG2(ST.me);
5111                 if (max != REG_INFTY && ST.count == max)
5112                     sayNO;
5113                 goto curlym_do_A; /* try to match a further A */
5114             }
5115             /* backtrack one A */
5116             if (ST.count == ARG1(ST.me) /* min */)
5117                 sayNO;
5118             ST.count--;
5119             locinput = HOPc(locinput, -ST.alen);
5120             goto curlym_do_B; /* try to match B */
5121
5122 #undef ST
5123 #define ST st->u.curly
5124
5125 #define CURLY_SETPAREN(paren, success) \
5126     if (paren) { \
5127         if (success) { \
5128             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5129             PL_regoffs[paren].end = locinput - PL_bostr; \
5130             *PL_reglastcloseparen = paren; \
5131         } \
5132         else \
5133             PL_regoffs[paren].end = -1; \
5134     }
5135
5136         case STAR:              /*  /A*B/ where A is width 1 */
5137             ST.paren = 0;
5138             ST.min = 0;
5139             ST.max = REG_INFTY;
5140             scan = NEXTOPER(scan);
5141             goto repeat;
5142         case PLUS:              /*  /A+B/ where A is width 1 */
5143             ST.paren = 0;
5144             ST.min = 1;
5145             ST.max = REG_INFTY;
5146             scan = NEXTOPER(scan);
5147             goto repeat;
5148         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5149             ST.paren = scan->flags;     /* Which paren to set */
5150             if (ST.paren > PL_regsize)
5151                 PL_regsize = ST.paren;
5152             if (ST.paren > *PL_reglastparen)
5153                 *PL_reglastparen = ST.paren;
5154             ST.min = ARG1(scan);  /* min to match */
5155             ST.max = ARG2(scan);  /* max to match */
5156             if (cur_eval && cur_eval->u.eval.close_paren &&
5157                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5158                 ST.min=1;
5159                 ST.max=1;
5160             }
5161             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5162             goto repeat;
5163         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5164             ST.paren = 0;
5165             ST.min = ARG1(scan);  /* min to match */
5166             ST.max = ARG2(scan);  /* max to match */
5167             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5168           repeat:
5169             /*
5170             * Lookahead to avoid useless match attempts
5171             * when we know what character comes next.
5172             *
5173             * Used to only do .*x and .*?x, but now it allows
5174             * for )'s, ('s and (?{ ... })'s to be in the way
5175             * of the quantifier and the EXACT-like node.  -- japhy
5176             */
5177
5178             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5179                 sayNO;
5180             if (HAS_TEXT(next) || JUMPABLE(next)) {
5181                 U8 *s;
5182                 regnode *text_node = next;
5183
5184                 if (! HAS_TEXT(text_node))
5185                     FIND_NEXT_IMPT(text_node);
5186
5187                 if (! HAS_TEXT(text_node))
5188                     ST.c1 = ST.c2 = CHRTEST_VOID;
5189                 else {
5190                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5191                         ST.c1 = ST.c2 = CHRTEST_VOID;
5192                         goto assume_ok_easy;
5193                     }
5194                     else
5195                         s = (U8*)STRING(text_node);
5196
5197                     /*  Currently we only get here when
5198
5199                         PL_rekind[OP(text_node)] == EXACT
5200
5201                         if this changes back then the macro for IS_TEXT and
5202                         friends need to change. */
5203                     if (!UTF_PATTERN) {
5204                         ST.c1 = *s;
5205                         switch (OP(text_node)) {
5206                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5207                             case EXACTFA:
5208                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5209                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5210                             default: ST.c2 = ST.c1; break;
5211                         }
5212                     }
5213                     else { /* UTF_PATTERN */
5214                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5215                              STRLEN ulen1, ulen2;
5216                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5217                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5218
5219                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5220                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5221 #ifdef EBCDIC
5222                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5223                                                     ckWARN(WARN_UTF8) ?
5224                                                     0 : UTF8_ALLOW_ANY);
5225                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5226                                                     ckWARN(WARN_UTF8) ?
5227                                                     0 : UTF8_ALLOW_ANY);
5228 #else
5229                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5230                                                     uniflags);
5231                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5232                                                     uniflags);
5233 #endif
5234                         }
5235                         else {
5236                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5237                                                      uniflags);
5238                         }
5239                     }
5240                 }
5241             }
5242             else
5243                 ST.c1 = ST.c2 = CHRTEST_VOID;
5244         assume_ok_easy:
5245
5246             ST.A = scan;
5247             ST.B = next;
5248             PL_reginput = locinput;
5249             if (minmod) {
5250                 minmod = 0;
5251                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5252                     sayNO;
5253                 ST.count = ST.min;
5254                 locinput = PL_reginput;
5255                 REGCP_SET(ST.cp);
5256                 if (ST.c1 == CHRTEST_VOID)
5257                     goto curly_try_B_min;
5258
5259                 ST.oldloc = locinput;
5260
5261                 /* set ST.maxpos to the furthest point along the
5262                  * string that could possibly match */
5263                 if  (ST.max == REG_INFTY) {
5264                     ST.maxpos = PL_regeol - 1;
5265                     if (utf8_target)
5266                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5267                             ST.maxpos--;
5268                 }
5269                 else if (utf8_target) {
5270                     int m = ST.max - ST.min;
5271                     for (ST.maxpos = locinput;
5272                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5273                         ST.maxpos += UTF8SKIP(ST.maxpos);
5274                 }
5275                 else {
5276                     ST.maxpos = locinput + ST.max - ST.min;
5277                     if (ST.maxpos >= PL_regeol)
5278                         ST.maxpos = PL_regeol - 1;
5279                 }
5280                 goto curly_try_B_min_known;
5281
5282             }
5283             else {
5284                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5285                 locinput = PL_reginput;
5286                 if (ST.count < ST.min)
5287                     sayNO;
5288                 if ((ST.count > ST.min)
5289                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5290                 {
5291                     /* A{m,n} must come at the end of the string, there's
5292                      * no point in backing off ... */
5293                     ST.min = ST.count;
5294                     /* ...except that $ and \Z can match before *and* after
5295                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5296                        We may back off by one in this case. */
5297                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5298                         ST.min--;
5299                 }
5300                 REGCP_SET(ST.cp);
5301                 goto curly_try_B_max;
5302             }
5303             /* NOTREACHED */
5304
5305
5306         case CURLY_B_min_known_fail:
5307             /* failed to find B in a non-greedy match where c1,c2 valid */
5308             if (ST.paren && ST.count)
5309                 PL_regoffs[ST.paren].end = -1;
5310
5311             PL_reginput = locinput;     /* Could be reset... */
5312             REGCP_UNWIND(ST.cp);
5313             /* Couldn't or didn't -- move forward. */
5314             ST.oldloc = locinput;
5315             if (utf8_target)
5316                 locinput += UTF8SKIP(locinput);
5317             else
5318                 locinput++;
5319             ST.count++;
5320           curly_try_B_min_known:
5321              /* find the next place where 'B' could work, then call B */
5322             {
5323                 int n;
5324                 if (utf8_target) {
5325                     n = (ST.oldloc == locinput) ? 0 : 1;
5326                     if (ST.c1 == ST.c2) {
5327                         STRLEN len;
5328                         /* set n to utf8_distance(oldloc, locinput) */
5329                         while (locinput <= ST.maxpos &&
5330                                utf8n_to_uvchr((U8*)locinput,
5331                                               UTF8_MAXBYTES, &len,
5332                                               uniflags) != (UV)ST.c1) {
5333                             locinput += len;
5334                             n++;
5335                         }
5336                     }
5337                     else {
5338                         /* set n to utf8_distance(oldloc, locinput) */
5339                         while (locinput <= ST.maxpos) {
5340                             STRLEN len;
5341                             const UV c = utf8n_to_uvchr((U8*)locinput,
5342                                                   UTF8_MAXBYTES, &len,
5343                                                   uniflags);
5344                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5345                                 break;
5346                             locinput += len;
5347                             n++;
5348                         }
5349                     }
5350                 }
5351                 else {
5352                     if (ST.c1 == ST.c2) {
5353                         while (locinput <= ST.maxpos &&
5354                                UCHARAT(locinput) != ST.c1)
5355                             locinput++;
5356                     }
5357                     else {
5358                         while (locinput <= ST.maxpos
5359                                && UCHARAT(locinput) != ST.c1
5360                                && UCHARAT(locinput) != ST.c2)
5361                             locinput++;
5362                     }
5363                     n = locinput - ST.oldloc;
5364                 }
5365                 if (locinput > ST.maxpos)
5366                     sayNO;
5367                 /* PL_reginput == oldloc now */
5368                 if (n) {
5369                     ST.count += n;
5370                     if (regrepeat(rex, ST.A, n, depth) < n)
5371                         sayNO;
5372                 }
5373                 PL_reginput = locinput;
5374                 CURLY_SETPAREN(ST.paren, ST.count);
5375                 if (cur_eval && cur_eval->u.eval.close_paren &&
5376                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5377                     goto fake_end;
5378                 }
5379                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5380             }
5381             /* NOTREACHED */
5382
5383
5384         case CURLY_B_min_fail:
5385             /* failed to find B in a non-greedy match where c1,c2 invalid */
5386             if (ST.paren && ST.count)
5387                 PL_regoffs[ST.paren].end = -1;
5388
5389             REGCP_UNWIND(ST.cp);
5390             /* failed -- move forward one */
5391             PL_reginput = locinput;
5392             if (regrepeat(rex, ST.A, 1, depth)) {
5393                 ST.count++;
5394                 locinput = PL_reginput;
5395                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5396                         ST.count > 0)) /* count overflow ? */
5397                 {
5398                   curly_try_B_min:
5399                     CURLY_SETPAREN(ST.paren, ST.count);
5400                     if (cur_eval && cur_eval->u.eval.close_paren &&
5401                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5402                         goto fake_end;
5403                     }
5404                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5405                 }
5406             }
5407             sayNO;
5408             /* NOTREACHED */
5409
5410
5411         curly_try_B_max:
5412             /* a successful greedy match: now try to match B */
5413             if (cur_eval && cur_eval->u.eval.close_paren &&
5414                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5415                 goto fake_end;
5416             }
5417             {
5418                 UV c = 0;
5419                 if (ST.c1 != CHRTEST_VOID)
5420                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5421                                            UTF8_MAXBYTES, 0, uniflags)
5422                                 : (UV) UCHARAT(PL_reginput);
5423                 /* If it could work, try it. */
5424                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5425                     CURLY_SETPAREN(ST.paren, ST.count);
5426                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5427                     /* NOTREACHED */
5428                 }
5429             }
5430             /* FALL THROUGH */
5431         case CURLY_B_max_fail:
5432             /* failed to find B in a greedy match */
5433             if (ST.paren && ST.count)
5434                 PL_regoffs[ST.paren].end = -1;
5435
5436             REGCP_UNWIND(ST.cp);
5437             /*  back up. */
5438             if (--ST.count < ST.min)
5439                 sayNO;
5440             PL_reginput = locinput = HOPc(locinput, -1);
5441             goto curly_try_B_max;
5442
5443 #undef ST
5444
5445         case END:
5446             fake_end:
5447             if (cur_eval) {
5448                 /* we've just finished A in /(??{A})B/; now continue with B */
5449                 I32 tmpix;
5450                 st->u.eval.toggle_reg_flags
5451                             = cur_eval->u.eval.toggle_reg_flags;
5452                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5453
5454                 st->u.eval.prev_rex = rex_sv;           /* inner */
5455                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5456                 rex = (struct regexp *)SvANY(rex_sv);
5457                 rexi = RXi_GET(rex);
5458                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5459                 ReREFCNT_inc(rex_sv);
5460                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5461
5462                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5463                 PL_reglastparen = &rex->lastparen;
5464                 PL_reglastcloseparen = &rex->lastcloseparen;
5465
5466                 REGCP_SET(st->u.eval.lastcp);
5467                 PL_reginput = locinput;
5468
5469                 /* Restore parens of the outer rex without popping the
5470                  * savestack */
5471                 tmpix = PL_savestack_ix;
5472                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5473                 regcppop(rex);
5474                 PL_savestack_ix = tmpix;
5475
5476                 st->u.eval.prev_eval = cur_eval;
5477                 cur_eval = cur_eval->u.eval.prev_eval;
5478                 DEBUG_EXECUTE_r(
5479                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5480                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5481                 if ( nochange_depth )
5482                     nochange_depth--;
5483
5484                 PUSH_YES_STATE_GOTO(EVAL_AB,
5485                         st->u.eval.prev_eval->u.eval.B); /* match B */
5486             }
5487
5488             if (locinput < reginfo->till) {
5489                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5490                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5491                                       PL_colors[4],
5492                                       (long)(locinput - PL_reg_starttry),
5493                                       (long)(reginfo->till - PL_reg_starttry),
5494                                       PL_colors[5]));
5495
5496                 sayNO_SILENT;           /* Cannot match: too short. */
5497             }
5498             PL_reginput = locinput;     /* put where regtry can find it */
5499             sayYES;                     /* Success! */
5500
5501         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5502             DEBUG_EXECUTE_r(
5503             PerlIO_printf(Perl_debug_log,
5504                 "%*s  %ssubpattern success...%s\n",
5505                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5506             PL_reginput = locinput;     /* put where regtry can find it */
5507             sayYES;                     /* Success! */
5508
5509 #undef  ST
5510 #define ST st->u.ifmatch
5511
5512         case SUSPEND:   /* (?>A) */
5513             ST.wanted = 1;
5514             PL_reginput = locinput;
5515             goto do_ifmatch;
5516
5517         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5518             ST.wanted = 0;
5519             goto ifmatch_trivial_fail_test;
5520
5521         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5522             ST.wanted = 1;
5523           ifmatch_trivial_fail_test:
5524             if (scan->flags) {
5525                 char * const s = HOPBACKc(locinput, scan->flags);
5526                 if (!s) {
5527                     /* trivial fail */
5528                     if (logical) {
5529                         logical = 0;
5530                         sw = 1 - cBOOL(ST.wanted);
5531                     }
5532                     else if (ST.wanted)
5533                         sayNO;
5534                     next = scan + ARG(scan);
5535                     if (next == scan)
5536                         next = NULL;
5537                     break;
5538                 }
5539                 PL_reginput = s;
5540             }
5541             else
5542                 PL_reginput = locinput;
5543
5544           do_ifmatch:
5545             ST.me = scan;
5546             ST.logical = logical;
5547             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5548
5549             /* execute body of (?...A) */
5550             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5551             /* NOTREACHED */
5552
5553         case IFMATCH_A_fail: /* body of (?...A) failed */
5554             ST.wanted = !ST.wanted;
5555             /* FALL THROUGH */
5556
5557         case IFMATCH_A: /* body of (?...A) succeeded */
5558             if (ST.logical) {
5559                 sw = cBOOL(ST.wanted);
5560             }
5561             else if (!ST.wanted)
5562                 sayNO;
5563
5564             if (OP(ST.me) == SUSPEND)
5565                 locinput = PL_reginput;
5566             else {
5567                 locinput = PL_reginput = st->locinput;
5568                 nextchr = UCHARAT(locinput);
5569             }
5570             scan = ST.me + ARG(ST.me);
5571             if (scan == ST.me)
5572                 scan = NULL;
5573             continue; /* execute B */
5574
5575 #undef ST
5576
5577         case LONGJMP:
5578             next = scan + ARG(scan);
5579             if (next == scan)
5580                 next = NULL;
5581             break;
5582         case COMMIT:
5583             reginfo->cutpoint = PL_regeol;
5584             /* FALLTHROUGH */
5585         case PRUNE:
5586             PL_reginput = locinput;
5587             if (!scan->flags)
5588                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5589             PUSH_STATE_GOTO(COMMIT_next,next);
5590             /* NOTREACHED */
5591         case COMMIT_next_fail:
5592             no_final = 1;
5593             /* FALLTHROUGH */
5594         case OPFAIL:
5595             sayNO;
5596             /* NOTREACHED */
5597
5598 #define ST st->u.mark
5599         case MARKPOINT:
5600             ST.prev_mark = mark_state;
5601             ST.mark_name = sv_commit = sv_yes_mark
5602                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5603             mark_state = st;
5604             ST.mark_loc = PL_reginput = locinput;
5605             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5606             /* NOTREACHED */
5607         case MARKPOINT_next:
5608             mark_state = ST.prev_mark;
5609             sayYES;
5610             /* NOTREACHED */
5611         case MARKPOINT_next_fail:
5612             if (popmark && sv_eq(ST.mark_name,popmark))
5613             {
5614                 if (ST.mark_loc > startpoint)
5615                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5616                 popmark = NULL; /* we found our mark */
5617                 sv_commit = ST.mark_name;
5618
5619                 DEBUG_EXECUTE_r({
5620                         PerlIO_printf(Perl_debug_log,
5621                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5622                             REPORT_CODE_OFF+depth*2, "",
5623                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5624                 });
5625             }
5626             mark_state = ST.prev_mark;
5627             sv_yes_mark = mark_state ?
5628                 mark_state->u.mark.mark_name : NULL;
5629             sayNO;
5630             /* NOTREACHED */
5631         case SKIP:
5632             PL_reginput = locinput;
5633             if (scan->flags) {
5634                 /* (*SKIP) : if we fail we cut here*/
5635                 ST.mark_name = NULL;
5636                 ST.mark_loc = locinput;
5637                 PUSH_STATE_GOTO(SKIP_next,next);
5638             } else {
5639                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5640                    otherwise do nothing.  Meaning we need to scan
5641                  */
5642                 regmatch_state *cur = mark_state;
5643                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5644
5645                 while (cur) {
5646                     if ( sv_eq( cur->u.mark.mark_name,
5647                                 find ) )
5648                     {
5649                         ST.mark_name = find;
5650                         PUSH_STATE_GOTO( SKIP_next, next );
5651                     }
5652                     cur = cur->u.mark.prev_mark;
5653                 }
5654             }
5655             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5656             break;
5657         case SKIP_next_fail:
5658             if (ST.mark_name) {
5659                 /* (*CUT:NAME) - Set up to search for the name as we
5660                    collapse the stack*/
5661                 popmark = ST.mark_name;
5662             } else {
5663                 /* (*CUT) - No name, we cut here.*/
5664                 if (ST.mark_loc > startpoint)
5665                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5666                 /* but we set sv_commit to latest mark_name if there
5667                    is one so they can test to see how things lead to this
5668                    cut */
5669                 if (mark_state)
5670                     sv_commit=mark_state->u.mark.mark_name;
5671             }
5672             no_final = 1;
5673             sayNO;
5674             /* NOTREACHED */
5675 #undef ST
5676         case FOLDCHAR:
5677             n = ARG(scan);
5678             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5679                 locinput += ln;
5680             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5681                 sayNO;
5682             } else  {
5683                 U8 folded[UTF8_MAXBYTES_CASE+1];
5684                 STRLEN foldlen;
5685                 const char * const l = locinput;
5686                 char *e = PL_regeol;
5687                 to_uni_fold(n, folded, &foldlen);
5688
5689                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5690                                l, &e, 0,  utf8_target)) {
5691                         sayNO;
5692                 }
5693                 locinput = e;
5694             }
5695             nextchr = UCHARAT(locinput);
5696             break;
5697         case LNBREAK:
5698             if ((n=is_LNBREAK(locinput,utf8_target))) {
5699                 locinput += n;
5700                 nextchr = UCHARAT(locinput);
5701             } else
5702                 sayNO;
5703             break;
5704
5705 #define CASE_CLASS(nAmE)                              \
5706         case nAmE:                                    \
5707             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5708                 locinput += n;                        \
5709                 nextchr = UCHARAT(locinput);          \
5710             } else                                    \
5711                 sayNO;                                \
5712             break;                                    \
5713         case N##nAmE:                                 \
5714             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5715                 sayNO;                                \
5716             } else {                                  \
5717                 locinput += UTF8SKIP(locinput);       \
5718                 nextchr = UCHARAT(locinput);          \
5719             }                                         \
5720             break
5721
5722         CASE_CLASS(VERTWS);
5723         CASE_CLASS(HORIZWS);
5724 #undef CASE_CLASS
5725
5726         default:
5727             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5728                           PTR2UV(scan), OP(scan));
5729             Perl_croak(aTHX_ "regexp memory corruption");
5730
5731         } /* end switch */
5732
5733         /* switch break jumps here */
5734         scan = next; /* prepare to execute the next op and ... */
5735         continue;    /* ... jump back to the top, reusing st */
5736         /* NOTREACHED */
5737
5738       push_yes_state:
5739         /* push a state that backtracks on success */
5740         st->u.yes.prev_yes_state = yes_state;
5741         yes_state = st;
5742         /* FALL THROUGH */
5743       push_state:
5744         /* push a new regex state, then continue at scan  */
5745         {
5746             regmatch_state *newst;
5747
5748             DEBUG_STACK_r({
5749                 regmatch_state *cur = st;
5750                 regmatch_state *curyes = yes_state;
5751                 int curd = depth;
5752                 regmatch_slab *slab = PL_regmatch_slab;
5753                 for (;curd > -1;cur--,curd--) {
5754                     if (cur < SLAB_FIRST(slab)) {
5755                         slab = slab->prev;
5756                         cur = SLAB_LAST(slab);
5757                     }
5758                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5759                         REPORT_CODE_OFF + 2 + depth * 2,"",
5760                         curd, PL_reg_name[cur->resume_state],
5761                         (curyes == cur) ? "yes" : ""
5762                     );
5763                     if (curyes == cur)
5764                         curyes = cur->u.yes.prev_yes_state;
5765                 }
5766             } else
5767                 DEBUG_STATE_pp("push")
5768             );
5769             depth++;
5770             st->locinput = locinput;
5771             newst = st+1;
5772             if (newst >  SLAB_LAST(PL_regmatch_slab))
5773                 newst = S_push_slab(aTHX);
5774             PL_regmatch_state = newst;
5775
5776             locinput = PL_reginput;
5777             nextchr = UCHARAT(locinput);
5778             st = newst;
5779             continue;
5780             /* NOTREACHED */
5781         }
5782     }
5783
5784     /*
5785     * We get here only if there's trouble -- normally "case END" is
5786     * the terminating point.
5787     */
5788     Perl_croak(aTHX_ "corrupted regexp pointers");
5789     /*NOTREACHED*/
5790     sayNO;
5791
5792 yes:
5793     if (yes_state) {
5794         /* we have successfully completed a subexpression, but we must now
5795          * pop to the state marked by yes_state and continue from there */
5796         assert(st != yes_state);
5797 #ifdef DEBUGGING
5798         while (st != yes_state) {
5799             st--;
5800             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5801                 PL_regmatch_slab = PL_regmatch_slab->prev;
5802                 st = SLAB_LAST(PL_regmatch_slab);
5803             }
5804             DEBUG_STATE_r({
5805                 if (no_final) {
5806                     DEBUG_STATE_pp("pop (no final)");
5807                 } else {
5808                     DEBUG_STATE_pp("pop (yes)");
5809                 }
5810             });
5811             depth--;
5812         }
5813 #else
5814         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5815             || yes_state > SLAB_LAST(PL_regmatch_slab))
5816         {
5817             /* not in this slab, pop slab */
5818             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5819             PL_regmatch_slab = PL_regmatch_slab->prev;
5820             st = SLAB_LAST(PL_regmatch_slab);
5821         }
5822         depth -= (st - yes_state);
5823 #endif
5824         st = yes_state;
5825         yes_state = st->u.yes.prev_yes_state;
5826         PL_regmatch_state = st;
5827
5828         if (no_final) {
5829             locinput= st->locinput;
5830             nextchr = UCHARAT(locinput);
5831         }
5832         state_num = st->resume_state + no_final;
5833         goto reenter_switch;
5834     }
5835
5836     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5837                           PL_colors[4], PL_colors[5]));
5838
5839     if (PL_reg_eval_set) {
5840         /* each successfully executed (?{...}) block does the equivalent of
5841          *   local $^R = do {...}
5842          * When popping the save stack, all these locals would be undone;
5843          * bypass this by setting the outermost saved $^R to the latest
5844          * value */
5845         if (oreplsv != GvSV(PL_replgv))
5846             sv_setsv(oreplsv, GvSV(PL_replgv));
5847     }
5848     result = 1;
5849     goto final_exit;
5850
5851 no:
5852     DEBUG_EXECUTE_r(
5853         PerlIO_printf(Perl_debug_log,
5854             "%*s  %sfailed...%s\n",
5855             REPORT_CODE_OFF+depth*2, "",
5856             PL_colors[4], PL_colors[5])
5857         );
5858
5859 no_silent:
5860     if (no_final) {
5861         if (yes_state) {
5862             goto yes;
5863         } else {
5864             goto final_exit;
5865         }
5866     }
5867     if (depth) {
5868         /* there's a previous state to backtrack to */
5869         st--;
5870         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5871             PL_regmatch_slab = PL_regmatch_slab->prev;
5872             st = SLAB_LAST(PL_regmatch_slab);
5873         }
5874         PL_regmatch_state = st;
5875         locinput= st->locinput;
5876         nextchr = UCHARAT(locinput);
5877
5878         DEBUG_STATE_pp("pop");
5879         depth--;
5880         if (yes_state == st)
5881             yes_state = st->u.yes.prev_yes_state;
5882
5883         state_num = st->resume_state + 1; /* failure = success + 1 */
5884         goto reenter_switch;
5885     }
5886     result = 0;
5887
5888   final_exit:
5889     if (rex->intflags & PREGf_VERBARG_SEEN) {
5890         SV *sv_err = get_sv("REGERROR", 1);
5891         SV *sv_mrk = get_sv("REGMARK", 1);
5892         if (result) {
5893             sv_commit = &PL_sv_no;
5894             if (!sv_yes_mark)
5895                 sv_yes_mark = &PL_sv_yes;
5896         } else {
5897             if (!sv_commit)
5898                 sv_commit = &PL_sv_yes;
5899             sv_yes_mark = &PL_sv_no;
5900         }
5901         sv_setsv(sv_err, sv_commit);
5902         sv_setsv(sv_mrk, sv_yes_mark);
5903     }
5904
5905     /* clean up; in particular, free all slabs above current one */
5906     LEAVE_SCOPE(oldsave);
5907
5908     return result;
5909 }
5910
5911 /*
5912  - regrepeat - repeatedly match something simple, report how many
5913  */
5914 /*
5915  * [This routine now assumes that it will only match on things of length 1.
5916  * That was true before, but now we assume scan - reginput is the count,
5917  * rather than incrementing count on every character.  [Er, except utf8.]]
5918  */
5919 STATIC I32
5920 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5921 {
5922     dVAR;
5923     register char *scan;
5924     register I32 c;
5925     register char *loceol = PL_regeol;
5926     register I32 hardcount = 0;
5927     register bool utf8_target = PL_reg_match_utf8;
5928     UV utf8_flags;
5929 #ifndef DEBUGGING
5930     PERL_UNUSED_ARG(depth);
5931 #endif
5932
5933     PERL_ARGS_ASSERT_REGREPEAT;
5934
5935     scan = PL_reginput;
5936     if (max == REG_INFTY)
5937         max = I32_MAX;
5938     else if (max < loceol - scan)
5939         loceol = scan + max;
5940     switch (OP(p)) {
5941     case REG_ANY:
5942         if (utf8_target) {
5943             loceol = PL_regeol;
5944             while (scan < loceol && hardcount < max && *scan != '\n') {
5945                 scan += UTF8SKIP(scan);
5946                 hardcount++;
5947             }
5948         } else {
5949             while (scan < loceol && *scan != '\n')
5950                 scan++;
5951         }
5952         break;
5953     case SANY:
5954         if (utf8_target) {
5955             loceol = PL_regeol;
5956             while (scan < loceol && hardcount < max) {
5957                 scan += UTF8SKIP(scan);
5958                 hardcount++;
5959             }
5960         }
5961         else
5962             scan = loceol;
5963         break;
5964     case CANY:
5965         scan = loceol;
5966         break;
5967     case EXACT:
5968         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5969          * means they match only characters in the string that can be expressed
5970          * as a single byte.  For non-utf8 strings, that means a simple match.
5971          * For utf8 strings, the character matched must be an invariant, or
5972          * downgradable to a single byte.  The pattern's utf8ness is
5973          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5974          * it is, it's an invariant */
5975
5976         c = (U8)*STRING(p);
5977         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5978
5979         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5980             while (scan < loceol && UCHARAT(scan) == c) {
5981                 scan++;
5982             }
5983         }
5984         else {
5985
5986             /* Here, the string is utf8, and the pattern char is different
5987              * in utf8 than not, so can't compare them directly.  Outside the
5988              * loop, find find the two utf8 bytes that represent c, and then
5989              * look for those in sequence in the utf8 string */
5990             U8 high = UTF8_TWO_BYTE_HI(c);
5991             U8 low = UTF8_TWO_BYTE_LO(c);
5992             loceol = PL_regeol;
5993
5994             while (hardcount < max
5995                     && scan + 1 < loceol
5996                     && UCHARAT(scan) == high
5997                     && UCHARAT(scan + 1) == low)
5998             {
5999                 scan += 2;
6000                 hardcount++;
6001             }
6002         }
6003         break;
6004     case EXACTFA:
6005         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6006         goto do_exactf;
6007
6008     case EXACTFL:
6009         PL_reg_flags |= RF_tainted;
6010         utf8_flags = FOLDEQ_UTF8_LOCALE;
6011         goto do_exactf;
6012
6013     case EXACTF:
6014     case EXACTFU:
6015         utf8_flags = 0;
6016
6017         /* The comments for the EXACT case above apply as well to these fold
6018          * ones */
6019
6020     do_exactf:
6021         c = (U8)*STRING(p);
6022         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6023
6024         if (utf8_target) { /* Use full Unicode fold matching */
6025             char *tmpeol = loceol;
6026             while (hardcount < max
6027                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6028                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6029             {
6030                 scan = tmpeol;
6031                 tmpeol = loceol;
6032                 hardcount++;
6033             }
6034
6035             /* XXX Note that the above handles properly the German sharp s in
6036              * the pattern matching ss in the string.  But it doesn't handle
6037              * properly cases where the string contains say 'LIGATURE ff' and
6038              * the pattern is 'f+'.  This would require, say, a new function or
6039              * revised interface to foldEQ_utf8(), in which the maximum number
6040              * of characters to match could be passed and it would return how
6041              * many actually did.  This is just one of many cases where
6042              * multi-char folds don't work properly, and so the fix is being
6043              * deferred */
6044         }
6045         else {
6046             U8 folded;
6047
6048             /* Here, the string isn't utf8 and c is a single byte; and either
6049              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6050              * doesn't affect c.  Can just do simple comparisons for exact or
6051              * fold matching. */
6052             switch (OP(p)) {
6053                 case EXACTF: folded = PL_fold[c]; break;
6054                 case EXACTFA:
6055                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6056                 case EXACTFL: folded = PL_fold_locale[c]; break;
6057                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6058             }
6059             while (scan < loceol &&
6060                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6061             {
6062                 scan++;
6063             }
6064         }
6065         break;
6066     case ANYOFV:
6067     case ANYOF:
6068         if (utf8_target || OP(p) == ANYOFV) {
6069             STRLEN inclasslen;
6070             loceol = PL_regeol;
6071             inclasslen = loceol - scan;
6072             while (hardcount < max
6073                    && ((inclasslen = loceol - scan) > 0)
6074                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6075             {
6076                 scan += inclasslen;
6077                 hardcount++;
6078             }
6079         } else {
6080             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6081                 scan++;
6082         }
6083         break;
6084     case ALNUMU:
6085         if (utf8_target) {
6086     utf8_wordchar:
6087             loceol = PL_regeol;
6088             LOAD_UTF8_CHARCLASS_ALNUM();
6089             while (hardcount < max && scan < loceol &&
6090                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6091             {
6092                 scan += UTF8SKIP(scan);
6093                 hardcount++;
6094             }
6095         } else {
6096             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6097                 scan++;
6098             }
6099         }
6100         break;
6101     case ALNUM:
6102         if (utf8_target)
6103             goto utf8_wordchar;
6104         while (scan < loceol && isALNUM((U8) *scan)) {
6105             scan++;
6106         }
6107         break;
6108     case ALNUMA:
6109         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6110             scan++;
6111         }
6112         break;
6113     case ALNUML:
6114         PL_reg_flags |= RF_tainted;
6115         if (utf8_target) {
6116             loceol = PL_regeol;
6117             while (hardcount < max && scan < loceol &&
6118                    isALNUM_LC_utf8((U8*)scan)) {
6119                 scan += UTF8SKIP(scan);
6120                 hardcount++;
6121             }
6122         } else {
6123             while (scan < loceol && isALNUM_LC(*scan))
6124                 scan++;
6125         }
6126         break;
6127     case NALNUMU:
6128         if (utf8_target) {
6129
6130     utf8_Nwordchar:
6131
6132             loceol = PL_regeol;
6133             LOAD_UTF8_CHARCLASS_ALNUM();
6134             while (hardcount < max && scan < loceol &&
6135                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6136             {
6137                 scan += UTF8SKIP(scan);
6138                 hardcount++;
6139             }
6140         } else {
6141             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6142                 scan++;
6143             }
6144         }
6145         break;
6146     case NALNUM:
6147         if (utf8_target)
6148             goto utf8_Nwordchar;
6149         while (scan < loceol && ! isALNUM((U8) *scan)) {
6150             scan++;
6151         }
6152         break;
6153     case NALNUMA:
6154         if (utf8_target) {
6155             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6156                 scan += UTF8SKIP(scan);
6157             }
6158         }
6159         else {
6160             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6161                 scan++;
6162             }
6163         }
6164         break;
6165     case NALNUML:
6166         PL_reg_flags |= RF_tainted;
6167         if (utf8_target) {
6168             loceol = PL_regeol;
6169             while (hardcount < max && scan < loceol &&
6170                    !isALNUM_LC_utf8((U8*)scan)) {
6171                 scan += UTF8SKIP(scan);
6172                 hardcount++;
6173             }
6174         } else {
6175             while (scan < loceol && !isALNUM_LC(*scan))
6176                 scan++;
6177         }
6178         break;
6179     case SPACEU:
6180         if (utf8_target) {
6181
6182     utf8_space:
6183
6184             loceol = PL_regeol;
6185             LOAD_UTF8_CHARCLASS_SPACE();
6186             while (hardcount < max && scan < loceol &&
6187                    (*scan == ' ' ||
6188                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6189             {
6190                 scan += UTF8SKIP(scan);
6191                 hardcount++;
6192             }
6193             break;
6194         }
6195         else {
6196             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6197                 scan++;
6198             }
6199             break;
6200         }
6201     case SPACE:
6202         if (utf8_target)
6203             goto utf8_space;
6204
6205         while (scan < loceol && isSPACE((U8) *scan)) {
6206             scan++;
6207         }
6208         break;
6209     case SPACEA:
6210         while (scan < loceol && isSPACE_A((U8) *scan)) {
6211             scan++;
6212         }
6213         break;
6214     case SPACEL:
6215         PL_reg_flags |= RF_tainted;
6216         if (utf8_target) {
6217             loceol = PL_regeol;
6218             while (hardcount < max && scan < loceol &&
6219                    isSPACE_LC_utf8((U8*)scan)) {
6220                 scan += UTF8SKIP(scan);
6221                 hardcount++;
6222             }
6223         } else {
6224             while (scan < loceol && isSPACE_LC(*scan))
6225                 scan++;
6226         }
6227         break;
6228     case NSPACEU:
6229         if (utf8_target) {
6230
6231     utf8_Nspace:
6232
6233             loceol = PL_regeol;
6234             LOAD_UTF8_CHARCLASS_SPACE();
6235             while (hardcount < max && scan < loceol &&
6236                    ! (*scan == ' ' ||
6237                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6238             {
6239                 scan += UTF8SKIP(scan);
6240                 hardcount++;
6241             }
6242             break;
6243         }
6244         else {
6245             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6246                 scan++;
6247             }
6248         }
6249         break;
6250     case NSPACE:
6251         if (utf8_target)
6252             goto utf8_Nspace;
6253
6254         while (scan < loceol && ! isSPACE((U8) *scan)) {
6255             scan++;
6256         }
6257         break;
6258     case NSPACEA:
6259         if (utf8_target) {
6260             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6261                 scan += UTF8SKIP(scan);
6262             }
6263         }
6264         else {
6265             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6266                 scan++;
6267             }
6268         }
6269         break;
6270     case NSPACEL:
6271         PL_reg_flags |= RF_tainted;
6272         if (utf8_target) {
6273             loceol = PL_regeol;
6274             while (hardcount < max && scan < loceol &&
6275                    !isSPACE_LC_utf8((U8*)scan)) {
6276                 scan += UTF8SKIP(scan);
6277                 hardcount++;
6278             }
6279         } else {
6280             while (scan < loceol && !isSPACE_LC(*scan))
6281                 scan++;
6282         }
6283         break;
6284     case DIGIT:
6285         if (utf8_target) {
6286             loceol = PL_regeol;
6287             LOAD_UTF8_CHARCLASS_DIGIT();
6288             while (hardcount < max && scan < loceol &&
6289                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6290                 scan += UTF8SKIP(scan);
6291                 hardcount++;
6292             }
6293         } else {
6294             while (scan < loceol && isDIGIT(*scan))
6295                 scan++;
6296         }
6297         break;
6298     case DIGITA:
6299         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6300             scan++;
6301         }
6302         break;
6303     case DIGITL:
6304         PL_reg_flags |= RF_tainted;
6305         if (utf8_target) {
6306             loceol = PL_regeol;
6307             while (hardcount < max && scan < loceol &&
6308                    isDIGIT_LC_utf8((U8*)scan)) {
6309                 scan += UTF8SKIP(scan);
6310                 hardcount++;
6311             }
6312         } else {
6313             while (scan < loceol && isDIGIT_LC(*scan))
6314                 scan++;
6315         }
6316         break;
6317     case NDIGIT:
6318         if (utf8_target) {
6319             loceol = PL_regeol;
6320             LOAD_UTF8_CHARCLASS_DIGIT();
6321             while (hardcount < max && scan < loceol &&
6322                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6323                 scan += UTF8SKIP(scan);
6324                 hardcount++;
6325             }
6326         } else {
6327             while (scan < loceol && !isDIGIT(*scan))
6328                 scan++;
6329         }
6330         break;
6331     case NDIGITA:
6332         if (utf8_target) {
6333             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6334                 scan += UTF8SKIP(scan);
6335             }
6336         }
6337         else {
6338             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6339                 scan++;
6340             }
6341         }
6342         break;
6343     case NDIGITL:
6344         PL_reg_flags |= RF_tainted;
6345         if (utf8_target) {
6346             loceol = PL_regeol;
6347             while (hardcount < max && scan < loceol &&
6348                    !isDIGIT_LC_utf8((U8*)scan)) {
6349                 scan += UTF8SKIP(scan);
6350                 hardcount++;
6351             }
6352         } else {
6353             while (scan < loceol && !isDIGIT_LC(*scan))
6354                 scan++;
6355         }
6356         break;
6357     case LNBREAK:
6358         if (utf8_target) {
6359             loceol = PL_regeol;
6360             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6361                 scan += c;
6362                 hardcount++;
6363             }
6364         } else {
6365             /*
6366               LNBREAK can match two latin chars, which is ok,
6367               because we have a null terminated string, but we
6368               have to use hardcount in this situation
6369             */
6370             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6371                 scan+=c;
6372                 hardcount++;
6373             }
6374         }
6375         break;
6376     case HORIZWS:
6377         if (utf8_target) {
6378             loceol = PL_regeol;
6379             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6380                 scan += c;
6381                 hardcount++;
6382             }
6383         } else {
6384             while (scan < loceol && is_HORIZWS_latin1(scan))
6385                 scan++;
6386         }
6387         break;
6388     case NHORIZWS:
6389         if (utf8_target) {
6390             loceol = PL_regeol;
6391             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6392                 scan += UTF8SKIP(scan);
6393                 hardcount++;
6394             }
6395         } else {
6396             while (scan < loceol && !is_HORIZWS_latin1(scan))
6397                 scan++;
6398
6399         }
6400         break;
6401     case VERTWS:
6402         if (utf8_target) {
6403             loceol = PL_regeol;
6404             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6405                 scan += c;
6406                 hardcount++;
6407             }
6408         } else {
6409             while (scan < loceol && is_VERTWS_latin1(scan))
6410                 scan++;
6411
6412         }
6413         break;
6414     case NVERTWS:
6415         if (utf8_target) {
6416             loceol = PL_regeol;
6417             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6418                 scan += UTF8SKIP(scan);
6419                 hardcount++;
6420             }
6421         } else {
6422             while (scan < loceol && !is_VERTWS_latin1(scan))
6423                 scan++;
6424
6425         }
6426         break;
6427
6428     default:            /* Called on something of 0 width. */
6429         break;          /* So match right here or not at all. */
6430     }
6431
6432     if (hardcount)
6433         c = hardcount;
6434     else
6435         c = scan - PL_reginput;
6436     PL_reginput = scan;
6437
6438     DEBUG_r({
6439         GET_RE_DEBUG_FLAGS_DECL;
6440         DEBUG_EXECUTE_r({
6441             SV * const prop = sv_newmortal();
6442             regprop(prog, prop, p);
6443             PerlIO_printf(Perl_debug_log,
6444                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6445                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6446         });
6447     });
6448
6449     return(c);
6450 }
6451
6452
6453 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6454 /*
6455 - regclass_swash - prepare the utf8 swash
6456 */
6457
6458 SV *
6459 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6460 {
6461     dVAR;
6462     SV *sw  = NULL;
6463     SV *si  = NULL;
6464     SV *alt = NULL;
6465     RXi_GET_DECL(prog,progi);
6466     const struct reg_data * const data = prog ? progi->data : NULL;
6467
6468     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6469
6470     if (data && data->count) {
6471         const U32 n = ARG(node);
6472
6473         if (data->what[n] == 's') {
6474             SV * const rv = MUTABLE_SV(data->data[n]);
6475             AV * const av = MUTABLE_AV(SvRV(rv));
6476             SV **const ary = AvARRAY(av);
6477             SV **a, **b;
6478
6479             /* See the end of regcomp.c:S_regclass() for
6480              * documentation of these array elements. */
6481
6482             si = *ary;
6483             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6484             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6485
6486             if (a)
6487                 sw = *a;
6488             else if (si && doinit) {
6489                 sw = swash_init("utf8", "", si, 1, 0);
6490                 (void)av_store(av, 1, sw);
6491             }
6492             if (b)
6493                 alt = *b;
6494         }
6495     }
6496
6497     if (listsvp)
6498         *listsvp = si;
6499     if (altsvp)
6500         *altsvp  = alt;
6501
6502     return sw;
6503 }
6504 #endif
6505
6506 /*
6507  - reginclass - determine if a character falls into a character class
6508
6509   n is the ANYOF regnode
6510   p is the target string
6511   lenp is pointer to the maximum number of bytes of how far to go in p
6512     (This is assumed wthout checking to always be at least the current
6513     character's size)
6514   utf8_target tells whether p is in UTF-8.
6515
6516   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6517   from a successful match, the value it points to will be updated to how many
6518   bytes in p were matched.  If there was no match, the value is undefined,
6519   possibly changed from the input.
6520
6521   Note that this can be a synthetic start class, a combination of various
6522   nodes, so things you think might be mutually exclusive, such as locale,
6523   aren't.  It can match both locale and non-locale
6524
6525  */
6526
6527 STATIC bool
6528 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6529 {
6530     dVAR;
6531     const char flags = ANYOF_FLAGS(n);
6532     bool match = FALSE;
6533     UV c = *p;
6534     STRLEN c_len = 0;
6535     STRLEN maxlen;
6536
6537     PERL_ARGS_ASSERT_REGINCLASS;
6538
6539     /* If c is not already the code point, get it */
6540     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6541         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6542                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6543                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6544                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6545                  * UTF8_ALLOW_FFFF */
6546         if (c_len == (STRLEN)-1)
6547             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6548     }
6549     else {
6550         c_len = 1;
6551     }
6552
6553     /* Use passed in max length, or one character if none passed in or less
6554      * than one character.  And assume will match just one character.  This is
6555      * overwritten later if matched more. */
6556     if (lenp) {
6557         maxlen = (*lenp > c_len) ? *lenp : c_len;
6558         *lenp = c_len;
6559
6560     }
6561     else {
6562         maxlen = c_len;
6563     }
6564
6565     /* If this character is potentially in the bitmap, check it */
6566     if (c < 256) {
6567         if (ANYOF_BITMAP_TEST(n, c))
6568             match = TRUE;
6569         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6570                 && ! utf8_target
6571                 && ! isASCII(c))
6572         {
6573             match = TRUE;
6574         }
6575
6576         else if (flags & ANYOF_LOCALE) {
6577             PL_reg_flags |= RF_tainted;
6578
6579             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6580                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6581             {
6582                 match = TRUE;
6583             }
6584             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6585                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6586                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6587                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6588                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6589                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6590                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6591                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6592                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6593                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6594                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6595                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6596                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6597                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6598                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6599                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6600                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6601                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6602                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6603                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6604                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6605                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6606                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6607                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6608                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6609                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6610                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6611                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6612                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6613                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6614                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6615                      ) /* How's that for a conditional? */
6616             ) {
6617                 match = TRUE;
6618             }
6619         }
6620     }
6621
6622     /* If the bitmap didn't (or couldn't) match, and something outside the
6623      * bitmap could match, try that.  Locale nodes specifiy completely the
6624      * behavior of code points in the bit map (otherwise, a utf8 target would
6625      * cause them to be treated as Unicode and not locale), except XXX in
6626      * the very unlikely event when this node is a synthetic start class, which
6627      * could be a combination of locale and non-locale nodes */
6628     if (!match) {
6629         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6630             match = TRUE;       /* Everything above 255 matches */
6631         }
6632         else if ((flags & ANYOF_NONBITMAP_NON_UTF8
6633                   || (utf8_target && flags & ANYOF_UTF8
6634                       && (c >=256 || ! (flags & ANYOF_LOCALE)))))
6635         {
6636             AV *av;
6637             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6638
6639             if (sw) {
6640                 U8 * utf8_p;
6641                 if (utf8_target) {
6642                     utf8_p = (U8 *) p;
6643                 } else {
6644
6645                     /* Not utf8.  Convert as much of the string as available up
6646                      * to the limit of how far the (single) character in the
6647                      * pattern can possibly match (no need to go further).  If
6648                      * the node is a straight ANYOF or not folding, it can't
6649                      * match more than one.  Otherwise, It can match up to how
6650                      * far a single char can fold to.  Since not utf8, each
6651                      * character is a single byte, so the max it can be in
6652                      * bytes is the same as the max it can be in characters */
6653                     STRLEN len = (OP(n) == ANYOF
6654                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6655                                   ? 1
6656                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6657                                     ? maxlen
6658                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6659                     utf8_p = bytes_to_utf8(p, &len);
6660                 }
6661
6662                 if (swash_fetch(sw, utf8_p, TRUE))
6663                     match = TRUE;
6664                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6665
6666                     /* Here, we need to test if the fold of the target string
6667                      * matches.  In the case of a multi-char fold that is
6668                      * caught by regcomp.c, it has stored all such folds into
6669                      * 'av'; we linearly check to see if any match the target
6670                      * string (folded).   We know that the originals were each
6671                      * one character, but we don't currently know how many
6672                      * characters/bytes each folded to, except we do know that
6673                      * there are small limits imposed by Unicode.  XXX A
6674                      * performance enhancement would be to have regcomp.c store
6675                      * the max number of chars/bytes that are in an av entry,
6676                      * as, say the 0th element.  Even better would be to have a
6677                      * hash of the few characters that can start a multi-char
6678                      * fold to the max number of chars of those folds.
6679                      *
6680                      * Further down, if there isn't a
6681                      * match in the av, we will check if there is another
6682                      * fold-type match.  For that, we also need the fold, but
6683                      * only the first character.  No sense in folding it twice,
6684                      * so we do it here, even if there isn't any multi-char
6685                      * fold, so we always fold at least the first character.
6686                      * If the node is a straight ANYOF node, or there is only
6687                      * one character available in the string, or if there isn't
6688                      * any av, that's all we have to fold.  In the case of a
6689                      * multi-char fold, we do have guarantees in Unicode that
6690                      * it can only expand up to so many characters and so many
6691                      * bytes.  We keep track so don't exceed either.
6692                      *
6693                      * If there is a match, we will need to advance (if lenp is
6694                      * specified) the match pointer in the target string.  But
6695                      * what we are comparing here isn't that string directly,
6696                      * but its fold, whose length may differ from the original.
6697                      * As we go along in constructing the fold, therefore, we
6698                      * create a map so that we know how many bytes in the
6699                      * source to advance given that we have matched a certain
6700                      * number of bytes in the fold.  This map is stored in
6701                      * 'map_fold_len_back'.  The first character in the fold
6702                      * has array element 1 contain the number of bytes in the
6703                      * source that folded to it; the 2nd is the cumulative
6704                      * number to match it; ... */
6705                     U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND] = { 0 };
6706                     U8 folded[UTF8_MAXBYTES_CASE+1];
6707                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6708                     STRLEN foldlen_for_av; /* num bytes in fold of all chars */
6709
6710                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6711
6712                         /* Here, only need to fold the first char of the target
6713                          * string */
6714                         to_utf8_fold(utf8_p, folded, &foldlen);
6715                         foldlen_for_av = foldlen;
6716                         map_fold_len_back[1] = UTF8SKIP(utf8_p);
6717                     }
6718                     else {
6719
6720                         /* Here, need to fold more than the first char.  Do so
6721                          * up to the limits */
6722                         UV which_char = 0;
6723                         U8* source_ptr = utf8_p;    /* The source for the fold
6724                                                        is the regex target
6725                                                        string */
6726                         U8* folded_ptr = folded;
6727                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6728                                                        available byte in the
6729                                                        target string */
6730                         while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND
6731                                && source_ptr < e)
6732                         {
6733
6734                             /* Fold the next character */
6735                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6736                             STRLEN this_char_foldlen;
6737                             to_utf8_fold(source_ptr,
6738                                          this_char_folded,
6739                                          &this_char_foldlen);
6740
6741                             /* Bail if it would exceed the byte limit for
6742                              * folding a single char. */
6743                             if (this_char_foldlen + folded_ptr - folded >
6744                                                             UTF8_MAXBYTES_CASE)
6745                             {
6746                                 break;
6747                             }
6748
6749                             /* Save the first character's folded length, in
6750                              * case we have to use it later */
6751                             if (! foldlen) {
6752                                 foldlen = this_char_foldlen;
6753                             }
6754
6755                             /* Here, add the fold of this character */
6756                             Copy(this_char_folded,
6757                                  folded_ptr,
6758                                  this_char_foldlen,
6759                                  U8);
6760                             which_char++;
6761                             map_fold_len_back[which_char] =
6762                                 map_fold_len_back[which_char - 1]
6763                                 + UTF8SKIP(source_ptr);
6764                             folded_ptr += this_char_foldlen;
6765                             source_ptr += UTF8SKIP(source_ptr);
6766                         }
6767                         *folded_ptr = '\0';
6768                         foldlen_for_av = folded_ptr - folded;
6769                     }
6770
6771
6772                     /* Do the linear search to see if the fold is in the list
6773                      * of multi-char folds.  (Useless to look if won't be able
6774                      * to store that it is a multi-char fold in *lenp) */
6775                     if (lenp && av) {
6776                         I32 i;
6777                         for (i = 0; i <= av_len(av); i++) {
6778                             SV* const sv = *av_fetch(av, i, FALSE);
6779                             STRLEN len;
6780                             const char * const s = SvPV_const(sv, len);
6781                             if (len <= foldlen_for_av && memEQ(s,
6782                                                                (char*)folded,
6783                                                                len))
6784                             {
6785
6786                                 /* Advance the target string ptr to account for
6787                                  * this fold, but have to translate from the
6788                                  * folded length to the corresponding source
6789                                  * length.  The array is indexed by how many
6790                                  * characters in the match */
6791                                 *lenp = map_fold_len_back[
6792                                         utf8_length(folded, folded + len)];
6793                                 match = TRUE;
6794                                 break;
6795                             }
6796                         }
6797                     }
6798 #if 0
6799                     if (!match) { /* See if the folded version matches */
6800                         SV** listp;
6801
6802                         /* Consider "k" =~ /[K]/i.  The line above would have
6803                          * just folded the 'k' to itself, and that isn't going
6804                          * to match 'K'.  So we look through the closure of
6805                          * everything that folds to 'k'.  That will find the
6806                          * 'K'.  Initialize the list, if necessary */
6807                         if (! PL_utf8_foldclosures) {
6808
6809                             /* If the folds haven't been read in, call a fold
6810                              * function to force that */
6811                             if (! PL_utf8_tofold) {
6812                                 U8 dummy[UTF8_MAXBYTES+1];
6813                                 STRLEN dummy_len;
6814                                 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6815                             }
6816                             PL_utf8_foldclosures =
6817                                   _swash_inversion_hash(PL_utf8_tofold);
6818                         }
6819
6820                         /* The data structure is a hash with the keys every
6821                          * character that is folded to, like 'k', and the
6822                          * values each an array of everything that folds to its
6823                          * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6824                         if ((listp = hv_fetch(PL_utf8_foldclosures,
6825                                       (char *) folded, foldlen, FALSE)))
6826                         {
6827                             AV* list = (AV*) *listp;
6828                             IV i;
6829                             for (i = 0; i <= av_len(list); i++) {
6830                                 SV** try_p = av_fetch(list, i, FALSE);
6831                                 char* try_c;
6832                                 if (try_p == NULL) {
6833                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6834                                 }
6835                                 /* Don't have to worry about embedded nulls
6836                                  * since NULL isn't folded or foldable */
6837                                 try_c = SvPVX(*try_p);
6838
6839                                 /* The fold in a few cases  of an above Latin1
6840                                  * char is in the Latin1 range, and hence may
6841                                  * be in the bitmap */
6842                                 if (UTF8_IS_INVARIANT(*try_c)
6843                                     && ANYOF_BITMAP_TEST(n,
6844                                                     UNI_TO_NATIVE(*try_c)))
6845                                 {
6846                                     match = TRUE;
6847                                     break;
6848                                 }
6849                                 else if
6850                                     (UTF8_IS_DOWNGRADEABLE_START(*try_c)
6851                                      && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
6852                                                 TWO_BYTE_UTF8_TO_UNI(try_c[0],
6853                                                                     try_c[1]))))
6854                                 {
6855                                    /* Since the fold comes from internally
6856                                     * generated data, we can safely assume it
6857                                     * is valid utf8 in the test above */
6858                                     match = TRUE;
6859                                     break;
6860                                 } else if (swash_fetch(sw, (U8*) try_c, TRUE)) {
6861                                     match = TRUE;
6862                                     break;
6863                                 }
6864                             }
6865                         }
6866                     }
6867 #endif
6868                 }
6869
6870                 /* If we allocated a string above, free it */
6871                 if (! utf8_target) Safefree(utf8_p);
6872             }
6873         }
6874     }
6875
6876     return (flags & ANYOF_INVERT) ? !match : match;
6877 }
6878
6879 STATIC U8 *
6880 S_reghop3(U8 *s, I32 off, const U8* lim)
6881 {
6882     dVAR;
6883
6884     PERL_ARGS_ASSERT_REGHOP3;
6885
6886     if (off >= 0) {
6887         while (off-- && s < lim) {
6888             /* XXX could check well-formedness here */
6889             s += UTF8SKIP(s);
6890         }
6891     }
6892     else {
6893         while (off++ && s > lim) {
6894             s--;
6895             if (UTF8_IS_CONTINUED(*s)) {
6896                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6897                     s--;
6898             }
6899             /* XXX could check well-formedness here */
6900         }
6901     }
6902     return s;
6903 }
6904
6905 #ifdef XXX_dmq
6906 /* there are a bunch of places where we use two reghop3's that should
6907    be replaced with this routine. but since thats not done yet
6908    we ifdef it out - dmq
6909 */
6910 STATIC U8 *
6911 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6912 {
6913     dVAR;
6914
6915     PERL_ARGS_ASSERT_REGHOP4;
6916
6917     if (off >= 0) {
6918         while (off-- && s < rlim) {
6919             /* XXX could check well-formedness here */
6920             s += UTF8SKIP(s);
6921         }
6922     }
6923     else {
6924         while (off++ && s > llim) {
6925             s--;
6926             if (UTF8_IS_CONTINUED(*s)) {
6927                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6928                     s--;
6929             }
6930             /* XXX could check well-formedness here */
6931         }
6932     }
6933     return s;
6934 }
6935 #endif
6936
6937 STATIC U8 *
6938 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6939 {
6940     dVAR;
6941
6942     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6943
6944     if (off >= 0) {
6945         while (off-- && s < lim) {
6946             /* XXX could check well-formedness here */
6947             s += UTF8SKIP(s);
6948         }
6949         if (off >= 0)
6950             return NULL;
6951     }
6952     else {
6953         while (off++ && s > lim) {
6954             s--;
6955             if (UTF8_IS_CONTINUED(*s)) {
6956                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6957                     s--;
6958             }
6959             /* XXX could check well-formedness here */
6960         }
6961         if (off <= 0)
6962             return NULL;
6963     }
6964     return s;
6965 }
6966
6967 static void
6968 restore_pos(pTHX_ void *arg)
6969 {
6970     dVAR;
6971     regexp * const rex = (regexp *)arg;
6972     if (PL_reg_eval_set) {
6973         if (PL_reg_oldsaved) {
6974             rex->subbeg = PL_reg_oldsaved;
6975             rex->sublen = PL_reg_oldsavedlen;
6976 #ifdef PERL_OLD_COPY_ON_WRITE
6977             rex->saved_copy = PL_nrs;
6978 #endif
6979             RXp_MATCH_COPIED_on(rex);
6980         }
6981         PL_reg_magic->mg_len = PL_reg_oldpos;
6982         PL_reg_eval_set = 0;
6983         PL_curpm = PL_reg_oldcurpm;
6984     }
6985 }
6986
6987 STATIC void
6988 S_to_utf8_substr(pTHX_ register regexp *prog)
6989 {
6990     int i = 1;
6991
6992     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6993
6994     do {
6995         if (prog->substrs->data[i].substr
6996             && !prog->substrs->data[i].utf8_substr) {
6997             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6998             prog->substrs->data[i].utf8_substr = sv;
6999             sv_utf8_upgrade(sv);
7000             if (SvVALID(prog->substrs->data[i].substr)) {
7001                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
7002                 if (flags & FBMcf_TAIL) {
7003                     /* Trim the trailing \n that fbm_compile added last
7004                        time.  */
7005                     SvCUR_set(sv, SvCUR(sv) - 1);
7006                     /* Whilst this makes the SV technically "invalid" (as its
7007                        buffer is no longer followed by "\0") when fbm_compile()
7008                        adds the "\n" back, a "\0" is restored.  */
7009                 }
7010                 fbm_compile(sv, flags);
7011             }
7012             if (prog->substrs->data[i].substr == prog->check_substr)
7013                 prog->check_utf8 = sv;
7014         }
7015     } while (i--);
7016 }
7017
7018 STATIC void
7019 S_to_byte_substr(pTHX_ register regexp *prog)
7020 {
7021     dVAR;
7022     int i = 1;
7023
7024     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7025
7026     do {
7027         if (prog->substrs->data[i].utf8_substr
7028             && !prog->substrs->data[i].substr) {
7029             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7030             if (sv_utf8_downgrade(sv, TRUE)) {
7031                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7032                     const U8 flags
7033                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
7034                     if (flags & FBMcf_TAIL) {
7035                         /* Trim the trailing \n that fbm_compile added last
7036                            time.  */
7037                         SvCUR_set(sv, SvCUR(sv) - 1);
7038                     }
7039                     fbm_compile(sv, flags);
7040                 }
7041             } else {
7042                 SvREFCNT_dec(sv);
7043                 sv = &PL_sv_undef;
7044             }
7045             prog->substrs->data[i].substr = sv;
7046             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7047                 prog->check_substr = sv;
7048         }
7049     } while (i--);
7050 }
7051
7052 /*
7053  * Local variables:
7054  * c-indentation-style: bsd
7055  * c-basic-offset: 4
7056  * indent-tabs-mode: t
7057  * End:
7058  *
7059  * ex: set ts=8 sts=4 sw=4 noet:
7060  */