src/5013010/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 /*
 154    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 155    so that it is possible to override the option here without having to
 156    rebuild the entire core. as we are required to do if we change regcomp.h
 157    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 158 */
 159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #endif
 162
 163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 164 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 167 #define RE_utf8_perl_word   PL_utf8_alnum
 168 #define RE_utf8_perl_space  PL_utf8_space
 169 #define RE_utf8_posix_digit PL_utf8_digit
 170 #define perl_word  alnum
 171 #define perl_space space
 172 #define posix_digit digit
 173 #else
 174 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 177 #define RE_utf8_perl_word   PL_utf8_perl_word
 178 #define RE_utf8_perl_space  PL_utf8_perl_space
 179 #define RE_utf8_posix_digit PL_utf8_posix_digit
 180 #endif
 181
 182 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 183
 184 /* The actual code for CCC_TRY, which uses several variables from the routine
 185  * it's callable from.  It is designed to be the bulk of a case statement.
 186  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 187  *      nextchr matches the class.
 188  * UTF8_TEST is the whole test string to use for utf8 targets
 189  * LOAD is what to use to test, and if not present to load in the swash for the
 190  *      class
 191  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 192  *      UTF8_TEST test.
 193  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 194  * utf8 and a variant, load the swash if necessary and test using the utf8
 195  * test.  Advance to the next character if test is ok, otherwise fail; If not
 196  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 197  * fails, or advance to the next character */
 198
 199 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 200     if (locinput >= PL_regeol) {                                              \
 201         sayNO;                                                                \
 202     }                                                                         \
 203     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 204         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 205         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 206             sayNO;                                                            \
 207         }                                                                     \
 208         locinput += PL_utf8skip[nextchr];                                     \
 209         nextchr = UCHARAT(locinput);                                          \
 210         break;                                                                \
 211     }                                                                         \
 212     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 213         sayNO;                                                                \
 214     }                                                                         \
 215     nextchr = UCHARAT(++locinput);                                            \
 216     break;
 217
 218 /* Handle the non-locale cases for a character class and its complement.  It
 219  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 220  * This is because that code fails when the test succeeds, so we want to have
 221  * the test fail so that the code succeeds.  The swash is stored in a
 222  * predictable PL_ place */
 223 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 224                            CLASS, STR)                                        \
 225     case NAME:                                                                \
 226         _CCC_TRY_CODE( !, FUNC,                                               \
 227                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 228                                             (U8*)locinput, TRUE)),            \
 229                           CLASS, STR)                                         \
 230     case NNAME:                                                               \
 231         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 232                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 233                                             (U8*)locinput, TRUE)),            \
 234                           CLASS, STR)                                         \
 235
 236 /* Generate the case statements for both locale and non-locale character
 237  * classes in regmatch for classes that don't have special unicode semantics.
 238  * Locales don't use an immediate swash, but an intermediary special locale
 239  * function that is called on the pointer to the current place in the input
 240  * string.  That function will resolve to needing the same swash.  One might
 241  * think that because we don't know what the locale will match, we shouldn't
 242  * check with the swash loading function that it loaded properly; ie, that we
 243  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 244  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 245  * irrelevant here */
 246 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 247                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 248                 NAMEA, NNAMEA, FUNCA,                                         \
 249                 CLASS, STR)                                                   \
 250     case NAMEL:                                                               \
 251         PL_reg_flags |= RF_tainted;                                           \
 252         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 253     case NNAMEL:                                                              \
 254         PL_reg_flags |= RF_tainted;                                           \
 255         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 256                        CLASS, STR)                                            \
 257     case NAMEA:                                                               \
 258         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 259             sayNO;                                                            \
 260         }                                                                     \
 261         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 262         nextchr = UCHARAT(++locinput);                                        \
 263         break;                                                                \
 264     case NNAMEA:                                                              \
 265         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 266             sayNO;                                                            \
 267         }                                                                     \
 268         if (utf8_target) {                                                    \
 269             locinput += PL_utf8skip[nextchr];                                 \
 270             nextchr = UCHARAT(locinput);                                      \
 271         }                                                                     \
 272         else {                                                                \
 273             nextchr = UCHARAT(++locinput);                                    \
 274         }                                                                     \
 275         break;                                                                \
 276     /* Generate the non-locale cases */                                       \
 277     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 278
 279 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 280  * statements to handle separate Unicode semantics nodes */
 281 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 282                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 283                   NAMEU, NNAMEU, FUNCU,                                        \
 284                   NAMEA, NNAMEA, FUNCA,                                        \
 285                   CLASS, STR)                                                  \
 286     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 287             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 288             NAMEA, NNAMEA, FUNCA,                                              \
 289             CLASS, STR)                                                        \
 290     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 291
 292 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 293
 294 /* for use after a quantifier and before an EXACT-like node -- japhy */
 295 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 296  *
 297  * NOTE that *nothing* that affects backtracking should be in here, specifically
 298  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 299  * node that is in between two EXACT like nodes when ascertaining what the required
 300  * "follow" character is. This should probably be moved to regex compile time
 301  * although it may be done at run time beause of the REF possibility - more
 302  * investigation required. -- demerphq
 303 */
 304 #define JUMPABLE(rn) (      \
 305     OP(rn) == OPEN ||       \
 306     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 307     OP(rn) == EVAL ||   \
 308     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 309     OP(rn) == PLUS || OP(rn) == MINMOD || \
 310     OP(rn) == KEEPS || \
 311     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 312 )
 313 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 314
 315 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 316
 317 #if 0
 318 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 319    we don't need this definition. */
 320 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 321 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 322 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 323
 324 #else
 325 /* ... so we use this as its faster. */
 326 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 327 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 328 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 329 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 330
 331 #endif
 332
 333 /*
 334   Search for mandatory following text node; for lookahead, the text must
 335   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 336 */
 337 #define FIND_NEXT_IMPT(rn) STMT_START { \
 338     while (JUMPABLE(rn)) { \
 339         const OPCODE type = OP(rn); \
 340         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 341             rn = NEXTOPER(NEXTOPER(rn)); \
 342         else if (type == PLUS) \
 343             rn = NEXTOPER(rn); \
 344         else if (type == IFMATCH) \
 345             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 346         else rn += NEXT_OFF(rn); \
 347     } \
 348 } STMT_END
 349
 350
 351 static void restore_pos(pTHX_ void *arg);
 352
 353 #define REGCP_PAREN_ELEMS 4
 354 #define REGCP_OTHER_ELEMS 5
 355 #define REGCP_FRAME_ELEMS 1
 356 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 357  * are needed for the regexp context stack bookkeeping. */
 358
 359 STATIC CHECKPOINT
 360 S_regcppush(pTHX_ I32 parenfloor)
 361 {
 362     dVAR;
 363     const int retval = PL_savestack_ix;
 364     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 365     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 366     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 367     int p;
 368     GET_RE_DEBUG_FLAGS_DECL;
 369
 370     if (paren_elems_to_push < 0)
 371         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 372
 373     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 374         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 375                    " out of range (%lu-%ld)",
 376                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 377
 378     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 379
 380     for (p = PL_regsize; p > parenfloor; p--) {
 381 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 382         SSPUSHINT(PL_regoffs[p].end);
 383         SSPUSHINT(PL_regoffs[p].start);
 384         SSPUSHPTR(PL_reg_start_tmp[p]);
 385         SSPUSHINT(p);
 386         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 387           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 388                       (UV)p, (IV)PL_regoffs[p].start,
 389                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 390                       (IV)PL_regoffs[p].end
 391         ));
 392     }
 393 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 394     SSPUSHPTR(PL_regoffs);
 395     SSPUSHINT(PL_regsize);
 396     SSPUSHINT(*PL_reglastparen);
 397     SSPUSHINT(*PL_reglastcloseparen);
 398     SSPUSHPTR(PL_reginput);
 399     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 400
 401     return retval;
 402 }
 403
 404 /* These are needed since we do not localize EVAL nodes: */
 405 #define REGCP_SET(cp)                                           \
 406     DEBUG_STATE_r(                                              \
 407             PerlIO_printf(Perl_debug_log,                       \
 408                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 409                 (IV)PL_savestack_ix));                          \
 410     cp = PL_savestack_ix
 411
 412 #define REGCP_UNWIND(cp)                                        \
 413     DEBUG_STATE_r(                                              \
 414         if (cp != PL_savestack_ix)                              \
 415             PerlIO_printf(Perl_debug_log,                       \
 416                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 417                 (IV)(cp), (IV)PL_savestack_ix));                \
 418     regcpblow(cp)
 419
 420 STATIC char *
 421 S_regcppop(pTHX_ const regexp *rex)
 422 {
 423     dVAR;
 424     UV i;
 425     char *input;
 426     GET_RE_DEBUG_FLAGS_DECL;
 427
 428     PERL_ARGS_ASSERT_REGCPPOP;
 429
 430     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 431     i = SSPOPUV;
 432     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 433     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 434     input = (char *) SSPOPPTR;
 435     *PL_reglastcloseparen = SSPOPINT;
 436     *PL_reglastparen = SSPOPINT;
 437     PL_regsize = SSPOPINT;
 438     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 439
 440     i -= REGCP_OTHER_ELEMS;
 441     /* Now restore the parentheses context. */
 442     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 443         I32 tmps;
 444         U32 paren = (U32)SSPOPINT;
 445         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 446         PL_regoffs[paren].start = SSPOPINT;
 447         tmps = SSPOPINT;
 448         if (paren <= *PL_reglastparen)
 449             PL_regoffs[paren].end = tmps;
 450         DEBUG_BUFFERS_r(
 451             PerlIO_printf(Perl_debug_log,
 452                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 453                           (UV)paren, (IV)PL_regoffs[paren].start,
 454                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 455                           (IV)PL_regoffs[paren].end,
 456                           (paren > *PL_reglastparen ? "(no)" : ""));
 457         );
 458     }
 459     DEBUG_BUFFERS_r(
 460         if (*PL_reglastparen + 1 <= rex->nparens) {
 461             PerlIO_printf(Perl_debug_log,
 462                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 463                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 464         }
 465     );
 466 #if 1
 467     /* It would seem that the similar code in regtry()
 468      * already takes care of this, and in fact it is in
 469      * a better location to since this code can #if 0-ed out
 470      * but the code in regtry() is needed or otherwise tests
 471      * requiring null fields (pat.t#187 and split.t#{13,14}
 472      * (as of patchlevel 7877)  will fail.  Then again,
 473      * this code seems to be necessary or otherwise
 474      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 475      * --jhi updated by dapm */
 476     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 477         if (i > PL_regsize)
 478             PL_regoffs[i].start = -1;
 479         PL_regoffs[i].end = -1;
 480     }
 481 #endif
 482     return input;
 483 }
 484
 485 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 486
 487 /*
 488  * pregexec and friends
 489  */
 490
 491 #ifndef PERL_IN_XSUB_RE
 492 /*
 493  - pregexec - match a regexp against a string
 494  */
 495 I32
 496 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 497          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 498 /* strend: pointer to null at end of string */
 499 /* strbeg: real beginning of string */
 500 /* minend: end of match must be >=minend after stringarg. */
 501 /* nosave: For optimizations. */
 502 {
 503     PERL_ARGS_ASSERT_PREGEXEC;
 504
 505     return
 506         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 507                       nosave ? 0 : REXEC_COPY_STR);
 508 }
 509 #endif
 510
 511 /*
 512  * Need to implement the following flags for reg_anch:
 513  *
 514  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 515  * USE_INTUIT_ML
 516  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 517  * INTUIT_AUTORITATIVE_ML
 518  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 519  * INTUIT_ONCE_ML
 520  *
 521  * Another flag for this function: SECOND_TIME (so that float substrs
 522  * with giant delta may be not rechecked).
 523  */
 524
 525 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 526
 527 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 528    Otherwise, only SvCUR(sv) is used to get strbeg. */
 529
 530 /* XXXX We assume that strpos is strbeg unless sv. */
 531
 532 /* XXXX Some places assume that there is a fixed substring.
 533         An update may be needed if optimizer marks as "INTUITable"
 534         RExen without fixed substrings.  Similarly, it is assumed that
 535         lengths of all the strings are no more than minlen, thus they
 536         cannot come from lookahead.
 537         (Or minlen should take into account lookahead.)
 538   NOTE: Some of this comment is not correct. minlen does now take account
 539   of lookahead/behind. Further research is required. -- demerphq
 540
 541 */
 542
 543 /* A failure to find a constant substring means that there is no need to make
 544    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 545    finding a substring too deep into the string means that less calls to
 546    regtry() should be needed.
 547
 548    REx compiler's optimizer found 4 possible hints:
 549         a) Anchored substring;
 550         b) Fixed substring;
 551         c) Whether we are anchored (beginning-of-line or \G);
 552         d) First node (of those at offset 0) which may distinguish positions;
 553    We use a)b)d) and multiline-part of c), and try to find a position in the
 554    string which does not contradict any of them.
 555  */
 556
 557 /* Most of decisions we do here should have been done at compile time.
 558    The nodes of the REx which we used for the search should have been
 559    deleted from the finite automaton. */
 560
 561 char *
 562 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 563                      char *strend, const U32 flags, re_scream_pos_data *data)
 564 {
 565     dVAR;
 566     struct regexp *const prog = (struct regexp *)SvANY(rx);
 567     register I32 start_shift = 0;
 568     /* Should be nonnegative! */
 569     register I32 end_shift   = 0;
 570     register char *s;
 571     register SV *check;
 572     char *strbeg;
 573     char *t;
 574     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 575     I32 ml_anch;
 576     register char *other_last = NULL;   /* other substr checked before this */
 577     char *check_at = NULL;              /* check substr found at this pos */
 578     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 579     RXi_GET_DECL(prog,progi);
 580 #ifdef DEBUGGING
 581     const char * const i_strpos = strpos;
 582 #endif
 583     GET_RE_DEBUG_FLAGS_DECL;
 584
 585     PERL_ARGS_ASSERT_RE_INTUIT_START;
 586
 587     RX_MATCH_UTF8_set(rx,utf8_target);
 588
 589     if (RX_UTF8(rx)) {
 590         PL_reg_flags |= RF_utf8;
 591     }
 592     DEBUG_EXECUTE_r(
 593         debug_start_match(rx, utf8_target, strpos, strend,
 594             sv ? "Guessing start of match in sv for"
 595                : "Guessing start of match in string for");
 596               );
 597
 598     /* CHR_DIST() would be more correct here but it makes things slow. */
 599     if (prog->minlen > strend - strpos) {
 600         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 601                               "String too short... [re_intuit_start]\n"));
 602         goto fail;
 603     }
 604
 605     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 606     PL_regeol = strend;
 607     if (utf8_target) {
 608         if (!prog->check_utf8 && prog->check_substr)
 609             to_utf8_substr(prog);
 610         check = prog->check_utf8;
 611     } else {
 612         if (!prog->check_substr && prog->check_utf8)
 613             to_byte_substr(prog);
 614         check = prog->check_substr;
 615     }
 616     if (check == &PL_sv_undef) {
 617         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 618                 "Non-utf8 string cannot match utf8 check string\n"));
 619         goto fail;
 620     }
 621     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 622         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 623                      || ( (prog->extflags & RXf_ANCH_BOL)
 624                           && !multiline ) );    /* Check after \n? */
 625
 626         if (!ml_anch) {
 627           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 628                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 629                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 630                && sv && !SvROK(sv)
 631                && (strpos != strbeg)) {
 632               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 633               goto fail;
 634           }
 635           if (prog->check_offset_min == prog->check_offset_max &&
 636               !(prog->extflags & RXf_CANY_SEEN)) {
 637             /* Substring at constant offset from beg-of-str... */
 638             I32 slen;
 639
 640             s = HOP3c(strpos, prog->check_offset_min, strend);
 641
 642             if (SvTAIL(check)) {
 643                 slen = SvCUR(check);    /* >= 1 */
 644
 645                 if ( strend - s > slen || strend - s < slen - 1
 646                      || (strend - s == slen && strend[-1] != '\n')) {
 647                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 648                     goto fail_finish;
 649                 }
 650                 /* Now should match s[0..slen-2] */
 651                 slen--;
 652                 if (slen && (*SvPVX_const(check) != *s
 653                              || (slen > 1
 654                                  && memNE(SvPVX_const(check), s, slen)))) {
 655                   report_neq:
 656                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 657                     goto fail_finish;
 658                 }
 659             }
 660             else if (*SvPVX_const(check) != *s
 661                      || ((slen = SvCUR(check)) > 1
 662                          && memNE(SvPVX_const(check), s, slen)))
 663                 goto report_neq;
 664             check_at = s;
 665             goto success_at_start;
 666           }
 667         }
 668         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 669         s = strpos;
 670         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 671         end_shift = prog->check_end_shift;
 672
 673         if (!ml_anch) {
 674             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 675                                          - (SvTAIL(check) != 0);
 676             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 677
 678             if (end_shift < eshift)
 679                 end_shift = eshift;
 680         }
 681     }
 682     else {                              /* Can match at random position */
 683         ml_anch = 0;
 684         s = strpos;
 685         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 686         end_shift = prog->check_end_shift;
 687
 688         /* end shift should be non negative here */
 689     }
 690
 691 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 692     if (end_shift < 0)
 693         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 694                    (IV)end_shift, RX_PRECOMP(prog));
 695 #endif
 696
 697   restart:
 698     /* Find a possible match in the region s..strend by looking for
 699        the "check" substring in the region corrected by start/end_shift. */
 700
 701     {
 702         I32 srch_start_shift = start_shift;
 703         I32 srch_end_shift = end_shift;
 704         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 705             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 706             srch_start_shift = strbeg - s;
 707         }
 708     DEBUG_OPTIMISE_MORE_r({
 709         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 710             (IV)prog->check_offset_min,
 711             (IV)srch_start_shift,
 712             (IV)srch_end_shift,
 713             (IV)prog->check_end_shift);
 714     });
 715
 716     if (flags & REXEC_SCREAM) {
 717         I32 p = -1;                     /* Internal iterator of scream. */
 718         I32 * const pp = data ? data->scream_pos : &p;
 719
 720         if (PL_screamfirst[BmRARE(check)] >= 0
 721             || ( BmRARE(check) == '\n'
 722                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 723                  && SvTAIL(check) ))
 724             s = screaminstr(sv, check,
 725                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 726         else
 727             goto fail_finish;
 728         /* we may be pointing at the wrong string */
 729         if (s && RXp_MATCH_COPIED(prog))
 730             s = strbeg + (s - SvPVX_const(sv));
 731         if (data)
 732             *data->scream_olds = s;
 733     }
 734     else {
 735         U8* start_point;
 736         U8* end_point;
 737         if (prog->extflags & RXf_CANY_SEEN) {
 738             start_point= (U8*)(s + srch_start_shift);
 739             end_point= (U8*)(strend - srch_end_shift);
 740         } else {
 741             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 742             end_point= HOP3(strend, -srch_end_shift, strbeg);
 743         }
 744         DEBUG_OPTIMISE_MORE_r({
 745             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 746                 (int)(end_point - start_point),
 747                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 748                 start_point);
 749         });
 750
 751         s = fbm_instr( start_point, end_point,
 752                       check, multiline ? FBMrf_MULTILINE : 0);
 753     }
 754     }
 755     /* Update the count-of-usability, remove useless subpatterns,
 756         unshift s.  */
 757
 758     DEBUG_EXECUTE_r({
 759         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 760             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 761         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 762                           (s ? "Found" : "Did not find"),
 763             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 764                 ? "anchored" : "floating"),
 765             quoted,
 766             RE_SV_TAIL(check),
 767             (s ? " at offset " : "...\n") );
 768     });
 769
 770     if (!s)
 771         goto fail_finish;
 772     /* Finish the diagnostic message */
 773     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 774
 775     /* XXX dmq: first branch is for positive lookbehind...
 776        Our check string is offset from the beginning of the pattern.
 777        So we need to do any stclass tests offset forward from that
 778        point. I think. :-(
 779      */
 780
 781
 782
 783     check_at=s;
 784
 785
 786     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 787        Start with the other substr.
 788        XXXX no SCREAM optimization yet - and a very coarse implementation
 789        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 790                 *always* match.  Probably should be marked during compile...
 791        Probably it is right to do no SCREAM here...
 792      */
 793
 794     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 795                 : (prog->float_substr && prog->anchored_substr))
 796     {
 797         /* Take into account the "other" substring. */
 798         /* XXXX May be hopelessly wrong for UTF... */
 799         if (!other_last)
 800             other_last = strpos;
 801         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 802           do_other_anchored:
 803             {
 804                 char * const last = HOP3c(s, -start_shift, strbeg);
 805                 char *last1, *last2;
 806                 char * const saved_s = s;
 807                 SV* must;
 808
 809                 t = s - prog->check_offset_max;
 810                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 811                     && (!utf8_target
 812                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 813                             && t > strpos)))
 814                     NOOP;
 815                 else
 816                     t = strpos;
 817                 t = HOP3c(t, prog->anchored_offset, strend);
 818                 if (t < other_last)     /* These positions already checked */
 819                     t = other_last;
 820                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 821                 if (last < last1)
 822                     last1 = last;
 823                 /* XXXX It is not documented what units *_offsets are in.
 824                    We assume bytes, but this is clearly wrong.
 825                    Meaning this code needs to be carefully reviewed for errors.
 826                    dmq.
 827                   */
 828
 829                 /* On end-of-str: see comment below. */
 830                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 831                 if (must == &PL_sv_undef) {
 832                     s = (char*)NULL;
 833                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 834                 }
 835                 else
 836                     s = fbm_instr(
 837                         (unsigned char*)t,
 838                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 839                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 840                         must,
 841                         multiline ? FBMrf_MULTILINE : 0
 842                     );
 843                 DEBUG_EXECUTE_r({
 844                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 845                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 846                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 847                         (s ? "Found" : "Contradicts"),
 848                         quoted, RE_SV_TAIL(must));
 849                 });
 850
 851
 852                 if (!s) {
 853                     if (last1 >= last2) {
 854                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 855                                                 ", giving up...\n"));
 856                         goto fail_finish;
 857                     }
 858                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 859                         ", trying floating at offset %ld...\n",
 860                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 861                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 862                     s = HOP3c(last, 1, strend);
 863                     goto restart;
 864                 }
 865                 else {
 866                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 867                           (long)(s - i_strpos)));
 868                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 869                     other_last = HOP3c(s, 1, strend);
 870                     s = saved_s;
 871                     if (t == strpos)
 872                         goto try_at_start;
 873                     goto try_at_offset;
 874                 }
 875             }
 876         }
 877         else {          /* Take into account the floating substring. */
 878             char *last, *last1;
 879             char * const saved_s = s;
 880             SV* must;
 881
 882             t = HOP3c(s, -start_shift, strbeg);
 883             last1 = last =
 884                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 885             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 886                 last = HOP3c(t, prog->float_max_offset, strend);
 887             s = HOP3c(t, prog->float_min_offset, strend);
 888             if (s < other_last)
 889                 s = other_last;
 890  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 891             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 892             /* fbm_instr() takes into account exact value of end-of-str
 893                if the check is SvTAIL(ed).  Since false positives are OK,
 894                and end-of-str is not later than strend we are OK. */
 895             if (must == &PL_sv_undef) {
 896                 s = (char*)NULL;
 897                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 898             }
 899             else
 900                 s = fbm_instr((unsigned char*)s,
 901                               (unsigned char*)last + SvCUR(must)
 902                                   - (SvTAIL(must)!=0),
 903                               must, multiline ? FBMrf_MULTILINE : 0);
 904             DEBUG_EXECUTE_r({
 905                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 906                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 907                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 908                     (s ? "Found" : "Contradicts"),
 909                     quoted, RE_SV_TAIL(must));
 910             });
 911             if (!s) {
 912                 if (last1 == last) {
 913                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 914                                             ", giving up...\n"));
 915                     goto fail_finish;
 916                 }
 917                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 918                     ", trying anchored starting at offset %ld...\n",
 919                     (long)(saved_s + 1 - i_strpos)));
 920                 other_last = last;
 921                 s = HOP3c(t, 1, strend);
 922                 goto restart;
 923             }
 924             else {
 925                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 926                       (long)(s - i_strpos)));
 927                 other_last = s; /* Fix this later. --Hugo */
 928                 s = saved_s;
 929                 if (t == strpos)
 930                     goto try_at_start;
 931                 goto try_at_offset;
 932             }
 933         }
 934     }
 935
 936
 937     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 938
 939     DEBUG_OPTIMISE_MORE_r(
 940         PerlIO_printf(Perl_debug_log,
 941             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 942             (IV)prog->check_offset_min,
 943             (IV)prog->check_offset_max,
 944             (IV)(s-strpos),
 945             (IV)(t-strpos),
 946             (IV)(t-s),
 947             (IV)(strend-strpos)
 948         )
 949     );
 950
 951     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 952         && (!utf8_target
 953             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 954                  && t > strpos)))
 955     {
 956         /* Fixed substring is found far enough so that the match
 957            cannot start at strpos. */
 958       try_at_offset:
 959         if (ml_anch && t[-1] != '\n') {
 960             /* Eventually fbm_*() should handle this, but often
 961                anchored_offset is not 0, so this check will not be wasted. */
 962             /* XXXX In the code below we prefer to look for "^" even in
 963                presence of anchored substrings.  And we search even
 964                beyond the found float position.  These pessimizations
 965                are historical artefacts only.  */
 966           find_anchor:
 967             while (t < strend - prog->minlen) {
 968                 if (*t == '\n') {
 969                     if (t < check_at - prog->check_offset_min) {
 970                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 971                             /* Since we moved from the found position,
 972                                we definitely contradict the found anchored
 973                                substr.  Due to the above check we do not
 974                                contradict "check" substr.
 975                                Thus we can arrive here only if check substr
 976                                is float.  Redo checking for "other"=="fixed".
 977                              */
 978                             strpos = t + 1;
 979                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 980                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 981                             goto do_other_anchored;
 982                         }
 983                         /* We don't contradict the found floating substring. */
 984                         /* XXXX Why not check for STCLASS? */
 985                         s = t + 1;
 986                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 987                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 988                         goto set_useful;
 989                     }
 990                     /* Position contradicts check-string */
 991                     /* XXXX probably better to look for check-string
 992                        than for "\n", so one should lower the limit for t? */
 993                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 994                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 995                     other_last = strpos = s = t + 1;
 996                     goto restart;
 997                 }
 998                 t++;
 999             }
1000             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1001                         PL_colors[0], PL_colors[1]));
1002             goto fail_finish;
1003         }
1004         else {
1005             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1006                         PL_colors[0], PL_colors[1]));
1007         }
1008         s = t;
1009       set_useful:
1010         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1011     }
1012     else {
1013         /* The found string does not prohibit matching at strpos,
1014            - no optimization of calling REx engine can be performed,
1015            unless it was an MBOL and we are not after MBOL,
1016            or a future STCLASS check will fail this. */
1017       try_at_start:
1018         /* Even in this situation we may use MBOL flag if strpos is offset
1019            wrt the start of the string. */
1020         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1021             && (strpos != strbeg) && strpos[-1] != '\n'
1022             /* May be due to an implicit anchor of m{.*foo}  */
1023             && !(prog->intflags & PREGf_IMPLICIT))
1024         {
1025             t = strpos;
1026             goto find_anchor;
1027         }
1028         DEBUG_EXECUTE_r( if (ml_anch)
1029             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1030                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1031         );
1032       success_at_start:
1033         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1034             && (utf8_target ? (
1035                 prog->check_utf8                /* Could be deleted already */
1036                 && --BmUSEFUL(prog->check_utf8) < 0
1037                 && (prog->check_utf8 == prog->float_utf8)
1038             ) : (
1039                 prog->check_substr              /* Could be deleted already */
1040                 && --BmUSEFUL(prog->check_substr) < 0
1041                 && (prog->check_substr == prog->float_substr)
1042             )))
1043         {
1044             /* If flags & SOMETHING - do not do it many times on the same match */
1045             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1046             /* XXX Does the destruction order has to change with utf8_target? */
1047             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1048             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1049             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1050             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1051             check = NULL;                       /* abort */
1052             s = strpos;
1053             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1054                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1055             if (prog->intflags & PREGf_IMPLICIT)
1056                 prog->extflags &= ~RXf_ANCH_MBOL;
1057             /* XXXX This is a remnant of the old implementation.  It
1058                     looks wasteful, since now INTUIT can use many
1059                     other heuristics. */
1060             prog->extflags &= ~RXf_USE_INTUIT;
1061             /* XXXX What other flags might need to be cleared in this branch? */
1062         }
1063         else
1064             s = strpos;
1065     }
1066
1067     /* Last resort... */
1068     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1069     /* trie stclasses are too expensive to use here, we are better off to
1070        leave it to regmatch itself */
1071     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1072         /* minlen == 0 is possible if regstclass is \b or \B,
1073            and the fixed substr is ''$.
1074            Since minlen is already taken into account, s+1 is before strend;
1075            accidentally, minlen >= 1 guaranties no false positives at s + 1
1076            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1077            regstclass does not come from lookahead...  */
1078         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1079            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1080         const U8* const str = (U8*)STRING(progi->regstclass);
1081         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1082                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1083                     : 1);
1084         char * endpos;
1085         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1086             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1087         else if (prog->float_substr || prog->float_utf8)
1088             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1089         else
1090             endpos= strend;
1091
1092         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1093                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1094
1095         t = s;
1096         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1097         if (!s) {
1098 #ifdef DEBUGGING
1099             const char *what = NULL;
1100 #endif
1101             if (endpos == strend) {
1102                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1103                                 "Could not match STCLASS...\n") );
1104                 goto fail;
1105             }
1106             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1107                                    "This position contradicts STCLASS...\n") );
1108             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1109                 goto fail;
1110             /* Contradict one of substrings */
1111             if (prog->anchored_substr || prog->anchored_utf8) {
1112                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1113                     DEBUG_EXECUTE_r( what = "anchored" );
1114                   hop_and_restart:
1115                     s = HOP3c(t, 1, strend);
1116                     if (s + start_shift + end_shift > strend) {
1117                         /* XXXX Should be taken into account earlier? */
1118                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                                                "Could not match STCLASS...\n") );
1120                         goto fail;
1121                     }
1122                     if (!check)
1123                         goto giveup;
1124                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1125                                 "Looking for %s substr starting at offset %ld...\n",
1126                                  what, (long)(s + start_shift - i_strpos)) );
1127                     goto restart;
1128                 }
1129                 /* Have both, check_string is floating */
1130                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1131                     goto retry_floating_check;
1132                 /* Recheck anchored substring, but not floating... */
1133                 s = check_at;
1134                 if (!check)
1135                     goto giveup;
1136                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1137                           "Looking for anchored substr starting at offset %ld...\n",
1138                           (long)(other_last - i_strpos)) );
1139                 goto do_other_anchored;
1140             }
1141             /* Another way we could have checked stclass at the
1142                current position only: */
1143             if (ml_anch) {
1144                 s = t = t + 1;
1145                 if (!check)
1146                     goto giveup;
1147                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148                           "Looking for /%s^%s/m starting at offset %ld...\n",
1149                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1150                 goto try_at_offset;
1151             }
1152             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1153                 goto fail;
1154             /* Check is floating substring. */
1155           retry_floating_check:
1156             t = check_at - start_shift;
1157             DEBUG_EXECUTE_r( what = "floating" );
1158             goto hop_and_restart;
1159         }
1160         if (t != s) {
1161             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1162                         "By STCLASS: moving %ld --> %ld\n",
1163                                   (long)(t - i_strpos), (long)(s - i_strpos))
1164                    );
1165         }
1166         else {
1167             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1168                                   "Does not contradict STCLASS...\n");
1169                    );
1170         }
1171     }
1172   giveup:
1173     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1174                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1175                           PL_colors[5], (long)(s - i_strpos)) );
1176     return s;
1177
1178   fail_finish:                          /* Substring not found */
1179     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1180         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1181   fail:
1182     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1183                           PL_colors[4], PL_colors[5]));
1184     return NULL;
1185 }
1186
1187 #define DECL_TRIE_TYPE(scan) \
1188     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1189                     trie_type = (scan->flags != EXACT) \
1190                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1191                               : (utf8_target ? trie_utf8 : trie_plain)
1192
1193 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1194 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1195     switch (trie_type) {                                                    \
1196     case trie_utf8_fold:                                                    \
1197         if ( foldlen>0 ) {                                                  \
1198             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1199             foldlen -= len;                                                 \
1200             uscan += len;                                                   \
1201             len=0;                                                          \
1202         } else {                                                            \
1203             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1204             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1205             foldlen -= UNISKIP( uvc );                                      \
1206             uscan = foldbuf + UNISKIP( uvc );                               \
1207         }                                                                   \
1208         break;                                                              \
1209     case trie_latin_utf8_fold:                                              \
1210         if ( foldlen>0 ) {                                                  \
1211             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1212             foldlen -= len;                                                 \
1213             uscan += len;                                                   \
1214             len=0;                                                          \
1215         } else {                                                            \
1216             len = 1;                                                        \
1217             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1218             foldlen -= UNISKIP( uvc );                                      \
1219             uscan = foldbuf + UNISKIP( uvc );                               \
1220         }                                                                   \
1221         break;                                                              \
1222     case trie_utf8:                                                         \
1223         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1224         break;                                                              \
1225     case trie_plain:                                                        \
1226         uvc = (UV)*uc;                                                      \
1227         len = 1;                                                            \
1228     }                                                                       \
1229     if (uvc < 256) {                                                        \
1230         charid = trie->charmap[ uvc ];                                      \
1231     }                                                                       \
1232     else {                                                                  \
1233         charid = 0;                                                         \
1234         if (widecharmap) {                                                  \
1235             SV** const svpp = hv_fetch(widecharmap,                         \
1236                         (char*)&uvc, sizeof(UV), 0);                        \
1237             if (svpp)                                                       \
1238                 charid = (U16)SvIV(*svpp);                                  \
1239         }                                                                   \
1240     }                                                                       \
1241 } STMT_END
1242
1243 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1244 STMT_START {                                              \
1245     while (s <= e) {                                      \
1246         if ( (CoNd)                                       \
1247              && (ln == 1 || folder(s, pat_string, ln))    \
1248              && (!reginfo || regtry(reginfo, &s)) )       \
1249             goto got_it;                                  \
1250         s++;                                              \
1251     }                                                     \
1252 } STMT_END
1253
1254 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1255 STMT_START {                                          \
1256     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1257         CoDe                                          \
1258         s += uskip;                                   \
1259     }                                                 \
1260 } STMT_END
1261
1262 #define REXEC_FBC_SCAN(CoDe)                          \
1263 STMT_START {                                          \
1264     while (s < strend) {                              \
1265         CoDe                                          \
1266         s++;                                          \
1267     }                                                 \
1268 } STMT_END
1269
1270 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1271 REXEC_FBC_UTF8_SCAN(                                  \
1272     if (CoNd) {                                       \
1273         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1274             goto got_it;                              \
1275         else                                          \
1276             tmp = doevery;                            \
1277     }                                                 \
1278     else                                              \
1279         tmp = 1;                                      \
1280 )
1281
1282 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1283 REXEC_FBC_SCAN(                                       \
1284     if (CoNd) {                                       \
1285         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1286             goto got_it;                              \
1287         else                                          \
1288             tmp = doevery;                            \
1289     }                                                 \
1290     else                                              \
1291         tmp = 1;                                      \
1292 )
1293
1294 #define REXEC_FBC_TRYIT               \
1295 if ((!reginfo || regtry(reginfo, &s))) \
1296     goto got_it
1297
1298 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1299     if (utf8_target) {                                             \
1300         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1301     }                                                          \
1302     else {                                                     \
1303         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1304     }
1305
1306 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1307     if (utf8_target) {                                             \
1308         UtFpReLoAd;                                            \
1309         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1310     }                                                          \
1311     else {                                                     \
1312         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1313     }
1314
1315 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1316     PL_reg_flags |= RF_tainted;                                \
1317     if (utf8_target) {                                             \
1318         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1319     }                                                          \
1320     else {                                                     \
1321         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1322     }
1323
1324 #define DUMP_EXEC_POS(li,s,doutf8) \
1325     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1326
1327
1328 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1329         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1330         tmp = TEST_NON_UTF8(tmp);                                              \
1331         REXEC_FBC_UTF8_SCAN(                                                   \
1332             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1333                 tmp = !tmp;                                                    \
1334                 IF_SUCCESS;                                                    \
1335             }                                                                  \
1336             else {                                                             \
1337                 IF_FAIL;                                                       \
1338             }                                                                  \
1339         );                                                                     \
1340
1341 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1342         if (s == PL_bostr) {                                                   \
1343             tmp = '\n';                                                        \
1344         }                                                                      \
1345         else {                                                                 \
1346             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1347             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1348         }                                                                      \
1349         tmp = TeSt1_UtF8;                                                      \
1350         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1351         REXEC_FBC_UTF8_SCAN(                                                   \
1352             if (tmp == ! (TeSt2_UtF8)) { \
1353                 tmp = !tmp;                                                    \
1354                 IF_SUCCESS;                                                    \
1355             }                                                                  \
1356             else {                                                             \
1357                 IF_FAIL;                                                       \
1358             }                                                                  \
1359         );                                                                     \
1360
1361 /* The only difference between the BOUND and NBOUND cases is that
1362  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1363  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1364  * with the other one being empty */
1365 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1366     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1367
1368 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1369     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1370
1371 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1372     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1373
1374 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1375     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1376
1377
1378 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1379  * be passed in completely with the variable name being tested, which isn't
1380  * such a clean interface, but this is easier to read than it was before.  We
1381  * are looking for the boundary (or non-boundary between a word and non-word
1382  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1383  * must be different.  Find the "wordness" of the character just prior to this
1384  * one, and compare it with the wordness of this one.  If they differ, we have
1385  * a boundary.  At the beginning of the string, pretend that the previous
1386  * character was a new-line */
1387 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1388     if (utf8_target) {                                                         \
1389                 UTF8_CODE \
1390     }                                                                          \
1391     else {  /* Not utf8 */                                                     \
1392         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1393         tmp = TEST_NON_UTF8(tmp);                                              \
1394         REXEC_FBC_SCAN(                                                        \
1395             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1396                 tmp = !tmp;                                                    \
1397                 IF_SUCCESS;                                                    \
1398             }                                                                  \
1399             else {                                                             \
1400                 IF_FAIL;                                                       \
1401             }                                                                  \
1402         );                                                                     \
1403     }                                                                          \
1404     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1405         goto got_it;
1406
1407 /* We know what class REx starts with.  Try to find this position... */
1408 /* if reginfo is NULL, its a dryrun */
1409 /* annoyingly all the vars in this routine have different names from their counterparts
1410    in regmatch. /grrr */
1411
1412 STATIC char *
1413 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1414     const char *strend, regmatch_info *reginfo)
1415 {
1416         dVAR;
1417         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1418         char *pat_string;   /* The pattern's exactish string */
1419         char *pat_end;      /* ptr to end char of pat_string */
1420         re_fold_t folder;       /* Function for computing non-utf8 folds */
1421         const U8 *fold_array;   /* array for folding ords < 256 */
1422         STRLEN ln;
1423         STRLEN lnc;
1424         register STRLEN uskip;
1425         U8 c1;
1426         U8 c2;
1427         char *e;
1428         register I32 tmp = 1;   /* Scratch variable? */
1429         register const bool utf8_target = PL_reg_match_utf8;
1430         UV utf8_fold_flags = 0;
1431         RXi_GET_DECL(prog,progi);
1432
1433         PERL_ARGS_ASSERT_FIND_BYCLASS;
1434
1435         /* We know what class it must start with. */
1436         switch (OP(c)) {
1437         case ANYOFV:
1438         case ANYOF:
1439             if (utf8_target || OP(c) == ANYOFV) {
1440                 STRLEN inclasslen = strend - s;
1441                 REXEC_FBC_UTF8_CLASS_SCAN(
1442                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1443             }
1444             else {
1445                  while (s < strend) {
1446                       STRLEN skip = 1;
1447
1448                       if (REGINCLASS(prog, c, (U8*)s) ||
1449                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1450                            /* The assignment of 2 is intentional:
1451                             * for the folded sharp s, the skip is 2. */
1452                            (skip = SHARP_S_SKIP))) {
1453                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1454                                 goto got_it;
1455                            else
1456                                 tmp = doevery;
1457                       }
1458                       else
1459                            tmp = 1;
1460                       s += skip;
1461                  }
1462             }
1463             break;
1464         case CANY:
1465             REXEC_FBC_SCAN(
1466                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1467                     goto got_it;
1468                 else
1469                     tmp = doevery;
1470             );
1471             break;
1472
1473         case EXACTFA:
1474             if (UTF_PATTERN || utf8_target) {
1475                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1476                 goto do_exactf_utf8;
1477             }
1478             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1479             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1480             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1481
1482         case EXACTFU:
1483             if (UTF_PATTERN || utf8_target) {
1484                 utf8_fold_flags = 0;
1485                 goto do_exactf_utf8;
1486             }
1487             fold_array = PL_fold_latin1;
1488             folder = foldEQ_latin1;
1489             /* XXX This uses the full utf8 fold because if the pattern contains
1490              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1491              * There could be a new node type, say EXACTFU_SS, which is
1492              * generated by regcomp only if there is an 'ss', and then every
1493              * other case could goto do_exactf_non_utf8;*/
1494             goto do_exactf_utf8;
1495
1496         case EXACTF:
1497             if (UTF_PATTERN || utf8_target) {
1498                 utf8_fold_flags = 0;
1499                 goto do_exactf_utf8;
1500             }
1501             fold_array = PL_fold;
1502             folder = foldEQ;
1503             goto do_exactf_non_utf8;
1504
1505         case EXACTFL:
1506             if (UTF_PATTERN || utf8_target) {
1507                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1508                 goto do_exactf_utf8;
1509             }
1510             fold_array = PL_fold_locale;
1511             folder = foldEQ_locale;
1512
1513             /* FALL THROUGH */
1514
1515         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1516
1517             /* The idea in the non-utf8 EXACTF* cases is to first find the
1518              * first character of the EXACTF* node and then, if necessary,
1519              * case-insensitively compare the full text of the node.  c1 is the
1520              * first character.  c2 is its fold.  This logic will not work for
1521              * Unicode semantics and the german sharp ss, which hence should
1522              * not be compiled into a node that gets here. */
1523             pat_string = STRING(c);
1524             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1525
1526             e = HOP3c(strend, -((I32)ln), s);
1527
1528             if (!reginfo && e < s) {
1529                 e = s;                  /* Due to minlen logic of intuit() */
1530             }
1531
1532             c1 = *pat_string;
1533             c2 = fold_array[c1];
1534             if (c1 == c2) { /* If char and fold are the same */
1535                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1536             }
1537             else {
1538                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1539             }
1540             break;
1541
1542         do_exactf_utf8:
1543
1544             /* If one of the operands is in utf8, we can't use the simpler
1545              * folding above, due to the fact that many different characters
1546              * can have the same fold, or portion of a fold, or different-
1547              * length fold */
1548             pat_string = STRING(c);
1549             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1550             pat_end = pat_string + ln;
1551             lnc = (UTF_PATTERN) /* length to match in characters */
1552                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1553                     : ln;
1554
1555             e = HOP3c(strend, -((I32)lnc), s);
1556
1557             if (!reginfo && e < s) {
1558                 e = s;                  /* Due to minlen logic of intuit() */
1559             }
1560
1561             while (s <= e) {
1562                 char *my_strend= (char *)strend;
1563                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1564                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1565                     && (!reginfo || regtry(reginfo, &s)) )
1566                 {
1567                     goto got_it;
1568                 }
1569                 s += UTF8SKIP(s);
1570             }
1571             break;
1572         case BOUNDL:
1573             PL_reg_flags |= RF_tainted;
1574             FBC_BOUND(isALNUM_LC,
1575                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1576                       isALNUM_LC_utf8((U8*)s));
1577             break;
1578         case NBOUNDL:
1579             PL_reg_flags |= RF_tainted;
1580             FBC_NBOUND(isALNUM_LC,
1581                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1582                        isALNUM_LC_utf8((U8*)s));
1583             break;
1584         case BOUND:
1585             FBC_BOUND(isWORDCHAR,
1586                       isALNUM_uni(tmp),
1587                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1588             break;
1589         case BOUNDA:
1590             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1591                              isWORDCHAR_A(tmp),
1592                              isWORDCHAR_A((U8*)s));
1593             break;
1594         case NBOUND:
1595             FBC_NBOUND(isWORDCHAR,
1596                        isALNUM_uni(tmp),
1597                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1598             break;
1599         case NBOUNDA:
1600             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1601                               isWORDCHAR_A(tmp),
1602                               isWORDCHAR_A((U8*)s));
1603             break;
1604         case BOUNDU:
1605             FBC_BOUND(isWORDCHAR_L1,
1606                       isALNUM_uni(tmp),
1607                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1608             break;
1609         case NBOUNDU:
1610             FBC_NBOUND(isWORDCHAR_L1,
1611                        isALNUM_uni(tmp),
1612                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1613             break;
1614         case ALNUML:
1615             REXEC_FBC_CSCAN_TAINT(
1616                 isALNUM_LC_utf8((U8*)s),
1617                 isALNUM_LC(*s)
1618             );
1619             break;
1620         case ALNUMU:
1621             REXEC_FBC_CSCAN_PRELOAD(
1622                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1623                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1624                 isWORDCHAR_L1((U8) *s)
1625             );
1626             break;
1627         case ALNUM:
1628             REXEC_FBC_CSCAN_PRELOAD(
1629                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1630                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1631                 isWORDCHAR((U8) *s)
1632             );
1633             break;
1634         case ALNUMA:
1635             /* Don't need to worry about utf8, as it can match only a single
1636              * byte invariant character */
1637             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1638             break;
1639         case NALNUMU:
1640             REXEC_FBC_CSCAN_PRELOAD(
1641                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1642                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1643                 ! isWORDCHAR_L1((U8) *s)
1644             );
1645             break;
1646         case NALNUM:
1647             REXEC_FBC_CSCAN_PRELOAD(
1648                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1649                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1650                 ! isALNUM(*s)
1651             );
1652             break;
1653         case NALNUMA:
1654             REXEC_FBC_CSCAN(
1655                 !isWORDCHAR_A(*s),
1656                 !isWORDCHAR_A(*s)
1657             );
1658             break;
1659         case NALNUML:
1660             REXEC_FBC_CSCAN_TAINT(
1661                 !isALNUM_LC_utf8((U8*)s),
1662                 !isALNUM_LC(*s)
1663             );
1664             break;
1665         case SPACEU:
1666             REXEC_FBC_CSCAN_PRELOAD(
1667                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1668                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1669                 isSPACE_L1((U8) *s)
1670             );
1671             break;
1672         case SPACE:
1673             REXEC_FBC_CSCAN_PRELOAD(
1674                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1675                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1676                 isSPACE((U8) *s)
1677             );
1678             break;
1679         case SPACEA:
1680             /* Don't need to worry about utf8, as it can match only a single
1681              * byte invariant character */
1682             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1683             break;
1684         case SPACEL:
1685             REXEC_FBC_CSCAN_TAINT(
1686                 isSPACE_LC_utf8((U8*)s),
1687                 isSPACE_LC(*s)
1688             );
1689             break;
1690         case NSPACEU:
1691             REXEC_FBC_CSCAN_PRELOAD(
1692                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1693                 !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1694                 ! isSPACE_L1((U8) *s)
1695             );
1696             break;
1697         case NSPACE:
1698             REXEC_FBC_CSCAN_PRELOAD(
1699                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1700                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1701                 ! isSPACE((U8) *s)
1702             );
1703             break;
1704         case NSPACEA:
1705             REXEC_FBC_CSCAN(
1706                 !isSPACE_A(*s),
1707                 !isSPACE_A(*s)
1708             );
1709             break;
1710         case NSPACEL:
1711             REXEC_FBC_CSCAN_TAINT(
1712                 !isSPACE_LC_utf8((U8*)s),
1713                 !isSPACE_LC(*s)
1714             );
1715             break;
1716         case DIGIT:
1717             REXEC_FBC_CSCAN_PRELOAD(
1718                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1719                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1720                 isDIGIT(*s)
1721             );
1722             break;
1723         case DIGITA:
1724             /* Don't need to worry about utf8, as it can match only a single
1725              * byte invariant character */
1726             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1727             break;
1728         case DIGITL:
1729             REXEC_FBC_CSCAN_TAINT(
1730                 isDIGIT_LC_utf8((U8*)s),
1731                 isDIGIT_LC(*s)
1732             );
1733             break;
1734         case NDIGIT:
1735             REXEC_FBC_CSCAN_PRELOAD(
1736                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1737                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1738                 !isDIGIT(*s)
1739             );
1740             break;
1741         case NDIGITA:
1742             REXEC_FBC_CSCAN(
1743                 !isDIGIT_A(*s),
1744                 !isDIGIT_A(*s)
1745             );
1746             break;
1747         case NDIGITL:
1748             REXEC_FBC_CSCAN_TAINT(
1749                 !isDIGIT_LC_utf8((U8*)s),
1750                 !isDIGIT_LC(*s)
1751             );
1752             break;
1753         case LNBREAK:
1754             REXEC_FBC_CSCAN(
1755                 is_LNBREAK_utf8(s),
1756                 is_LNBREAK_latin1(s)
1757             );
1758             break;
1759         case VERTWS:
1760             REXEC_FBC_CSCAN(
1761                 is_VERTWS_utf8(s),
1762                 is_VERTWS_latin1(s)
1763             );
1764             break;
1765         case NVERTWS:
1766             REXEC_FBC_CSCAN(
1767                 !is_VERTWS_utf8(s),
1768                 !is_VERTWS_latin1(s)
1769             );
1770             break;
1771         case HORIZWS:
1772             REXEC_FBC_CSCAN(
1773                 is_HORIZWS_utf8(s),
1774                 is_HORIZWS_latin1(s)
1775             );
1776             break;
1777         case NHORIZWS:
1778             REXEC_FBC_CSCAN(
1779                 !is_HORIZWS_utf8(s),
1780                 !is_HORIZWS_latin1(s)
1781             );
1782             break;
1783         case AHOCORASICKC:
1784         case AHOCORASICK:
1785             {
1786                 DECL_TRIE_TYPE(c);
1787                 /* what trie are we using right now */
1788                 reg_ac_data *aho
1789                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1790                 reg_trie_data *trie
1791                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1792                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1793
1794                 const char *last_start = strend - trie->minlen;
1795 #ifdef DEBUGGING
1796                 const char *real_start = s;
1797 #endif
1798                 STRLEN maxlen = trie->maxlen;
1799                 SV *sv_points;
1800                 U8 **points; /* map of where we were in the input string
1801                                 when reading a given char. For ASCII this
1802                                 is unnecessary overhead as the relationship
1803                                 is always 1:1, but for Unicode, especially
1804                                 case folded Unicode this is not true. */
1805                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1806                 U8 *bitmap=NULL;
1807
1808
1809                 GET_RE_DEBUG_FLAGS_DECL;
1810
1811                 /* We can't just allocate points here. We need to wrap it in
1812                  * an SV so it gets freed properly if there is a croak while
1813                  * running the match */
1814                 ENTER;
1815                 SAVETMPS;
1816                 sv_points=newSV(maxlen * sizeof(U8 *));
1817                 SvCUR_set(sv_points,
1818                     maxlen * sizeof(U8 *));
1819                 SvPOK_on(sv_points);
1820                 sv_2mortal(sv_points);
1821                 points=(U8**)SvPV_nolen(sv_points );
1822                 if ( trie_type != trie_utf8_fold
1823                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1824                 {
1825                     if (trie->bitmap)
1826                         bitmap=(U8*)trie->bitmap;
1827                     else
1828                         bitmap=(U8*)ANYOF_BITMAP(c);
1829                 }
1830                 /* this is the Aho-Corasick algorithm modified a touch
1831                    to include special handling for long "unknown char"
1832                    sequences. The basic idea being that we use AC as long
1833                    as we are dealing with a possible matching char, when
1834                    we encounter an unknown char (and we have not encountered
1835                    an accepting state) we scan forward until we find a legal
1836                    starting char.
1837                    AC matching is basically that of trie matching, except
1838                    that when we encounter a failing transition, we fall back
1839                    to the current states "fail state", and try the current char
1840                    again, a process we repeat until we reach the root state,
1841                    state 1, or a legal transition. If we fail on the root state
1842                    then we can either terminate if we have reached an accepting
1843                    state previously, or restart the entire process from the beginning
1844                    if we have not.
1845
1846                  */
1847                 while (s <= last_start) {
1848                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1849                     U8 *uc = (U8*)s;
1850                     U16 charid = 0;
1851                     U32 base = 1;
1852                     U32 state = 1;
1853                     UV uvc = 0;
1854                     STRLEN len = 0;
1855                     STRLEN foldlen = 0;
1856                     U8 *uscan = (U8*)NULL;
1857                     U8 *leftmost = NULL;
1858 #ifdef DEBUGGING
1859                     U32 accepted_word= 0;
1860 #endif
1861                     U32 pointpos = 0;
1862
1863                     while ( state && uc <= (U8*)strend ) {
1864                         int failed=0;
1865                         U32 word = aho->states[ state ].wordnum;
1866
1867                         if( state==1 ) {
1868                             if ( bitmap ) {
1869                                 DEBUG_TRIE_EXECUTE_r(
1870                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1871                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1872                                             (char *)uc, utf8_target );
1873                                         PerlIO_printf( Perl_debug_log,
1874                                             " Scanning for legal start char...\n");
1875                                     }
1876                                 );
1877                                 if (utf8_target) {
1878                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1879                                         uc += UTF8SKIP(uc);
1880                                     }
1881                                 } else {
1882                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1883                                         uc++;
1884                                     }
1885                                 }
1886                                 s= (char *)uc;
1887                             }
1888                             if (uc >(U8*)last_start) break;
1889                         }
1890
1891                         if ( word ) {
1892                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1893                             if (!leftmost || lpos < leftmost) {
1894                                 DEBUG_r(accepted_word=word);
1895                                 leftmost= lpos;
1896                             }
1897                             if (base==0) break;
1898
1899                         }
1900                         points[pointpos++ % maxlen]= uc;
1901                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1902                                              uscan, len, uvc, charid, foldlen,
1903                                              foldbuf, uniflags);
1904                         DEBUG_TRIE_EXECUTE_r({
1905                             dump_exec_pos( (char *)uc, c, strend, real_start,
1906                                 s,   utf8_target );
1907                             PerlIO_printf(Perl_debug_log,
1908                                 " Charid:%3u CP:%4"UVxf" ",
1909                                  charid, uvc);
1910                         });
1911
1912                         do {
1913 #ifdef DEBUGGING
1914                             word = aho->states[ state ].wordnum;
1915 #endif
1916                             base = aho->states[ state ].trans.base;
1917
1918                             DEBUG_TRIE_EXECUTE_r({
1919                                 if (failed)
1920                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1921                                         s,   utf8_target );
1922                                 PerlIO_printf( Perl_debug_log,
1923                                     "%sState: %4"UVxf", word=%"UVxf,
1924                                     failed ? " Fail transition to " : "",
1925                                     (UV)state, (UV)word);
1926                             });
1927                             if ( base ) {
1928                                 U32 tmp;
1929                                 I32 offset;
1930                                 if (charid &&
1931                                      ( ((offset = base + charid
1932                                         - 1 - trie->uniquecharcount)) >= 0)
1933                                      && ((U32)offset < trie->lasttrans)
1934                                      && trie->trans[offset].check == state
1935                                      && (tmp=trie->trans[offset].next))
1936                                 {
1937                                     DEBUG_TRIE_EXECUTE_r(
1938                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1939                                     state = tmp;
1940                                     break;
1941                                 }
1942                                 else {
1943                                     DEBUG_TRIE_EXECUTE_r(
1944                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1945                                     failed = 1;
1946                                     state = aho->fail[state];
1947                                 }
1948                             }
1949                             else {
1950                                 /* we must be accepting here */
1951                                 DEBUG_TRIE_EXECUTE_r(
1952                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1953                                 failed = 1;
1954                                 break;
1955                             }
1956                         } while(state);
1957                         uc += len;
1958                         if (failed) {
1959                             if (leftmost)
1960                                 break;
1961                             if (!state) state = 1;
1962                         }
1963                     }
1964                     if ( aho->states[ state ].wordnum ) {
1965                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1966                         if (!leftmost || lpos < leftmost) {
1967                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1968                             leftmost = lpos;
1969                         }
1970                     }
1971                     if (leftmost) {
1972                         s = (char*)leftmost;
1973                         DEBUG_TRIE_EXECUTE_r({
1974                             PerlIO_printf(
1975                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1976                                 (UV)accepted_word, (IV)(s - real_start)
1977                             );
1978                         });
1979                         if (!reginfo || regtry(reginfo, &s)) {
1980                             FREETMPS;
1981                             LEAVE;
1982                             goto got_it;
1983                         }
1984                         s = HOPc(s,1);
1985                         DEBUG_TRIE_EXECUTE_r({
1986                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1987                         });
1988                     } else {
1989                         DEBUG_TRIE_EXECUTE_r(
1990                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1991                         break;
1992                     }
1993                 }
1994                 FREETMPS;
1995                 LEAVE;
1996             }
1997             break;
1998         default:
1999             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2000             break;
2001         }
2002         return 0;
2003       got_it:
2004         return s;
2005 }
2006
2007
2008 /*
2009  - regexec_flags - match a regexp against a string
2010  */
2011 I32
2012 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2013               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2014 /* strend: pointer to null at end of string */
2015 /* strbeg: real beginning of string */
2016 /* minend: end of match must be >=minend after stringarg. */
2017 /* data: May be used for some additional optimizations.
2018          Currently its only used, with a U32 cast, for transmitting
2019          the ganch offset when doing a /g match. This will change */
2020 /* nosave: For optimizations. */
2021 {
2022     dVAR;
2023     struct regexp *const prog = (struct regexp *)SvANY(rx);
2024     /*register*/ char *s;
2025     register regnode *c;
2026     /*register*/ char *startpos = stringarg;
2027     I32 minlen;         /* must match at least this many chars */
2028     I32 dontbother = 0; /* how many characters not to try at end */
2029     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2030     I32 scream_pos = -1;                /* Internal iterator of scream. */
2031     char *scream_olds = NULL;
2032     const bool utf8_target = cBOOL(DO_UTF8(sv));
2033     I32 multiline;
2034     RXi_GET_DECL(prog,progi);
2035     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2036     regexp_paren_pair *swap = NULL;
2037     GET_RE_DEBUG_FLAGS_DECL;
2038
2039     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2040     PERL_UNUSED_ARG(data);
2041
2042     /* Be paranoid... */
2043     if (prog == NULL || startpos == NULL) {
2044         Perl_croak(aTHX_ "NULL regexp parameter");
2045         return 0;
2046     }
2047
2048     multiline = prog->extflags & RXf_PMf_MULTILINE;
2049     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2050
2051     RX_MATCH_UTF8_set(rx, utf8_target);
2052     DEBUG_EXECUTE_r(
2053         debug_start_match(rx, utf8_target, startpos, strend,
2054         "Matching");
2055     );
2056
2057     minlen = prog->minlen;
2058
2059     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2060         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2061                               "String too short [regexec_flags]...\n"));
2062         goto phooey;
2063     }
2064
2065
2066     /* Check validity of program. */
2067     if (UCHARAT(progi->program) != REG_MAGIC) {
2068         Perl_croak(aTHX_ "corrupted regexp program");
2069     }
2070
2071     PL_reg_flags = 0;
2072     PL_reg_eval_set = 0;
2073     PL_reg_maxiter = 0;
2074
2075     if (RX_UTF8(rx))
2076         PL_reg_flags |= RF_utf8;
2077
2078     /* Mark beginning of line for ^ and lookbehind. */
2079     reginfo.bol = startpos; /* XXX not used ??? */
2080     PL_bostr  = strbeg;
2081     reginfo.sv = sv;
2082
2083     /* Mark end of line for $ (and such) */
2084     PL_regeol = strend;
2085
2086     /* see how far we have to get to not match where we matched before */
2087     reginfo.till = startpos+minend;
2088
2089     /* If there is a "must appear" string, look for it. */
2090     s = startpos;
2091
2092     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2093         MAGIC *mg;
2094         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2095             reginfo.ganch = startpos + prog->gofs;
2096             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2097               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2098         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2099                   && SvMAGIC(sv)
2100                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2101                   && mg->mg_len >= 0) {
2102             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2103             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2104                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2105
2106             if (prog->extflags & RXf_ANCH_GPOS) {
2107                 if (s > reginfo.ganch)
2108                     goto phooey;
2109                 s = reginfo.ganch - prog->gofs;
2110                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2111                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2112                 if (s < strbeg)
2113                     goto phooey;
2114             }
2115         }
2116         else if (data) {
2117             reginfo.ganch = strbeg + PTR2UV(data);
2118             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2119                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2120
2121         } else {                                /* pos() not defined */
2122             reginfo.ganch = strbeg;
2123             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2124                  "GPOS: reginfo.ganch = strbeg\n"));
2125         }
2126     }
2127     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2128         /* We have to be careful. If the previous successful match
2129            was from this regex we don't want a subsequent partially
2130            successful match to clobber the old results.
2131            So when we detect this possibility we add a swap buffer
2132            to the re, and switch the buffer each match. If we fail
2133            we switch it back, otherwise we leave it swapped.
2134         */
2135         swap = prog->offs;
2136         /* do we need a save destructor here for eval dies? */
2137         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2138     }
2139     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2140         re_scream_pos_data d;
2141
2142         d.scream_olds = &scream_olds;
2143         d.scream_pos = &scream_pos;
2144         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2145         if (!s) {
2146             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2147             goto phooey;        /* not present */
2148         }
2149     }
2150
2151
2152
2153     /* Simplest case:  anchored match need be tried only once. */
2154     /*  [unless only anchor is BOL and multiline is set] */
2155     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2156         if (s == startpos && regtry(&reginfo, &startpos))
2157             goto got_it;
2158         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2159                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2160         {
2161             char *end;
2162
2163             if (minlen)
2164                 dontbother = minlen - 1;
2165             end = HOP3c(strend, -dontbother, strbeg) - 1;
2166             /* for multiline we only have to try after newlines */
2167             if (prog->check_substr || prog->check_utf8) {
2168                 /* because of the goto we can not easily reuse the macros for bifurcating the
2169                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2170                 if (utf8_target) {
2171                     if (s == startpos)
2172                         goto after_try_utf8;
2173                     while (1) {
2174                         if (regtry(&reginfo, &s)) {
2175                             goto got_it;
2176                         }
2177                       after_try_utf8:
2178                         if (s > end) {
2179                             goto phooey;
2180                         }
2181                         if (prog->extflags & RXf_USE_INTUIT) {
2182                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2183                             if (!s) {
2184                                 goto phooey;
2185                             }
2186                         }
2187                         else {
2188                             s += UTF8SKIP(s);
2189                         }
2190                     }
2191                 } /* end search for check string in unicode */
2192                 else {
2193                     if (s == startpos) {
2194                         goto after_try_latin;
2195                     }
2196                     while (1) {
2197                         if (regtry(&reginfo, &s)) {
2198                             goto got_it;
2199                         }
2200                       after_try_latin:
2201                         if (s > end) {
2202                             goto phooey;
2203                         }
2204                         if (prog->extflags & RXf_USE_INTUIT) {
2205                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2206                             if (!s) {
2207                                 goto phooey;
2208                             }
2209                         }
2210                         else {
2211                             s++;
2212                         }
2213                     }
2214                 } /* end search for check string in latin*/
2215             } /* end search for check string */
2216             else { /* search for newline */
2217                 if (s > startpos) {
2218                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2219                     s--;
2220                 }
2221                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2222                 while (s < end) {
2223                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2224                         if (regtry(&reginfo, &s))
2225                             goto got_it;
2226                     }
2227                 }
2228             } /* end search for newline */
2229         } /* end anchored/multiline check string search */
2230         goto phooey;
2231     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2232     {
2233         /* the warning about reginfo.ganch being used without initialization
2234            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2235            and we only enter this block when the same bit is set. */
2236         char *tmp_s = reginfo.ganch - prog->gofs;
2237
2238         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2239             goto got_it;
2240         goto phooey;
2241     }
2242
2243     /* Messy cases:  unanchored match. */
2244     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2245         /* we have /x+whatever/ */
2246         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2247         char ch;
2248 #ifdef DEBUGGING
2249         int did_match = 0;
2250 #endif
2251         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2252             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2253         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2254
2255         if (utf8_target) {
2256             REXEC_FBC_SCAN(
2257                 if (*s == ch) {
2258                     DEBUG_EXECUTE_r( did_match = 1 );
2259                     if (regtry(&reginfo, &s)) goto got_it;
2260                     s += UTF8SKIP(s);
2261                     while (s < strend && *s == ch)
2262                         s += UTF8SKIP(s);
2263                 }
2264             );
2265         }
2266         else {
2267             REXEC_FBC_SCAN(
2268                 if (*s == ch) {
2269                     DEBUG_EXECUTE_r( did_match = 1 );
2270                     if (regtry(&reginfo, &s)) goto got_it;
2271                     s++;
2272                     while (s < strend && *s == ch)
2273                         s++;
2274                 }
2275             );
2276         }
2277         DEBUG_EXECUTE_r(if (!did_match)
2278                 PerlIO_printf(Perl_debug_log,
2279                                   "Did not find anchored character...\n")
2280                );
2281     }
2282     else if (prog->anchored_substr != NULL
2283               || prog->anchored_utf8 != NULL
2284               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2285                   && prog->float_max_offset < strend - s)) {
2286         SV *must;
2287         I32 back_max;
2288         I32 back_min;
2289         char *last;
2290         char *last1;            /* Last position checked before */
2291 #ifdef DEBUGGING
2292         int did_match = 0;
2293 #endif
2294         if (prog->anchored_substr || prog->anchored_utf8) {
2295             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2296                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2297             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2298             back_max = back_min = prog->anchored_offset;
2299         } else {
2300             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2301                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2302             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2303             back_max = prog->float_max_offset;
2304             back_min = prog->float_min_offset;
2305         }
2306
2307
2308         if (must == &PL_sv_undef)
2309             /* could not downgrade utf8 check substring, so must fail */
2310             goto phooey;
2311
2312         if (back_min<0) {
2313             last = strend;
2314         } else {
2315             last = HOP3c(strend,        /* Cannot start after this */
2316                   -(I32)(CHR_SVLEN(must)
2317                          - (SvTAIL(must) != 0) + back_min), strbeg);
2318         }
2319         if (s > PL_bostr)
2320             last1 = HOPc(s, -1);
2321         else
2322             last1 = s - 1;      /* bogus */
2323
2324         /* XXXX check_substr already used to find "s", can optimize if
2325            check_substr==must. */
2326         scream_pos = -1;
2327         dontbother = end_shift;
2328         strend = HOPc(strend, -dontbother);
2329         while ( (s <= last) &&
2330                 ((flags & REXEC_SCREAM)
2331                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2332                                     end_shift, &scream_pos, 0))
2333                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2334                                   (unsigned char*)strend, must,
2335                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2336             /* we may be pointing at the wrong string */
2337             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2338                 s = strbeg + (s - SvPVX_const(sv));
2339             DEBUG_EXECUTE_r( did_match = 1 );
2340             if (HOPc(s, -back_max) > last1) {
2341                 last1 = HOPc(s, -back_min);
2342                 s = HOPc(s, -back_max);
2343             }
2344             else {
2345                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2346
2347                 last1 = HOPc(s, -back_min);
2348                 s = t;
2349             }
2350             if (utf8_target) {
2351                 while (s <= last1) {
2352                     if (regtry(&reginfo, &s))
2353                         goto got_it;
2354                     s += UTF8SKIP(s);
2355                 }
2356             }
2357             else {
2358                 while (s <= last1) {
2359                     if (regtry(&reginfo, &s))
2360                         goto got_it;
2361                     s++;
2362                 }
2363             }
2364         }
2365         DEBUG_EXECUTE_r(if (!did_match) {
2366             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2367                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2368             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2369                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2370                                ? "anchored" : "floating"),
2371                 quoted, RE_SV_TAIL(must));
2372         });
2373         goto phooey;
2374     }
2375     else if ( (c = progi->regstclass) ) {
2376         if (minlen) {
2377             const OPCODE op = OP(progi->regstclass);
2378             /* don't bother with what can't match */
2379             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2380                 strend = HOPc(strend, -(minlen - 1));
2381         }
2382         DEBUG_EXECUTE_r({
2383             SV * const prop = sv_newmortal();
2384             regprop(prog, prop, c);
2385             {
2386                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2387                     s,strend-s,60);
2388                 PerlIO_printf(Perl_debug_log,
2389                     "Matching stclass %.*s against %s (%d bytes)\n",
2390                     (int)SvCUR(prop), SvPVX_const(prop),
2391                      quoted, (int)(strend - s));
2392             }
2393         });
2394         if (find_byclass(prog, c, s, strend, &reginfo))
2395             goto got_it;
2396         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2397     }
2398     else {
2399         dontbother = 0;
2400         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2401             /* Trim the end. */
2402             char *last;
2403             SV* float_real;
2404
2405             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2406                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2407             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2408
2409             if (flags & REXEC_SCREAM) {
2410                 last = screaminstr(sv, float_real, s - strbeg,
2411                                    end_shift, &scream_pos, 1); /* last one */
2412                 if (!last)
2413                     last = scream_olds; /* Only one occurrence. */
2414                 /* we may be pointing at the wrong string */
2415                 else if (RXp_MATCH_COPIED(prog))
2416                     s = strbeg + (s - SvPVX_const(sv));
2417             }
2418             else {
2419                 STRLEN len;
2420                 const char * const little = SvPV_const(float_real, len);
2421
2422                 if (SvTAIL(float_real)) {
2423                     if (memEQ(strend - len + 1, little, len - 1))
2424                         last = strend - len + 1;
2425                     else if (!multiline)
2426                         last = memEQ(strend - len, little, len)
2427                             ? strend - len : NULL;
2428                     else
2429                         goto find_last;
2430                 } else {
2431                   find_last:
2432                     if (len)
2433                         last = rninstr(s, strend, little, little + len);
2434                     else
2435                         last = strend;  /* matching "$" */
2436                 }
2437             }
2438             if (last == NULL) {
2439                 DEBUG_EXECUTE_r(
2440                     PerlIO_printf(Perl_debug_log,
2441                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2442                         PL_colors[4], PL_colors[5]));
2443                 goto phooey; /* Should not happen! */
2444             }
2445             dontbother = strend - last + prog->float_min_offset;
2446         }
2447         if (minlen && (dontbother < minlen))
2448             dontbother = minlen - 1;
2449         strend -= dontbother;              /* this one's always in bytes! */
2450         /* We don't know much -- general case. */
2451         if (utf8_target) {
2452             for (;;) {
2453                 if (regtry(&reginfo, &s))
2454                     goto got_it;
2455                 if (s >= strend)
2456                     break;
2457                 s += UTF8SKIP(s);
2458             };
2459         }
2460         else {
2461             do {
2462                 if (regtry(&reginfo, &s))
2463                     goto got_it;
2464             } while (s++ < strend);
2465         }
2466     }
2467
2468     /* Failure. */
2469     goto phooey;
2470
2471 got_it:
2472     Safefree(swap);
2473     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2474
2475     if (PL_reg_eval_set)
2476         restore_pos(aTHX_ prog);
2477     if (RXp_PAREN_NAMES(prog))
2478         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2479
2480     /* make sure $`, $&, $', and $digit will work later */
2481     if ( !(flags & REXEC_NOT_FIRST) ) {
2482         RX_MATCH_COPY_FREE(rx);
2483         if (flags & REXEC_COPY_STR) {
2484             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2485 #ifdef PERL_OLD_COPY_ON_WRITE
2486             if ((SvIsCOW(sv)
2487                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2488                 if (DEBUG_C_TEST) {
2489                     PerlIO_printf(Perl_debug_log,
2490                                   "Copy on write: regexp capture, type %d\n",
2491                                   (int) SvTYPE(sv));
2492                 }
2493                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2494                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2495                 assert (SvPOKp(prog->saved_copy));
2496             } else
2497 #endif
2498             {
2499                 RX_MATCH_COPIED_on(rx);
2500                 s = savepvn(strbeg, i);
2501                 prog->subbeg = s;
2502             }
2503             prog->sublen = i;
2504         }
2505         else {
2506             prog->subbeg = strbeg;
2507             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2508         }
2509     }
2510
2511     return 1;
2512
2513 phooey:
2514     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2515                           PL_colors[4], PL_colors[5]));
2516     if (PL_reg_eval_set)
2517         restore_pos(aTHX_ prog);
2518     if (swap) {
2519         /* we failed :-( roll it back */
2520         Safefree(prog->offs);
2521         prog->offs = swap;
2522     }
2523
2524     return 0;
2525 }
2526
2527
2528 /*
2529  - regtry - try match at specific point
2530  */
2531 STATIC I32                      /* 0 failure, 1 success */
2532 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2533 {
2534     dVAR;
2535     CHECKPOINT lastcp;
2536     REGEXP *const rx = reginfo->prog;
2537     regexp *const prog = (struct regexp *)SvANY(rx);
2538     RXi_GET_DECL(prog,progi);
2539     GET_RE_DEBUG_FLAGS_DECL;
2540
2541     PERL_ARGS_ASSERT_REGTRY;
2542
2543     reginfo->cutpoint=NULL;
2544
2545     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2546         MAGIC *mg;
2547
2548         PL_reg_eval_set = RS_init;
2549         DEBUG_EXECUTE_r(DEBUG_s(
2550             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2551                           (IV)(PL_stack_sp - PL_stack_base));
2552             ));
2553         SAVESTACK_CXPOS();
2554         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2555         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2556         SAVETMPS;
2557         /* Apparently this is not needed, judging by wantarray. */
2558         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2559            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2560
2561         if (reginfo->sv) {
2562             /* Make $_ available to executed code. */
2563             if (reginfo->sv != DEFSV) {
2564                 SAVE_DEFSV;
2565                 DEFSV_set(reginfo->sv);
2566             }
2567
2568             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2569                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2570                 /* prepare for quick setting of pos */
2571 #ifdef PERL_OLD_COPY_ON_WRITE
2572                 if (SvIsCOW(reginfo->sv))
2573                     sv_force_normal_flags(reginfo->sv, 0);
2574 #endif
2575                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2576                                  &PL_vtbl_mglob, NULL, 0);
2577                 mg->mg_len = -1;
2578             }
2579             PL_reg_magic    = mg;
2580             PL_reg_oldpos   = mg->mg_len;
2581             SAVEDESTRUCTOR_X(restore_pos, prog);
2582         }
2583         if (!PL_reg_curpm) {
2584             Newxz(PL_reg_curpm, 1, PMOP);
2585 #ifdef USE_ITHREADS
2586             {
2587                 SV* const repointer = &PL_sv_undef;
2588                 /* this regexp is also owned by the new PL_reg_curpm, which
2589                    will try to free it.  */
2590                 av_push(PL_regex_padav, repointer);
2591                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2592                 PL_regex_pad = AvARRAY(PL_regex_padav);
2593             }
2594 #endif
2595         }
2596 #ifdef USE_ITHREADS
2597         /* It seems that non-ithreads works both with and without this code.
2598            So for efficiency reasons it seems best not to have the code
2599            compiled when it is not needed.  */
2600         /* This is safe against NULLs: */
2601         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2602         /* PM_reg_curpm owns a reference to this regexp.  */
2603         ReREFCNT_inc(rx);
2604 #endif
2605         PM_SETRE(PL_reg_curpm, rx);
2606         PL_reg_oldcurpm = PL_curpm;
2607         PL_curpm = PL_reg_curpm;
2608         if (RXp_MATCH_COPIED(prog)) {
2609             /*  Here is a serious problem: we cannot rewrite subbeg,
2610                 since it may be needed if this match fails.  Thus
2611                 $` inside (?{}) could fail... */
2612             PL_reg_oldsaved = prog->subbeg;
2613             PL_reg_oldsavedlen = prog->sublen;
2614 #ifdef PERL_OLD_COPY_ON_WRITE
2615             PL_nrs = prog->saved_copy;
2616 #endif
2617             RXp_MATCH_COPIED_off(prog);
2618         }
2619         else
2620             PL_reg_oldsaved = NULL;
2621         prog->subbeg = PL_bostr;
2622         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2623     }
2624     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2625     prog->offs[0].start = *startpos - PL_bostr;
2626     PL_reginput = *startpos;
2627     PL_reglastparen = &prog->lastparen;
2628     PL_reglastcloseparen = &prog->lastcloseparen;
2629     prog->lastparen = 0;
2630     prog->lastcloseparen = 0;
2631     PL_regsize = 0;
2632     PL_regoffs = prog->offs;
2633     if (PL_reg_start_tmpl <= prog->nparens) {
2634         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2635         if(PL_reg_start_tmp)
2636             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2637         else
2638             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2639     }
2640
2641     /* XXXX What this code is doing here?!!!  There should be no need
2642        to do this again and again, PL_reglastparen should take care of
2643        this!  --ilya*/
2644
2645     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2646      * Actually, the code in regcppop() (which Ilya may be meaning by
2647      * PL_reglastparen), is not needed at all by the test suite
2648      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2649      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2650      * Meanwhile, this code *is* needed for the
2651      * above-mentioned test suite tests to succeed.  The common theme
2652      * on those tests seems to be returning null fields from matches.
2653      * --jhi updated by dapm */
2654 #if 1
2655     if (prog->nparens) {
2656         regexp_paren_pair *pp = PL_regoffs;
2657         register I32 i;
2658         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2659             ++pp;
2660             pp->start = -1;
2661             pp->end = -1;
2662         }
2663     }
2664 #endif
2665     REGCP_SET(lastcp);
2666     if (regmatch(reginfo, progi->program + 1)) {
2667         PL_regoffs[0].end = PL_reginput - PL_bostr;
2668         return 1;
2669     }
2670     if (reginfo->cutpoint)
2671         *startpos= reginfo->cutpoint;
2672     REGCP_UNWIND(lastcp);
2673     return 0;
2674 }
2675
2676
2677 #define sayYES goto yes
2678 #define sayNO goto no
2679 #define sayNO_SILENT goto no_silent
2680
2681 /* we dont use STMT_START/END here because it leads to
2682    "unreachable code" warnings, which are bogus, but distracting. */
2683 #define CACHEsayNO \
2684     if (ST.cache_mask) \
2685        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2686     sayNO
2687
2688 /* this is used to determine how far from the left messages like
2689    'failed...' are printed. It should be set such that messages
2690    are inline with the regop output that created them.
2691 */
2692 #define REPORT_CODE_OFF 32
2693
2694
2695 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2696 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2697
2698 #define SLAB_FIRST(s) (&(s)->states[0])
2699 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2700
2701 /* grab a new slab and return the first slot in it */
2702
2703 STATIC regmatch_state *
2704 S_push_slab(pTHX)
2705 {
2706 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2707     dMY_CXT;
2708 #endif
2709     regmatch_slab *s = PL_regmatch_slab->next;
2710     if (!s) {
2711         Newx(s, 1, regmatch_slab);
2712         s->prev = PL_regmatch_slab;
2713         s->next = NULL;
2714         PL_regmatch_slab->next = s;
2715     }
2716     PL_regmatch_slab = s;
2717     return SLAB_FIRST(s);
2718 }
2719
2720
2721 /* push a new state then goto it */
2722
2723 #define PUSH_STATE_GOTO(state, node) \
2724     scan = node; \
2725     st->resume_state = state; \
2726     goto push_state;
2727
2728 /* push a new state with success backtracking, then goto it */
2729
2730 #define PUSH_YES_STATE_GOTO(state, node) \
2731     scan = node; \
2732     st->resume_state = state; \
2733     goto push_yes_state;
2734
2735
2736
2737 /*
2738
2739 regmatch() - main matching routine
2740
2741 This is basically one big switch statement in a loop. We execute an op,
2742 set 'next' to point the next op, and continue. If we come to a point which
2743 we may need to backtrack to on failure such as (A|B|C), we push a
2744 backtrack state onto the backtrack stack. On failure, we pop the top
2745 state, and re-enter the loop at the state indicated. If there are no more
2746 states to pop, we return failure.
2747
2748 Sometimes we also need to backtrack on success; for example /A+/, where
2749 after successfully matching one A, we need to go back and try to
2750 match another one; similarly for lookahead assertions: if the assertion
2751 completes successfully, we backtrack to the state just before the assertion
2752 and then carry on.  In these cases, the pushed state is marked as
2753 'backtrack on success too'. This marking is in fact done by a chain of
2754 pointers, each pointing to the previous 'yes' state. On success, we pop to
2755 the nearest yes state, discarding any intermediate failure-only states.
2756 Sometimes a yes state is pushed just to force some cleanup code to be
2757 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2758 it to free the inner regex.
2759
2760 Note that failure backtracking rewinds the cursor position, while
2761 success backtracking leaves it alone.
2762
2763 A pattern is complete when the END op is executed, while a subpattern
2764 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2765 ops trigger the "pop to last yes state if any, otherwise return true"
2766 behaviour.
2767
2768 A common convention in this function is to use A and B to refer to the two
2769 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2770 the subpattern to be matched possibly multiple times, while B is the entire
2771 rest of the pattern. Variable and state names reflect this convention.
2772
2773 The states in the main switch are the union of ops and failure/success of
2774 substates associated with with that op.  For example, IFMATCH is the op
2775 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2776 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2777 successfully matched A and IFMATCH_A_fail is a state saying that we have
2778 just failed to match A. Resume states always come in pairs. The backtrack
2779 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2780 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2781 on success or failure.
2782
2783 The struct that holds a backtracking state is actually a big union, with
2784 one variant for each major type of op. The variable st points to the
2785 top-most backtrack struct. To make the code clearer, within each
2786 block of code we #define ST to alias the relevant union.
2787
2788 Here's a concrete example of a (vastly oversimplified) IFMATCH
2789 implementation:
2790
2791     switch (state) {
2792     ....
2793
2794 #define ST st->u.ifmatch
2795
2796     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2797         ST.foo = ...; // some state we wish to save
2798         ...
2799         // push a yes backtrack state with a resume value of
2800         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2801         // first node of A:
2802         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2803         // NOTREACHED
2804
2805     case IFMATCH_A: // we have successfully executed A; now continue with B
2806         next = B;
2807         bar = ST.foo; // do something with the preserved value
2808         break;
2809
2810     case IFMATCH_A_fail: // A failed, so the assertion failed
2811         ...;   // do some housekeeping, then ...
2812         sayNO; // propagate the failure
2813
2814 #undef ST
2815
2816     ...
2817     }
2818
2819 For any old-timers reading this who are familiar with the old recursive
2820 approach, the code above is equivalent to:
2821
2822     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2823     {
2824         int foo = ...
2825         ...
2826         if (regmatch(A)) {
2827             next = B;
2828             bar = foo;
2829             break;
2830         }
2831         ...;   // do some housekeeping, then ...
2832         sayNO; // propagate the failure
2833     }
2834
2835 The topmost backtrack state, pointed to by st, is usually free. If you
2836 want to claim it, populate any ST.foo fields in it with values you wish to
2837 save, then do one of
2838
2839         PUSH_STATE_GOTO(resume_state, node);
2840         PUSH_YES_STATE_GOTO(resume_state, node);
2841
2842 which sets that backtrack state's resume value to 'resume_state', pushes a
2843 new free entry to the top of the backtrack stack, then goes to 'node'.
2844 On backtracking, the free slot is popped, and the saved state becomes the
2845 new free state. An ST.foo field in this new top state can be temporarily
2846 accessed to retrieve values, but once the main loop is re-entered, it
2847 becomes available for reuse.
2848
2849 Note that the depth of the backtrack stack constantly increases during the
2850 left-to-right execution of the pattern, rather than going up and down with
2851 the pattern nesting. For example the stack is at its maximum at Z at the
2852 end of the pattern, rather than at X in the following:
2853
2854     /(((X)+)+)+....(Y)+....Z/
2855
2856 The only exceptions to this are lookahead/behind assertions and the cut,
2857 (?>A), which pop all the backtrack states associated with A before
2858 continuing.
2859
2860 Backtrack state structs are allocated in slabs of about 4K in size.
2861 PL_regmatch_state and st always point to the currently active state,
2862 and PL_regmatch_slab points to the slab currently containing
2863 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2864 allocated, and is never freed until interpreter destruction. When the slab
2865 is full, a new one is allocated and chained to the end. At exit from
2866 regmatch(), slabs allocated since entry are freed.
2867
2868 */
2869
2870
2871 #define DEBUG_STATE_pp(pp)                                  \
2872     DEBUG_STATE_r({                                         \
2873         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2874         PerlIO_printf(Perl_debug_log,                       \
2875             "    %*s"pp" %s%s%s%s%s\n",                     \
2876             depth*2, "",                                    \
2877             PL_reg_name[st->resume_state],                     \
2878             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2879             ((st==yes_state) ? "Y" : ""),                   \
2880             ((st==mark_state) ? "M" : ""),                  \
2881             ((st==yes_state||st==mark_state) ? "]" : "")    \
2882         );                                                  \
2883     });
2884
2885
2886 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2887
2888 #ifdef DEBUGGING
2889
2890 STATIC void
2891 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2892     const char *start, const char *end, const char *blurb)
2893 {
2894     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2895
2896     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2897
2898     if (!PL_colorset)
2899             reginitcolors();
2900     {
2901         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2902             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2903
2904         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2905             start, end - start, 60);
2906
2907         PerlIO_printf(Perl_debug_log,
2908             "%s%s REx%s %s against %s\n",
2909                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2910
2911         if (utf8_target||utf8_pat)
2912             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2913                 utf8_pat ? "pattern" : "",
2914                 utf8_pat && utf8_target ? " and " : "",
2915                 utf8_target ? "string" : ""
2916             );
2917     }
2918 }
2919
2920 STATIC void
2921 S_dump_exec_pos(pTHX_ const char *locinput,
2922                       const regnode *scan,
2923                       const char *loc_regeol,
2924                       const char *loc_bostr,
2925                       const char *loc_reg_starttry,
2926                       const bool utf8_target)
2927 {
2928     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2929     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2930     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2931     /* The part of the string before starttry has one color
2932        (pref0_len chars), between starttry and current
2933        position another one (pref_len - pref0_len chars),
2934        after the current position the third one.
2935        We assume that pref0_len <= pref_len, otherwise we
2936        decrease pref0_len.  */
2937     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2938         ? (5 + taill) - l : locinput - loc_bostr;
2939     int pref0_len;
2940
2941     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2942
2943     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2944         pref_len++;
2945     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2946     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2947         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2948               ? (5 + taill) - pref_len : loc_regeol - locinput);
2949     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2950         l--;
2951     if (pref0_len < 0)
2952         pref0_len = 0;
2953     if (pref0_len > pref_len)
2954         pref0_len = pref_len;
2955     {
2956         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2957
2958         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2959             (locinput - pref_len),pref0_len, 60, 4, 5);
2960
2961         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2962                     (locinput - pref_len + pref0_len),
2963                     pref_len - pref0_len, 60, 2, 3);
2964
2965         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2966                     locinput, loc_regeol - locinput, 10, 0, 1);
2967
2968         const STRLEN tlen=len0+len1+len2;
2969         PerlIO_printf(Perl_debug_log,
2970                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2971                     (IV)(locinput - loc_bostr),
2972                     len0, s0,
2973                     len1, s1,
2974                     (docolor ? "" : "> <"),
2975                     len2, s2,
2976                     (int)(tlen > 19 ? 0 :  19 - tlen),
2977                     "");
2978     }
2979 }
2980
2981 #endif
2982
2983 /* reg_check_named_buff_matched()
2984  * Checks to see if a named buffer has matched. The data array of
2985  * buffer numbers corresponding to the buffer is expected to reside
2986  * in the regexp->data->data array in the slot stored in the ARG() of
2987  * node involved. Note that this routine doesn't actually care about the
2988  * name, that information is not preserved from compilation to execution.
2989  * Returns the index of the leftmost defined buffer with the given name
2990  * or 0 if non of the buffers matched.
2991  */
2992 STATIC I32
2993 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2994 {
2995     I32 n;
2996     RXi_GET_DECL(rex,rexi);
2997     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2998     I32 *nums=(I32*)SvPVX(sv_dat);
2999
3000     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3001
3002     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3003         if ((I32)*PL_reglastparen >= nums[n] &&
3004             PL_regoffs[nums[n]].end != -1)
3005         {
3006             return nums[n];
3007         }
3008     }
3009     return 0;
3010 }
3011
3012
3013 /* free all slabs above current one  - called during LEAVE_SCOPE */
3014
3015 STATIC void
3016 S_clear_backtrack_stack(pTHX_ void *p)
3017 {
3018     regmatch_slab *s = PL_regmatch_slab->next;
3019     PERL_UNUSED_ARG(p);
3020
3021     if (!s)
3022         return;
3023     PL_regmatch_slab->next = NULL;
3024     while (s) {
3025         regmatch_slab * const osl = s;
3026         s = s->next;
3027         Safefree(osl);
3028     }
3029 }
3030
3031
3032 #define SETREX(Re1,Re2) \
3033     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3034     Re1 = (Re2)
3035
3036 STATIC I32                      /* 0 failure, 1 success */
3037 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3038 {
3039 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3040     dMY_CXT;
3041 #endif
3042     dVAR;
3043     register const bool utf8_target = PL_reg_match_utf8;
3044     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3045     REGEXP *rex_sv = reginfo->prog;
3046     regexp *rex = (struct regexp *)SvANY(rex_sv);
3047     RXi_GET_DECL(rex,rexi);
3048     I32 oldsave;
3049     /* the current state. This is a cached copy of PL_regmatch_state */
3050     register regmatch_state *st;
3051     /* cache heavy used fields of st in registers */
3052     register regnode *scan;
3053     register regnode *next;
3054     register U32 n = 0; /* general value; init to avoid compiler warning */
3055     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3056     register char *locinput = PL_reginput;
3057     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3058
3059     bool result = 0;        /* return value of S_regmatch */
3060     int depth = 0;          /* depth of backtrack stack */
3061     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3062     const U32 max_nochange_depth =
3063         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3064         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3065     regmatch_state *yes_state = NULL; /* state to pop to on success of
3066                                                             subpattern */
3067     /* mark_state piggy backs on the yes_state logic so that when we unwind
3068        the stack on success we can update the mark_state as we go */
3069     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3070     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3071     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3072     U32 state_num;
3073     bool no_final = 0;      /* prevent failure from backtracking? */
3074     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3075     char *startpoint = PL_reginput;
3076     SV *popmark = NULL;     /* are we looking for a mark? */
3077     SV *sv_commit = NULL;   /* last mark name seen in failure */
3078     SV *sv_yes_mark = NULL; /* last mark name we have seen
3079                                during a successful match */
3080     U32 lastopen = 0;       /* last open we saw */
3081     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3082     SV* const oreplsv = GvSV(PL_replgv);
3083     /* these three flags are set by various ops to signal information to
3084      * the very next op. They have a useful lifetime of exactly one loop
3085      * iteration, and are not preserved or restored by state pushes/pops
3086      */
3087     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3088     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3089     int logical = 0;        /* the following EVAL is:
3090                                 0: (?{...})
3091                                 1: (?(?{...})X|Y)
3092                                 2: (??{...})
3093                                or the following IFMATCH/UNLESSM is:
3094                                 false: plain (?=foo)
3095                                 true:  used as a condition: (?(?=foo))
3096                             */
3097 #ifdef DEBUGGING
3098     GET_RE_DEBUG_FLAGS_DECL;
3099 #endif
3100
3101     PERL_ARGS_ASSERT_REGMATCH;
3102
3103     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3104             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3105     }));
3106     /* on first ever call to regmatch, allocate first slab */
3107     if (!PL_regmatch_slab) {
3108         Newx(PL_regmatch_slab, 1, regmatch_slab);
3109         PL_regmatch_slab->prev = NULL;
3110         PL_regmatch_slab->next = NULL;
3111         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3112     }
3113
3114     oldsave = PL_savestack_ix;
3115     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3116     SAVEVPTR(PL_regmatch_slab);
3117     SAVEVPTR(PL_regmatch_state);
3118
3119     /* grab next free state slot */
3120     st = ++PL_regmatch_state;
3121     if (st >  SLAB_LAST(PL_regmatch_slab))
3122         st = PL_regmatch_state = S_push_slab(aTHX);
3123
3124     /* Note that nextchr is a byte even in UTF */
3125     nextchr = UCHARAT(locinput);
3126     scan = prog;
3127     while (scan != NULL) {
3128
3129         DEBUG_EXECUTE_r( {
3130             SV * const prop = sv_newmortal();
3131             regnode *rnext=regnext(scan);
3132             DUMP_EXEC_POS( locinput, scan, utf8_target );
3133             regprop(rex, prop, scan);
3134
3135             PerlIO_printf(Perl_debug_log,
3136                     "%3"IVdf":%*s%s(%"IVdf")\n",
3137                     (IV)(scan - rexi->program), depth*2, "",
3138                     SvPVX_const(prop),
3139                     (PL_regkind[OP(scan)] == END || !rnext) ?
3140                         0 : (IV)(rnext - rexi->program));
3141         });
3142
3143         next = scan + NEXT_OFF(scan);
3144         if (next == scan)
3145             next = NULL;
3146         state_num = OP(scan);
3147
3148       reenter_switch:
3149
3150         assert(PL_reglastparen == &rex->lastparen);
3151         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3152         assert(PL_regoffs == rex->offs);
3153
3154         switch (state_num) {
3155         case BOL:
3156             if (locinput == PL_bostr)
3157             {
3158                 /* reginfo->till = reginfo->bol; */
3159                 break;
3160             }
3161             sayNO;
3162         case MBOL:
3163             if (locinput == PL_bostr ||
3164                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3165             {
3166                 break;
3167             }
3168             sayNO;
3169         case SBOL:
3170             if (locinput == PL_bostr)
3171                 break;
3172             sayNO;
3173         case GPOS:
3174             if (locinput == reginfo->ganch)
3175                 break;
3176             sayNO;
3177
3178         case KEEPS:
3179             /* update the startpoint */
3180             st->u.keeper.val = PL_regoffs[0].start;
3181             PL_reginput = locinput;
3182             PL_regoffs[0].start = locinput - PL_bostr;
3183             PUSH_STATE_GOTO(KEEPS_next, next);
3184             /*NOT-REACHED*/
3185         case KEEPS_next_fail:
3186             /* rollback the start point change */
3187             PL_regoffs[0].start = st->u.keeper.val;
3188             sayNO_SILENT;
3189             /*NOT-REACHED*/
3190         case EOL:
3191                 goto seol;
3192         case MEOL:
3193             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3194                 sayNO;
3195             break;
3196         case SEOL:
3197           seol:
3198             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3199                 sayNO;
3200             if (PL_regeol - locinput > 1)
3201                 sayNO;
3202             break;
3203         case EOS:
3204             if (PL_regeol != locinput)
3205                 sayNO;
3206             break;
3207         case SANY:
3208             if (!nextchr && locinput >= PL_regeol)
3209                 sayNO;
3210             if (utf8_target) {
3211                 locinput += PL_utf8skip[nextchr];
3212                 if (locinput > PL_regeol)
3213                     sayNO;
3214                 nextchr = UCHARAT(locinput);
3215             }
3216             else
3217                 nextchr = UCHARAT(++locinput);
3218             break;
3219         case CANY:
3220             if (!nextchr && locinput >= PL_regeol)
3221                 sayNO;
3222             nextchr = UCHARAT(++locinput);
3223             break;
3224         case REG_ANY:
3225             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3226                 sayNO;
3227             if (utf8_target) {
3228                 locinput += PL_utf8skip[nextchr];
3229                 if (locinput > PL_regeol)
3230                     sayNO;
3231                 nextchr = UCHARAT(locinput);
3232             }
3233             else
3234                 nextchr = UCHARAT(++locinput);
3235             break;
3236
3237 #undef  ST
3238 #define ST st->u.trie
3239         case TRIEC:
3240             /* In this case the charclass data is available inline so
3241                we can fail fast without a lot of extra overhead.
3242              */
3243             if (scan->flags == EXACT || !utf8_target) {
3244                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3245                     DEBUG_EXECUTE_r(
3246                         PerlIO_printf(Perl_debug_log,
3247                                   "%*s  %sfailed to match trie start class...%s\n",
3248                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3249                     );
3250                     sayNO_SILENT;
3251                     /* NOTREACHED */
3252                 }
3253             }
3254             /* FALL THROUGH */
3255         case TRIE:
3256             /* the basic plan of execution of the trie is:
3257              * At the beginning, run though all the states, and
3258              * find the longest-matching word. Also remember the position
3259              * of the shortest matching word. For example, this pattern:
3260              *    1  2 3 4    5
3261              *    ab|a|x|abcd|abc
3262              * when matched against the string "abcde", will generate
3263              * accept states for all words except 3, with the longest
3264              * matching word being 4, and the shortest being 1 (with
3265              * the position being after char 1 of the string).
3266              *
3267              * Then for each matching word, in word order (i.e. 1,2,4,5),
3268              * we run the remainder of the pattern; on each try setting
3269              * the current position to the character following the word,
3270              * returning to try the next word on failure.
3271              *
3272              * We avoid having to build a list of words at runtime by
3273              * using a compile-time structure, wordinfo[].prev, which
3274              * gives, for each word, the previous accepting word (if any).
3275              * In the case above it would contain the mappings 1->2, 2->0,
3276              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3277              * the longest word (4 above), a list of all words, by
3278              * following the list of prev pointers; this gives us the
3279              * unordered list 4,5,1,2. Then given the current word we have
3280              * just tried, we can go through the list and find the
3281              * next-biggest word to try (so if we just failed on word 2,
3282              * the next in the list is 4).
3283              *
3284              * Since at runtime we don't record the matching position in
3285              * the string for each word, we have to work that out for
3286              * each word we're about to process. The wordinfo table holds
3287              * the character length of each word; given that we recorded
3288              * at the start: the position of the shortest word and its
3289              * length in chars, we just need to move the pointer the
3290              * difference between the two char lengths. Depending on
3291              * Unicode status and folding, that's cheap or expensive.
3292              *
3293              * This algorithm is optimised for the case where are only a
3294              * small number of accept states, i.e. 0,1, or maybe 2.
3295              * With lots of accepts states, and having to try all of them,
3296              * it becomes quadratic on number of accept states to find all
3297              * the next words.
3298              */
3299
3300             {
3301                 /* what type of TRIE am I? (utf8 makes this contextual) */
3302                 DECL_TRIE_TYPE(scan);
3303
3304                 /* what trie are we using right now */
3305                 reg_trie_data * const trie
3306                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3307                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3308                 U32 state = trie->startstate;
3309
3310                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3311                     !TRIE_BITMAP_TEST(trie,*locinput)
3312                 ) {
3313                     if (trie->states[ state ].wordnum) {
3314                          DEBUG_EXECUTE_r(
3315                             PerlIO_printf(Perl_debug_log,
3316                                           "%*s  %smatched empty string...%s\n",
3317                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3318                         );
3319                         if (!trie->jump)
3320                             break;
3321                     } else {
3322                         DEBUG_EXECUTE_r(
3323                             PerlIO_printf(Perl_debug_log,
3324                                           "%*s  %sfailed to match trie start class...%s\n",
3325                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3326                         );
3327                         sayNO_SILENT;
3328                    }
3329                 }
3330
3331             {
3332                 U8 *uc = ( U8* )locinput;
3333
3334                 STRLEN len = 0;
3335                 STRLEN foldlen = 0;
3336                 U8 *uscan = (U8*)NULL;
3337                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3338                 U32 charcount = 0; /* how many input chars we have matched */
3339                 U32 accepted = 0; /* have we seen any accepting states? */
3340
3341                 ST.B = next;
3342                 ST.jump = trie->jump;
3343                 ST.me = scan;
3344                 ST.firstpos = NULL;
3345                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3346                 ST.nextword = 0;
3347
3348                 /* fully traverse the TRIE; note the position of the
3349                    shortest accept state and the wordnum of the longest
3350                    accept state */
3351
3352                 while ( state && uc <= (U8*)PL_regeol ) {
3353                     U32 base = trie->states[ state ].trans.base;
3354                     UV uvc = 0;
3355                     U16 charid = 0;
3356                     U16 wordnum;
3357                     wordnum = trie->states[ state ].wordnum;
3358
3359                     if (wordnum) { /* it's an accept state */
3360                         if (!accepted) {
3361                             accepted = 1;
3362                             /* record first match position */
3363                             if (ST.longfold) {
3364                                 ST.firstpos = (U8*)locinput;
3365                                 ST.firstchars = 0;
3366                             }
3367                             else {
3368                                 ST.firstpos = uc;
3369                                 ST.firstchars = charcount;
3370                             }
3371                         }
3372                         if (!ST.nextword || wordnum < ST.nextword)
3373                             ST.nextword = wordnum;
3374                         ST.topword = wordnum;
3375                     }
3376
3377                     DEBUG_TRIE_EXECUTE_r({
3378                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3379                                 PerlIO_printf( Perl_debug_log,
3380                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3381                                     2+depth * 2, "", PL_colors[4],
3382                                     (UV)state, (accepted ? 'Y' : 'N'));
3383                     });
3384
3385                     /* read a char and goto next state */
3386                     if ( base ) {
3387                         I32 offset;
3388                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3389                                              uscan, len, uvc, charid, foldlen,
3390                                              foldbuf, uniflags);
3391                         charcount++;
3392                         if (foldlen>0)
3393                             ST.longfold = TRUE;
3394                         if (charid &&
3395                              ( ((offset =
3396                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3397
3398                              && ((U32)offset < trie->lasttrans)
3399                              && trie->trans[offset].check == state)
3400                         {
3401                             state = trie->trans[offset].next;
3402                         }
3403                         else {
3404                             state = 0;
3405                         }
3406                         uc += len;
3407
3408                     }
3409                     else {
3410                         state = 0;
3411                     }
3412                     DEBUG_TRIE_EXECUTE_r(
3413                         PerlIO_printf( Perl_debug_log,
3414                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3415                             charid, uvc, (UV)state, PL_colors[5] );
3416                     );
3417                 }
3418                 if (!accepted)
3419                    sayNO;
3420
3421                 /* calculate total number of accept states */
3422                 {
3423                     U16 w = ST.topword;
3424                     accepted = 0;
3425                     while (w) {
3426                         w = trie->wordinfo[w].prev;
3427                         accepted++;
3428                     }
3429                     ST.accepted = accepted;
3430                 }
3431
3432                 DEBUG_EXECUTE_r(
3433                     PerlIO_printf( Perl_debug_log,
3434                         "%*s  %sgot %"IVdf" possible matches%s\n",
3435                         REPORT_CODE_OFF + depth * 2, "",
3436                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3437                 );
3438                 goto trie_first_try; /* jump into the fail handler */
3439             }}
3440             /* NOTREACHED */
3441
3442         case TRIE_next_fail: /* we failed - try next alternative */
3443             if ( ST.jump) {
3444                 REGCP_UNWIND(ST.cp);
3445                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3446                     PL_regoffs[n].end = -1;
3447                 *PL_reglastparen = n;
3448             }
3449             if (!--ST.accepted) {
3450                 DEBUG_EXECUTE_r({
3451                     PerlIO_printf( Perl_debug_log,
3452                         "%*s  %sTRIE failed...%s\n",
3453                         REPORT_CODE_OFF+depth*2, "",
3454                         PL_colors[4],
3455                         PL_colors[5] );
3456                 });
3457                 sayNO_SILENT;
3458             }
3459             {
3460                 /* Find next-highest word to process.  Note that this code
3461                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3462                 register U16 min = 0;
3463                 register U16 word;
3464                 register U16 const nextword = ST.nextword;
3465                 register reg_trie_wordinfo * const wordinfo
3466                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3467                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3468                     if (word > nextword && (!min || word < min))
3469                         min = word;
3470                 }
3471                 ST.nextword = min;
3472             }
3473
3474           trie_first_try:
3475             if (do_cutgroup) {
3476                 do_cutgroup = 0;
3477                 no_final = 0;
3478             }
3479
3480             if ( ST.jump) {
3481                 ST.lastparen = *PL_reglastparen;
3482                 REGCP_SET(ST.cp);
3483             }
3484
3485             /* find start char of end of current word */
3486             {
3487                 U32 chars; /* how many chars to skip */
3488                 U8 *uc = ST.firstpos;
3489                 reg_trie_data * const trie
3490                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3491
3492                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3493                             >=  ST.firstchars);
3494                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3495                             - ST.firstchars;
3496
3497                 if (ST.longfold) {
3498                     /* the hard option - fold each char in turn and find
3499                      * its folded length (which may be different */
3500                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3501                     STRLEN foldlen;
3502                     STRLEN len;
3503                     UV uvc;
3504                     U8 *uscan;
3505
3506                     while (chars) {
3507                         if (utf8_target) {
3508                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3509                                                     uniflags);
3510                             uc += len;
3511                         }
3512                         else {
3513                             uvc = *uc;
3514                             uc++;
3515                         }
3516                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3517                         uscan = foldbuf;
3518                         while (foldlen) {
3519                             if (!--chars)
3520                                 break;
3521                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3522                                             uniflags);
3523                             uscan += len;
3524                             foldlen -= len;
3525                         }
3526                     }
3527                 }
3528                 else {
3529                     if (utf8_target)
3530                         while (chars--)
3531                             uc += UTF8SKIP(uc);
3532                     else
3533                         uc += chars;
3534                 }
3535                 PL_reginput = (char *)uc;
3536             }
3537
3538             scan = (ST.jump && ST.jump[ST.nextword])
3539                         ? ST.me + ST.jump[ST.nextword]
3540                         : ST.B;
3541
3542             DEBUG_EXECUTE_r({
3543                 PerlIO_printf( Perl_debug_log,
3544                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3545                     REPORT_CODE_OFF+depth*2, "",
3546                     PL_colors[4],
3547                     ST.nextword,
3548                     PL_colors[5]
3549                     );
3550             });
3551
3552             if (ST.accepted > 1 || has_cutgroup) {
3553                 PUSH_STATE_GOTO(TRIE_next, scan);
3554                 /* NOTREACHED */
3555             }
3556             /* only one choice left - just continue */
3557             DEBUG_EXECUTE_r({
3558                 AV *const trie_words
3559                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3560                 SV ** const tmp = av_fetch( trie_words,
3561                     ST.nextword-1, 0 );
3562                 SV *sv= tmp ? sv_newmortal() : NULL;
3563
3564                 PerlIO_printf( Perl_debug_log,
3565                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3566                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3567                     ST.nextword,
3568                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3569                             PL_colors[0], PL_colors[1],
3570                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3571                         )
3572                     : "not compiled under -Dr",
3573                     PL_colors[5] );
3574             });
3575
3576             locinput = PL_reginput;
3577             nextchr = UCHARAT(locinput);
3578             continue; /* execute rest of RE */
3579             /* NOTREACHED */
3580 #undef  ST
3581
3582         case EXACT: {
3583             char *s = STRING(scan);
3584             ln = STR_LEN(scan);
3585             if (utf8_target != UTF_PATTERN) {
3586                 /* The target and the pattern have differing utf8ness. */
3587                 char *l = locinput;
3588                 const char * const e = s + ln;
3589
3590                 if (utf8_target) {
3591                     /* The target is utf8, the pattern is not utf8. */
3592                     while (s < e) {
3593                         STRLEN ulen;
3594                         if (l >= PL_regeol)
3595                              sayNO;
3596                         if (NATIVE_TO_UNI(*(U8*)s) !=
3597                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3598                                             uniflags))
3599                              sayNO;
3600                         l += ulen;
3601                         s ++;
3602                     }
3603                 }
3604                 else {
3605                     /* The target is not utf8, the pattern is utf8. */
3606                     while (s < e) {
3607                         STRLEN ulen;
3608                         if (l >= PL_regeol)
3609                             sayNO;
3610                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3611                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3612                                            uniflags))
3613                             sayNO;
3614                         s += ulen;
3615                         l ++;
3616                     }
3617                 }
3618                 locinput = l;
3619                 nextchr = UCHARAT(locinput);
3620                 break;
3621             }
3622             /* The target and the pattern have the same utf8ness. */
3623             /* Inline the first character, for speed. */
3624             if (UCHARAT(s) != nextchr)
3625                 sayNO;
3626             if (PL_regeol - locinput < ln)
3627                 sayNO;
3628             if (ln > 1 && memNE(s, locinput, ln))
3629                 sayNO;
3630             locinput += ln;
3631             nextchr = UCHARAT(locinput);
3632             break;
3633             }
3634         case EXACTFL: {
3635             re_fold_t folder;
3636             const U8 * fold_array;
3637             const char * s;
3638             U32 fold_utf8_flags;
3639
3640             PL_reg_flags |= RF_tainted;
3641             folder = foldEQ_locale;
3642             fold_array = PL_fold_locale;
3643             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3644             goto do_exactf;
3645
3646         case EXACTFU:
3647             folder = foldEQ_latin1;
3648             fold_array = PL_fold_latin1;
3649             fold_utf8_flags = 0;
3650             goto do_exactf;
3651
3652         case EXACTFA:
3653             folder = foldEQ_latin1;
3654             fold_array = PL_fold_latin1;
3655             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3656             goto do_exactf;
3657
3658         case EXACTF:
3659             folder = foldEQ;
3660             fold_array = PL_fold;
3661             fold_utf8_flags = 0;
3662
3663           do_exactf:
3664             s = STRING(scan);
3665             ln = STR_LEN(scan);
3666
3667             if (utf8_target || UTF_PATTERN) {
3668               /* Either target or the pattern are utf8. */
3669                 const char * const l = locinput;
3670                 char *e = PL_regeol;
3671
3672                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3673                                l, &e, 0,  utf8_target, fold_utf8_flags)) {
3674                      /* One more case for the sharp s:
3675                       * pack("U0U*", 0xDF) =~ /ss/i,
3676                       * the 0xC3 0x9F are the UTF-8
3677                       * byte sequence for the U+00DF. */
3678
3679                      if (!(utf8_target &&
3680                            toLOWER(s[0]) == 's' &&
3681                            ln >= 2 &&
3682                            toLOWER(s[1]) == 's' &&
3683                            (U8)l[0] == 0xC3 &&
3684                            e - l >= 2 &&
3685                            (U8)l[1] == 0x9F))
3686                           sayNO;
3687                 }
3688                 locinput = e;
3689                 nextchr = UCHARAT(locinput);
3690                 break;
3691             }
3692
3693             /* Neither the target nor the pattern are utf8 */
3694             if (UCHARAT(s) != nextchr &&
3695                 UCHARAT(s) != fold_array[nextchr])
3696             {
3697                 sayNO;
3698             }
3699             if (PL_regeol - locinput < ln)
3700                 sayNO;
3701             if (ln > 1 && ! folder(s, locinput, ln))
3702                 sayNO;
3703             locinput += ln;
3704             nextchr = UCHARAT(locinput);
3705             break;
3706         }
3707
3708         /* XXX Could improve efficiency by separating these all out using a
3709          * macro or in-line function.  At that point regcomp.c would no longer
3710          * have to set the FLAGS fields of these */
3711         case BOUNDL:
3712         case NBOUNDL:
3713             PL_reg_flags |= RF_tainted;
3714             /* FALL THROUGH */
3715         case BOUND:
3716         case BOUNDU:
3717         case BOUNDA:
3718         case NBOUND:
3719         case NBOUNDU:
3720         case NBOUNDA:
3721             /* was last char in word? */
3722             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3723                 if (locinput == PL_bostr)
3724                     ln = '\n';
3725                 else {
3726                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3727
3728                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3729                 }
3730                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3731                     ln = isALNUM_uni(ln);
3732                     LOAD_UTF8_CHARCLASS_ALNUM();
3733                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3734                 }
3735                 else {
3736                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3737                     n = isALNUM_LC_utf8((U8*)locinput);
3738                 }
3739             }
3740             else {
3741
3742                 /* Here the string isn't utf8, or is utf8 and only ascii
3743                  * characters are to match \w.  In the latter case looking at
3744                  * the byte just prior to the current one may be just the final
3745                  * byte of a multi-byte character.  This is ok.  There are two
3746                  * cases:
3747                  * 1) it is a single byte character, and then the test is doing
3748                  *      just what it's supposed to.
3749                  * 2) it is a multi-byte character, in which case the final
3750                  *      byte is never mistakable for ASCII, and so the test
3751                  *      will say it is not a word character, which is the
3752                  *      correct answer. */
3753                 ln = (locinput != PL_bostr) ?
3754                     UCHARAT(locinput - 1) : '\n';
3755                 switch (FLAGS(scan)) {
3756                     case REGEX_UNICODE_CHARSET:
3757                         ln = isWORDCHAR_L1(ln);
3758                         n = isWORDCHAR_L1(nextchr);
3759                         break;
3760                     case REGEX_LOCALE_CHARSET:
3761                         ln = isALNUM_LC(ln);
3762                         n = isALNUM_LC(nextchr);
3763                         break;
3764                     case REGEX_DEPENDS_CHARSET:
3765                         ln = isALNUM(ln);
3766                         n = isALNUM(nextchr);
3767                         break;
3768                     case REGEX_ASCII_RESTRICTED_CHARSET:
3769                         ln = isWORDCHAR_A(ln);
3770                         n = isWORDCHAR_A(nextchr);
3771                         break;
3772                     default:
3773                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3774                         break;
3775                 }
3776             }
3777             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3778              * regcomp.sym */
3779             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3780                     sayNO;
3781             break;
3782         case ANYOFV:
3783         case ANYOF:
3784             if (utf8_target || state_num == ANYOFV) {
3785                 STRLEN inclasslen = PL_regeol - locinput;
3786                 if (locinput >= PL_regeol)
3787                     sayNO;
3788
3789                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3790                     sayNO;
3791                 locinput += inclasslen;
3792                 nextchr = UCHARAT(locinput);
3793                 break;
3794             }
3795             else {
3796                 if (nextchr < 0)
3797                     nextchr = UCHARAT(locinput);
3798                 if (!nextchr && locinput >= PL_regeol)
3799                     sayNO;
3800                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3801                     sayNO;
3802                 nextchr = UCHARAT(++locinput);
3803                 break;
3804             }
3805             break;
3806         /* Special char classes - The defines start on line 129 or so */
3807         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3808                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3809                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3810                   ALNUMA, NALNUMA, isWORDCHAR_A,
3811                   perl_word, "a");
3812
3813         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3814                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3815                   SPACEU, NSPACEU, isSPACE_L1,
3816                   SPACEA, NSPACEA, isSPACE_A,
3817                   perl_space, " ");
3818
3819         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3820                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3821                 DIGITA, NDIGITA, isDIGIT_A,
3822                 posix_digit, "0");
3823
3824         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3825                        a Unicode extended Grapheme Cluster */
3826             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3827               extended Grapheme Cluster is:
3828
3829                CR LF
3830                | Prepend* Begin Extend*
3831                | .
3832
3833                Begin is (Hangul-syllable | ! Control)
3834                Extend is (Grapheme_Extend | Spacing_Mark)
3835                Control is [ GCB_Control CR LF ]
3836
3837                The discussion below shows how the code for CLUMP is derived
3838                from this regex.  Note that most of these concepts are from
3839                property values of the Grapheme Cluster Boundary (GCB) property.
3840                No code point can have multiple property values for a given
3841                property.  Thus a code point in Prepend can't be in Control, but
3842                it must be in !Control.  This is why Control above includes
3843                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3844                property separately, and so can't be in GCB_Control, even though
3845                they logically are controls.  Control is not the same as gc=cc,
3846                but includes format and other characters as well.
3847
3848                The Unicode definition of Hangul-syllable is:
3849                    L+
3850                    | (L* ( ( V | LV ) V* | LVT ) T*)
3851                    | T+
3852                   )
3853                Each of these is a value for the GCB property, and hence must be
3854                disjoint, so the order they are tested is immaterial, so the
3855                above can safely be changed to
3856                    T+
3857                    | L+
3858                    | (L* ( LVT | ( V | LV ) V*) T*)
3859
3860                The last two terms can be combined like this:
3861                    L* ( L
3862                         | (( LVT | ( V | LV ) V*) T*))
3863
3864                And refactored into this:
3865                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3866
3867                That means that if we have seen any L's at all we can quit
3868                there, but if the next character is a LVT, a V or and LV we
3869                should keep going.
3870
3871                There is a subtlety with Prepend* which showed up in testing.
3872                Note that the Begin, and only the Begin is required in:
3873                 | Prepend* Begin Extend*
3874                Also, Begin contains '! Control'.  A Prepend must be a '!
3875                Control', which means it must be a Begin.  What it comes down to
3876                is that if we match Prepend* and then find no suitable Begin
3877                afterwards, that if we backtrack the last Prepend, that one will
3878                be a suitable Begin.
3879             */
3880
3881             if (locinput >= PL_regeol)
3882                 sayNO;
3883             if  (! utf8_target) {
3884
3885                 /* Match either CR LF  or '.', as all the other possibilities
3886                  * require utf8 */
3887                 locinput++;         /* Match the . or CR */
3888                 if (nextchr == '\r'
3889                     && locinput < PL_regeol
3890                     && UCHARAT(locinput) == '\n') locinput++;
3891             }
3892             else {
3893
3894                 /* Utf8: See if is ( CR LF ); already know that locinput <
3895                  * PL_regeol, so locinput+1 is in bounds */
3896                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3897                     locinput += 2;
3898                 }
3899                 else {
3900                     /* In case have to backtrack to beginning, then match '.' */
3901                     char *starting = locinput;
3902
3903                     /* In case have to backtrack the last prepend */
3904                     char *previous_prepend = 0;
3905
3906                     LOAD_UTF8_CHARCLASS_GCB();
3907
3908                     /* Match (prepend)* */
3909                     while (locinput < PL_regeol
3910                            && swash_fetch(PL_utf8_X_prepend,
3911                                           (U8*)locinput, utf8_target))
3912                     {
3913                         previous_prepend = locinput;
3914                         locinput += UTF8SKIP(locinput);
3915                     }
3916
3917                     /* As noted above, if we matched a prepend character, but
3918                      * the next thing won't match, back off the last prepend we
3919                      * matched, as it is guaranteed to match the begin */
3920                     if (previous_prepend
3921                         && (locinput >=  PL_regeol
3922                             || ! swash_fetch(PL_utf8_X_begin,
3923                                              (U8*)locinput, utf8_target)))
3924                     {
3925                         locinput = previous_prepend;
3926                     }
3927
3928                     /* Note that here we know PL_regeol > locinput, as we
3929                      * tested that upon input to this switch case, and if we
3930                      * moved locinput forward, we tested the result just above
3931                      * and it either passed, or we backed off so that it will
3932                      * now pass */
3933                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3934
3935                         /* Here did not match the required 'Begin' in the
3936                          * second term.  So just match the very first
3937                          * character, the '.' of the final term of the regex */
3938                         locinput = starting + UTF8SKIP(starting);
3939                     } else {
3940
3941                         /* Here is the beginning of a character that can have
3942                          * an extender.  It is either a hangul syllable, or a
3943                          * non-control */
3944                         if (swash_fetch(PL_utf8_X_non_hangul,
3945                                         (U8*)locinput, utf8_target))
3946                         {
3947
3948                             /* Here not a Hangul syllable, must be a
3949                              * ('!  * Control') */
3950                             locinput += UTF8SKIP(locinput);
3951                         } else {
3952
3953                             /* Here is a Hangul syllable.  It can be composed
3954                              * of several individual characters.  One
3955                              * possibility is T+ */
3956                             if (swash_fetch(PL_utf8_X_T,
3957                                             (U8*)locinput, utf8_target))
3958                             {
3959                                 while (locinput < PL_regeol
3960                                         && swash_fetch(PL_utf8_X_T,
3961                                                         (U8*)locinput, utf8_target))
3962                                 {
3963                                     locinput += UTF8SKIP(locinput);
3964                                 }
3965                             } else {
3966
3967                                 /* Here, not T+, but is a Hangul.  That means
3968                                  * it is one of the others: L, LV, LVT or V,
3969                                  * and matches:
3970                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3971
3972                                 /* Match L*           */
3973                                 while (locinput < PL_regeol
3974                                         && swash_fetch(PL_utf8_X_L,
3975                                                         (U8*)locinput, utf8_target))
3976                                 {
3977                                     locinput += UTF8SKIP(locinput);
3978                                 }
3979
3980                                 /* Here, have exhausted L*.  If the next
3981                                  * character is not an LV, LVT nor V, it means
3982                                  * we had to have at least one L, so matches L+
3983                                  * in the original equation, we have a complete
3984                                  * hangul syllable.  Are done. */
3985
3986                                 if (locinput < PL_regeol
3987                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3988                                                     (U8*)locinput, utf8_target))
3989                                 {
3990
3991                                     /* Otherwise keep going.  Must be LV, LVT
3992                                      * or V.  See if LVT */
3993                                     if (swash_fetch(PL_utf8_X_LVT,
3994                                                     (U8*)locinput, utf8_target))
3995                                     {
3996                                         locinput += UTF8SKIP(locinput);
3997                                     } else {
3998
3999                                         /* Must be  V or LV.  Take it, then
4000                                          * match V*     */
4001                                         locinput += UTF8SKIP(locinput);
4002                                         while (locinput < PL_regeol
4003                                                 && swash_fetch(PL_utf8_X_V,
4004                                                          (U8*)locinput, utf8_target))
4005                                         {
4006                                             locinput += UTF8SKIP(locinput);
4007                                         }
4008                                     }
4009
4010                                     /* And any of LV, LVT, or V can be followed
4011                                      * by T*            */
4012                                     while (locinput < PL_regeol
4013                                            && swash_fetch(PL_utf8_X_T,
4014                                                            (U8*)locinput,
4015                                                            utf8_target))
4016                                     {
4017                                         locinput += UTF8SKIP(locinput);
4018                                     }
4019                                 }
4020                             }
4021                         }
4022
4023                         /* Match any extender */
4024                         while (locinput < PL_regeol
4025                                 && swash_fetch(PL_utf8_X_extend,
4026                                                 (U8*)locinput, utf8_target))
4027                         {
4028                             locinput += UTF8SKIP(locinput);
4029                         }
4030                     }
4031                 }
4032                 if (locinput > PL_regeol) sayNO;
4033             }
4034             nextchr = UCHARAT(locinput);
4035             break;
4036
4037         case NREFFL:
4038         {   /* The capture buffer cases.  The ones beginning with N for the
4039                named buffers just convert to the equivalent numbered and
4040                pretend they were called as the corresponding numbered buffer
4041                op.  */
4042             /* don't initialize these in the declaration, it makes C++
4043                unhappy */
4044             char *s;
4045             char type;
4046             re_fold_t folder;
4047             const U8 *fold_array;
4048             UV utf8_fold_flags;
4049
4050             PL_reg_flags |= RF_tainted;
4051             folder = foldEQ_locale;
4052             fold_array = PL_fold_locale;
4053             type = REFFL;
4054             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4055             goto do_nref;
4056
4057         case NREFFA:
4058             folder = foldEQ_latin1;
4059             fold_array = PL_fold_latin1;
4060             type = REFFA;
4061             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4062             goto do_nref;
4063
4064         case NREFFU:
4065             folder = foldEQ_latin1;
4066             fold_array = PL_fold_latin1;
4067             type = REFFU;
4068             utf8_fold_flags = 0;
4069             goto do_nref;
4070
4071         case NREFF:
4072             folder = foldEQ;
4073             fold_array = PL_fold;
4074             type = REFF;
4075             utf8_fold_flags = 0;
4076             goto do_nref;
4077
4078         case NREF:
4079             type = REF;
4080             folder = NULL;
4081             fold_array = NULL;
4082             utf8_fold_flags = 0;
4083           do_nref:
4084
4085             /* For the named back references, find the corresponding buffer
4086              * number */
4087             n = reg_check_named_buff_matched(rex,scan);
4088
4089             if ( ! n ) {
4090                 sayNO;
4091             }
4092             goto do_nref_ref_common;
4093
4094         case REFFL:
4095             PL_reg_flags |= RF_tainted;
4096             folder = foldEQ_locale;
4097             fold_array = PL_fold_locale;
4098             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4099             goto do_ref;
4100
4101         case REFFA:
4102             folder = foldEQ_latin1;
4103             fold_array = PL_fold_latin1;
4104             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4105             goto do_ref;
4106
4107         case REFFU:
4108             folder = foldEQ_latin1;
4109             fold_array = PL_fold_latin1;
4110             utf8_fold_flags = 0;
4111             goto do_ref;
4112
4113         case REFF:
4114             folder = foldEQ;
4115             fold_array = PL_fold;
4116             utf8_fold_flags = 0;
4117             goto do_ref;
4118
4119         case REF:
4120             folder = NULL;
4121             fold_array = NULL;
4122             utf8_fold_flags = 0;
4123
4124           do_ref:
4125             type = OP(scan);
4126             n = ARG(scan);  /* which paren pair */
4127
4128           do_nref_ref_common:
4129             ln = PL_regoffs[n].start;
4130             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4131             if (*PL_reglastparen < n || ln == -1)
4132                 sayNO;                  /* Do not match unless seen CLOSEn. */
4133             if (ln == PL_regoffs[n].end)
4134                 break;
4135
4136             s = PL_bostr + ln;
4137             if (type != REF     /* REF can do byte comparison */
4138                 && (utf8_target || type == REFFU))
4139             { /* XXX handle REFFL better */
4140                 char * limit = PL_regeol;
4141
4142                 /* This call case insensitively compares the entire buffer
4143                     * at s, with the current input starting at locinput, but
4144                     * not going off the end given by PL_regeol, and returns in
4145                     * limit upon success, how much of the current input was
4146                     * matched */
4147                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4148                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4149                 {
4150                     sayNO;
4151                 }
4152                 locinput = limit;
4153                 nextchr = UCHARAT(locinput);
4154                 break;
4155             }
4156
4157             /* Not utf8:  Inline the first character, for speed. */
4158             if (UCHARAT(s) != nextchr &&
4159                 (type == REF ||
4160                  UCHARAT(s) != fold_array[nextchr]))
4161                 sayNO;
4162             ln = PL_regoffs[n].end - ln;
4163             if (locinput + ln > PL_regeol)
4164                 sayNO;
4165             if (ln > 1 && (type == REF
4166                            ? memNE(s, locinput, ln)
4167                            : ! folder(s, locinput, ln)))
4168                 sayNO;
4169             locinput += ln;
4170             nextchr = UCHARAT(locinput);
4171             break;
4172         }
4173         case NOTHING:
4174         case TAIL:
4175             break;
4176         case BACK:
4177             break;
4178
4179 #undef  ST
4180 #define ST st->u.eval
4181         {
4182             SV *ret;
4183             REGEXP *re_sv;
4184             regexp *re;
4185             regexp_internal *rei;
4186             regnode *startpoint;
4187
4188         case GOSTART:
4189         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4190             if (cur_eval && cur_eval->locinput==locinput) {
4191                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4192                     Perl_croak(aTHX_ "Infinite recursion in regex");
4193                 if ( ++nochange_depth > max_nochange_depth )
4194                     Perl_croak(aTHX_
4195                         "Pattern subroutine nesting without pos change"
4196                         " exceeded limit in regex");
4197             } else {
4198                 nochange_depth = 0;
4199             }
4200             re_sv = rex_sv;
4201             re = rex;
4202             rei = rexi;
4203             (void)ReREFCNT_inc(rex_sv);
4204             if (OP(scan)==GOSUB) {
4205                 startpoint = scan + ARG2L(scan);
4206                 ST.close_paren = ARG(scan);
4207             } else {
4208                 startpoint = rei->program+1;
4209                 ST.close_paren = 0;
4210             }
4211             goto eval_recurse_doit;
4212             /* NOTREACHED */
4213         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4214             if (cur_eval && cur_eval->locinput==locinput) {
4215                 if ( ++nochange_depth > max_nochange_depth )
4216                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4217             } else {
4218                 nochange_depth = 0;
4219             }
4220             {
4221                 /* execute the code in the {...} */
4222                 dSP;
4223                 SV ** const before = SP;
4224                 OP_4tree * const oop = PL_op;
4225                 COP * const ocurcop = PL_curcop;
4226                 PAD *old_comppad;
4227                 char *saved_regeol = PL_regeol;
4228                 struct re_save_state saved_state;
4229
4230                 /* To not corrupt the existing regex state while executing the
4231                  * eval we would normally put it on the save stack, like with
4232                  * save_re_context. However, re-evals have a weird scoping so we
4233                  * can't just add ENTER/LEAVE here. With that, things like
4234                  *
4235                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4236                  *
4237                  * would break, as they expect the localisation to be unwound
4238                  * only when the re-engine backtracks through the bit that
4239                  * localised it.
4240                  *
4241                  * What we do instead is just saving the state in a local c
4242                  * variable.
4243                  */
4244                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4245
4246                 n = ARG(scan);
4247                 PL_op = (OP_4tree*)rexi->data->data[n];
4248                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4249                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4250                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4251                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4252
4253                 if (sv_yes_mark) {
4254                     SV *sv_mrk = get_sv("REGMARK", 1);
4255                     sv_setsv(sv_mrk, sv_yes_mark);
4256                 }
4257
4258                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4259                 SPAGAIN;
4260                 if (SP == before)
4261                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4262                 else {
4263                     ret = POPs;
4264                     PUTBACK;
4265                 }
4266
4267                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4268
4269                 PL_op = oop;
4270                 PAD_RESTORE_LOCAL(old_comppad);
4271                 PL_curcop = ocurcop;
4272                 PL_regeol = saved_regeol;
4273                 if (!logical) {
4274                     /* /(?{...})/ */
4275                     sv_setsv(save_scalar(PL_replgv), ret);
4276                     break;
4277                 }
4278             }
4279             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4280                 logical = 0;
4281                 {
4282                     /* extract RE object from returned value; compiling if
4283                      * necessary */
4284                     MAGIC *mg = NULL;
4285                     REGEXP *rx = NULL;
4286
4287                     if (SvROK(ret)) {
4288                         SV *const sv = SvRV(ret);
4289
4290                         if (SvTYPE(sv) == SVt_REGEXP) {
4291                             rx = (REGEXP*) sv;
4292                         } else if (SvSMAGICAL(sv)) {
4293                             mg = mg_find(sv, PERL_MAGIC_qr);
4294                             assert(mg);
4295                         }
4296                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4297                         rx = (REGEXP*) ret;
4298                     } else if (SvSMAGICAL(ret)) {
4299                         if (SvGMAGICAL(ret)) {
4300                             /* I don't believe that there is ever qr magic
4301                                here.  */
4302                             assert(!mg_find(ret, PERL_MAGIC_qr));
4303                             sv_unmagic(ret, PERL_MAGIC_qr);
4304                         }
4305                         else {
4306                             mg = mg_find(ret, PERL_MAGIC_qr);
4307                             /* testing suggests mg only ends up non-NULL for
4308                                scalars who were upgraded and compiled in the
4309                                else block below. In turn, this is only
4310                                triggered in the "postponed utf8 string" tests
4311                                in t/op/pat.t  */
4312                         }
4313                     }
4314
4315                     if (mg) {
4316                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4317                         assert(rx);
4318                     }
4319                     if (rx) {
4320                         rx = reg_temp_copy(NULL, rx);
4321                     }
4322                     else {
4323                         U32 pm_flags = 0;
4324                         const I32 osize = PL_regsize;
4325
4326                         if (DO_UTF8(ret)) {
4327                             assert (SvUTF8(ret));
4328                         } else if (SvUTF8(ret)) {
4329                             /* Not doing UTF-8, despite what the SV says. Is
4330                                this only if we're trapped in use 'bytes'?  */
4331                             /* Make a copy of the octet sequence, but without
4332                                the flag on, as the compiler now honours the
4333                                SvUTF8 flag on ret.  */
4334                             STRLEN len;
4335                             const char *const p = SvPV(ret, len);
4336                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4337                         }
4338                         rx = CALLREGCOMP(ret, pm_flags);
4339                         if (!(SvFLAGS(ret)
4340                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4341                                  | SVs_GMG))) {
4342                             /* This isn't a first class regexp. Instead, it's
4343                                caching a regexp onto an existing, Perl visible
4344                                scalar.  */
4345                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4346                         }
4347                         PL_regsize = osize;
4348                     }
4349                     re_sv = rx;
4350                     re = (struct regexp *)SvANY(rx);
4351                 }
4352                 RXp_MATCH_COPIED_off(re);
4353                 re->subbeg = rex->subbeg;
4354                 re->sublen = rex->sublen;
4355                 rei = RXi_GET(re);
4356                 DEBUG_EXECUTE_r(
4357                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4358                         "Matching embedded");
4359                 );
4360                 startpoint = rei->program + 1;
4361                 ST.close_paren = 0; /* only used for GOSUB */
4362                 /* borrowed from regtry */
4363                 if (PL_reg_start_tmpl <= re->nparens) {
4364                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4365                     if(PL_reg_start_tmp)
4366                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4367                     else
4368                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4369                 }
4370
4371         eval_recurse_doit: /* Share code with GOSUB below this line */
4372                 /* run the pattern returned from (??{...}) */
4373                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4374                 REGCP_SET(ST.lastcp);
4375
4376                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4377
4378                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4379                 PL_reglastparen = &re->lastparen;
4380                 PL_reglastcloseparen = &re->lastcloseparen;
4381                 re->lastparen = 0;
4382                 re->lastcloseparen = 0;
4383
4384                 PL_reginput = locinput;
4385                 PL_regsize = 0;
4386
4387                 /* XXXX This is too dramatic a measure... */
4388                 PL_reg_maxiter = 0;
4389
4390                 ST.toggle_reg_flags = PL_reg_flags;
4391                 if (RX_UTF8(re_sv))
4392                     PL_reg_flags |= RF_utf8;
4393                 else
4394                     PL_reg_flags &= ~RF_utf8;
4395                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4396
4397                 ST.prev_rex = rex_sv;
4398                 ST.prev_curlyx = cur_curlyx;
4399                 SETREX(rex_sv,re_sv);
4400                 rex = re;
4401                 rexi = rei;
4402                 cur_curlyx = NULL;
4403                 ST.B = next;
4404                 ST.prev_eval = cur_eval;
4405                 cur_eval = st;
4406                 /* now continue from first node in postoned RE */
4407                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4408                 /* NOTREACHED */
4409             }
4410             /* logical is 1,   /(?(?{...})X|Y)/ */
4411             sw = cBOOL(SvTRUE(ret));
4412             logical = 0;
4413             break;
4414         }
4415
4416         case EVAL_AB: /* cleanup after a successful (??{A})B */
4417             /* note: this is called twice; first after popping B, then A */
4418             PL_reg_flags ^= ST.toggle_reg_flags;
4419             ReREFCNT_dec(rex_sv);
4420             SETREX(rex_sv,ST.prev_rex);
4421             rex = (struct regexp *)SvANY(rex_sv);
4422             rexi = RXi_GET(rex);
4423             regcpblow(ST.cp);
4424             cur_eval = ST.prev_eval;
4425             cur_curlyx = ST.prev_curlyx;
4426
4427             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4428             PL_reglastparen = &rex->lastparen;
4429             PL_reglastcloseparen = &rex->lastcloseparen;
4430             /* also update PL_regoffs */
4431             PL_regoffs = rex->offs;
4432
4433             /* XXXX This is too dramatic a measure... */
4434             PL_reg_maxiter = 0;
4435             if ( nochange_depth )
4436                 nochange_depth--;
4437             sayYES;
4438
4439
4440         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4441             /* note: this is called twice; first after popping B, then A */
4442             PL_reg_flags ^= ST.toggle_reg_flags;
4443             ReREFCNT_dec(rex_sv);
4444             SETREX(rex_sv,ST.prev_rex);
4445             rex = (struct regexp *)SvANY(rex_sv);
4446             rexi = RXi_GET(rex);
4447             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4448             PL_reglastparen = &rex->lastparen;
4449             PL_reglastcloseparen = &rex->lastcloseparen;
4450
4451             PL_reginput = locinput;
4452             REGCP_UNWIND(ST.lastcp);
4453             regcppop(rex);
4454             cur_eval = ST.prev_eval;
4455             cur_curlyx = ST.prev_curlyx;
4456             /* XXXX This is too dramatic a measure... */
4457             PL_reg_maxiter = 0;
4458             if ( nochange_depth )
4459                 nochange_depth--;
4460             sayNO_SILENT;
4461 #undef ST
4462
4463         case OPEN:
4464             n = ARG(scan);  /* which paren pair */
4465             PL_reg_start_tmp[n] = locinput;
4466             if (n > PL_regsize)
4467                 PL_regsize = n;
4468             lastopen = n;
4469             break;
4470         case CLOSE:
4471             n = ARG(scan);  /* which paren pair */
4472             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4473             PL_regoffs[n].end = locinput - PL_bostr;
4474             /*if (n > PL_regsize)
4475                 PL_regsize = n;*/
4476             if (n > *PL_reglastparen)
4477                 *PL_reglastparen = n;
4478             *PL_reglastcloseparen = n;
4479             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4480                 goto fake_end;
4481             }
4482             break;
4483         case ACCEPT:
4484             if (ARG(scan)){
4485                 regnode *cursor;
4486                 for (cursor=scan;
4487                      cursor && OP(cursor)!=END;
4488                      cursor=regnext(cursor))
4489                 {
4490                     if ( OP(cursor)==CLOSE ){
4491                         n = ARG(cursor);
4492                         if ( n <= lastopen ) {
4493                             PL_regoffs[n].start
4494                                 = PL_reg_start_tmp[n] - PL_bostr;
4495                             PL_regoffs[n].end = locinput - PL_bostr;
4496                             /*if (n > PL_regsize)
4497                             PL_regsize = n;*/
4498                             if (n > *PL_reglastparen)
4499                                 *PL_reglastparen = n;
4500                             *PL_reglastcloseparen = n;
4501                             if ( n == ARG(scan) || (cur_eval &&
4502                                 cur_eval->u.eval.close_paren == n))
4503                                 break;
4504                         }
4505                     }
4506                 }
4507             }
4508             goto fake_end;
4509             /*NOTREACHED*/
4510         case GROUPP:
4511             n = ARG(scan);  /* which paren pair */
4512             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4513             break;
4514         case NGROUPP:
4515             /* reg_check_named_buff_matched returns 0 for no match */
4516             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4517             break;
4518         case INSUBP:
4519             n = ARG(scan);
4520             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4521             break;
4522         case DEFINEP:
4523             sw = 0;
4524             break;
4525         case IFTHEN:
4526             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4527             if (sw)
4528                 next = NEXTOPER(NEXTOPER(scan));
4529             else {
4530                 next = scan + ARG(scan);
4531                 if (OP(next) == IFTHEN) /* Fake one. */
4532                     next = NEXTOPER(NEXTOPER(next));
4533             }
4534             break;
4535         case LOGICAL:
4536             logical = scan->flags;
4537             break;
4538
4539 /*******************************************************************
4540
4541 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4542 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4543 STAR/PLUS/CURLY/CURLYN are used instead.)
4544
4545 A*B is compiled as <CURLYX><A><WHILEM><B>
4546
4547 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4548 state, which contains the current count, initialised to -1. It also sets
4549 cur_curlyx to point to this state, with any previous value saved in the
4550 state block.
4551
4552 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4553 since the pattern may possibly match zero times (i.e. it's a while {} loop
4554 rather than a do {} while loop).
4555
4556 Each entry to WHILEM represents a successful match of A. The count in the
4557 CURLYX block is incremented, another WHILEM state is pushed, and execution
4558 passes to A or B depending on greediness and the current count.
4559
4560 For example, if matching against the string a1a2a3b (where the aN are
4561 substrings that match /A/), then the match progresses as follows: (the
4562 pushed states are interspersed with the bits of strings matched so far):
4563
4564     <CURLYX cnt=-1>
4565     <CURLYX cnt=0><WHILEM>
4566     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4567     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4568     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4569     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4570
4571 (Contrast this with something like CURLYM, which maintains only a single
4572 backtrack state:
4573
4574     <CURLYM cnt=0> a1
4575     a1 <CURLYM cnt=1> a2
4576     a1 a2 <CURLYM cnt=2> a3
4577     a1 a2 a3 <CURLYM cnt=3> b
4578 )
4579
4580 Each WHILEM state block marks a point to backtrack to upon partial failure
4581 of A or B, and also contains some minor state data related to that
4582 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4583 overall state, such as the count, and pointers to the A and B ops.
4584
4585 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4586 must always point to the *current* CURLYX block, the rules are:
4587
4588 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4589 and set cur_curlyx to point the new block.
4590
4591 When popping the CURLYX block after a successful or unsuccessful match,
4592 restore the previous cur_curlyx.
4593
4594 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4595 to the outer one saved in the CURLYX block.
4596
4597 When popping the WHILEM block after a successful or unsuccessful B match,
4598 restore the previous cur_curlyx.
4599
4600 Here's an example for the pattern (AI* BI)*BO
4601 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4602
4603 cur_
4604 curlyx backtrack stack
4605 ------ ---------------
4606 NULL
4607 CO     <CO prev=NULL> <WO>
4608 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4609 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4610 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4611
4612 At this point the pattern succeeds, and we work back down the stack to
4613 clean up, restoring as we go:
4614
4615 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4616 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4617 CO     <CO prev=NULL> <WO>
4618 NULL
4619
4620 *******************************************************************/
4621
4622 #define ST st->u.curlyx
4623
4624         case CURLYX:    /* start of /A*B/  (for complex A) */
4625         {
4626             /* No need to save/restore up to this paren */
4627             I32 parenfloor = scan->flags;
4628
4629             assert(next); /* keep Coverity happy */
4630             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4631                 next += ARG(next);
4632
4633             /* XXXX Probably it is better to teach regpush to support
4634                parenfloor > PL_regsize... */
4635             if (parenfloor > (I32)*PL_reglastparen)
4636                 parenfloor = *PL_reglastparen; /* Pessimization... */
4637
4638             ST.prev_curlyx= cur_curlyx;
4639             cur_curlyx = st;
4640             ST.cp = PL_savestack_ix;
4641
4642             /* these fields contain the state of the current curly.
4643              * they are accessed by subsequent WHILEMs */
4644             ST.parenfloor = parenfloor;
4645             ST.me = scan;
4646             ST.B = next;
4647             ST.minmod = minmod;
4648             minmod = 0;
4649             ST.count = -1;      /* this will be updated by WHILEM */
4650             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4651
4652             PL_reginput = locinput;
4653             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4654             /* NOTREACHED */
4655         }
4656
4657         case CURLYX_end: /* just finished matching all of A*B */
4658             cur_curlyx = ST.prev_curlyx;
4659             sayYES;
4660             /* NOTREACHED */
4661
4662         case CURLYX_end_fail: /* just failed to match all of A*B */
4663             regcpblow(ST.cp);
4664             cur_curlyx = ST.prev_curlyx;
4665             sayNO;
4666             /* NOTREACHED */
4667
4668
4669 #undef ST
4670 #define ST st->u.whilem
4671
4672         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4673         {
4674             /* see the discussion above about CURLYX/WHILEM */
4675             I32 n;
4676             int min = ARG1(cur_curlyx->u.curlyx.me);
4677             int max = ARG2(cur_curlyx->u.curlyx.me);
4678             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4679
4680             assert(cur_curlyx); /* keep Coverity happy */
4681             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4682             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4683             ST.cache_offset = 0;
4684             ST.cache_mask = 0;
4685
4686             PL_reginput = locinput;
4687
4688             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4689                   "%*s  whilem: matched %ld out of %d..%d\n",
4690                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4691             );
4692
4693             /* First just match a string of min A's. */
4694
4695             if (n < min) {
4696                 cur_curlyx->u.curlyx.lastloc = locinput;
4697                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4698                 /* NOTREACHED */
4699             }
4700
4701             /* If degenerate A matches "", assume A done. */
4702
4703             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4704                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4705                    "%*s  whilem: empty match detected, trying continuation...\n",
4706                    REPORT_CODE_OFF+depth*2, "")
4707                 );
4708                 goto do_whilem_B_max;
4709             }
4710
4711             /* super-linear cache processing */
4712
4713             if (scan->flags) {
4714
4715                 if (!PL_reg_maxiter) {
4716                     /* start the countdown: Postpone detection until we
4717                      * know the match is not *that* much linear. */
4718                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4719                     /* possible overflow for long strings and many CURLYX's */
4720                     if (PL_reg_maxiter < 0)
4721                         PL_reg_maxiter = I32_MAX;
4722                     PL_reg_leftiter = PL_reg_maxiter;
4723                 }
4724
4725                 if (PL_reg_leftiter-- == 0) {
4726                     /* initialise cache */
4727                     const I32 size = (PL_reg_maxiter + 7)/8;
4728                     if (PL_reg_poscache) {
4729                         if ((I32)PL_reg_poscache_size < size) {
4730                             Renew(PL_reg_poscache, size, char);
4731                             PL_reg_poscache_size = size;
4732                         }
4733                         Zero(PL_reg_poscache, size, char);
4734                     }
4735                     else {
4736                         PL_reg_poscache_size = size;
4737                         Newxz(PL_reg_poscache, size, char);
4738                     }
4739                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4740       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4741                               PL_colors[4], PL_colors[5])
4742                     );
4743                 }
4744
4745                 if (PL_reg_leftiter < 0) {
4746                     /* have we already failed at this position? */
4747                     I32 offset, mask;
4748                     offset  = (scan->flags & 0xf) - 1
4749                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4750                     mask    = 1 << (offset % 8);
4751                     offset /= 8;
4752                     if (PL_reg_poscache[offset] & mask) {
4753                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4754                             "%*s  whilem: (cache) already tried at this position...\n",
4755                             REPORT_CODE_OFF+depth*2, "")
4756                         );
4757                         sayNO; /* cache records failure */
4758                     }
4759                     ST.cache_offset = offset;
4760                     ST.cache_mask   = mask;
4761                 }
4762             }
4763
4764             /* Prefer B over A for minimal matching. */
4765
4766             if (cur_curlyx->u.curlyx.minmod) {
4767                 ST.save_curlyx = cur_curlyx;
4768                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4769                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4770                 REGCP_SET(ST.lastcp);
4771                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4772                 /* NOTREACHED */
4773             }
4774
4775             /* Prefer A over B for maximal matching. */
4776
4777             if (n < max) { /* More greed allowed? */
4778                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4779                 cur_curlyx->u.curlyx.lastloc = locinput;
4780                 REGCP_SET(ST.lastcp);
4781                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4782                 /* NOTREACHED */
4783             }
4784             goto do_whilem_B_max;
4785         }
4786         /* NOTREACHED */
4787
4788         case WHILEM_B_min: /* just matched B in a minimal match */
4789         case WHILEM_B_max: /* just matched B in a maximal match */
4790             cur_curlyx = ST.save_curlyx;
4791             sayYES;
4792             /* NOTREACHED */
4793
4794         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4795             cur_curlyx = ST.save_curlyx;
4796             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4797             cur_curlyx->u.curlyx.count--;
4798             CACHEsayNO;
4799             /* NOTREACHED */
4800
4801         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4802             REGCP_UNWIND(ST.lastcp);
4803             regcppop(rex);
4804             /* FALL THROUGH */
4805         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4806             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4807             cur_curlyx->u.curlyx.count--;
4808             CACHEsayNO;
4809             /* NOTREACHED */
4810
4811         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4812             REGCP_UNWIND(ST.lastcp);
4813             regcppop(rex);      /* Restore some previous $<digit>s? */
4814             PL_reginput = locinput;
4815             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4816                 "%*s  whilem: failed, trying continuation...\n",
4817                 REPORT_CODE_OFF+depth*2, "")
4818             );
4819           do_whilem_B_max:
4820             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4821                 && ckWARN(WARN_REGEXP)
4822                 && !(PL_reg_flags & RF_warned))
4823             {
4824                 PL_reg_flags |= RF_warned;
4825                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4826                      "Complex regular subexpression recursion",
4827                      REG_INFTY - 1);
4828             }
4829
4830             /* now try B */
4831             ST.save_curlyx = cur_curlyx;
4832             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4833             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4834             /* NOTREACHED */
4835
4836         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4837             cur_curlyx = ST.save_curlyx;
4838             REGCP_UNWIND(ST.lastcp);
4839             regcppop(rex);
4840
4841             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4842                 /* Maximum greed exceeded */
4843                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4844                     && ckWARN(WARN_REGEXP)
4845                     && !(PL_reg_flags & RF_warned))
4846                 {
4847                     PL_reg_flags |= RF_warned;
4848                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4849                         "%s limit (%d) exceeded",
4850                         "Complex regular subexpression recursion",
4851                         REG_INFTY - 1);
4852                 }
4853                 cur_curlyx->u.curlyx.count--;
4854                 CACHEsayNO;
4855             }
4856
4857             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4858                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4859             );
4860             /* Try grabbing another A and see if it helps. */
4861             PL_reginput = locinput;
4862             cur_curlyx->u.curlyx.lastloc = locinput;
4863             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4864             REGCP_SET(ST.lastcp);
4865             PUSH_STATE_GOTO(WHILEM_A_min,
4866                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4867             /* NOTREACHED */
4868
4869 #undef  ST
4870 #define ST st->u.branch
4871
4872         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4873             next = scan + ARG(scan);
4874             if (next == scan)
4875                 next = NULL;
4876             scan = NEXTOPER(scan);
4877             /* FALL THROUGH */
4878
4879         case BRANCH:        /*  /(...|A|...)/ */
4880             scan = NEXTOPER(scan); /* scan now points to inner node */
4881             ST.lastparen = *PL_reglastparen;
4882             ST.next_branch = next;
4883             REGCP_SET(ST.cp);
4884             PL_reginput = locinput;
4885
4886             /* Now go into the branch */
4887             if (has_cutgroup) {
4888                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4889             } else {
4890                 PUSH_STATE_GOTO(BRANCH_next, scan);
4891             }
4892             /* NOTREACHED */
4893         case CUTGROUP:
4894             PL_reginput = locinput;
4895             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4896                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4897             PUSH_STATE_GOTO(CUTGROUP_next,next);
4898             /* NOTREACHED */
4899         case CUTGROUP_next_fail:
4900             do_cutgroup = 1;
4901             no_final = 1;
4902             if (st->u.mark.mark_name)
4903                 sv_commit = st->u.mark.mark_name;
4904             sayNO;
4905             /* NOTREACHED */
4906         case BRANCH_next:
4907             sayYES;
4908             /* NOTREACHED */
4909         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4910             if (do_cutgroup) {
4911                 do_cutgroup = 0;
4912                 no_final = 0;
4913             }
4914             REGCP_UNWIND(ST.cp);
4915             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4916                 PL_regoffs[n].end = -1;
4917             *PL_reglastparen = n;
4918             /*dmq: *PL_reglastcloseparen = n; */
4919             scan = ST.next_branch;
4920             /* no more branches? */
4921             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4922                 DEBUG_EXECUTE_r({
4923                     PerlIO_printf( Perl_debug_log,
4924                         "%*s  %sBRANCH failed...%s\n",
4925                         REPORT_CODE_OFF+depth*2, "",
4926                         PL_colors[4],
4927                         PL_colors[5] );
4928                 });
4929                 sayNO_SILENT;
4930             }
4931             continue; /* execute next BRANCH[J] op */
4932             /* NOTREACHED */
4933
4934         case MINMOD:
4935             minmod = 1;
4936             break;
4937
4938 #undef  ST
4939 #define ST st->u.curlym
4940
4941         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4942
4943             /* This is an optimisation of CURLYX that enables us to push
4944              * only a single backtracking state, no matter how many matches
4945              * there are in {m,n}. It relies on the pattern being constant
4946              * length, with no parens to influence future backrefs
4947              */
4948
4949             ST.me = scan;
4950             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4951
4952             /* if paren positive, emulate an OPEN/CLOSE around A */
4953             if (ST.me->flags) {
4954                 U32 paren = ST.me->flags;
4955                 if (paren > PL_regsize)
4956                     PL_regsize = paren;
4957                 if (paren > *PL_reglastparen)
4958                     *PL_reglastparen = paren;
4959                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4960             }
4961             ST.A = scan;
4962             ST.B = next;
4963             ST.alen = 0;
4964             ST.count = 0;
4965             ST.minmod = minmod;
4966             minmod = 0;
4967             ST.c1 = CHRTEST_UNINIT;
4968             REGCP_SET(ST.cp);
4969
4970             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4971                 goto curlym_do_B;
4972
4973           curlym_do_A: /* execute the A in /A{m,n}B/  */
4974             PL_reginput = locinput;
4975             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4976             /* NOTREACHED */
4977
4978         case CURLYM_A: /* we've just matched an A */
4979             locinput = st->locinput;
4980             nextchr = UCHARAT(locinput);
4981
4982             ST.count++;
4983             /* after first match, determine A's length: u.curlym.alen */
4984             if (ST.count == 1) {
4985                 if (PL_reg_match_utf8) {
4986                     char *s = locinput;
4987                     while (s < PL_reginput) {
4988                         ST.alen++;
4989                         s += UTF8SKIP(s);
4990                     }
4991                 }
4992                 else {
4993                     ST.alen = PL_reginput - locinput;
4994                 }
4995                 if (ST.alen == 0)
4996                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4997             }
4998             DEBUG_EXECUTE_r(
4999                 PerlIO_printf(Perl_debug_log,
5000                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5001                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5002                           (IV) ST.count, (IV)ST.alen)
5003             );
5004
5005             locinput = PL_reginput;
5006
5007             if (cur_eval && cur_eval->u.eval.close_paren &&
5008                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5009                 goto fake_end;
5010
5011             {
5012                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5013                 if ( max == REG_INFTY || ST.count < max )
5014                     goto curlym_do_A; /* try to match another A */
5015             }
5016             goto curlym_do_B; /* try to match B */
5017
5018         case CURLYM_A_fail: /* just failed to match an A */
5019             REGCP_UNWIND(ST.cp);
5020
5021             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5022                 || (cur_eval && cur_eval->u.eval.close_paren &&
5023                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5024                 sayNO;
5025
5026           curlym_do_B: /* execute the B in /A{m,n}B/  */
5027             PL_reginput = locinput;
5028             if (ST.c1 == CHRTEST_UNINIT) {
5029                 /* calculate c1 and c2 for possible match of 1st char
5030                  * following curly */
5031                 ST.c1 = ST.c2 = CHRTEST_VOID;
5032                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5033                     regnode *text_node = ST.B;
5034                     if (! HAS_TEXT(text_node))
5035                         FIND_NEXT_IMPT(text_node);
5036                     /* this used to be
5037
5038                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5039
5040                         But the former is redundant in light of the latter.
5041
5042                         if this changes back then the macro for
5043                         IS_TEXT and friends need to change.
5044                      */
5045                     if (PL_regkind[OP(text_node)] == EXACT)
5046                     {
5047
5048                         ST.c1 = (U8)*STRING(text_node);
5049                         switch (OP(text_node)) {
5050                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5051                             case EXACTFA:
5052                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5053                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5054                             default: ST.c2 = ST.c1;
5055                         }
5056                     }
5057                 }
5058             }
5059
5060             DEBUG_EXECUTE_r(
5061                 PerlIO_printf(Perl_debug_log,
5062                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5063                     (int)(REPORT_CODE_OFF+(depth*2)),
5064                     "", (IV)ST.count)
5065                 );
5066             if (ST.c1 != CHRTEST_VOID
5067                     && UCHARAT(PL_reginput) != ST.c1
5068                     && UCHARAT(PL_reginput) != ST.c2)
5069             {
5070                 /* simulate B failing */
5071                 DEBUG_OPTIMISE_r(
5072                     PerlIO_printf(Perl_debug_log,
5073                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5074                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5075                         (IV)ST.c1,(IV)ST.c2
5076                 ));
5077                 state_num = CURLYM_B_fail;
5078                 goto reenter_switch;
5079             }
5080
5081             if (ST.me->flags) {
5082                 /* mark current A as captured */
5083                 I32 paren = ST.me->flags;
5084                 if (ST.count) {
5085                     PL_regoffs[paren].start
5086                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5087                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5088                     /*dmq: *PL_reglastcloseparen = paren; */
5089                 }
5090                 else
5091                     PL_regoffs[paren].end = -1;
5092                 if (cur_eval && cur_eval->u.eval.close_paren &&
5093                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5094                 {
5095                     if (ST.count)
5096                         goto fake_end;
5097                     else
5098                         sayNO;
5099                 }
5100             }
5101
5102             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5103             /* NOTREACHED */
5104
5105         case CURLYM_B_fail: /* just failed to match a B */
5106             REGCP_UNWIND(ST.cp);
5107             if (ST.minmod) {
5108                 I32 max = ARG2(ST.me);
5109                 if (max != REG_INFTY && ST.count == max)
5110                     sayNO;
5111                 goto curlym_do_A; /* try to match a further A */
5112             }
5113             /* backtrack one A */
5114             if (ST.count == ARG1(ST.me) /* min */)
5115                 sayNO;
5116             ST.count--;
5117             locinput = HOPc(locinput, -ST.alen);
5118             goto curlym_do_B; /* try to match B */
5119
5120 #undef ST
5121 #define ST st->u.curly
5122
5123 #define CURLY_SETPAREN(paren, success) \
5124     if (paren) { \
5125         if (success) { \
5126             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5127             PL_regoffs[paren].end = locinput - PL_bostr; \
5128             *PL_reglastcloseparen = paren; \
5129         } \
5130         else \
5131             PL_regoffs[paren].end = -1; \
5132     }
5133
5134         case STAR:              /*  /A*B/ where A is width 1 */
5135             ST.paren = 0;
5136             ST.min = 0;
5137             ST.max = REG_INFTY;
5138             scan = NEXTOPER(scan);
5139             goto repeat;
5140         case PLUS:              /*  /A+B/ where A is width 1 */
5141             ST.paren = 0;
5142             ST.min = 1;
5143             ST.max = REG_INFTY;
5144             scan = NEXTOPER(scan);
5145             goto repeat;
5146         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5147             ST.paren = scan->flags;     /* Which paren to set */
5148             if (ST.paren > PL_regsize)
5149                 PL_regsize = ST.paren;
5150             if (ST.paren > *PL_reglastparen)
5151                 *PL_reglastparen = ST.paren;
5152             ST.min = ARG1(scan);  /* min to match */
5153             ST.max = ARG2(scan);  /* max to match */
5154             if (cur_eval && cur_eval->u.eval.close_paren &&
5155                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5156                 ST.min=1;
5157                 ST.max=1;
5158             }
5159             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5160             goto repeat;
5161         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5162             ST.paren = 0;
5163             ST.min = ARG1(scan);  /* min to match */
5164             ST.max = ARG2(scan);  /* max to match */
5165             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5166           repeat:
5167             /*
5168             * Lookahead to avoid useless match attempts
5169             * when we know what character comes next.
5170             *
5171             * Used to only do .*x and .*?x, but now it allows
5172             * for )'s, ('s and (?{ ... })'s to be in the way
5173             * of the quantifier and the EXACT-like node.  -- japhy
5174             */
5175
5176             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5177                 sayNO;
5178             if (HAS_TEXT(next) || JUMPABLE(next)) {
5179                 U8 *s;
5180                 regnode *text_node = next;
5181
5182                 if (! HAS_TEXT(text_node))
5183                     FIND_NEXT_IMPT(text_node);
5184
5185                 if (! HAS_TEXT(text_node))
5186                     ST.c1 = ST.c2 = CHRTEST_VOID;
5187                 else {
5188                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5189                         ST.c1 = ST.c2 = CHRTEST_VOID;
5190                         goto assume_ok_easy;
5191                     }
5192                     else
5193                         s = (U8*)STRING(text_node);
5194
5195                     /*  Currently we only get here when
5196
5197                         PL_rekind[OP(text_node)] == EXACT
5198
5199                         if this changes back then the macro for IS_TEXT and
5200                         friends need to change. */
5201                     if (!UTF_PATTERN) {
5202                         ST.c1 = *s;
5203                         switch (OP(text_node)) {
5204                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5205                             case EXACTFA:
5206                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5207                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5208                             default: ST.c2 = ST.c1; break;
5209                         }
5210                     }
5211                     else { /* UTF_PATTERN */
5212                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5213                              STRLEN ulen1, ulen2;
5214                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5215                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5216
5217                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5218                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5219 #ifdef EBCDIC
5220                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5221                                                     ckWARN(WARN_UTF8) ?
5222                                                     0 : UTF8_ALLOW_ANY);
5223                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5224                                                     ckWARN(WARN_UTF8) ?
5225                                                     0 : UTF8_ALLOW_ANY);
5226 #else
5227                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5228                                                     uniflags);
5229                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5230                                                     uniflags);
5231 #endif
5232                         }
5233                         else {
5234                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5235                                                      uniflags);
5236                         }
5237                     }
5238                 }
5239             }
5240             else
5241                 ST.c1 = ST.c2 = CHRTEST_VOID;
5242         assume_ok_easy:
5243
5244             ST.A = scan;
5245             ST.B = next;
5246             PL_reginput = locinput;
5247             if (minmod) {
5248                 minmod = 0;
5249                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5250                     sayNO;
5251                 ST.count = ST.min;
5252                 locinput = PL_reginput;
5253                 REGCP_SET(ST.cp);
5254                 if (ST.c1 == CHRTEST_VOID)
5255                     goto curly_try_B_min;
5256
5257                 ST.oldloc = locinput;
5258
5259                 /* set ST.maxpos to the furthest point along the
5260                  * string that could possibly match */
5261                 if  (ST.max == REG_INFTY) {
5262                     ST.maxpos = PL_regeol - 1;
5263                     if (utf8_target)
5264                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5265                             ST.maxpos--;
5266                 }
5267                 else if (utf8_target) {
5268                     int m = ST.max - ST.min;
5269                     for (ST.maxpos = locinput;
5270                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5271                         ST.maxpos += UTF8SKIP(ST.maxpos);
5272                 }
5273                 else {
5274                     ST.maxpos = locinput + ST.max - ST.min;
5275                     if (ST.maxpos >= PL_regeol)
5276                         ST.maxpos = PL_regeol - 1;
5277                 }
5278                 goto curly_try_B_min_known;
5279
5280             }
5281             else {
5282                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5283                 locinput = PL_reginput;
5284                 if (ST.count < ST.min)
5285                     sayNO;
5286                 if ((ST.count > ST.min)
5287                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5288                 {
5289                     /* A{m,n} must come at the end of the string, there's
5290                      * no point in backing off ... */
5291                     ST.min = ST.count;
5292                     /* ...except that $ and \Z can match before *and* after
5293                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5294                        We may back off by one in this case. */
5295                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5296                         ST.min--;
5297                 }
5298                 REGCP_SET(ST.cp);
5299                 goto curly_try_B_max;
5300             }
5301             /* NOTREACHED */
5302
5303
5304         case CURLY_B_min_known_fail:
5305             /* failed to find B in a non-greedy match where c1,c2 valid */
5306             if (ST.paren && ST.count)
5307                 PL_regoffs[ST.paren].end = -1;
5308
5309             PL_reginput = locinput;     /* Could be reset... */
5310             REGCP_UNWIND(ST.cp);
5311             /* Couldn't or didn't -- move forward. */
5312             ST.oldloc = locinput;
5313             if (utf8_target)
5314                 locinput += UTF8SKIP(locinput);
5315             else
5316                 locinput++;
5317             ST.count++;
5318           curly_try_B_min_known:
5319              /* find the next place where 'B' could work, then call B */
5320             {
5321                 int n;
5322                 if (utf8_target) {
5323                     n = (ST.oldloc == locinput) ? 0 : 1;
5324                     if (ST.c1 == ST.c2) {
5325                         STRLEN len;
5326                         /* set n to utf8_distance(oldloc, locinput) */
5327                         while (locinput <= ST.maxpos &&
5328                                utf8n_to_uvchr((U8*)locinput,
5329                                               UTF8_MAXBYTES, &len,
5330                                               uniflags) != (UV)ST.c1) {
5331                             locinput += len;
5332                             n++;
5333                         }
5334                     }
5335                     else {
5336                         /* set n to utf8_distance(oldloc, locinput) */
5337                         while (locinput <= ST.maxpos) {
5338                             STRLEN len;
5339                             const UV c = utf8n_to_uvchr((U8*)locinput,
5340                                                   UTF8_MAXBYTES, &len,
5341                                                   uniflags);
5342                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5343                                 break;
5344                             locinput += len;
5345                             n++;
5346                         }
5347                     }
5348                 }
5349                 else {
5350                     if (ST.c1 == ST.c2) {
5351                         while (locinput <= ST.maxpos &&
5352                                UCHARAT(locinput) != ST.c1)
5353                             locinput++;
5354                     }
5355                     else {
5356                         while (locinput <= ST.maxpos
5357                                && UCHARAT(locinput) != ST.c1
5358                                && UCHARAT(locinput) != ST.c2)
5359                             locinput++;
5360                     }
5361                     n = locinput - ST.oldloc;
5362                 }
5363                 if (locinput > ST.maxpos)
5364                     sayNO;
5365                 /* PL_reginput == oldloc now */
5366                 if (n) {
5367                     ST.count += n;
5368                     if (regrepeat(rex, ST.A, n, depth) < n)
5369                         sayNO;
5370                 }
5371                 PL_reginput = locinput;
5372                 CURLY_SETPAREN(ST.paren, ST.count);
5373                 if (cur_eval && cur_eval->u.eval.close_paren &&
5374                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5375                     goto fake_end;
5376                 }
5377                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5378             }
5379             /* NOTREACHED */
5380
5381
5382         case CURLY_B_min_fail:
5383             /* failed to find B in a non-greedy match where c1,c2 invalid */
5384             if (ST.paren && ST.count)
5385                 PL_regoffs[ST.paren].end = -1;
5386
5387             REGCP_UNWIND(ST.cp);
5388             /* failed -- move forward one */
5389             PL_reginput = locinput;
5390             if (regrepeat(rex, ST.A, 1, depth)) {
5391                 ST.count++;
5392                 locinput = PL_reginput;
5393                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5394                         ST.count > 0)) /* count overflow ? */
5395                 {
5396                   curly_try_B_min:
5397                     CURLY_SETPAREN(ST.paren, ST.count);
5398                     if (cur_eval && cur_eval->u.eval.close_paren &&
5399                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5400                         goto fake_end;
5401                     }
5402                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5403                 }
5404             }
5405             sayNO;
5406             /* NOTREACHED */
5407
5408
5409         curly_try_B_max:
5410             /* a successful greedy match: now try to match B */
5411             if (cur_eval && cur_eval->u.eval.close_paren &&
5412                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5413                 goto fake_end;
5414             }
5415             {
5416                 UV c = 0;
5417                 if (ST.c1 != CHRTEST_VOID)
5418                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5419                                            UTF8_MAXBYTES, 0, uniflags)
5420                                 : (UV) UCHARAT(PL_reginput);
5421                 /* If it could work, try it. */
5422                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5423                     CURLY_SETPAREN(ST.paren, ST.count);
5424                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5425                     /* NOTREACHED */
5426                 }
5427             }
5428             /* FALL THROUGH */
5429         case CURLY_B_max_fail:
5430             /* failed to find B in a greedy match */
5431             if (ST.paren && ST.count)
5432                 PL_regoffs[ST.paren].end = -1;
5433
5434             REGCP_UNWIND(ST.cp);
5435             /*  back up. */
5436             if (--ST.count < ST.min)
5437                 sayNO;
5438             PL_reginput = locinput = HOPc(locinput, -1);
5439             goto curly_try_B_max;
5440
5441 #undef ST
5442
5443         case END:
5444             fake_end:
5445             if (cur_eval) {
5446                 /* we've just finished A in /(??{A})B/; now continue with B */
5447                 I32 tmpix;
5448                 st->u.eval.toggle_reg_flags
5449                             = cur_eval->u.eval.toggle_reg_flags;
5450                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5451
5452                 st->u.eval.prev_rex = rex_sv;           /* inner */
5453                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5454                 rex = (struct regexp *)SvANY(rex_sv);
5455                 rexi = RXi_GET(rex);
5456                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5457                 ReREFCNT_inc(rex_sv);
5458                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5459
5460                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5461                 PL_reglastparen = &rex->lastparen;
5462                 PL_reglastcloseparen = &rex->lastcloseparen;
5463
5464                 REGCP_SET(st->u.eval.lastcp);
5465                 PL_reginput = locinput;
5466
5467                 /* Restore parens of the outer rex without popping the
5468                  * savestack */
5469                 tmpix = PL_savestack_ix;
5470                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5471                 regcppop(rex);
5472                 PL_savestack_ix = tmpix;
5473
5474                 st->u.eval.prev_eval = cur_eval;
5475                 cur_eval = cur_eval->u.eval.prev_eval;
5476                 DEBUG_EXECUTE_r(
5477                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5478                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5479                 if ( nochange_depth )
5480                     nochange_depth--;
5481
5482                 PUSH_YES_STATE_GOTO(EVAL_AB,
5483                         st->u.eval.prev_eval->u.eval.B); /* match B */
5484             }
5485
5486             if (locinput < reginfo->till) {
5487                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5488                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5489                                       PL_colors[4],
5490                                       (long)(locinput - PL_reg_starttry),
5491                                       (long)(reginfo->till - PL_reg_starttry),
5492                                       PL_colors[5]));
5493
5494                 sayNO_SILENT;           /* Cannot match: too short. */
5495             }
5496             PL_reginput = locinput;     /* put where regtry can find it */
5497             sayYES;                     /* Success! */
5498
5499         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5500             DEBUG_EXECUTE_r(
5501             PerlIO_printf(Perl_debug_log,
5502                 "%*s  %ssubpattern success...%s\n",
5503                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5504             PL_reginput = locinput;     /* put where regtry can find it */
5505             sayYES;                     /* Success! */
5506
5507 #undef  ST
5508 #define ST st->u.ifmatch
5509
5510         case SUSPEND:   /* (?>A) */
5511             ST.wanted = 1;
5512             PL_reginput = locinput;
5513             goto do_ifmatch;
5514
5515         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5516             ST.wanted = 0;
5517             goto ifmatch_trivial_fail_test;
5518
5519         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5520             ST.wanted = 1;
5521           ifmatch_trivial_fail_test:
5522             if (scan->flags) {
5523                 char * const s = HOPBACKc(locinput, scan->flags);
5524                 if (!s) {
5525                     /* trivial fail */
5526                     if (logical) {
5527                         logical = 0;
5528                         sw = 1 - cBOOL(ST.wanted);
5529                     }
5530                     else if (ST.wanted)
5531                         sayNO;
5532                     next = scan + ARG(scan);
5533                     if (next == scan)
5534                         next = NULL;
5535                     break;
5536                 }
5537                 PL_reginput = s;
5538             }
5539             else
5540                 PL_reginput = locinput;
5541
5542           do_ifmatch:
5543             ST.me = scan;
5544             ST.logical = logical;
5545             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5546
5547             /* execute body of (?...A) */
5548             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5549             /* NOTREACHED */
5550
5551         case IFMATCH_A_fail: /* body of (?...A) failed */
5552             ST.wanted = !ST.wanted;
5553             /* FALL THROUGH */
5554
5555         case IFMATCH_A: /* body of (?...A) succeeded */
5556             if (ST.logical) {
5557                 sw = cBOOL(ST.wanted);
5558             }
5559             else if (!ST.wanted)
5560                 sayNO;
5561
5562             if (OP(ST.me) == SUSPEND)
5563                 locinput = PL_reginput;
5564             else {
5565                 locinput = PL_reginput = st->locinput;
5566                 nextchr = UCHARAT(locinput);
5567             }
5568             scan = ST.me + ARG(ST.me);
5569             if (scan == ST.me)
5570                 scan = NULL;
5571             continue; /* execute B */
5572
5573 #undef ST
5574
5575         case LONGJMP:
5576             next = scan + ARG(scan);
5577             if (next == scan)
5578                 next = NULL;
5579             break;
5580         case COMMIT:
5581             reginfo->cutpoint = PL_regeol;
5582             /* FALLTHROUGH */
5583         case PRUNE:
5584             PL_reginput = locinput;
5585             if (!scan->flags)
5586                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5587             PUSH_STATE_GOTO(COMMIT_next,next);
5588             /* NOTREACHED */
5589         case COMMIT_next_fail:
5590             no_final = 1;
5591             /* FALLTHROUGH */
5592         case OPFAIL:
5593             sayNO;
5594             /* NOTREACHED */
5595
5596 #define ST st->u.mark
5597         case MARKPOINT:
5598             ST.prev_mark = mark_state;
5599             ST.mark_name = sv_commit = sv_yes_mark
5600                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5601             mark_state = st;
5602             ST.mark_loc = PL_reginput = locinput;
5603             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5604             /* NOTREACHED */
5605         case MARKPOINT_next:
5606             mark_state = ST.prev_mark;
5607             sayYES;
5608             /* NOTREACHED */
5609         case MARKPOINT_next_fail:
5610             if (popmark && sv_eq(ST.mark_name,popmark))
5611             {
5612                 if (ST.mark_loc > startpoint)
5613                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5614                 popmark = NULL; /* we found our mark */
5615                 sv_commit = ST.mark_name;
5616
5617                 DEBUG_EXECUTE_r({
5618                         PerlIO_printf(Perl_debug_log,
5619                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5620                             REPORT_CODE_OFF+depth*2, "",
5621                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5622                 });
5623             }
5624             mark_state = ST.prev_mark;
5625             sv_yes_mark = mark_state ?
5626                 mark_state->u.mark.mark_name : NULL;
5627             sayNO;
5628             /* NOTREACHED */
5629         case SKIP:
5630             PL_reginput = locinput;
5631             if (scan->flags) {
5632                 /* (*SKIP) : if we fail we cut here*/
5633                 ST.mark_name = NULL;
5634                 ST.mark_loc = locinput;
5635                 PUSH_STATE_GOTO(SKIP_next,next);
5636             } else {
5637                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5638                    otherwise do nothing.  Meaning we need to scan
5639                  */
5640                 regmatch_state *cur = mark_state;
5641                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5642
5643                 while (cur) {
5644                     if ( sv_eq( cur->u.mark.mark_name,
5645                                 find ) )
5646                     {
5647                         ST.mark_name = find;
5648                         PUSH_STATE_GOTO( SKIP_next, next );
5649                     }
5650                     cur = cur->u.mark.prev_mark;
5651                 }
5652             }
5653             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5654             break;
5655         case SKIP_next_fail:
5656             if (ST.mark_name) {
5657                 /* (*CUT:NAME) - Set up to search for the name as we
5658                    collapse the stack*/
5659                 popmark = ST.mark_name;
5660             } else {
5661                 /* (*CUT) - No name, we cut here.*/
5662                 if (ST.mark_loc > startpoint)
5663                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5664                 /* but we set sv_commit to latest mark_name if there
5665                    is one so they can test to see how things lead to this
5666                    cut */
5667                 if (mark_state)
5668                     sv_commit=mark_state->u.mark.mark_name;
5669             }
5670             no_final = 1;
5671             sayNO;
5672             /* NOTREACHED */
5673 #undef ST
5674         case FOLDCHAR:
5675             n = ARG(scan);
5676             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5677                 locinput += ln;
5678             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5679                 sayNO;
5680             } else  {
5681                 U8 folded[UTF8_MAXBYTES_CASE+1];
5682                 STRLEN foldlen;
5683                 const char * const l = locinput;
5684                 char *e = PL_regeol;
5685                 to_uni_fold(n, folded, &foldlen);
5686
5687                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5688                                l, &e, 0,  utf8_target)) {
5689                         sayNO;
5690                 }
5691                 locinput = e;
5692             }
5693             nextchr = UCHARAT(locinput);
5694             break;
5695         case LNBREAK:
5696             if ((n=is_LNBREAK(locinput,utf8_target))) {
5697                 locinput += n;
5698                 nextchr = UCHARAT(locinput);
5699             } else
5700                 sayNO;
5701             break;
5702
5703 #define CASE_CLASS(nAmE)                              \
5704         case nAmE:                                    \
5705             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5706                 locinput += n;                        \
5707                 nextchr = UCHARAT(locinput);          \
5708             } else                                    \
5709                 sayNO;                                \
5710             break;                                    \
5711         case N##nAmE:                                 \
5712             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5713                 sayNO;                                \
5714             } else {                                  \
5715                 locinput += UTF8SKIP(locinput);       \
5716                 nextchr = UCHARAT(locinput);          \
5717             }                                         \
5718             break
5719
5720         CASE_CLASS(VERTWS);
5721         CASE_CLASS(HORIZWS);
5722 #undef CASE_CLASS
5723
5724         default:
5725             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5726                           PTR2UV(scan), OP(scan));
5727             Perl_croak(aTHX_ "regexp memory corruption");
5728
5729         } /* end switch */
5730
5731         /* switch break jumps here */
5732         scan = next; /* prepare to execute the next op and ... */
5733         continue;    /* ... jump back to the top, reusing st */
5734         /* NOTREACHED */
5735
5736       push_yes_state:
5737         /* push a state that backtracks on success */
5738         st->u.yes.prev_yes_state = yes_state;
5739         yes_state = st;
5740         /* FALL THROUGH */
5741       push_state:
5742         /* push a new regex state, then continue at scan  */
5743         {
5744             regmatch_state *newst;
5745
5746             DEBUG_STACK_r({
5747                 regmatch_state *cur = st;
5748                 regmatch_state *curyes = yes_state;
5749                 int curd = depth;
5750                 regmatch_slab *slab = PL_regmatch_slab;
5751                 for (;curd > -1;cur--,curd--) {
5752                     if (cur < SLAB_FIRST(slab)) {
5753                         slab = slab->prev;
5754                         cur = SLAB_LAST(slab);
5755                     }
5756                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5757                         REPORT_CODE_OFF + 2 + depth * 2,"",
5758                         curd, PL_reg_name[cur->resume_state],
5759                         (curyes == cur) ? "yes" : ""
5760                     );
5761                     if (curyes == cur)
5762                         curyes = cur->u.yes.prev_yes_state;
5763                 }
5764             } else
5765                 DEBUG_STATE_pp("push")
5766             );
5767             depth++;
5768             st->locinput = locinput;
5769             newst = st+1;
5770             if (newst >  SLAB_LAST(PL_regmatch_slab))
5771                 newst = S_push_slab(aTHX);
5772             PL_regmatch_state = newst;
5773
5774             locinput = PL_reginput;
5775             nextchr = UCHARAT(locinput);
5776             st = newst;
5777             continue;
5778             /* NOTREACHED */
5779         }
5780     }
5781
5782     /*
5783     * We get here only if there's trouble -- normally "case END" is
5784     * the terminating point.
5785     */
5786     Perl_croak(aTHX_ "corrupted regexp pointers");
5787     /*NOTREACHED*/
5788     sayNO;
5789
5790 yes:
5791     if (yes_state) {
5792         /* we have successfully completed a subexpression, but we must now
5793          * pop to the state marked by yes_state and continue from there */
5794         assert(st != yes_state);
5795 #ifdef DEBUGGING
5796         while (st != yes_state) {
5797             st--;
5798             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5799                 PL_regmatch_slab = PL_regmatch_slab->prev;
5800                 st = SLAB_LAST(PL_regmatch_slab);
5801             }
5802             DEBUG_STATE_r({
5803                 if (no_final) {
5804                     DEBUG_STATE_pp("pop (no final)");
5805                 } else {
5806                     DEBUG_STATE_pp("pop (yes)");
5807                 }
5808             });
5809             depth--;
5810         }
5811 #else
5812         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5813             || yes_state > SLAB_LAST(PL_regmatch_slab))
5814         {
5815             /* not in this slab, pop slab */
5816             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5817             PL_regmatch_slab = PL_regmatch_slab->prev;
5818             st = SLAB_LAST(PL_regmatch_slab);
5819         }
5820         depth -= (st - yes_state);
5821 #endif
5822         st = yes_state;
5823         yes_state = st->u.yes.prev_yes_state;
5824         PL_regmatch_state = st;
5825
5826         if (no_final) {
5827             locinput= st->locinput;
5828             nextchr = UCHARAT(locinput);
5829         }
5830         state_num = st->resume_state + no_final;
5831         goto reenter_switch;
5832     }
5833
5834     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5835                           PL_colors[4], PL_colors[5]));
5836
5837     if (PL_reg_eval_set) {
5838         /* each successfully executed (?{...}) block does the equivalent of
5839          *   local $^R = do {...}
5840          * When popping the save stack, all these locals would be undone;
5841          * bypass this by setting the outermost saved $^R to the latest
5842          * value */
5843         if (oreplsv != GvSV(PL_replgv))
5844             sv_setsv(oreplsv, GvSV(PL_replgv));
5845     }
5846     result = 1;
5847     goto final_exit;
5848
5849 no:
5850     DEBUG_EXECUTE_r(
5851         PerlIO_printf(Perl_debug_log,
5852             "%*s  %sfailed...%s\n",
5853             REPORT_CODE_OFF+depth*2, "",
5854             PL_colors[4], PL_colors[5])
5855         );
5856
5857 no_silent:
5858     if (no_final) {
5859         if (yes_state) {
5860             goto yes;
5861         } else {
5862             goto final_exit;
5863         }
5864     }
5865     if (depth) {
5866         /* there's a previous state to backtrack to */
5867         st--;
5868         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5869             PL_regmatch_slab = PL_regmatch_slab->prev;
5870             st = SLAB_LAST(PL_regmatch_slab);
5871         }
5872         PL_regmatch_state = st;
5873         locinput= st->locinput;
5874         nextchr = UCHARAT(locinput);
5875
5876         DEBUG_STATE_pp("pop");
5877         depth--;
5878         if (yes_state == st)
5879             yes_state = st->u.yes.prev_yes_state;
5880
5881         state_num = st->resume_state + 1; /* failure = success + 1 */
5882         goto reenter_switch;
5883     }
5884     result = 0;
5885
5886   final_exit:
5887     if (rex->intflags & PREGf_VERBARG_SEEN) {
5888         SV *sv_err = get_sv("REGERROR", 1);
5889         SV *sv_mrk = get_sv("REGMARK", 1);
5890         if (result) {
5891             sv_commit = &PL_sv_no;
5892             if (!sv_yes_mark)
5893                 sv_yes_mark = &PL_sv_yes;
5894         } else {
5895             if (!sv_commit)
5896                 sv_commit = &PL_sv_yes;
5897             sv_yes_mark = &PL_sv_no;
5898         }
5899         sv_setsv(sv_err, sv_commit);
5900         sv_setsv(sv_mrk, sv_yes_mark);
5901     }
5902
5903     /* clean up; in particular, free all slabs above current one */
5904     LEAVE_SCOPE(oldsave);
5905
5906     return result;
5907 }
5908
5909 /*
5910  - regrepeat - repeatedly match something simple, report how many
5911  */
5912 /*
5913  * [This routine now assumes that it will only match on things of length 1.
5914  * That was true before, but now we assume scan - reginput is the count,
5915  * rather than incrementing count on every character.  [Er, except utf8.]]
5916  */
5917 STATIC I32
5918 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5919 {
5920     dVAR;
5921     register char *scan;
5922     register I32 c;
5923     register char *loceol = PL_regeol;
5924     register I32 hardcount = 0;
5925     register bool utf8_target = PL_reg_match_utf8;
5926     UV utf8_flags;
5927 #ifndef DEBUGGING
5928     PERL_UNUSED_ARG(depth);
5929 #endif
5930
5931     PERL_ARGS_ASSERT_REGREPEAT;
5932
5933     scan = PL_reginput;
5934     if (max == REG_INFTY)
5935         max = I32_MAX;
5936     else if (max < loceol - scan)
5937         loceol = scan + max;
5938     switch (OP(p)) {
5939     case REG_ANY:
5940         if (utf8_target) {
5941             loceol = PL_regeol;
5942             while (scan < loceol && hardcount < max && *scan != '\n') {
5943                 scan += UTF8SKIP(scan);
5944                 hardcount++;
5945             }
5946         } else {
5947             while (scan < loceol && *scan != '\n')
5948                 scan++;
5949         }
5950         break;
5951     case SANY:
5952         if (utf8_target) {
5953             loceol = PL_regeol;
5954             while (scan < loceol && hardcount < max) {
5955                 scan += UTF8SKIP(scan);
5956                 hardcount++;
5957             }
5958         }
5959         else
5960             scan = loceol;
5961         break;
5962     case CANY:
5963         scan = loceol;
5964         break;
5965     case EXACT:
5966         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5967          * means they match only characters in the string that can be expressed
5968          * as a single byte.  For non-utf8 strings, that means a simple match.
5969          * For utf8 strings, the character matched must be an invariant, or
5970          * downgradable to a single byte.  The pattern's utf8ness is
5971          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5972          * it is, it's an invariant */
5973
5974         c = (U8)*STRING(p);
5975         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5976
5977         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5978             while (scan < loceol && UCHARAT(scan) == c) {
5979                 scan++;
5980             }
5981         }
5982         else {
5983
5984             /* Here, the string is utf8, and the pattern char is different
5985              * in utf8 than not, so can't compare them directly.  Outside the
5986              * loop, find find the two utf8 bytes that represent c, and then
5987              * look for those in sequence in the utf8 string */
5988             U8 high = UTF8_TWO_BYTE_HI(c);
5989             U8 low = UTF8_TWO_BYTE_LO(c);
5990             loceol = PL_regeol;
5991
5992             while (hardcount < max
5993                     && scan + 1 < loceol
5994                     && UCHARAT(scan) == high
5995                     && UCHARAT(scan + 1) == low)
5996             {
5997                 scan += 2;
5998                 hardcount++;
5999             }
6000         }
6001         break;
6002     case EXACTFA:
6003         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6004         goto do_exactf;
6005
6006     case EXACTFL:
6007         PL_reg_flags |= RF_tainted;
6008         utf8_flags = FOLDEQ_UTF8_LOCALE;
6009         goto do_exactf;
6010
6011     case EXACTF:
6012     case EXACTFU:
6013         utf8_flags = 0;
6014
6015         /* The comments for the EXACT case above apply as well to these fold
6016          * ones */
6017
6018     do_exactf:
6019         c = (U8)*STRING(p);
6020         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6021
6022         if (utf8_target) { /* Use full Unicode fold matching */
6023             char *tmpeol = loceol;
6024             while (hardcount < max
6025                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6026                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6027             {
6028                 scan = tmpeol;
6029                 tmpeol = loceol;
6030                 hardcount++;
6031             }
6032
6033             /* XXX Note that the above handles properly the German sharp s in
6034              * the pattern matching ss in the string.  But it doesn't handle
6035              * properly cases where the string contains say 'LIGATURE ff' and
6036              * the pattern is 'f+'.  This would require, say, a new function or
6037              * revised interface to foldEQ_utf8(), in which the maximum number
6038              * of characters to match could be passed and it would return how
6039              * many actually did.  This is just one of many cases where
6040              * multi-char folds don't work properly, and so the fix is being
6041              * deferred */
6042         }
6043         else {
6044             U8 folded;
6045
6046             /* Here, the string isn't utf8 and c is a single byte; and either
6047              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6048              * doesn't affect c.  Can just do simple comparisons for exact or
6049              * fold matching. */
6050             switch (OP(p)) {
6051                 case EXACTF: folded = PL_fold[c]; break;
6052                 case EXACTFA:
6053                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6054                 case EXACTFL: folded = PL_fold_locale[c]; break;
6055                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6056             }
6057             while (scan < loceol &&
6058                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6059             {
6060                 scan++;
6061             }
6062         }
6063         break;
6064     case ANYOFV:
6065     case ANYOF:
6066         if (utf8_target || OP(p) == ANYOFV) {
6067             STRLEN inclasslen;
6068             loceol = PL_regeol;
6069             inclasslen = loceol - scan;
6070             while (hardcount < max
6071                    && ((inclasslen = loceol - scan) > 0)
6072                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6073             {
6074                 scan += inclasslen;
6075                 hardcount++;
6076             }
6077         } else {
6078             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6079                 scan++;
6080         }
6081         break;
6082     case ALNUMU:
6083         if (utf8_target) {
6084     utf8_wordchar:
6085             loceol = PL_regeol;
6086             LOAD_UTF8_CHARCLASS_ALNUM();
6087             while (hardcount < max && scan < loceol &&
6088                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6089             {
6090                 scan += UTF8SKIP(scan);
6091                 hardcount++;
6092             }
6093         } else {
6094             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6095                 scan++;
6096             }
6097         }
6098         break;
6099     case ALNUM:
6100         if (utf8_target)
6101             goto utf8_wordchar;
6102         while (scan < loceol && isALNUM((U8) *scan)) {
6103             scan++;
6104         }
6105         break;
6106     case ALNUMA:
6107         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6108             scan++;
6109         }
6110         break;
6111     case ALNUML:
6112         PL_reg_flags |= RF_tainted;
6113         if (utf8_target) {
6114             loceol = PL_regeol;
6115             while (hardcount < max && scan < loceol &&
6116                    isALNUM_LC_utf8((U8*)scan)) {
6117                 scan += UTF8SKIP(scan);
6118                 hardcount++;
6119             }
6120         } else {
6121             while (scan < loceol && isALNUM_LC(*scan))
6122                 scan++;
6123         }
6124         break;
6125     case NALNUMU:
6126         if (utf8_target) {
6127
6128     utf8_Nwordchar:
6129
6130             loceol = PL_regeol;
6131             LOAD_UTF8_CHARCLASS_ALNUM();
6132             while (hardcount < max && scan < loceol &&
6133                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6134             {
6135                 scan += UTF8SKIP(scan);
6136                 hardcount++;
6137             }
6138         } else {
6139             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6140                 scan++;
6141             }
6142         }
6143         break;
6144     case NALNUM:
6145         if (utf8_target)
6146             goto utf8_Nwordchar;
6147         while (scan < loceol && ! isALNUM((U8) *scan)) {
6148             scan++;
6149         }
6150         break;
6151     case NALNUMA:
6152         if (utf8_target) {
6153             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6154                 scan += UTF8SKIP(scan);
6155             }
6156         }
6157         else {
6158             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6159                 scan++;
6160             }
6161         }
6162         break;
6163     case NALNUML:
6164         PL_reg_flags |= RF_tainted;
6165         if (utf8_target) {
6166             loceol = PL_regeol;
6167             while (hardcount < max && scan < loceol &&
6168                    !isALNUM_LC_utf8((U8*)scan)) {
6169                 scan += UTF8SKIP(scan);
6170                 hardcount++;
6171             }
6172         } else {
6173             while (scan < loceol && !isALNUM_LC(*scan))
6174                 scan++;
6175         }
6176         break;
6177     case SPACEU:
6178         if (utf8_target) {
6179
6180     utf8_space:
6181
6182             loceol = PL_regeol;
6183             LOAD_UTF8_CHARCLASS_SPACE();
6184             while (hardcount < max && scan < loceol &&
6185                    (*scan == ' ' ||
6186                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6187             {
6188                 scan += UTF8SKIP(scan);
6189                 hardcount++;
6190             }
6191             break;
6192         }
6193         else {
6194             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6195                 scan++;
6196             }
6197             break;
6198         }
6199     case SPACE:
6200         if (utf8_target)
6201             goto utf8_space;
6202
6203         while (scan < loceol && isSPACE((U8) *scan)) {
6204             scan++;
6205         }
6206         break;
6207     case SPACEA:
6208         while (scan < loceol && isSPACE_A((U8) *scan)) {
6209             scan++;
6210         }
6211         break;
6212     case SPACEL:
6213         PL_reg_flags |= RF_tainted;
6214         if (utf8_target) {
6215             loceol = PL_regeol;
6216             while (hardcount < max && scan < loceol &&
6217                    isSPACE_LC_utf8((U8*)scan)) {
6218                 scan += UTF8SKIP(scan);
6219                 hardcount++;
6220             }
6221         } else {
6222             while (scan < loceol && isSPACE_LC(*scan))
6223                 scan++;
6224         }
6225         break;
6226     case NSPACEU:
6227         if (utf8_target) {
6228
6229     utf8_Nspace:
6230
6231             loceol = PL_regeol;
6232             LOAD_UTF8_CHARCLASS_SPACE();
6233             while (hardcount < max && scan < loceol &&
6234                    ! (*scan == ' ' ||
6235                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6236             {
6237                 scan += UTF8SKIP(scan);
6238                 hardcount++;
6239             }
6240             break;
6241         }
6242         else {
6243             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6244                 scan++;
6245             }
6246         }
6247         break;
6248     case NSPACE:
6249         if (utf8_target)
6250             goto utf8_Nspace;
6251
6252         while (scan < loceol && ! isSPACE((U8) *scan)) {
6253             scan++;
6254         }
6255         break;
6256     case NSPACEA:
6257         if (utf8_target) {
6258             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6259                 scan += UTF8SKIP(scan);
6260             }
6261         }
6262         else {
6263             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6264                 scan++;
6265             }
6266         }
6267         break;
6268     case NSPACEL:
6269         PL_reg_flags |= RF_tainted;
6270         if (utf8_target) {
6271             loceol = PL_regeol;
6272             while (hardcount < max && scan < loceol &&
6273                    !isSPACE_LC_utf8((U8*)scan)) {
6274                 scan += UTF8SKIP(scan);
6275                 hardcount++;
6276             }
6277         } else {
6278             while (scan < loceol && !isSPACE_LC(*scan))
6279                 scan++;
6280         }
6281         break;
6282     case DIGIT:
6283         if (utf8_target) {
6284             loceol = PL_regeol;
6285             LOAD_UTF8_CHARCLASS_DIGIT();
6286             while (hardcount < max && scan < loceol &&
6287                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6288                 scan += UTF8SKIP(scan);
6289                 hardcount++;
6290             }
6291         } else {
6292             while (scan < loceol && isDIGIT(*scan))
6293                 scan++;
6294         }
6295         break;
6296     case DIGITA:
6297         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6298             scan++;
6299         }
6300         break;
6301     case DIGITL:
6302         PL_reg_flags |= RF_tainted;
6303         if (utf8_target) {
6304             loceol = PL_regeol;
6305             while (hardcount < max && scan < loceol &&
6306                    isDIGIT_LC_utf8((U8*)scan)) {
6307                 scan += UTF8SKIP(scan);
6308                 hardcount++;
6309             }
6310         } else {
6311             while (scan < loceol && isDIGIT_LC(*scan))
6312                 scan++;
6313         }
6314         break;
6315     case NDIGIT:
6316         if (utf8_target) {
6317             loceol = PL_regeol;
6318             LOAD_UTF8_CHARCLASS_DIGIT();
6319             while (hardcount < max && scan < loceol &&
6320                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6321                 scan += UTF8SKIP(scan);
6322                 hardcount++;
6323             }
6324         } else {
6325             while (scan < loceol && !isDIGIT(*scan))
6326                 scan++;
6327         }
6328         break;
6329     case NDIGITA:
6330         if (utf8_target) {
6331             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6332                 scan += UTF8SKIP(scan);
6333             }
6334         }
6335         else {
6336             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6337                 scan++;
6338             }
6339         }
6340         break;
6341     case NDIGITL:
6342         PL_reg_flags |= RF_tainted;
6343         if (utf8_target) {
6344             loceol = PL_regeol;
6345             while (hardcount < max && scan < loceol &&
6346                    !isDIGIT_LC_utf8((U8*)scan)) {
6347                 scan += UTF8SKIP(scan);
6348                 hardcount++;
6349             }
6350         } else {
6351             while (scan < loceol && !isDIGIT_LC(*scan))
6352                 scan++;
6353         }
6354         break;
6355     case LNBREAK:
6356         if (utf8_target) {
6357             loceol = PL_regeol;
6358             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6359                 scan += c;
6360                 hardcount++;
6361             }
6362         } else {
6363             /*
6364               LNBREAK can match two latin chars, which is ok,
6365               because we have a null terminated string, but we
6366               have to use hardcount in this situation
6367             */
6368             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6369                 scan+=c;
6370                 hardcount++;
6371             }
6372         }
6373         break;
6374     case HORIZWS:
6375         if (utf8_target) {
6376             loceol = PL_regeol;
6377             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6378                 scan += c;
6379                 hardcount++;
6380             }
6381         } else {
6382             while (scan < loceol && is_HORIZWS_latin1(scan))
6383                 scan++;
6384         }
6385         break;
6386     case NHORIZWS:
6387         if (utf8_target) {
6388             loceol = PL_regeol;
6389             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6390                 scan += UTF8SKIP(scan);
6391                 hardcount++;
6392             }
6393         } else {
6394             while (scan < loceol && !is_HORIZWS_latin1(scan))
6395                 scan++;
6396
6397         }
6398         break;
6399     case VERTWS:
6400         if (utf8_target) {
6401             loceol = PL_regeol;
6402             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6403                 scan += c;
6404                 hardcount++;
6405             }
6406         } else {
6407             while (scan < loceol && is_VERTWS_latin1(scan))
6408                 scan++;
6409
6410         }
6411         break;
6412     case NVERTWS:
6413         if (utf8_target) {
6414             loceol = PL_regeol;
6415             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6416                 scan += UTF8SKIP(scan);
6417                 hardcount++;
6418             }
6419         } else {
6420             while (scan < loceol && !is_VERTWS_latin1(scan))
6421                 scan++;
6422
6423         }
6424         break;
6425
6426     default:            /* Called on something of 0 width. */
6427         break;          /* So match right here or not at all. */
6428     }
6429
6430     if (hardcount)
6431         c = hardcount;
6432     else
6433         c = scan - PL_reginput;
6434     PL_reginput = scan;
6435
6436     DEBUG_r({
6437         GET_RE_DEBUG_FLAGS_DECL;
6438         DEBUG_EXECUTE_r({
6439             SV * const prop = sv_newmortal();
6440             regprop(prog, prop, p);
6441             PerlIO_printf(Perl_debug_log,
6442                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6443                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6444         });
6445     });
6446
6447     return(c);
6448 }
6449
6450
6451 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6452 /*
6453 - regclass_swash - prepare the utf8 swash
6454 */
6455
6456 SV *
6457 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6458 {
6459     dVAR;
6460     SV *sw  = NULL;
6461     SV *si  = NULL;
6462     SV *alt = NULL;
6463     RXi_GET_DECL(prog,progi);
6464     const struct reg_data * const data = prog ? progi->data : NULL;
6465
6466     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6467
6468     if (data && data->count) {
6469         const U32 n = ARG(node);
6470
6471         if (data->what[n] == 's') {
6472             SV * const rv = MUTABLE_SV(data->data[n]);
6473             AV * const av = MUTABLE_AV(SvRV(rv));
6474             SV **const ary = AvARRAY(av);
6475             SV **a, **b;
6476
6477             /* See the end of regcomp.c:S_regclass() for
6478              * documentation of these array elements. */
6479
6480             si = *ary;
6481             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6482             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6483
6484             if (a)
6485                 sw = *a;
6486             else if (si && doinit) {
6487                 sw = swash_init("utf8", "", si, 1, 0);
6488                 (void)av_store(av, 1, sw);
6489             }
6490             if (b)
6491                 alt = *b;
6492         }
6493     }
6494
6495     if (listsvp)
6496         *listsvp = si;
6497     if (altsvp)
6498         *altsvp  = alt;
6499
6500     return sw;
6501 }
6502 #endif
6503
6504 /*
6505  - reginclass - determine if a character falls into a character class
6506
6507   n is the ANYOF regnode
6508   p is the target string
6509   lenp is pointer to the maximum number of bytes of how far to go in p
6510     (This is assumed wthout checking to always be at least the current
6511     character's size)
6512   utf8_target tells whether p is in UTF-8.
6513
6514   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6515   from a successful match, the value it points to will be updated to how many
6516   bytes in p were matched.  If there was no match, the value is undefined,
6517   possibly changed from the input.
6518
6519   Note that this can be a synthetic start class, a combination of various
6520   nodes, so things you think might be mutually exclusive, such as locale,
6521   aren't.  It can match both locale and non-locale
6522
6523  */
6524
6525 STATIC bool
6526 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6527 {
6528     dVAR;
6529     const char flags = ANYOF_FLAGS(n);
6530     bool match = FALSE;
6531     UV c = *p;
6532     STRLEN c_len = 0;
6533     STRLEN maxlen;
6534
6535     PERL_ARGS_ASSERT_REGINCLASS;
6536
6537     /* If c is not already the code point, get it */
6538     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6539         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6540                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6541                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6542                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6543                  * UTF8_ALLOW_FFFF */
6544         if (c_len == (STRLEN)-1)
6545             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6546     }
6547     else {
6548         c_len = 1;
6549     }
6550
6551     /* Use passed in max length, or one character if none passed in or less
6552      * than one character.  And assume will match just one character.  This is
6553      * overwritten later if matched more. */
6554     if (lenp) {
6555         maxlen = (*lenp > c_len) ? *lenp : c_len;
6556         *lenp = c_len;
6557
6558     }
6559     else {
6560         maxlen = c_len;
6561     }
6562
6563     /* If this character is potentially in the bitmap, check it */
6564     if (c < 256) {
6565         if (ANYOF_BITMAP_TEST(n, c))
6566             match = TRUE;
6567         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6568                 && ! utf8_target
6569                 && ! isASCII(c))
6570         {
6571             match = TRUE;
6572         }
6573
6574         else if (flags & ANYOF_LOCALE) {
6575             PL_reg_flags |= RF_tainted;
6576
6577             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6578                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6579             {
6580                 match = TRUE;
6581             }
6582             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6583                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6584                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6585                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6586                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6587                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6588                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6589                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6590                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6591                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6592                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6593                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6594                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6595                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6596                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6597                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6598                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6599                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6600                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6601                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6602                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6603                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6604                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6605                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6606                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6607                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6608                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6609                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6610                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6611                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6612                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6613                      ) /* How's that for a conditional? */
6614             ) {
6615                 match = TRUE;
6616             }
6617         }
6618     }
6619
6620     /* If the bitmap didn't (or couldn't) match, and something outside the
6621      * bitmap could match, try that.  Locale nodes specifiy completely the
6622      * behavior of code points in the bit map (otherwise, a utf8 target would
6623      * cause them to be treated as Unicode and not locale), except XXX in
6624      * the very unlikely event when this node is a synthetic start class, which
6625      * could be a combination of locale and non-locale nodes */
6626     if (!match) {
6627         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6628             match = TRUE;       /* Everything above 255 matches */
6629         }
6630         else if ((flags & ANYOF_NONBITMAP_NON_UTF8
6631                   || (utf8_target && flags & ANYOF_UTF8
6632                       && (c >=256 || ! (flags & ANYOF_LOCALE)))))
6633         {
6634             AV *av;
6635             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6636
6637             if (sw) {
6638                 U8 * utf8_p;
6639                 if (utf8_target) {
6640                     utf8_p = (U8 *) p;
6641                 } else {
6642
6643                     /* Not utf8.  Convert as much of the string as available up
6644                      * to the limit of how far the (single) character in the
6645                      * pattern can possibly match (no need to go further).  If
6646                      * the node is a straight ANYOF or not folding, it can't
6647                      * match more than one.  Otherwise, It can match up to how
6648                      * far a single char can fold to.  Since not utf8, each
6649                      * character is a single byte, so the max it can be in
6650                      * bytes is the same as the max it can be in characters */
6651                     STRLEN len = (OP(n) == ANYOF
6652                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6653                                   ? 1
6654                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6655                                     ? maxlen
6656                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6657                     utf8_p = bytes_to_utf8(p, &len);
6658                 }
6659
6660                 if (swash_fetch(sw, utf8_p, TRUE))
6661                     match = TRUE;
6662                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6663
6664                     /* Here, we need to test if the fold of the target string
6665                      * matches.  In the case of a multi-char fold that is
6666                      * caught by regcomp.c, it has stored all such folds into
6667                      * 'av'; we linearly check to see if any match the target
6668                      * string (folded).   We know that the originals were each
6669                      * one character, but we don't currently know how many
6670                      * characters/bytes each folded to, except we do know that
6671                      * there are small limits imposed by Unicode.  XXX A
6672                      * performance enhancement would be to have regcomp.c store
6673                      * the max number of chars/bytes that are in an av entry,
6674                      * as, say the 0th element.  Even better would be to have a
6675                      * hash of the few characters that can start a multi-char
6676                      * fold to the max number of chars of those folds.
6677                      *
6678                      * Further down, if there isn't a
6679                      * match in the av, we will check if there is another
6680                      * fold-type match.  For that, we also need the fold, but
6681                      * only the first character.  No sense in folding it twice,
6682                      * so we do it here, even if there isn't any multi-char
6683                      * fold, so we always fold at least the first character.
6684                      * If the node is a straight ANYOF node, or there is only
6685                      * one character available in the string, or if there isn't
6686                      * any av, that's all we have to fold.  In the case of a
6687                      * multi-char fold, we do have guarantees in Unicode that
6688                      * it can only expand up to so many characters and so many
6689                      * bytes.  We keep track so don't exceed either.
6690                      *
6691                      * If there is a match, we will need to advance (if lenp is
6692                      * specified) the match pointer in the target string.  But
6693                      * what we are comparing here isn't that string directly,
6694                      * but its fold, whose length may differ from the original.
6695                      * As we go along in constructing the fold, therefore, we
6696                      * create a map so that we know how many bytes in the
6697                      * source to advance given that we have matched a certain
6698                      * number of bytes in the fold.  This map is stored in
6699                      * 'map_fold_len_back'.  The first character in the fold
6700                      * has array element 1 contain the number of bytes in the
6701                      * source that folded to it; the 2nd is the cumulative
6702                      * number to match it; ... */
6703                     U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND] = { 0 };
6704                     U8 folded[UTF8_MAXBYTES_CASE+1];
6705                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6706                     STRLEN foldlen_for_av; /* num bytes in fold of all chars */
6707
6708                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6709
6710                         /* Here, only need to fold the first char of the target
6711                          * string */
6712                         to_utf8_fold(utf8_p, folded, &foldlen);
6713                         foldlen_for_av = foldlen;
6714                         map_fold_len_back[1] = UTF8SKIP(utf8_p);
6715                     }
6716                     else {
6717
6718                         /* Here, need to fold more than the first char.  Do so
6719                          * up to the limits */
6720                         UV which_char = 0;
6721                         U8* source_ptr = utf8_p;    /* The source for the fold
6722                                                        is the regex target
6723                                                        string */
6724                         U8* folded_ptr = folded;
6725                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6726                                                        available byte in the
6727                                                        target string */
6728                         while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND
6729                                && source_ptr < e)
6730                         {
6731
6732                             /* Fold the next character */
6733                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6734                             STRLEN this_char_foldlen;
6735                             to_utf8_fold(source_ptr,
6736                                          this_char_folded,
6737                                          &this_char_foldlen);
6738
6739                             /* Bail if it would exceed the byte limit for
6740                              * folding a single char. */
6741                             if (this_char_foldlen + folded_ptr - folded >
6742                                                             UTF8_MAXBYTES_CASE)
6743                             {
6744                                 break;
6745                             }
6746
6747                             /* Save the first character's folded length, in
6748                              * case we have to use it later */
6749                             if (! foldlen) {
6750                                 foldlen = this_char_foldlen;
6751                             }
6752
6753                             /* Here, add the fold of this character */
6754                             Copy(this_char_folded,
6755                                  folded_ptr,
6756                                  this_char_foldlen,
6757                                  U8);
6758                             which_char++;
6759                             map_fold_len_back[which_char] =
6760                                 map_fold_len_back[which_char - 1]
6761                                 + UTF8SKIP(source_ptr);
6762                             folded_ptr += this_char_foldlen;
6763                             source_ptr += UTF8SKIP(source_ptr);
6764                         }
6765                         *folded_ptr = '\0';
6766                         foldlen_for_av = folded_ptr - folded;
6767                     }
6768
6769
6770                     /* Do the linear search to see if the fold is in the list
6771                      * of multi-char folds.  (Useless to look if won't be able
6772                      * to store that it is a multi-char fold in *lenp) */
6773                     if (lenp && av) {
6774                         I32 i;
6775                         for (i = 0; i <= av_len(av); i++) {
6776                             SV* const sv = *av_fetch(av, i, FALSE);
6777                             STRLEN len;
6778                             const char * const s = SvPV_const(sv, len);
6779                             if (len <= foldlen_for_av && memEQ(s,
6780                                                                (char*)folded,
6781                                                                len))
6782                             {
6783
6784                                 /* Advance the target string ptr to account for
6785                                  * this fold, but have to translate from the
6786                                  * folded length to the corresponding source
6787                                  * length.  The array is indexed by how many
6788                                  * characters in the match */
6789                                 *lenp = map_fold_len_back[
6790                                         utf8_length(folded, folded + len)];
6791                                 match = TRUE;
6792                                 break;
6793                             }
6794                         }
6795                     }
6796 #if 0
6797                     if (!match) { /* See if the folded version matches */
6798                         SV** listp;
6799
6800                         /* Consider "k" =~ /[K]/i.  The line above would have
6801                          * just folded the 'k' to itself, and that isn't going
6802                          * to match 'K'.  So we look through the closure of
6803                          * everything that folds to 'k'.  That will find the
6804                          * 'K'.  Initialize the list, if necessary */
6805                         if (! PL_utf8_foldclosures) {
6806
6807                             /* If the folds haven't been read in, call a fold
6808                              * function to force that */
6809                             if (! PL_utf8_tofold) {
6810                                 U8 dummy[UTF8_MAXBYTES+1];
6811                                 STRLEN dummy_len;
6812                                 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6813                             }
6814                             PL_utf8_foldclosures =
6815                                   _swash_inversion_hash(PL_utf8_tofold);
6816                         }
6817
6818                         /* The data structure is a hash with the keys every
6819                          * character that is folded to, like 'k', and the
6820                          * values each an array of everything that folds to its
6821                          * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6822                         if ((listp = hv_fetch(PL_utf8_foldclosures,
6823                                       (char *) folded, foldlen, FALSE)))
6824                         {
6825                             AV* list = (AV*) *listp;
6826                             IV i;
6827                             for (i = 0; i <= av_len(list); i++) {
6828                                 SV** try_p = av_fetch(list, i, FALSE);
6829                                 char* try_c;
6830                                 if (try_p == NULL) {
6831                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6832                                 }
6833                                 /* Don't have to worry about embedded nulls
6834                                  * since NULL isn't folded or foldable */
6835                                 try_c = SvPVX(*try_p);
6836
6837                                 /* The fold in a few cases  of an above Latin1
6838                                  * char is in the Latin1 range, and hence may
6839                                  * be in the bitmap */
6840                                 if (UTF8_IS_INVARIANT(*try_c)
6841                                     && ANYOF_BITMAP_TEST(n,
6842                                                     UNI_TO_NATIVE(*try_c)))
6843                                 {
6844                                     match = TRUE;
6845                                     break;
6846                                 }
6847                                 else if
6848                                     (UTF8_IS_DOWNGRADEABLE_START(*try_c)
6849                                      && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
6850                                                 TWO_BYTE_UTF8_TO_UNI(try_c[0],
6851                                                                     try_c[1]))))
6852                                 {
6853                                    /* Since the fold comes from internally
6854                                     * generated data, we can safely assume it
6855                                     * is valid utf8 in the test above */
6856                                     match = TRUE;
6857                                     break;
6858                                 } else if (swash_fetch(sw, (U8*) try_c, TRUE)) {
6859                                     match = TRUE;
6860                                     break;
6861                                 }
6862                             }
6863                         }
6864                     }
6865 #endif
6866                 }
6867
6868                 /* If we allocated a string above, free it */
6869                 if (! utf8_target) Safefree(utf8_p);
6870             }
6871         }
6872     }
6873
6874     return (flags & ANYOF_INVERT) ? !match : match;
6875 }
6876
6877 STATIC U8 *
6878 S_reghop3(U8 *s, I32 off, const U8* lim)
6879 {
6880     dVAR;
6881
6882     PERL_ARGS_ASSERT_REGHOP3;
6883
6884     if (off >= 0) {
6885         while (off-- && s < lim) {
6886             /* XXX could check well-formedness here */
6887             s += UTF8SKIP(s);
6888         }
6889     }
6890     else {
6891         while (off++ && s > lim) {
6892             s--;
6893             if (UTF8_IS_CONTINUED(*s)) {
6894                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6895                     s--;
6896             }
6897             /* XXX could check well-formedness here */
6898         }
6899     }
6900     return s;
6901 }
6902
6903 #ifdef XXX_dmq
6904 /* there are a bunch of places where we use two reghop3's that should
6905    be replaced with this routine. but since thats not done yet
6906    we ifdef it out - dmq
6907 */
6908 STATIC U8 *
6909 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6910 {
6911     dVAR;
6912
6913     PERL_ARGS_ASSERT_REGHOP4;
6914
6915     if (off >= 0) {
6916         while (off-- && s < rlim) {
6917             /* XXX could check well-formedness here */
6918             s += UTF8SKIP(s);
6919         }
6920     }
6921     else {
6922         while (off++ && s > llim) {
6923             s--;
6924             if (UTF8_IS_CONTINUED(*s)) {
6925                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6926                     s--;
6927             }
6928             /* XXX could check well-formedness here */
6929         }
6930     }
6931     return s;
6932 }
6933 #endif
6934
6935 STATIC U8 *
6936 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6937 {
6938     dVAR;
6939
6940     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6941
6942     if (off >= 0) {
6943         while (off-- && s < lim) {
6944             /* XXX could check well-formedness here */
6945             s += UTF8SKIP(s);
6946         }
6947         if (off >= 0)
6948             return NULL;
6949     }
6950     else {
6951         while (off++ && s > lim) {
6952             s--;
6953             if (UTF8_IS_CONTINUED(*s)) {
6954                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6955                     s--;
6956             }
6957             /* XXX could check well-formedness here */
6958         }
6959         if (off <= 0)
6960             return NULL;
6961     }
6962     return s;
6963 }
6964
6965 static void
6966 restore_pos(pTHX_ void *arg)
6967 {
6968     dVAR;
6969     regexp * const rex = (regexp *)arg;
6970     if (PL_reg_eval_set) {
6971         if (PL_reg_oldsaved) {
6972             rex->subbeg = PL_reg_oldsaved;
6973             rex->sublen = PL_reg_oldsavedlen;
6974 #ifdef PERL_OLD_COPY_ON_WRITE
6975             rex->saved_copy = PL_nrs;
6976 #endif
6977             RXp_MATCH_COPIED_on(rex);
6978         }
6979         PL_reg_magic->mg_len = PL_reg_oldpos;
6980         PL_reg_eval_set = 0;
6981         PL_curpm = PL_reg_oldcurpm;
6982     }
6983 }
6984
6985 STATIC void
6986 S_to_utf8_substr(pTHX_ register regexp *prog)
6987 {
6988     int i = 1;
6989
6990     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6991
6992     do {
6993         if (prog->substrs->data[i].substr
6994             && !prog->substrs->data[i].utf8_substr) {
6995             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6996             prog->substrs->data[i].utf8_substr = sv;
6997             sv_utf8_upgrade(sv);
6998             if (SvVALID(prog->substrs->data[i].substr)) {
6999                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
7000                 if (flags & FBMcf_TAIL) {
7001                     /* Trim the trailing \n that fbm_compile added last
7002                        time.  */
7003                     SvCUR_set(sv, SvCUR(sv) - 1);
7004                     /* Whilst this makes the SV technically "invalid" (as its
7005                        buffer is no longer followed by "\0") when fbm_compile()
7006                        adds the "\n" back, a "\0" is restored.  */
7007                 }
7008                 fbm_compile(sv, flags);
7009             }
7010             if (prog->substrs->data[i].substr == prog->check_substr)
7011                 prog->check_utf8 = sv;
7012         }
7013     } while (i--);
7014 }
7015
7016 STATIC void
7017 S_to_byte_substr(pTHX_ register regexp *prog)
7018 {
7019     dVAR;
7020     int i = 1;
7021
7022     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7023
7024     do {
7025         if (prog->substrs->data[i].utf8_substr
7026             && !prog->substrs->data[i].substr) {
7027             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7028             if (sv_utf8_downgrade(sv, TRUE)) {
7029                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7030                     const U8 flags
7031                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
7032                     if (flags & FBMcf_TAIL) {
7033                         /* Trim the trailing \n that fbm_compile added last
7034                            time.  */
7035                         SvCUR_set(sv, SvCUR(sv) - 1);
7036                     }
7037                     fbm_compile(sv, flags);
7038                 }
7039             } else {
7040                 SvREFCNT_dec(sv);
7041                 sv = &PL_sv_undef;
7042             }
7043             prog->substrs->data[i].substr = sv;
7044             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7045                 prog->check_substr = sv;
7046         }
7047     } while (i--);
7048 }
7049
7050 /*
7051  * Local variables:
7052  * c-indentation-style: bsd
7053  * c-basic-offset: 4
7054  * indent-tabs-mode: t
7055  * End:
7056  *
7057  * ex: set ts=8 sts=4 sw=4 noet:
7058  */