src/5016002/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #ifdef EBCDIC
 127     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 128      * skip the check on EBCDIC platforms */
 129 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 130 #else
 131 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 132     if (!CAT2(PL_utf8_,class)) { \
 133         bool ok; \
 134         ENTER; save_re_context(); \
 135         ok=CAT2(is_utf8_,class)((const U8*)str); \
 136         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 137 #endif
 138
 139 /* Doesn't do an assert to verify that is correct */
 140 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 141     if (!CAT2(PL_utf8_,class)) { \
 142         bool throw_away PERL_UNUSED_DECL; \
 143         ENTER; save_re_context(); \
 144         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 145         LEAVE; } } STMT_END
 146
 147 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 148 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 149 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 150
 151 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 152         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 153         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 154         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 155             * assert should likely and hopefully fail on an EBCDIC machine */ \
 156         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 157                                                                             \
 158         /* No asserts are done for these, in case called on an early        \
 159             * Unicode version in which they map to nothing */               \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 163         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 164         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 165         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 166         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 167
 168 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 169
 170 /* The actual code for CCC_TRY, which uses several variables from the routine
 171  * it's callable from.  It is designed to be the bulk of a case statement.
 172  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 173  *      nextchr matches the class.
 174  * UTF8_TEST is the whole test string to use for utf8 targets
 175  * LOAD is what to use to test, and if not present to load in the swash for the
 176  *      class
 177  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 178  *      UTF8_TEST test.
 179  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 180  * utf8 and a variant, load the swash if necessary and test using the utf8
 181  * test.  Advance to the next character if test is ok, otherwise fail; If not
 182  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 183  * fails, or advance to the next character */
 184
 185 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 186     if (locinput >= PL_regeol) {                                              \
 187         sayNO;                                                                \
 188     }                                                                         \
 189     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 190         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 191         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 192             sayNO;                                                            \
 193         }                                                                     \
 194         locinput += PL_utf8skip[nextchr];                                     \
 195         nextchr = UCHARAT(locinput);                                          \
 196         break;                                                                \
 197     }                                                                         \
 198     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 199         sayNO;                                                                \
 200     }                                                                         \
 201     nextchr = UCHARAT(++locinput);                                            \
 202     break;
 203
 204 /* Handle the non-locale cases for a character class and its complement.  It
 205  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 206  * This is because that code fails when the test succeeds, so we want to have
 207  * the test fail so that the code succeeds.  The swash is stored in a
 208  * predictable PL_ place */
 209 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 210                            CLASS, STR)                                        \
 211     case NAME:                                                                \
 212         _CCC_TRY_CODE( !, FUNC,                                               \
 213                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 214                                             (U8*)locinput, TRUE)),            \
 215                           CLASS, STR)                                         \
 216     case NNAME:                                                               \
 217         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 218                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 219                                             (U8*)locinput, TRUE)),            \
 220                           CLASS, STR)                                         \
 221
 222 /* Generate the case statements for both locale and non-locale character
 223  * classes in regmatch for classes that don't have special unicode semantics.
 224  * Locales don't use an immediate swash, but an intermediary special locale
 225  * function that is called on the pointer to the current place in the input
 226  * string.  That function will resolve to needing the same swash.  One might
 227  * think that because we don't know what the locale will match, we shouldn't
 228  * check with the swash loading function that it loaded properly; ie, that we
 229  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 230  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 231  * irrelevant here */
 232 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 233                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 234                 NAMEA, NNAMEA, FUNCA,                                         \
 235                 CLASS, STR)                                                   \
 236     case NAMEL:                                                               \
 237         PL_reg_flags |= RF_tainted;                                           \
 238         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 239     case NNAMEL:                                                              \
 240         PL_reg_flags |= RF_tainted;                                           \
 241         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 242                        CLASS, STR)                                            \
 243     case NAMEA:                                                               \
 244         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 245             sayNO;                                                            \
 246         }                                                                     \
 247         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 248         nextchr = UCHARAT(++locinput);                                        \
 249         break;                                                                \
 250     case NNAMEA:                                                              \
 251         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 252             sayNO;                                                            \
 253         }                                                                     \
 254         if (utf8_target) {                                                    \
 255             locinput += PL_utf8skip[nextchr];                                 \
 256             nextchr = UCHARAT(locinput);                                      \
 257         }                                                                     \
 258         else {                                                                \
 259             nextchr = UCHARAT(++locinput);                                    \
 260         }                                                                     \
 261         break;                                                                \
 262     /* Generate the non-locale cases */                                       \
 263     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 264
 265 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 266  * statements to handle separate Unicode semantics nodes */
 267 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 268                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 269                   NAMEU, NNAMEU, FUNCU,                                        \
 270                   NAMEA, NNAMEA, FUNCA,                                        \
 271                   CLASS, STR)                                                  \
 272     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 273             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 274             NAMEA, NNAMEA, FUNCA,                                              \
 275             CLASS, STR)                                                        \
 276     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 277
 278 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 279
 280 /* for use after a quantifier and before an EXACT-like node -- japhy */
 281 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 282  *
 283  * NOTE that *nothing* that affects backtracking should be in here, specifically
 284  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 285  * node that is in between two EXACT like nodes when ascertaining what the required
 286  * "follow" character is. This should probably be moved to regex compile time
 287  * although it may be done at run time beause of the REF possibility - more
 288  * investigation required. -- demerphq
 289 */
 290 #define JUMPABLE(rn) (      \
 291     OP(rn) == OPEN ||       \
 292     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 293     OP(rn) == EVAL ||   \
 294     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 295     OP(rn) == PLUS || OP(rn) == MINMOD || \
 296     OP(rn) == KEEPS || \
 297     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 298 )
 299 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 300
 301 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 302
 303 #if 0
 304 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 305    we don't need this definition. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 307 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 309
 310 #else
 311 /* ... so we use this as its faster. */
 312 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 313 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 314 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 315 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 316
 317 #endif
 318
 319 /*
 320   Search for mandatory following text node; for lookahead, the text must
 321   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 322 */
 323 #define FIND_NEXT_IMPT(rn) STMT_START { \
 324     while (JUMPABLE(rn)) { \
 325         const OPCODE type = OP(rn); \
 326         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 327             rn = NEXTOPER(NEXTOPER(rn)); \
 328         else if (type == PLUS) \
 329             rn = NEXTOPER(rn); \
 330         else if (type == IFMATCH) \
 331             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 332         else rn += NEXT_OFF(rn); \
 333     } \
 334 } STMT_END
 335
 336
 337 static void restore_pos(pTHX_ void *arg);
 338
 339 #define REGCP_PAREN_ELEMS 4
 340 #define REGCP_OTHER_ELEMS 5
 341 #define REGCP_FRAME_ELEMS 1
 342 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 343  * are needed for the regexp context stack bookkeeping. */
 344
 345 STATIC CHECKPOINT
 346 S_regcppush(pTHX_ I32 parenfloor)
 347 {
 348     dVAR;
 349     const int retval = PL_savestack_ix;
 350     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 351     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 352     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 353     int p;
 354     GET_RE_DEBUG_FLAGS_DECL;
 355
 356     if (paren_elems_to_push < 0)
 357         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 358                    paren_elems_to_push);
 359
 360     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 361         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 362                    " out of range (%lu-%ld)",
 363                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 364
 365     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 366
 367     for (p = PL_regsize; p > parenfloor; p--) {
 368 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 369         SSPUSHINT(PL_regoffs[p].end);
 370         SSPUSHINT(PL_regoffs[p].start);
 371         SSPUSHPTR(PL_reg_start_tmp[p]);
 372         SSPUSHINT(p);
 373         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 374           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 375                       (UV)p, (IV)PL_regoffs[p].start,
 376                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 377                       (IV)PL_regoffs[p].end
 378         ));
 379     }
 380 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 381     SSPUSHPTR(PL_regoffs);
 382     SSPUSHINT(PL_regsize);
 383     SSPUSHINT(*PL_reglastparen);
 384     SSPUSHINT(*PL_reglastcloseparen);
 385     SSPUSHPTR(PL_reginput);
 386     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 387
 388     return retval;
 389 }
 390
 391 /* These are needed since we do not localize EVAL nodes: */
 392 #define REGCP_SET(cp)                                           \
 393     DEBUG_STATE_r(                                              \
 394             PerlIO_printf(Perl_debug_log,                       \
 395                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 396                 (IV)PL_savestack_ix));                          \
 397     cp = PL_savestack_ix
 398
 399 #define REGCP_UNWIND(cp)                                        \
 400     DEBUG_STATE_r(                                              \
 401         if (cp != PL_savestack_ix)                              \
 402             PerlIO_printf(Perl_debug_log,                       \
 403                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 404                 (IV)(cp), (IV)PL_savestack_ix));                \
 405     regcpblow(cp)
 406
 407 STATIC char *
 408 S_regcppop(pTHX_ const regexp *rex)
 409 {
 410     dVAR;
 411     UV i;
 412     char *input;
 413     GET_RE_DEBUG_FLAGS_DECL;
 414
 415     PERL_ARGS_ASSERT_REGCPPOP;
 416
 417     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 418     i = SSPOPUV;
 419     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 420     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 421     input = (char *) SSPOPPTR;
 422     *PL_reglastcloseparen = SSPOPINT;
 423     *PL_reglastparen = SSPOPINT;
 424     PL_regsize = SSPOPINT;
 425     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 426
 427     i -= REGCP_OTHER_ELEMS;
 428     /* Now restore the parentheses context. */
 429     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 430         I32 tmps;
 431         U32 paren = (U32)SSPOPINT;
 432         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 433         PL_regoffs[paren].start = SSPOPINT;
 434         tmps = SSPOPINT;
 435         if (paren <= *PL_reglastparen)
 436             PL_regoffs[paren].end = tmps;
 437         DEBUG_BUFFERS_r(
 438             PerlIO_printf(Perl_debug_log,
 439                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 440                           (UV)paren, (IV)PL_regoffs[paren].start,
 441                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 442                           (IV)PL_regoffs[paren].end,
 443                           (paren > *PL_reglastparen ? "(no)" : ""));
 444         );
 445     }
 446     DEBUG_BUFFERS_r(
 447         if (*PL_reglastparen + 1 <= rex->nparens) {
 448             PerlIO_printf(Perl_debug_log,
 449                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 450                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 451         }
 452     );
 453 #if 1
 454     /* It would seem that the similar code in regtry()
 455      * already takes care of this, and in fact it is in
 456      * a better location to since this code can #if 0-ed out
 457      * but the code in regtry() is needed or otherwise tests
 458      * requiring null fields (pat.t#187 and split.t#{13,14}
 459      * (as of patchlevel 7877)  will fail.  Then again,
 460      * this code seems to be necessary or otherwise
 461      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 462      * --jhi updated by dapm */
 463     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 464         if (i > PL_regsize)
 465             PL_regoffs[i].start = -1;
 466         PL_regoffs[i].end = -1;
 467     }
 468 #endif
 469     return input;
 470 }
 471
 472 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 473
 474 /*
 475  * pregexec and friends
 476  */
 477
 478 #ifndef PERL_IN_XSUB_RE
 479 /*
 480  - pregexec - match a regexp against a string
 481  */
 482 I32
 483 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 484          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 485 /* strend: pointer to null at end of string */
 486 /* strbeg: real beginning of string */
 487 /* minend: end of match must be >=minend after stringarg. */
 488 /* nosave: For optimizations. */
 489 {
 490     PERL_ARGS_ASSERT_PREGEXEC;
 491
 492     return
 493         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 494                       nosave ? 0 : REXEC_COPY_STR);
 495 }
 496 #endif
 497
 498 /*
 499  * Need to implement the following flags for reg_anch:
 500  *
 501  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 502  * USE_INTUIT_ML
 503  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 504  * INTUIT_AUTORITATIVE_ML
 505  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 506  * INTUIT_ONCE_ML
 507  *
 508  * Another flag for this function: SECOND_TIME (so that float substrs
 509  * with giant delta may be not rechecked).
 510  */
 511
 512 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 513
 514 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 515    Otherwise, only SvCUR(sv) is used to get strbeg. */
 516
 517 /* XXXX We assume that strpos is strbeg unless sv. */
 518
 519 /* XXXX Some places assume that there is a fixed substring.
 520         An update may be needed if optimizer marks as "INTUITable"
 521         RExen without fixed substrings.  Similarly, it is assumed that
 522         lengths of all the strings are no more than minlen, thus they
 523         cannot come from lookahead.
 524         (Or minlen should take into account lookahead.)
 525   NOTE: Some of this comment is not correct. minlen does now take account
 526   of lookahead/behind. Further research is required. -- demerphq
 527
 528 */
 529
 530 /* A failure to find a constant substring means that there is no need to make
 531    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 532    finding a substring too deep into the string means that less calls to
 533    regtry() should be needed.
 534
 535    REx compiler's optimizer found 4 possible hints:
 536         a) Anchored substring;
 537         b) Fixed substring;
 538         c) Whether we are anchored (beginning-of-line or \G);
 539         d) First node (of those at offset 0) which may distinguish positions;
 540    We use a)b)d) and multiline-part of c), and try to find a position in the
 541    string which does not contradict any of them.
 542  */
 543
 544 /* Most of decisions we do here should have been done at compile time.
 545    The nodes of the REx which we used for the search should have been
 546    deleted from the finite automaton. */
 547
 548 char *
 549 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 550                      char *strend, const U32 flags, re_scream_pos_data *data)
 551 {
 552     dVAR;
 553     struct regexp *const prog = (struct regexp *)SvANY(rx);
 554     register I32 start_shift = 0;
 555     /* Should be nonnegative! */
 556     register I32 end_shift   = 0;
 557     register char *s;
 558     register SV *check;
 559     char *strbeg;
 560     char *t;
 561     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 562     I32 ml_anch;
 563     register char *other_last = NULL;   /* other substr checked before this */
 564     char *check_at = NULL;              /* check substr found at this pos */
 565     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 566     RXi_GET_DECL(prog,progi);
 567 #ifdef DEBUGGING
 568     const char * const i_strpos = strpos;
 569 #endif
 570     GET_RE_DEBUG_FLAGS_DECL;
 571
 572     PERL_ARGS_ASSERT_RE_INTUIT_START;
 573
 574     RX_MATCH_UTF8_set(rx,utf8_target);
 575
 576     if (RX_UTF8(rx)) {
 577         PL_reg_flags |= RF_utf8;
 578     }
 579     DEBUG_EXECUTE_r(
 580         debug_start_match(rx, utf8_target, strpos, strend,
 581             sv ? "Guessing start of match in sv for"
 582                : "Guessing start of match in string for");
 583               );
 584
 585     /* CHR_DIST() would be more correct here but it makes things slow. */
 586     if (prog->minlen > strend - strpos) {
 587         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 588                               "String too short... [re_intuit_start]\n"));
 589         goto fail;
 590     }
 591
 592     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 593     PL_regeol = strend;
 594     if (utf8_target) {
 595         if (!prog->check_utf8 && prog->check_substr)
 596             to_utf8_substr(prog);
 597         check = prog->check_utf8;
 598     } else {
 599         if (!prog->check_substr && prog->check_utf8)
 600             to_byte_substr(prog);
 601         check = prog->check_substr;
 602     }
 603     if (check == &PL_sv_undef) {
 604         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 605                 "Non-utf8 string cannot match utf8 check string\n"));
 606         goto fail;
 607     }
 608     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 609         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 610                      || ( (prog->extflags & RXf_ANCH_BOL)
 611                           && !multiline ) );    /* Check after \n? */
 612
 613         if (!ml_anch) {
 614           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 615                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 616                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 617                && sv && !SvROK(sv)
 618                && (strpos != strbeg)) {
 619               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 620               goto fail;
 621           }
 622           if (prog->check_offset_min == prog->check_offset_max &&
 623               !(prog->extflags & RXf_CANY_SEEN)) {
 624             /* Substring at constant offset from beg-of-str... */
 625             I32 slen;
 626
 627             s = HOP3c(strpos, prog->check_offset_min, strend);
 628
 629             if (SvTAIL(check)) {
 630                 slen = SvCUR(check);    /* >= 1 */
 631
 632                 if ( strend - s > slen || strend - s < slen - 1
 633                      || (strend - s == slen && strend[-1] != '\n')) {
 634                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 635                     goto fail_finish;
 636                 }
 637                 /* Now should match s[0..slen-2] */
 638                 slen--;
 639                 if (slen && (*SvPVX_const(check) != *s
 640                              || (slen > 1
 641                                  && memNE(SvPVX_const(check), s, slen)))) {
 642                   report_neq:
 643                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 644                     goto fail_finish;
 645                 }
 646             }
 647             else if (*SvPVX_const(check) != *s
 648                      || ((slen = SvCUR(check)) > 1
 649                          && memNE(SvPVX_const(check), s, slen)))
 650                 goto report_neq;
 651             check_at = s;
 652             goto success_at_start;
 653           }
 654         }
 655         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 656         s = strpos;
 657         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 658         end_shift = prog->check_end_shift;
 659
 660         if (!ml_anch) {
 661             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 662                                          - (SvTAIL(check) != 0);
 663             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 664
 665             if (end_shift < eshift)
 666                 end_shift = eshift;
 667         }
 668     }
 669     else {                              /* Can match at random position */
 670         ml_anch = 0;
 671         s = strpos;
 672         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 673         end_shift = prog->check_end_shift;
 674
 675         /* end shift should be non negative here */
 676     }
 677
 678 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 679     if (end_shift < 0)
 680         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 681                    (IV)end_shift, RX_PRECOMP(prog));
 682 #endif
 683
 684   restart:
 685     /* Find a possible match in the region s..strend by looking for
 686        the "check" substring in the region corrected by start/end_shift. */
 687
 688     {
 689         I32 srch_start_shift = start_shift;
 690         I32 srch_end_shift = end_shift;
 691         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 692             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 693             srch_start_shift = strbeg - s;
 694         }
 695     DEBUG_OPTIMISE_MORE_r({
 696         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 697             (IV)prog->check_offset_min,
 698             (IV)srch_start_shift,
 699             (IV)srch_end_shift,
 700             (IV)prog->check_end_shift);
 701     });
 702
 703     if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
 704         I32 p = -1;                     /* Internal iterator of scream. */
 705         I32 * const pp = data ? data->scream_pos : &p;
 706         const MAGIC *mg;
 707         bool found = FALSE;
 708
 709         assert(SvMAGICAL(sv));
 710         mg = mg_find(sv, PERL_MAGIC_study);
 711         assert(mg);
 712
 713         if (mg->mg_private == 1) {
 714             found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
 715         } else if (mg->mg_private == 2) {
 716             found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
 717         } else {
 718             assert (mg->mg_private == 4);
 719             found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
 720         }
 721
 722         if (found
 723             || ( BmRARE(check) == '\n'
 724                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 725                  && SvTAIL(check) ))
 726             s = screaminstr(sv, check,
 727                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 728         else
 729             goto fail_finish;
 730         /* we may be pointing at the wrong string */
 731         if (s && RXp_MATCH_COPIED(prog))
 732             s = strbeg + (s - SvPVX_const(sv));
 733         if (data)
 734             *data->scream_olds = s;
 735     }
 736     else {
 737         U8* start_point;
 738         U8* end_point;
 739         if (prog->extflags & RXf_CANY_SEEN) {
 740             start_point= (U8*)(s + srch_start_shift);
 741             end_point= (U8*)(strend - srch_end_shift);
 742         } else {
 743             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 744             end_point= HOP3(strend, -srch_end_shift, strbeg);
 745         }
 746         DEBUG_OPTIMISE_MORE_r({
 747             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 748                 (int)(end_point - start_point),
 749                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 750                 start_point);
 751         });
 752
 753         s = fbm_instr( start_point, end_point,
 754                       check, multiline ? FBMrf_MULTILINE : 0);
 755     }
 756     }
 757     /* Update the count-of-usability, remove useless subpatterns,
 758         unshift s.  */
 759
 760     DEBUG_EXECUTE_r({
 761         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 762             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 763         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 764                           (s ? "Found" : "Did not find"),
 765             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 766                 ? "anchored" : "floating"),
 767             quoted,
 768             RE_SV_TAIL(check),
 769             (s ? " at offset " : "...\n") );
 770     });
 771
 772     if (!s)
 773         goto fail_finish;
 774     /* Finish the diagnostic message */
 775     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 776
 777     /* XXX dmq: first branch is for positive lookbehind...
 778        Our check string is offset from the beginning of the pattern.
 779        So we need to do any stclass tests offset forward from that
 780        point. I think. :-(
 781      */
 782
 783
 784
 785     check_at=s;
 786
 787
 788     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 789        Start with the other substr.
 790        XXXX no SCREAM optimization yet - and a very coarse implementation
 791        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 792                 *always* match.  Probably should be marked during compile...
 793        Probably it is right to do no SCREAM here...
 794      */
 795
 796     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 797                 : (prog->float_substr && prog->anchored_substr))
 798     {
 799         /* Take into account the "other" substring. */
 800         /* XXXX May be hopelessly wrong for UTF... */
 801         if (!other_last)
 802             other_last = strpos;
 803         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 804           do_other_anchored:
 805             {
 806                 char * const last = HOP3c(s, -start_shift, strbeg);
 807                 char *last1, *last2;
 808                 char * const saved_s = s;
 809                 SV* must;
 810
 811                 t = s - prog->check_offset_max;
 812                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 813                     && (!utf8_target
 814                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 815                             && t > strpos)))
 816                     NOOP;
 817                 else
 818                     t = strpos;
 819                 t = HOP3c(t, prog->anchored_offset, strend);
 820                 if (t < other_last)     /* These positions already checked */
 821                     t = other_last;
 822                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 823                 if (last < last1)
 824                     last1 = last;
 825                 /* XXXX It is not documented what units *_offsets are in.
 826                    We assume bytes, but this is clearly wrong.
 827                    Meaning this code needs to be carefully reviewed for errors.
 828                    dmq.
 829                   */
 830
 831                 /* On end-of-str: see comment below. */
 832                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 833                 if (must == &PL_sv_undef) {
 834                     s = (char*)NULL;
 835                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 836                 }
 837                 else
 838                     s = fbm_instr(
 839                         (unsigned char*)t,
 840                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 841                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 842                         must,
 843                         multiline ? FBMrf_MULTILINE : 0
 844                     );
 845                 DEBUG_EXECUTE_r({
 846                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 847                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 848                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 849                         (s ? "Found" : "Contradicts"),
 850                         quoted, RE_SV_TAIL(must));
 851                 });
 852
 853
 854                 if (!s) {
 855                     if (last1 >= last2) {
 856                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 857                                                 ", giving up...\n"));
 858                         goto fail_finish;
 859                     }
 860                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 861                         ", trying floating at offset %ld...\n",
 862                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 863                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 864                     s = HOP3c(last, 1, strend);
 865                     goto restart;
 866                 }
 867                 else {
 868                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 869                           (long)(s - i_strpos)));
 870                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 871                     other_last = HOP3c(s, 1, strend);
 872                     s = saved_s;
 873                     if (t == strpos)
 874                         goto try_at_start;
 875                     goto try_at_offset;
 876                 }
 877             }
 878         }
 879         else {          /* Take into account the floating substring. */
 880             char *last, *last1;
 881             char * const saved_s = s;
 882             SV* must;
 883
 884             t = HOP3c(s, -start_shift, strbeg);
 885             last1 = last =
 886                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 887             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 888                 last = HOP3c(t, prog->float_max_offset, strend);
 889             s = HOP3c(t, prog->float_min_offset, strend);
 890             if (s < other_last)
 891                 s = other_last;
 892  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 893             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 894             /* fbm_instr() takes into account exact value of end-of-str
 895                if the check is SvTAIL(ed).  Since false positives are OK,
 896                and end-of-str is not later than strend we are OK. */
 897             if (must == &PL_sv_undef) {
 898                 s = (char*)NULL;
 899                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 900             }
 901             else
 902                 s = fbm_instr((unsigned char*)s,
 903                               (unsigned char*)last + SvCUR(must)
 904                                   - (SvTAIL(must)!=0),
 905                               must, multiline ? FBMrf_MULTILINE : 0);
 906             DEBUG_EXECUTE_r({
 907                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 908                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 909                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 910                     (s ? "Found" : "Contradicts"),
 911                     quoted, RE_SV_TAIL(must));
 912             });
 913             if (!s) {
 914                 if (last1 == last) {
 915                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 916                                             ", giving up...\n"));
 917                     goto fail_finish;
 918                 }
 919                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 920                     ", trying anchored starting at offset %ld...\n",
 921                     (long)(saved_s + 1 - i_strpos)));
 922                 other_last = last;
 923                 s = HOP3c(t, 1, strend);
 924                 goto restart;
 925             }
 926             else {
 927                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 928                       (long)(s - i_strpos)));
 929                 other_last = s; /* Fix this later. --Hugo */
 930                 s = saved_s;
 931                 if (t == strpos)
 932                     goto try_at_start;
 933                 goto try_at_offset;
 934             }
 935         }
 936     }
 937
 938
 939     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 940
 941     DEBUG_OPTIMISE_MORE_r(
 942         PerlIO_printf(Perl_debug_log,
 943             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 944             (IV)prog->check_offset_min,
 945             (IV)prog->check_offset_max,
 946             (IV)(s-strpos),
 947             (IV)(t-strpos),
 948             (IV)(t-s),
 949             (IV)(strend-strpos)
 950         )
 951     );
 952
 953     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 954         && (!utf8_target
 955             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 956                  && t > strpos)))
 957     {
 958         /* Fixed substring is found far enough so that the match
 959            cannot start at strpos. */
 960       try_at_offset:
 961         if (ml_anch && t[-1] != '\n') {
 962             /* Eventually fbm_*() should handle this, but often
 963                anchored_offset is not 0, so this check will not be wasted. */
 964             /* XXXX In the code below we prefer to look for "^" even in
 965                presence of anchored substrings.  And we search even
 966                beyond the found float position.  These pessimizations
 967                are historical artefacts only.  */
 968           find_anchor:
 969             while (t < strend - prog->minlen) {
 970                 if (*t == '\n') {
 971                     if (t < check_at - prog->check_offset_min) {
 972                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 973                             /* Since we moved from the found position,
 974                                we definitely contradict the found anchored
 975                                substr.  Due to the above check we do not
 976                                contradict "check" substr.
 977                                Thus we can arrive here only if check substr
 978                                is float.  Redo checking for "other"=="fixed".
 979                              */
 980                             strpos = t + 1;
 981                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 982                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 983                             goto do_other_anchored;
 984                         }
 985                         /* We don't contradict the found floating substring. */
 986                         /* XXXX Why not check for STCLASS? */
 987                         s = t + 1;
 988                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 989                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 990                         goto set_useful;
 991                     }
 992                     /* Position contradicts check-string */
 993                     /* XXXX probably better to look for check-string
 994                        than for "\n", so one should lower the limit for t? */
 995                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 996                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 997                     other_last = strpos = s = t + 1;
 998                     goto restart;
 999                 }
1000                 t++;
1001             }
1002             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1003                         PL_colors[0], PL_colors[1]));
1004             goto fail_finish;
1005         }
1006         else {
1007             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1008                         PL_colors[0], PL_colors[1]));
1009         }
1010         s = t;
1011       set_useful:
1012         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1013     }
1014     else {
1015         /* The found string does not prohibit matching at strpos,
1016            - no optimization of calling REx engine can be performed,
1017            unless it was an MBOL and we are not after MBOL,
1018            or a future STCLASS check will fail this. */
1019       try_at_start:
1020         /* Even in this situation we may use MBOL flag if strpos is offset
1021            wrt the start of the string. */
1022         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1023             && (strpos != strbeg) && strpos[-1] != '\n'
1024             /* May be due to an implicit anchor of m{.*foo}  */
1025             && !(prog->intflags & PREGf_IMPLICIT))
1026         {
1027             t = strpos;
1028             goto find_anchor;
1029         }
1030         DEBUG_EXECUTE_r( if (ml_anch)
1031             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1032                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1033         );
1034       success_at_start:
1035         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1036             && (utf8_target ? (
1037                 prog->check_utf8                /* Could be deleted already */
1038                 && --BmUSEFUL(prog->check_utf8) < 0
1039                 && (prog->check_utf8 == prog->float_utf8)
1040             ) : (
1041                 prog->check_substr              /* Could be deleted already */
1042                 && --BmUSEFUL(prog->check_substr) < 0
1043                 && (prog->check_substr == prog->float_substr)
1044             )))
1045         {
1046             /* If flags & SOMETHING - do not do it many times on the same match */
1047             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1048             /* XXX Does the destruction order has to change with utf8_target? */
1049             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1050             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1051             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1052             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1053             check = NULL;                       /* abort */
1054             s = strpos;
1055             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1056                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1057             if (prog->intflags & PREGf_IMPLICIT)
1058                 prog->extflags &= ~RXf_ANCH_MBOL;
1059             /* XXXX This is a remnant of the old implementation.  It
1060                     looks wasteful, since now INTUIT can use many
1061                     other heuristics. */
1062             prog->extflags &= ~RXf_USE_INTUIT;
1063             /* XXXX What other flags might need to be cleared in this branch? */
1064         }
1065         else
1066             s = strpos;
1067     }
1068
1069     /* Last resort... */
1070     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1071     /* trie stclasses are too expensive to use here, we are better off to
1072        leave it to regmatch itself */
1073     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1074         /* minlen == 0 is possible if regstclass is \b or \B,
1075            and the fixed substr is ''$.
1076            Since minlen is already taken into account, s+1 is before strend;
1077            accidentally, minlen >= 1 guaranties no false positives at s + 1
1078            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1079            regstclass does not come from lookahead...  */
1080         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1081            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1082         const U8* const str = (U8*)STRING(progi->regstclass);
1083         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1084                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1085                     : 1);
1086         char * endpos;
1087         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1088             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1089         else if (prog->float_substr || prog->float_utf8)
1090             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1091         else
1092             endpos= strend;
1093
1094         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1095                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1096
1097         t = s;
1098         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1099         if (!s) {
1100 #ifdef DEBUGGING
1101             const char *what = NULL;
1102 #endif
1103             if (endpos == strend) {
1104                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1105                                 "Could not match STCLASS...\n") );
1106                 goto fail;
1107             }
1108             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109                                    "This position contradicts STCLASS...\n") );
1110             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1111                 goto fail;
1112             /* Contradict one of substrings */
1113             if (prog->anchored_substr || prog->anchored_utf8) {
1114                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1115                     DEBUG_EXECUTE_r( what = "anchored" );
1116                   hop_and_restart:
1117                     s = HOP3c(t, 1, strend);
1118                     if (s + start_shift + end_shift > strend) {
1119                         /* XXXX Should be taken into account earlier? */
1120                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1121                                                "Could not match STCLASS...\n") );
1122                         goto fail;
1123                     }
1124                     if (!check)
1125                         goto giveup;
1126                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                                 "Looking for %s substr starting at offset %ld...\n",
1128                                  what, (long)(s + start_shift - i_strpos)) );
1129                     goto restart;
1130                 }
1131                 /* Have both, check_string is floating */
1132                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1133                     goto retry_floating_check;
1134                 /* Recheck anchored substring, but not floating... */
1135                 s = check_at;
1136                 if (!check)
1137                     goto giveup;
1138                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1139                           "Looking for anchored substr starting at offset %ld...\n",
1140                           (long)(other_last - i_strpos)) );
1141                 goto do_other_anchored;
1142             }
1143             /* Another way we could have checked stclass at the
1144                current position only: */
1145             if (ml_anch) {
1146                 s = t = t + 1;
1147                 if (!check)
1148                     goto giveup;
1149                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1150                           "Looking for /%s^%s/m starting at offset %ld...\n",
1151                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1152                 goto try_at_offset;
1153             }
1154             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1155                 goto fail;
1156             /* Check is floating substring. */
1157           retry_floating_check:
1158             t = check_at - start_shift;
1159             DEBUG_EXECUTE_r( what = "floating" );
1160             goto hop_and_restart;
1161         }
1162         if (t != s) {
1163             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1164                         "By STCLASS: moving %ld --> %ld\n",
1165                                   (long)(t - i_strpos), (long)(s - i_strpos))
1166                    );
1167         }
1168         else {
1169             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1170                                   "Does not contradict STCLASS...\n");
1171                    );
1172         }
1173     }
1174   giveup:
1175     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1176                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1177                           PL_colors[5], (long)(s - i_strpos)) );
1178     return s;
1179
1180   fail_finish:                          /* Substring not found */
1181     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1182         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1183   fail:
1184     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1185                           PL_colors[4], PL_colors[5]));
1186     return NULL;
1187 }
1188
1189 #define DECL_TRIE_TYPE(scan) \
1190     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1191                     trie_type = ((scan->flags == EXACT) \
1192                               ? (utf8_target ? trie_utf8 : trie_plain) \
1193                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1194
1195 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1196 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1197     STRLEN skiplen;                                                                 \
1198     switch (trie_type) {                                                            \
1199     case trie_utf8_fold:                                                            \
1200         if ( foldlen>0 ) {                                                          \
1201             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1202             foldlen -= len;                                                         \
1203             uscan += len;                                                           \
1204             len=0;                                                                  \
1205         } else {                                                                    \
1206             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1207             len = UTF8SKIP(uc);                                                     \
1208             skiplen = UNISKIP( uvc );                                               \
1209             foldlen -= skiplen;                                                     \
1210             uscan = foldbuf + skiplen;                                              \
1211         }                                                                           \
1212         break;                                                                      \
1213     case trie_latin_utf8_fold:                                                      \
1214         if ( foldlen>0 ) {                                                          \
1215             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1216             foldlen -= len;                                                         \
1217             uscan += len;                                                           \
1218             len=0;                                                                  \
1219         } else {                                                                    \
1220             len = 1;                                                                \
1221             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1222             skiplen = UNISKIP( uvc );                                               \
1223             foldlen -= skiplen;                                                     \
1224             uscan = foldbuf + skiplen;                                              \
1225         }                                                                           \
1226         break;                                                                      \
1227     case trie_utf8:                                                                 \
1228         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1229         break;                                                                      \
1230     case trie_plain:                                                                \
1231         uvc = (UV)*uc;                                                              \
1232         len = 1;                                                                    \
1233     }                                                                               \
1234     if (uvc < 256) {                                                                \
1235         charid = trie->charmap[ uvc ];                                              \
1236     }                                                                               \
1237     else {                                                                          \
1238         charid = 0;                                                                 \
1239         if (widecharmap) {                                                          \
1240             SV** const svpp = hv_fetch(widecharmap,                                 \
1241                         (char*)&uvc, sizeof(UV), 0);                                \
1242             if (svpp)                                                               \
1243                 charid = (U16)SvIV(*svpp);                                          \
1244         }                                                                           \
1245     }                                                                               \
1246 } STMT_END
1247
1248 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1249 STMT_START {                                              \
1250     while (s <= e) {                                      \
1251         if ( (CoNd)                                       \
1252              && (ln == 1 || folder(s, pat_string, ln))    \
1253              && (!reginfo || regtry(reginfo, &s)) )       \
1254             goto got_it;                                  \
1255         s++;                                              \
1256     }                                                     \
1257 } STMT_END
1258
1259 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1260 STMT_START {                                          \
1261     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1262         CoDe                                          \
1263         s += uskip;                                   \
1264     }                                                 \
1265 } STMT_END
1266
1267 #define REXEC_FBC_SCAN(CoDe)                          \
1268 STMT_START {                                          \
1269     while (s < strend) {                              \
1270         CoDe                                          \
1271         s++;                                          \
1272     }                                                 \
1273 } STMT_END
1274
1275 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1276 REXEC_FBC_UTF8_SCAN(                                  \
1277     if (CoNd) {                                       \
1278         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1279             goto got_it;                              \
1280         else                                          \
1281             tmp = doevery;                            \
1282     }                                                 \
1283     else                                              \
1284         tmp = 1;                                      \
1285 )
1286
1287 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1288 REXEC_FBC_SCAN(                                       \
1289     if (CoNd) {                                       \
1290         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1291             goto got_it;                              \
1292         else                                          \
1293             tmp = doevery;                            \
1294     }                                                 \
1295     else                                              \
1296         tmp = 1;                                      \
1297 )
1298
1299 #define REXEC_FBC_TRYIT               \
1300 if ((!reginfo || regtry(reginfo, &s))) \
1301     goto got_it
1302
1303 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1304     if (utf8_target) {                                             \
1305         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1306     }                                                          \
1307     else {                                                     \
1308         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1309     }
1310
1311 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1312     if (utf8_target) {                                             \
1313         UtFpReLoAd;                                            \
1314         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1315     }                                                          \
1316     else {                                                     \
1317         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1318     }
1319
1320 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1321     PL_reg_flags |= RF_tainted;                                \
1322     if (utf8_target) {                                             \
1323         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1324     }                                                          \
1325     else {                                                     \
1326         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1327     }
1328
1329 #define DUMP_EXEC_POS(li,s,doutf8) \
1330     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1331
1332
1333 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1334         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1335         tmp = TEST_NON_UTF8(tmp);                                              \
1336         REXEC_FBC_UTF8_SCAN(                                                   \
1337             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1338                 tmp = !tmp;                                                    \
1339                 IF_SUCCESS;                                                    \
1340             }                                                                  \
1341             else {                                                             \
1342                 IF_FAIL;                                                       \
1343             }                                                                  \
1344         );                                                                     \
1345
1346 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1347         if (s == PL_bostr) {                                                   \
1348             tmp = '\n';                                                        \
1349         }                                                                      \
1350         else {                                                                 \
1351             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1352             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1353         }                                                                      \
1354         tmp = TeSt1_UtF8;                                                      \
1355         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1356         REXEC_FBC_UTF8_SCAN(                                                   \
1357             if (tmp == ! (TeSt2_UtF8)) { \
1358                 tmp = !tmp;                                                    \
1359                 IF_SUCCESS;                                                    \
1360             }                                                                  \
1361             else {                                                             \
1362                 IF_FAIL;                                                       \
1363             }                                                                  \
1364         );                                                                     \
1365
1366 /* The only difference between the BOUND and NBOUND cases is that
1367  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1368  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1369  * with the other one being empty */
1370 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1371     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1372
1373 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1374     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1375
1376 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1377     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1378
1379 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1380     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1381
1382
1383 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1384  * be passed in completely with the variable name being tested, which isn't
1385  * such a clean interface, but this is easier to read than it was before.  We
1386  * are looking for the boundary (or non-boundary between a word and non-word
1387  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1388  * must be different.  Find the "wordness" of the character just prior to this
1389  * one, and compare it with the wordness of this one.  If they differ, we have
1390  * a boundary.  At the beginning of the string, pretend that the previous
1391  * character was a new-line */
1392 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1393     if (utf8_target) {                                                         \
1394                 UTF8_CODE \
1395     }                                                                          \
1396     else {  /* Not utf8 */                                                     \
1397         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1398         tmp = TEST_NON_UTF8(tmp);                                              \
1399         REXEC_FBC_SCAN(                                                        \
1400             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1401                 tmp = !tmp;                                                    \
1402                 IF_SUCCESS;                                                    \
1403             }                                                                  \
1404             else {                                                             \
1405                 IF_FAIL;                                                       \
1406             }                                                                  \
1407         );                                                                     \
1408     }                                                                          \
1409     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1410         goto got_it;
1411
1412 /* We know what class REx starts with.  Try to find this position... */
1413 /* if reginfo is NULL, its a dryrun */
1414 /* annoyingly all the vars in this routine have different names from their counterparts
1415    in regmatch. /grrr */
1416
1417 STATIC char *
1418 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1419     const char *strend, regmatch_info *reginfo)
1420 {
1421         dVAR;
1422         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1423         char *pat_string;   /* The pattern's exactish string */
1424         char *pat_end;      /* ptr to end char of pat_string */
1425         re_fold_t folder;       /* Function for computing non-utf8 folds */
1426         const U8 *fold_array;   /* array for folding ords < 256 */
1427         STRLEN ln;
1428         STRLEN lnc;
1429         register STRLEN uskip;
1430         U8 c1;
1431         U8 c2;
1432         char *e;
1433         register I32 tmp = 1;   /* Scratch variable? */
1434         register const bool utf8_target = PL_reg_match_utf8;
1435         UV utf8_fold_flags = 0;
1436         RXi_GET_DECL(prog,progi);
1437
1438         PERL_ARGS_ASSERT_FIND_BYCLASS;
1439
1440         /* We know what class it must start with. */
1441         switch (OP(c)) {
1442         case ANYOFV:
1443         case ANYOF:
1444             if (utf8_target || OP(c) == ANYOFV) {
1445                 STRLEN inclasslen = strend - s;
1446                 REXEC_FBC_UTF8_CLASS_SCAN(
1447                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1448             }
1449             else {
1450                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1451             }
1452             break;
1453         case CANY:
1454             REXEC_FBC_SCAN(
1455                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1456                     goto got_it;
1457                 else
1458                     tmp = doevery;
1459             );
1460             break;
1461
1462         case EXACTFA:
1463             if (UTF_PATTERN || utf8_target) {
1464                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1465                 goto do_exactf_utf8;
1466             }
1467             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1468             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1469             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1470
1471         case EXACTF:
1472             if (utf8_target) {
1473
1474                 /* regcomp.c already folded this if pattern is in UTF-8 */
1475                 utf8_fold_flags = 0;
1476                 goto do_exactf_utf8;
1477             }
1478             fold_array = PL_fold;
1479             folder = foldEQ;
1480             goto do_exactf_non_utf8;
1481
1482         case EXACTFL:
1483             if (UTF_PATTERN || utf8_target) {
1484                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1485                 goto do_exactf_utf8;
1486             }
1487             fold_array = PL_fold_locale;
1488             folder = foldEQ_locale;
1489             goto do_exactf_non_utf8;
1490
1491         case EXACTFU_SS:
1492             if (UTF_PATTERN) {
1493                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1494             }
1495             goto do_exactf_utf8;
1496
1497         case EXACTFU_TRICKYFOLD:
1498         case EXACTFU:
1499             if (UTF_PATTERN || utf8_target) {
1500                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1501                 goto do_exactf_utf8;
1502             }
1503
1504             /* Any 'ss' in the pattern should have been replaced by regcomp,
1505              * so we don't have to worry here about this single special case
1506              * in the Latin1 range */
1507             fold_array = PL_fold_latin1;
1508             folder = foldEQ_latin1;
1509
1510             /* FALL THROUGH */
1511
1512         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1513                                are no glitches with fold-length differences
1514                                between the target string and pattern */
1515
1516             /* The idea in the non-utf8 EXACTF* cases is to first find the
1517              * first character of the EXACTF* node and then, if necessary,
1518              * case-insensitively compare the full text of the node.  c1 is the
1519              * first character.  c2 is its fold.  This logic will not work for
1520              * Unicode semantics and the german sharp ss, which hence should
1521              * not be compiled into a node that gets here. */
1522             pat_string = STRING(c);
1523             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1524
1525             /* We know that we have to match at least 'ln' bytes (which is the
1526              * same as characters, since not utf8).  If we have to match 3
1527              * characters, and there are only 2 availabe, we know without
1528              * trying that it will fail; so don't start a match past the
1529              * required minimum number from the far end */
1530             e = HOP3c(strend, -((I32)ln), s);
1531
1532             if (!reginfo && e < s) {
1533                 e = s;                  /* Due to minlen logic of intuit() */
1534             }
1535
1536             c1 = *pat_string;
1537             c2 = fold_array[c1];
1538             if (c1 == c2) { /* If char and fold are the same */
1539                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1540             }
1541             else {
1542                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1543             }
1544             break;
1545
1546         do_exactf_utf8:
1547         {
1548             unsigned expansion;
1549
1550
1551             /* If one of the operands is in utf8, we can't use the simpler
1552              * folding above, due to the fact that many different characters
1553              * can have the same fold, or portion of a fold, or different-
1554              * length fold */
1555             pat_string = STRING(c);
1556             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1557             pat_end = pat_string + ln;
1558             lnc = (UTF_PATTERN) /* length to match in characters */
1559                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1560                     : ln;
1561
1562             /* We have 'lnc' characters to match in the pattern, but because of
1563              * multi-character folding, each character in the target can match
1564              * up to 3 characters (Unicode guarantees it will never exceed
1565              * this) if it is utf8-encoded; and up to 2 if not (based on the
1566              * fact that the Latin 1 folds are already determined, and the
1567              * only multi-char fold in that range is the sharp-s folding to
1568              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1569              * string character.  Adjust lnc accordingly, rounding up, so that
1570              * if we need to match at least 4+1/3 chars, that really is 5. */
1571             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1572             lnc = (lnc + expansion - 1) / expansion;
1573
1574             /* As in the non-UTF8 case, if we have to match 3 characters, and
1575              * only 2 are left, it's guaranteed to fail, so don't start a
1576              * match that would require us to go beyond the end of the string
1577              */
1578             e = HOP3c(strend, -((I32)lnc), s);
1579
1580             if (!reginfo && e < s) {
1581                 e = s;                  /* Due to minlen logic of intuit() */
1582             }
1583
1584             /* XXX Note that we could recalculate e to stop the loop earlier,
1585              * as the worst case expansion above will rarely be met, and as we
1586              * go along we would usually find that e moves further to the left.
1587              * This would happen only after we reached the point in the loop
1588              * where if there were no expansion we should fail.  Unclear if
1589              * worth the expense */
1590
1591             while (s <= e) {
1592                 char *my_strend= (char *)strend;
1593                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1594                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1595                     && (!reginfo || regtry(reginfo, &s)) )
1596                 {
1597                     goto got_it;
1598                 }
1599                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1600             }
1601             break;
1602         }
1603         case BOUNDL:
1604             PL_reg_flags |= RF_tainted;
1605             FBC_BOUND(isALNUM_LC,
1606                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1607                       isALNUM_LC_utf8((U8*)s));
1608             break;
1609         case NBOUNDL:
1610             PL_reg_flags |= RF_tainted;
1611             FBC_NBOUND(isALNUM_LC,
1612                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1613                        isALNUM_LC_utf8((U8*)s));
1614             break;
1615         case BOUND:
1616             FBC_BOUND(isWORDCHAR,
1617                       isALNUM_uni(tmp),
1618                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1619             break;
1620         case BOUNDA:
1621             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1622                              isWORDCHAR_A(tmp),
1623                              isWORDCHAR_A((U8*)s));
1624             break;
1625         case NBOUND:
1626             FBC_NBOUND(isWORDCHAR,
1627                        isALNUM_uni(tmp),
1628                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1629             break;
1630         case NBOUNDA:
1631             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1632                               isWORDCHAR_A(tmp),
1633                               isWORDCHAR_A((U8*)s));
1634             break;
1635         case BOUNDU:
1636             FBC_BOUND(isWORDCHAR_L1,
1637                       isALNUM_uni(tmp),
1638                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1639             break;
1640         case NBOUNDU:
1641             FBC_NBOUND(isWORDCHAR_L1,
1642                        isALNUM_uni(tmp),
1643                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1644             break;
1645         case ALNUML:
1646             REXEC_FBC_CSCAN_TAINT(
1647                 isALNUM_LC_utf8((U8*)s),
1648                 isALNUM_LC(*s)
1649             );
1650             break;
1651         case ALNUMU:
1652             REXEC_FBC_CSCAN_PRELOAD(
1653                 LOAD_UTF8_CHARCLASS_ALNUM(),
1654                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1655                 isWORDCHAR_L1((U8) *s)
1656             );
1657             break;
1658         case ALNUM:
1659             REXEC_FBC_CSCAN_PRELOAD(
1660                 LOAD_UTF8_CHARCLASS_ALNUM(),
1661                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1662                 isWORDCHAR((U8) *s)
1663             );
1664             break;
1665         case ALNUMA:
1666             /* Don't need to worry about utf8, as it can match only a single
1667              * byte invariant character */
1668             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1669             break;
1670         case NALNUMU:
1671             REXEC_FBC_CSCAN_PRELOAD(
1672                 LOAD_UTF8_CHARCLASS_ALNUM(),
1673                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1674                 ! isWORDCHAR_L1((U8) *s)
1675             );
1676             break;
1677         case NALNUM:
1678             REXEC_FBC_CSCAN_PRELOAD(
1679                 LOAD_UTF8_CHARCLASS_ALNUM(),
1680                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1681                 ! isALNUM(*s)
1682             );
1683             break;
1684         case NALNUMA:
1685             REXEC_FBC_CSCAN(
1686                 !isWORDCHAR_A(*s),
1687                 !isWORDCHAR_A(*s)
1688             );
1689             break;
1690         case NALNUML:
1691             REXEC_FBC_CSCAN_TAINT(
1692                 !isALNUM_LC_utf8((U8*)s),
1693                 !isALNUM_LC(*s)
1694             );
1695             break;
1696         case SPACEU:
1697             REXEC_FBC_CSCAN_PRELOAD(
1698                 LOAD_UTF8_CHARCLASS_SPACE(),
1699                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1700                 isSPACE_L1((U8) *s)
1701             );
1702             break;
1703         case SPACE:
1704             REXEC_FBC_CSCAN_PRELOAD(
1705                 LOAD_UTF8_CHARCLASS_SPACE(),
1706                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1707                 isSPACE((U8) *s)
1708             );
1709             break;
1710         case SPACEA:
1711             /* Don't need to worry about utf8, as it can match only a single
1712              * byte invariant character */
1713             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1714             break;
1715         case SPACEL:
1716             REXEC_FBC_CSCAN_TAINT(
1717                 isSPACE_LC_utf8((U8*)s),
1718                 isSPACE_LC(*s)
1719             );
1720             break;
1721         case NSPACEU:
1722             REXEC_FBC_CSCAN_PRELOAD(
1723                 LOAD_UTF8_CHARCLASS_SPACE(),
1724                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1725                 ! isSPACE_L1((U8) *s)
1726             );
1727             break;
1728         case NSPACE:
1729             REXEC_FBC_CSCAN_PRELOAD(
1730                 LOAD_UTF8_CHARCLASS_SPACE(),
1731                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1732                 ! isSPACE((U8) *s)
1733             );
1734             break;
1735         case NSPACEA:
1736             REXEC_FBC_CSCAN(
1737                 !isSPACE_A(*s),
1738                 !isSPACE_A(*s)
1739             );
1740             break;
1741         case NSPACEL:
1742             REXEC_FBC_CSCAN_TAINT(
1743                 !isSPACE_LC_utf8((U8*)s),
1744                 !isSPACE_LC(*s)
1745             );
1746             break;
1747         case DIGIT:
1748             REXEC_FBC_CSCAN_PRELOAD(
1749                 LOAD_UTF8_CHARCLASS_DIGIT(),
1750                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1751                 isDIGIT(*s)
1752             );
1753             break;
1754         case DIGITA:
1755             /* Don't need to worry about utf8, as it can match only a single
1756              * byte invariant character */
1757             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1758             break;
1759         case DIGITL:
1760             REXEC_FBC_CSCAN_TAINT(
1761                 isDIGIT_LC_utf8((U8*)s),
1762                 isDIGIT_LC(*s)
1763             );
1764             break;
1765         case NDIGIT:
1766             REXEC_FBC_CSCAN_PRELOAD(
1767                 LOAD_UTF8_CHARCLASS_DIGIT(),
1768                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1769                 !isDIGIT(*s)
1770             );
1771             break;
1772         case NDIGITA:
1773             REXEC_FBC_CSCAN(
1774                 !isDIGIT_A(*s),
1775                 !isDIGIT_A(*s)
1776             );
1777             break;
1778         case NDIGITL:
1779             REXEC_FBC_CSCAN_TAINT(
1780                 !isDIGIT_LC_utf8((U8*)s),
1781                 !isDIGIT_LC(*s)
1782             );
1783             break;
1784         case LNBREAK:
1785             REXEC_FBC_CSCAN(
1786                 is_LNBREAK_utf8(s),
1787                 is_LNBREAK_latin1(s)
1788             );
1789             break;
1790         case VERTWS:
1791             REXEC_FBC_CSCAN(
1792                 is_VERTWS_utf8(s),
1793                 is_VERTWS_latin1(s)
1794             );
1795             break;
1796         case NVERTWS:
1797             REXEC_FBC_CSCAN(
1798                 !is_VERTWS_utf8(s),
1799                 !is_VERTWS_latin1(s)
1800             );
1801             break;
1802         case HORIZWS:
1803             REXEC_FBC_CSCAN(
1804                 is_HORIZWS_utf8(s),
1805                 is_HORIZWS_latin1(s)
1806             );
1807             break;
1808         case NHORIZWS:
1809             REXEC_FBC_CSCAN(
1810                 !is_HORIZWS_utf8(s),
1811                 !is_HORIZWS_latin1(s)
1812             );
1813             break;
1814         case AHOCORASICKC:
1815         case AHOCORASICK:
1816             {
1817                 DECL_TRIE_TYPE(c);
1818                 /* what trie are we using right now */
1819                 reg_ac_data *aho
1820                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1821                 reg_trie_data *trie
1822                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1823                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1824
1825                 const char *last_start = strend - trie->minlen;
1826 #ifdef DEBUGGING
1827                 const char *real_start = s;
1828 #endif
1829                 STRLEN maxlen = trie->maxlen;
1830                 SV *sv_points;
1831                 U8 **points; /* map of where we were in the input string
1832                                 when reading a given char. For ASCII this
1833                                 is unnecessary overhead as the relationship
1834                                 is always 1:1, but for Unicode, especially
1835                                 case folded Unicode this is not true. */
1836                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1837                 U8 *bitmap=NULL;
1838
1839
1840                 GET_RE_DEBUG_FLAGS_DECL;
1841
1842                 /* We can't just allocate points here. We need to wrap it in
1843                  * an SV so it gets freed properly if there is a croak while
1844                  * running the match */
1845                 ENTER;
1846                 SAVETMPS;
1847                 sv_points=newSV(maxlen * sizeof(U8 *));
1848                 SvCUR_set(sv_points,
1849                     maxlen * sizeof(U8 *));
1850                 SvPOK_on(sv_points);
1851                 sv_2mortal(sv_points);
1852                 points=(U8**)SvPV_nolen(sv_points );
1853                 if ( trie_type != trie_utf8_fold
1854                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1855                 {
1856                     if (trie->bitmap)
1857                         bitmap=(U8*)trie->bitmap;
1858                     else
1859                         bitmap=(U8*)ANYOF_BITMAP(c);
1860                 }
1861                 /* this is the Aho-Corasick algorithm modified a touch
1862                    to include special handling for long "unknown char"
1863                    sequences. The basic idea being that we use AC as long
1864                    as we are dealing with a possible matching char, when
1865                    we encounter an unknown char (and we have not encountered
1866                    an accepting state) we scan forward until we find a legal
1867                    starting char.
1868                    AC matching is basically that of trie matching, except
1869                    that when we encounter a failing transition, we fall back
1870                    to the current states "fail state", and try the current char
1871                    again, a process we repeat until we reach the root state,
1872                    state 1, or a legal transition. If we fail on the root state
1873                    then we can either terminate if we have reached an accepting
1874                    state previously, or restart the entire process from the beginning
1875                    if we have not.
1876
1877                  */
1878                 while (s <= last_start) {
1879                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1880                     U8 *uc = (U8*)s;
1881                     U16 charid = 0;
1882                     U32 base = 1;
1883                     U32 state = 1;
1884                     UV uvc = 0;
1885                     STRLEN len = 0;
1886                     STRLEN foldlen = 0;
1887                     U8 *uscan = (U8*)NULL;
1888                     U8 *leftmost = NULL;
1889 #ifdef DEBUGGING
1890                     U32 accepted_word= 0;
1891 #endif
1892                     U32 pointpos = 0;
1893
1894                     while ( state && uc <= (U8*)strend ) {
1895                         int failed=0;
1896                         U32 word = aho->states[ state ].wordnum;
1897
1898                         if( state==1 ) {
1899                             if ( bitmap ) {
1900                                 DEBUG_TRIE_EXECUTE_r(
1901                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1902                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1903                                             (char *)uc, utf8_target );
1904                                         PerlIO_printf( Perl_debug_log,
1905                                             " Scanning for legal start char...\n");
1906                                     }
1907                                 );
1908                                 if (utf8_target) {
1909                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1910                                         uc += UTF8SKIP(uc);
1911                                     }
1912                                 } else {
1913                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1914                                         uc++;
1915                                     }
1916                                 }
1917                                 s= (char *)uc;
1918                             }
1919                             if (uc >(U8*)last_start) break;
1920                         }
1921
1922                         if ( word ) {
1923                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1924                             if (!leftmost || lpos < leftmost) {
1925                                 DEBUG_r(accepted_word=word);
1926                                 leftmost= lpos;
1927                             }
1928                             if (base==0) break;
1929
1930                         }
1931                         points[pointpos++ % maxlen]= uc;
1932                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1933                                              uscan, len, uvc, charid, foldlen,
1934                                              foldbuf, uniflags);
1935                         DEBUG_TRIE_EXECUTE_r({
1936                             dump_exec_pos( (char *)uc, c, strend, real_start,
1937                                 s,   utf8_target );
1938                             PerlIO_printf(Perl_debug_log,
1939                                 " Charid:%3u CP:%4"UVxf" ",
1940                                  charid, uvc);
1941                         });
1942
1943                         do {
1944 #ifdef DEBUGGING
1945                             word = aho->states[ state ].wordnum;
1946 #endif
1947                             base = aho->states[ state ].trans.base;
1948
1949                             DEBUG_TRIE_EXECUTE_r({
1950                                 if (failed)
1951                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1952                                         s,   utf8_target );
1953                                 PerlIO_printf( Perl_debug_log,
1954                                     "%sState: %4"UVxf", word=%"UVxf,
1955                                     failed ? " Fail transition to " : "",
1956                                     (UV)state, (UV)word);
1957                             });
1958                             if ( base ) {
1959                                 U32 tmp;
1960                                 I32 offset;
1961                                 if (charid &&
1962                                      ( ((offset = base + charid
1963                                         - 1 - trie->uniquecharcount)) >= 0)
1964                                      && ((U32)offset < trie->lasttrans)
1965                                      && trie->trans[offset].check == state
1966                                      && (tmp=trie->trans[offset].next))
1967                                 {
1968                                     DEBUG_TRIE_EXECUTE_r(
1969                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1970                                     state = tmp;
1971                                     break;
1972                                 }
1973                                 else {
1974                                     DEBUG_TRIE_EXECUTE_r(
1975                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1976                                     failed = 1;
1977                                     state = aho->fail[state];
1978                                 }
1979                             }
1980                             else {
1981                                 /* we must be accepting here */
1982                                 DEBUG_TRIE_EXECUTE_r(
1983                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1984                                 failed = 1;
1985                                 break;
1986                             }
1987                         } while(state);
1988                         uc += len;
1989                         if (failed) {
1990                             if (leftmost)
1991                                 break;
1992                             if (!state) state = 1;
1993                         }
1994                     }
1995                     if ( aho->states[ state ].wordnum ) {
1996                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1997                         if (!leftmost || lpos < leftmost) {
1998                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1999                             leftmost = lpos;
2000                         }
2001                     }
2002                     if (leftmost) {
2003                         s = (char*)leftmost;
2004                         DEBUG_TRIE_EXECUTE_r({
2005                             PerlIO_printf(
2006                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2007                                 (UV)accepted_word, (IV)(s - real_start)
2008                             );
2009                         });
2010                         if (!reginfo || regtry(reginfo, &s)) {
2011                             FREETMPS;
2012                             LEAVE;
2013                             goto got_it;
2014                         }
2015                         s = HOPc(s,1);
2016                         DEBUG_TRIE_EXECUTE_r({
2017                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2018                         });
2019                     } else {
2020                         DEBUG_TRIE_EXECUTE_r(
2021                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2022                         break;
2023                     }
2024                 }
2025                 FREETMPS;
2026                 LEAVE;
2027             }
2028             break;
2029         default:
2030             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2031             break;
2032         }
2033         return 0;
2034       got_it:
2035         return s;
2036 }
2037
2038
2039 /*
2040  - regexec_flags - match a regexp against a string
2041  */
2042 I32
2043 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2044               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2045 /* strend: pointer to null at end of string */
2046 /* strbeg: real beginning of string */
2047 /* minend: end of match must be >=minend after stringarg. */
2048 /* data: May be used for some additional optimizations.
2049          Currently its only used, with a U32 cast, for transmitting
2050          the ganch offset when doing a /g match. This will change */
2051 /* nosave: For optimizations. */
2052 {
2053     dVAR;
2054     struct regexp *const prog = (struct regexp *)SvANY(rx);
2055     /*register*/ char *s;
2056     register regnode *c;
2057     /*register*/ char *startpos = stringarg;
2058     I32 minlen;         /* must match at least this many chars */
2059     I32 dontbother = 0; /* how many characters not to try at end */
2060     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2061     I32 scream_pos = -1;                /* Internal iterator of scream. */
2062     char *scream_olds = NULL;
2063     const bool utf8_target = cBOOL(DO_UTF8(sv));
2064     I32 multiline;
2065     RXi_GET_DECL(prog,progi);
2066     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2067     regexp_paren_pair *swap = NULL;
2068     GET_RE_DEBUG_FLAGS_DECL;
2069
2070     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2071     PERL_UNUSED_ARG(data);
2072
2073     /* Be paranoid... */
2074     if (prog == NULL || startpos == NULL) {
2075         Perl_croak(aTHX_ "NULL regexp parameter");
2076         return 0;
2077     }
2078
2079     multiline = prog->extflags & RXf_PMf_MULTILINE;
2080     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2081
2082     RX_MATCH_UTF8_set(rx, utf8_target);
2083     DEBUG_EXECUTE_r(
2084         debug_start_match(rx, utf8_target, startpos, strend,
2085         "Matching");
2086     );
2087
2088     minlen = prog->minlen;
2089
2090     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2091         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2092                               "String too short [regexec_flags]...\n"));
2093         goto phooey;
2094     }
2095
2096
2097     /* Check validity of program. */
2098     if (UCHARAT(progi->program) != REG_MAGIC) {
2099         Perl_croak(aTHX_ "corrupted regexp program");
2100     }
2101
2102     PL_reg_flags = 0;
2103     PL_reg_eval_set = 0;
2104     PL_reg_maxiter = 0;
2105
2106     if (RX_UTF8(rx))
2107         PL_reg_flags |= RF_utf8;
2108
2109     /* Mark beginning of line for ^ and lookbehind. */
2110     reginfo.bol = startpos; /* XXX not used ??? */
2111     PL_bostr  = strbeg;
2112     reginfo.sv = sv;
2113
2114     /* Mark end of line for $ (and such) */
2115     PL_regeol = strend;
2116
2117     /* see how far we have to get to not match where we matched before */
2118     reginfo.till = startpos+minend;
2119
2120     /* If there is a "must appear" string, look for it. */
2121     s = startpos;
2122
2123     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2124         MAGIC *mg;
2125         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2126             reginfo.ganch = startpos + prog->gofs;
2127             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2128               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2129         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2130                   && SvMAGIC(sv)
2131                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2132                   && mg->mg_len >= 0) {
2133             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2134             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2135                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2136
2137             if (prog->extflags & RXf_ANCH_GPOS) {
2138                 if (s > reginfo.ganch)
2139                     goto phooey;
2140                 s = reginfo.ganch - prog->gofs;
2141                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2142                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2143                 if (s < strbeg)
2144                     goto phooey;
2145             }
2146         }
2147         else if (data) {
2148             reginfo.ganch = strbeg + PTR2UV(data);
2149             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2150                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2151
2152         } else {                                /* pos() not defined */
2153             reginfo.ganch = strbeg;
2154             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2155                  "GPOS: reginfo.ganch = strbeg\n"));
2156         }
2157     }
2158     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2159         /* We have to be careful. If the previous successful match
2160            was from this regex we don't want a subsequent partially
2161            successful match to clobber the old results.
2162            So when we detect this possibility we add a swap buffer
2163            to the re, and switch the buffer each match. If we fail
2164            we switch it back, otherwise we leave it swapped.
2165         */
2166         swap = prog->offs;
2167         /* do we need a save destructor here for eval dies? */
2168         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2169     }
2170     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2171         re_scream_pos_data d;
2172
2173         d.scream_olds = &scream_olds;
2174         d.scream_pos = &scream_pos;
2175         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2176         if (!s) {
2177             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2178             goto phooey;        /* not present */
2179         }
2180     }
2181
2182
2183
2184     /* Simplest case:  anchored match need be tried only once. */
2185     /*  [unless only anchor is BOL and multiline is set] */
2186     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2187         if (s == startpos && regtry(&reginfo, &startpos))
2188             goto got_it;
2189         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2190                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2191         {
2192             char *end;
2193
2194             if (minlen)
2195                 dontbother = minlen - 1;
2196             end = HOP3c(strend, -dontbother, strbeg) - 1;
2197             /* for multiline we only have to try after newlines */
2198             if (prog->check_substr || prog->check_utf8) {
2199                 /* because of the goto we can not easily reuse the macros for bifurcating the
2200                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2201                 if (utf8_target) {
2202                     if (s == startpos)
2203                         goto after_try_utf8;
2204                     while (1) {
2205                         if (regtry(&reginfo, &s)) {
2206                             goto got_it;
2207                         }
2208                       after_try_utf8:
2209                         if (s > end) {
2210                             goto phooey;
2211                         }
2212                         if (prog->extflags & RXf_USE_INTUIT) {
2213                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2214                             if (!s) {
2215                                 goto phooey;
2216                             }
2217                         }
2218                         else {
2219                             s += UTF8SKIP(s);
2220                         }
2221                     }
2222                 } /* end search for check string in unicode */
2223                 else {
2224                     if (s == startpos) {
2225                         goto after_try_latin;
2226                     }
2227                     while (1) {
2228                         if (regtry(&reginfo, &s)) {
2229                             goto got_it;
2230                         }
2231                       after_try_latin:
2232                         if (s > end) {
2233                             goto phooey;
2234                         }
2235                         if (prog->extflags & RXf_USE_INTUIT) {
2236                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2237                             if (!s) {
2238                                 goto phooey;
2239                             }
2240                         }
2241                         else {
2242                             s++;
2243                         }
2244                     }
2245                 } /* end search for check string in latin*/
2246             } /* end search for check string */
2247             else { /* search for newline */
2248                 if (s > startpos) {
2249                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2250                     s--;
2251                 }
2252                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2253                 while (s <= end) { /* note it could be possible to match at the end of the string */
2254                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2255                         if (regtry(&reginfo, &s))
2256                             goto got_it;
2257                     }
2258                 }
2259             } /* end search for newline */
2260         } /* end anchored/multiline check string search */
2261         goto phooey;
2262     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2263     {
2264         /* the warning about reginfo.ganch being used without initialization
2265            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2266            and we only enter this block when the same bit is set. */
2267         char *tmp_s = reginfo.ganch - prog->gofs;
2268
2269         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2270             goto got_it;
2271         goto phooey;
2272     }
2273
2274     /* Messy cases:  unanchored match. */
2275     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2276         /* we have /x+whatever/ */
2277         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2278         char ch;
2279 #ifdef DEBUGGING
2280         int did_match = 0;
2281 #endif
2282         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2283             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2284         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2285
2286         if (utf8_target) {
2287             REXEC_FBC_SCAN(
2288                 if (*s == ch) {
2289                     DEBUG_EXECUTE_r( did_match = 1 );
2290                     if (regtry(&reginfo, &s)) goto got_it;
2291                     s += UTF8SKIP(s);
2292                     while (s < strend && *s == ch)
2293                         s += UTF8SKIP(s);
2294                 }
2295             );
2296         }
2297         else {
2298             REXEC_FBC_SCAN(
2299                 if (*s == ch) {
2300                     DEBUG_EXECUTE_r( did_match = 1 );
2301                     if (regtry(&reginfo, &s)) goto got_it;
2302                     s++;
2303                     while (s < strend && *s == ch)
2304                         s++;
2305                 }
2306             );
2307         }
2308         DEBUG_EXECUTE_r(if (!did_match)
2309                 PerlIO_printf(Perl_debug_log,
2310                                   "Did not find anchored character...\n")
2311                );
2312     }
2313     else if (prog->anchored_substr != NULL
2314               || prog->anchored_utf8 != NULL
2315               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2316                   && prog->float_max_offset < strend - s)) {
2317         SV *must;
2318         I32 back_max;
2319         I32 back_min;
2320         char *last;
2321         char *last1;            /* Last position checked before */
2322 #ifdef DEBUGGING
2323         int did_match = 0;
2324 #endif
2325         if (prog->anchored_substr || prog->anchored_utf8) {
2326             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2327                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2328             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2329             back_max = back_min = prog->anchored_offset;
2330         } else {
2331             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2332                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2333             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2334             back_max = prog->float_max_offset;
2335             back_min = prog->float_min_offset;
2336         }
2337
2338
2339         if (must == &PL_sv_undef)
2340             /* could not downgrade utf8 check substring, so must fail */
2341             goto phooey;
2342
2343         if (back_min<0) {
2344             last = strend;
2345         } else {
2346             last = HOP3c(strend,        /* Cannot start after this */
2347                   -(I32)(CHR_SVLEN(must)
2348                          - (SvTAIL(must) != 0) + back_min), strbeg);
2349         }
2350         if (s > PL_bostr)
2351             last1 = HOPc(s, -1);
2352         else
2353             last1 = s - 1;      /* bogus */
2354
2355         /* XXXX check_substr already used to find "s", can optimize if
2356            check_substr==must. */
2357         scream_pos = -1;
2358         dontbother = end_shift;
2359         strend = HOPc(strend, -dontbother);
2360         while ( (s <= last) &&
2361                 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
2362                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2363                                     end_shift, &scream_pos, 0))
2364                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2365                                   (unsigned char*)strend, must,
2366                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2367             /* we may be pointing at the wrong string */
2368             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2369                 s = strbeg + (s - SvPVX_const(sv));
2370             DEBUG_EXECUTE_r( did_match = 1 );
2371             if (HOPc(s, -back_max) > last1) {
2372                 last1 = HOPc(s, -back_min);
2373                 s = HOPc(s, -back_max);
2374             }
2375             else {
2376                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2377
2378                 last1 = HOPc(s, -back_min);
2379                 s = t;
2380             }
2381             if (utf8_target) {
2382                 while (s <= last1) {
2383                     if (regtry(&reginfo, &s))
2384                         goto got_it;
2385                     s += UTF8SKIP(s);
2386                 }
2387             }
2388             else {
2389                 while (s <= last1) {
2390                     if (regtry(&reginfo, &s))
2391                         goto got_it;
2392                     s++;
2393                 }
2394             }
2395         }
2396         DEBUG_EXECUTE_r(if (!did_match) {
2397             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2398                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2399             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2400                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2401                                ? "anchored" : "floating"),
2402                 quoted, RE_SV_TAIL(must));
2403         });
2404         goto phooey;
2405     }
2406     else if ( (c = progi->regstclass) ) {
2407         if (minlen) {
2408             const OPCODE op = OP(progi->regstclass);
2409             /* don't bother with what can't match */
2410             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2411                 strend = HOPc(strend, -(minlen - 1));
2412         }
2413         DEBUG_EXECUTE_r({
2414             SV * const prop = sv_newmortal();
2415             regprop(prog, prop, c);
2416             {
2417                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2418                     s,strend-s,60);
2419                 PerlIO_printf(Perl_debug_log,
2420                     "Matching stclass %.*s against %s (%d bytes)\n",
2421                     (int)SvCUR(prop), SvPVX_const(prop),
2422                      quoted, (int)(strend - s));
2423             }
2424         });
2425         if (find_byclass(prog, c, s, strend, &reginfo))
2426             goto got_it;
2427         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2428     }
2429     else {
2430         dontbother = 0;
2431         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2432             /* Trim the end. */
2433             char *last= NULL;
2434             SV* float_real;
2435
2436             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2437                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2438             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2439
2440             if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
2441                 last = screaminstr(sv, float_real, s - strbeg,
2442                                    end_shift, &scream_pos, 1); /* last one */
2443                 if (!last)
2444                     last = scream_olds; /* Only one occurrence. */
2445                 /* we may be pointing at the wrong string */
2446                 else if (RXp_MATCH_COPIED(prog))
2447                     s = strbeg + (s - SvPVX_const(sv));
2448             }
2449             else {
2450                 STRLEN len;
2451                 const char * const little = SvPV_const(float_real, len);
2452                 if (SvTAIL(float_real)) {
2453                     /* This means that float_real contains an artificial \n on the end
2454                      * due to the presence of something like this: /foo$/
2455                      * where we can match both "foo" and "foo\n" at the end of the string.
2456                      * So we have to compare the end of the string first against the float_real
2457                      * without the \n and then against the full float_real with the string.
2458                      * We have to watch out for cases where the string might be smaller
2459                      * than the float_real or the float_real without the \n.
2460                      */
2461                     char *checkpos= strend - len;
2462                     DEBUG_OPTIMISE_r(
2463                         PerlIO_printf(Perl_debug_log,
2464                             "%sChecking for float_real.%s\n",
2465                             PL_colors[4], PL_colors[5]));
2466                     if (checkpos + 1 < strbeg) {
2467                         /* can't match, even if we remove the trailing \n string is too short to match */
2468                         DEBUG_EXECUTE_r(
2469                             PerlIO_printf(Perl_debug_log,
2470                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2471                                 PL_colors[4], PL_colors[5]));
2472                         goto phooey;
2473                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2474                         /* can match, the end of the string matches without the "\n" */
2475                         last = checkpos + 1;
2476                     } else if (checkpos < strbeg) {
2477                         /* cant match, string is too short when the "\n" is included */
2478                         DEBUG_EXECUTE_r(
2479                             PerlIO_printf(Perl_debug_log,
2480                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2481                                 PL_colors[4], PL_colors[5]));
2482                         goto phooey;
2483                     } else if (!multiline) {
2484                         /* non multiline match, so compare with the "\n" at the end of the string */
2485                         if (memEQ(checkpos, little, len)) {
2486                             last= checkpos;
2487                         } else {
2488                             DEBUG_EXECUTE_r(
2489                                 PerlIO_printf(Perl_debug_log,
2490                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2491                                     PL_colors[4], PL_colors[5]));
2492                             goto phooey;
2493                         }
2494                     } else {
2495                         /* multiline match, so we have to search for a place where the full string is located */
2496                         goto find_last;
2497                     }
2498                 } else {
2499                   find_last:
2500                     if (len)
2501                         last = rninstr(s, strend, little, little + len);
2502                     else
2503                         last = strend;  /* matching "$" */
2504                 }
2505             }
2506             if (!last) {
2507                 /* at one point this block contained a comment which was probably
2508                  * incorrect, which said that this was a "should not happen" case.
2509                  * Even if it was true when it was written I am pretty sure it is
2510                  * not anymore, so I have removed the comment and replaced it with
2511                  * this one. Yves */
2512                 DEBUG_EXECUTE_r(
2513                     PerlIO_printf(Perl_debug_log,
2514                         "String does not contain required substring, cannot match.\n"
2515                     ));
2516                 goto phooey;
2517             }
2518             dontbother = strend - last + prog->float_min_offset;
2519         }
2520         if (minlen && (dontbother < minlen))
2521             dontbother = minlen - 1;
2522         strend -= dontbother;              /* this one's always in bytes! */
2523         /* We don't know much -- general case. */
2524         if (utf8_target) {
2525             for (;;) {
2526                 if (regtry(&reginfo, &s))
2527                     goto got_it;
2528                 if (s >= strend)
2529                     break;
2530                 s += UTF8SKIP(s);
2531             };
2532         }
2533         else {
2534             do {
2535                 if (regtry(&reginfo, &s))
2536                     goto got_it;
2537             } while (s++ < strend);
2538         }
2539     }
2540
2541     /* Failure. */
2542     goto phooey;
2543
2544 got_it:
2545     Safefree(swap);
2546     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2547
2548     if (PL_reg_eval_set)
2549         restore_pos(aTHX_ prog);
2550     if (RXp_PAREN_NAMES(prog))
2551         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2552
2553     /* make sure $`, $&, $', and $digit will work later */
2554     if ( !(flags & REXEC_NOT_FIRST) ) {
2555         RX_MATCH_COPY_FREE(rx);
2556         if (flags & REXEC_COPY_STR) {
2557             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2558 #ifdef PERL_OLD_COPY_ON_WRITE
2559             if ((SvIsCOW(sv)
2560                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2561                 if (DEBUG_C_TEST) {
2562                     PerlIO_printf(Perl_debug_log,
2563                                   "Copy on write: regexp capture, type %d\n",
2564                                   (int) SvTYPE(sv));
2565                 }
2566                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2567                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2568                 assert (SvPOKp(prog->saved_copy));
2569             } else
2570 #endif
2571             {
2572                 RX_MATCH_COPIED_on(rx);
2573                 s = savepvn(strbeg, i);
2574                 prog->subbeg = s;
2575             }
2576             prog->sublen = i;
2577         }
2578         else {
2579             prog->subbeg = strbeg;
2580             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2581         }
2582     }
2583
2584     return 1;
2585
2586 phooey:
2587     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2588                           PL_colors[4], PL_colors[5]));
2589     if (PL_reg_eval_set)
2590         restore_pos(aTHX_ prog);
2591     if (swap) {
2592         /* we failed :-( roll it back */
2593         Safefree(prog->offs);
2594         prog->offs = swap;
2595     }
2596
2597     return 0;
2598 }
2599
2600
2601 /*
2602  - regtry - try match at specific point
2603  */
2604 STATIC I32                      /* 0 failure, 1 success */
2605 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2606 {
2607     dVAR;
2608     CHECKPOINT lastcp;
2609     REGEXP *const rx = reginfo->prog;
2610     regexp *const prog = (struct regexp *)SvANY(rx);
2611     RXi_GET_DECL(prog,progi);
2612     GET_RE_DEBUG_FLAGS_DECL;
2613
2614     PERL_ARGS_ASSERT_REGTRY;
2615
2616     reginfo->cutpoint=NULL;
2617
2618     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2619         MAGIC *mg;
2620
2621         PL_reg_eval_set = RS_init;
2622         DEBUG_EXECUTE_r(DEBUG_s(
2623             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2624                           (IV)(PL_stack_sp - PL_stack_base));
2625             ));
2626         SAVESTACK_CXPOS();
2627         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2628         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2629         SAVETMPS;
2630         /* Apparently this is not needed, judging by wantarray. */
2631         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2632            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2633
2634         if (reginfo->sv) {
2635             /* Make $_ available to executed code. */
2636             if (reginfo->sv != DEFSV) {
2637                 SAVE_DEFSV;
2638                 DEFSV_set(reginfo->sv);
2639             }
2640
2641             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2642                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2643                 /* prepare for quick setting of pos */
2644 #ifdef PERL_OLD_COPY_ON_WRITE
2645                 if (SvIsCOW(reginfo->sv))
2646                     sv_force_normal_flags(reginfo->sv, 0);
2647 #endif
2648                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2649                                  &PL_vtbl_mglob, NULL, 0);
2650                 mg->mg_len = -1;
2651             }
2652             PL_reg_magic    = mg;
2653             PL_reg_oldpos   = mg->mg_len;
2654             SAVEDESTRUCTOR_X(restore_pos, prog);
2655         }
2656         if (!PL_reg_curpm) {
2657             Newxz(PL_reg_curpm, 1, PMOP);
2658 #ifdef USE_ITHREADS
2659             {
2660                 SV* const repointer = &PL_sv_undef;
2661                 /* this regexp is also owned by the new PL_reg_curpm, which
2662                    will try to free it.  */
2663                 av_push(PL_regex_padav, repointer);
2664                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2665                 PL_regex_pad = AvARRAY(PL_regex_padav);
2666             }
2667 #endif
2668         }
2669 #ifdef USE_ITHREADS
2670         /* It seems that non-ithreads works both with and without this code.
2671            So for efficiency reasons it seems best not to have the code
2672            compiled when it is not needed.  */
2673         /* This is safe against NULLs: */
2674         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2675         /* PM_reg_curpm owns a reference to this regexp.  */
2676         (void)ReREFCNT_inc(rx);
2677 #endif
2678         PM_SETRE(PL_reg_curpm, rx);
2679         PL_reg_oldcurpm = PL_curpm;
2680         PL_curpm = PL_reg_curpm;
2681         if (RXp_MATCH_COPIED(prog)) {
2682             /*  Here is a serious problem: we cannot rewrite subbeg,
2683                 since it may be needed if this match fails.  Thus
2684                 $` inside (?{}) could fail... */
2685             PL_reg_oldsaved = prog->subbeg;
2686             PL_reg_oldsavedlen = prog->sublen;
2687 #ifdef PERL_OLD_COPY_ON_WRITE
2688             PL_nrs = prog->saved_copy;
2689 #endif
2690             RXp_MATCH_COPIED_off(prog);
2691         }
2692         else
2693             PL_reg_oldsaved = NULL;
2694         prog->subbeg = PL_bostr;
2695         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2696     }
2697     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2698     prog->offs[0].start = *startpos - PL_bostr;
2699     PL_reginput = *startpos;
2700     PL_reglastparen = &prog->lastparen;
2701     PL_reglastcloseparen = &prog->lastcloseparen;
2702     prog->lastparen = 0;
2703     prog->lastcloseparen = 0;
2704     PL_regsize = 0;
2705     PL_regoffs = prog->offs;
2706     if (PL_reg_start_tmpl <= prog->nparens) {
2707         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2708         if(PL_reg_start_tmp)
2709             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2710         else
2711             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2712     }
2713
2714     /* XXXX What this code is doing here?!!!  There should be no need
2715        to do this again and again, PL_reglastparen should take care of
2716        this!  --ilya*/
2717
2718     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2719      * Actually, the code in regcppop() (which Ilya may be meaning by
2720      * PL_reglastparen), is not needed at all by the test suite
2721      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2722      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2723      * Meanwhile, this code *is* needed for the
2724      * above-mentioned test suite tests to succeed.  The common theme
2725      * on those tests seems to be returning null fields from matches.
2726      * --jhi updated by dapm */
2727 #if 1
2728     if (prog->nparens) {
2729         regexp_paren_pair *pp = PL_regoffs;
2730         register I32 i;
2731         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2732             ++pp;
2733             pp->start = -1;
2734             pp->end = -1;
2735         }
2736     }
2737 #endif
2738     REGCP_SET(lastcp);
2739     if (regmatch(reginfo, progi->program + 1)) {
2740         PL_regoffs[0].end = PL_reginput - PL_bostr;
2741         return 1;
2742     }
2743     if (reginfo->cutpoint)
2744         *startpos= reginfo->cutpoint;
2745     REGCP_UNWIND(lastcp);
2746     return 0;
2747 }
2748
2749
2750 #define sayYES goto yes
2751 #define sayNO goto no
2752 #define sayNO_SILENT goto no_silent
2753
2754 /* we dont use STMT_START/END here because it leads to
2755    "unreachable code" warnings, which are bogus, but distracting. */
2756 #define CACHEsayNO \
2757     if (ST.cache_mask) \
2758        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2759     sayNO
2760
2761 /* this is used to determine how far from the left messages like
2762    'failed...' are printed. It should be set such that messages
2763    are inline with the regop output that created them.
2764 */
2765 #define REPORT_CODE_OFF 32
2766
2767
2768 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2769 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2770
2771 #define SLAB_FIRST(s) (&(s)->states[0])
2772 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2773
2774 /* grab a new slab and return the first slot in it */
2775
2776 STATIC regmatch_state *
2777 S_push_slab(pTHX)
2778 {
2779 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2780     dMY_CXT;
2781 #endif
2782     regmatch_slab *s = PL_regmatch_slab->next;
2783     if (!s) {
2784         Newx(s, 1, regmatch_slab);
2785         s->prev = PL_regmatch_slab;
2786         s->next = NULL;
2787         PL_regmatch_slab->next = s;
2788     }
2789     PL_regmatch_slab = s;
2790     return SLAB_FIRST(s);
2791 }
2792
2793
2794 /* push a new state then goto it */
2795
2796 #define PUSH_STATE_GOTO(state, node) \
2797     scan = node; \
2798     st->resume_state = state; \
2799     goto push_state;
2800
2801 /* push a new state with success backtracking, then goto it */
2802
2803 #define PUSH_YES_STATE_GOTO(state, node) \
2804     scan = node; \
2805     st->resume_state = state; \
2806     goto push_yes_state;
2807
2808
2809
2810 /*
2811
2812 regmatch() - main matching routine
2813
2814 This is basically one big switch statement in a loop. We execute an op,
2815 set 'next' to point the next op, and continue. If we come to a point which
2816 we may need to backtrack to on failure such as (A|B|C), we push a
2817 backtrack state onto the backtrack stack. On failure, we pop the top
2818 state, and re-enter the loop at the state indicated. If there are no more
2819 states to pop, we return failure.
2820
2821 Sometimes we also need to backtrack on success; for example /A+/, where
2822 after successfully matching one A, we need to go back and try to
2823 match another one; similarly for lookahead assertions: if the assertion
2824 completes successfully, we backtrack to the state just before the assertion
2825 and then carry on.  In these cases, the pushed state is marked as
2826 'backtrack on success too'. This marking is in fact done by a chain of
2827 pointers, each pointing to the previous 'yes' state. On success, we pop to
2828 the nearest yes state, discarding any intermediate failure-only states.
2829 Sometimes a yes state is pushed just to force some cleanup code to be
2830 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2831 it to free the inner regex.
2832
2833 Note that failure backtracking rewinds the cursor position, while
2834 success backtracking leaves it alone.
2835
2836 A pattern is complete when the END op is executed, while a subpattern
2837 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2838 ops trigger the "pop to last yes state if any, otherwise return true"
2839 behaviour.
2840
2841 A common convention in this function is to use A and B to refer to the two
2842 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2843 the subpattern to be matched possibly multiple times, while B is the entire
2844 rest of the pattern. Variable and state names reflect this convention.
2845
2846 The states in the main switch are the union of ops and failure/success of
2847 substates associated with with that op.  For example, IFMATCH is the op
2848 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2849 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2850 successfully matched A and IFMATCH_A_fail is a state saying that we have
2851 just failed to match A. Resume states always come in pairs. The backtrack
2852 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2853 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2854 on success or failure.
2855
2856 The struct that holds a backtracking state is actually a big union, with
2857 one variant for each major type of op. The variable st points to the
2858 top-most backtrack struct. To make the code clearer, within each
2859 block of code we #define ST to alias the relevant union.
2860
2861 Here's a concrete example of a (vastly oversimplified) IFMATCH
2862 implementation:
2863
2864     switch (state) {
2865     ....
2866
2867 #define ST st->u.ifmatch
2868
2869     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2870         ST.foo = ...; // some state we wish to save
2871         ...
2872         // push a yes backtrack state with a resume value of
2873         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2874         // first node of A:
2875         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2876         // NOTREACHED
2877
2878     case IFMATCH_A: // we have successfully executed A; now continue with B
2879         next = B;
2880         bar = ST.foo; // do something with the preserved value
2881         break;
2882
2883     case IFMATCH_A_fail: // A failed, so the assertion failed
2884         ...;   // do some housekeeping, then ...
2885         sayNO; // propagate the failure
2886
2887 #undef ST
2888
2889     ...
2890     }
2891
2892 For any old-timers reading this who are familiar with the old recursive
2893 approach, the code above is equivalent to:
2894
2895     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2896     {
2897         int foo = ...
2898         ...
2899         if (regmatch(A)) {
2900             next = B;
2901             bar = foo;
2902             break;
2903         }
2904         ...;   // do some housekeeping, then ...
2905         sayNO; // propagate the failure
2906     }
2907
2908 The topmost backtrack state, pointed to by st, is usually free. If you
2909 want to claim it, populate any ST.foo fields in it with values you wish to
2910 save, then do one of
2911
2912         PUSH_STATE_GOTO(resume_state, node);
2913         PUSH_YES_STATE_GOTO(resume_state, node);
2914
2915 which sets that backtrack state's resume value to 'resume_state', pushes a
2916 new free entry to the top of the backtrack stack, then goes to 'node'.
2917 On backtracking, the free slot is popped, and the saved state becomes the
2918 new free state. An ST.foo field in this new top state can be temporarily
2919 accessed to retrieve values, but once the main loop is re-entered, it
2920 becomes available for reuse.
2921
2922 Note that the depth of the backtrack stack constantly increases during the
2923 left-to-right execution of the pattern, rather than going up and down with
2924 the pattern nesting. For example the stack is at its maximum at Z at the
2925 end of the pattern, rather than at X in the following:
2926
2927     /(((X)+)+)+....(Y)+....Z/
2928
2929 The only exceptions to this are lookahead/behind assertions and the cut,
2930 (?>A), which pop all the backtrack states associated with A before
2931 continuing.
2932
2933 Backtrack state structs are allocated in slabs of about 4K in size.
2934 PL_regmatch_state and st always point to the currently active state,
2935 and PL_regmatch_slab points to the slab currently containing
2936 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2937 allocated, and is never freed until interpreter destruction. When the slab
2938 is full, a new one is allocated and chained to the end. At exit from
2939 regmatch(), slabs allocated since entry are freed.
2940
2941 */
2942
2943
2944 #define DEBUG_STATE_pp(pp)                                  \
2945     DEBUG_STATE_r({                                         \
2946         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2947         PerlIO_printf(Perl_debug_log,                       \
2948             "    %*s"pp" %s%s%s%s%s\n",                     \
2949             depth*2, "",                                    \
2950             PL_reg_name[st->resume_state],                     \
2951             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2952             ((st==yes_state) ? "Y" : ""),                   \
2953             ((st==mark_state) ? "M" : ""),                  \
2954             ((st==yes_state||st==mark_state) ? "]" : "")    \
2955         );                                                  \
2956     });
2957
2958
2959 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2960
2961 #ifdef DEBUGGING
2962
2963 STATIC void
2964 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2965     const char *start, const char *end, const char *blurb)
2966 {
2967     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2968
2969     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2970
2971     if (!PL_colorset)
2972             reginitcolors();
2973     {
2974         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2975             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2976
2977         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2978             start, end - start, 60);
2979
2980         PerlIO_printf(Perl_debug_log,
2981             "%s%s REx%s %s against %s\n",
2982                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2983
2984         if (utf8_target||utf8_pat)
2985             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2986                 utf8_pat ? "pattern" : "",
2987                 utf8_pat && utf8_target ? " and " : "",
2988                 utf8_target ? "string" : ""
2989             );
2990     }
2991 }
2992
2993 STATIC void
2994 S_dump_exec_pos(pTHX_ const char *locinput,
2995                       const regnode *scan,
2996                       const char *loc_regeol,
2997                       const char *loc_bostr,
2998                       const char *loc_reg_starttry,
2999                       const bool utf8_target)
3000 {
3001     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3002     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3003     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3004     /* The part of the string before starttry has one color
3005        (pref0_len chars), between starttry and current
3006        position another one (pref_len - pref0_len chars),
3007        after the current position the third one.
3008        We assume that pref0_len <= pref_len, otherwise we
3009        decrease pref0_len.  */
3010     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3011         ? (5 + taill) - l : locinput - loc_bostr;
3012     int pref0_len;
3013
3014     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3015
3016     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3017         pref_len++;
3018     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3019     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3020         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3021               ? (5 + taill) - pref_len : loc_regeol - locinput);
3022     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3023         l--;
3024     if (pref0_len < 0)
3025         pref0_len = 0;
3026     if (pref0_len > pref_len)
3027         pref0_len = pref_len;
3028     {
3029         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3030
3031         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3032             (locinput - pref_len),pref0_len, 60, 4, 5);
3033
3034         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3035                     (locinput - pref_len + pref0_len),
3036                     pref_len - pref0_len, 60, 2, 3);
3037
3038         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3039                     locinput, loc_regeol - locinput, 10, 0, 1);
3040
3041         const STRLEN tlen=len0+len1+len2;
3042         PerlIO_printf(Perl_debug_log,
3043                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3044                     (IV)(locinput - loc_bostr),
3045                     len0, s0,
3046                     len1, s1,
3047                     (docolor ? "" : "> <"),
3048                     len2, s2,
3049                     (int)(tlen > 19 ? 0 :  19 - tlen),
3050                     "");
3051     }
3052 }
3053
3054 #endif
3055
3056 /* reg_check_named_buff_matched()
3057  * Checks to see if a named buffer has matched. The data array of
3058  * buffer numbers corresponding to the buffer is expected to reside
3059  * in the regexp->data->data array in the slot stored in the ARG() of
3060  * node involved. Note that this routine doesn't actually care about the
3061  * name, that information is not preserved from compilation to execution.
3062  * Returns the index of the leftmost defined buffer with the given name
3063  * or 0 if non of the buffers matched.
3064  */
3065 STATIC I32
3066 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3067 {
3068     I32 n;
3069     RXi_GET_DECL(rex,rexi);
3070     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3071     I32 *nums=(I32*)SvPVX(sv_dat);
3072
3073     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3074
3075     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3076         if ((I32)*PL_reglastparen >= nums[n] &&
3077             PL_regoffs[nums[n]].end != -1)
3078         {
3079             return nums[n];
3080         }
3081     }
3082     return 0;
3083 }
3084
3085
3086 /* free all slabs above current one  - called during LEAVE_SCOPE */
3087
3088 STATIC void
3089 S_clear_backtrack_stack(pTHX_ void *p)
3090 {
3091     regmatch_slab *s = PL_regmatch_slab->next;
3092     PERL_UNUSED_ARG(p);
3093
3094     if (!s)
3095         return;
3096     PL_regmatch_slab->next = NULL;
3097     while (s) {
3098         regmatch_slab * const osl = s;
3099         s = s->next;
3100         Safefree(osl);
3101     }
3102 }
3103
3104
3105 #define SETREX(Re1,Re2) \
3106     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3107     Re1 = (Re2)
3108
3109 STATIC I32                      /* 0 failure, 1 success */
3110 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3111 {
3112 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3113     dMY_CXT;
3114 #endif
3115     dVAR;
3116     register const bool utf8_target = PL_reg_match_utf8;
3117     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3118     REGEXP *rex_sv = reginfo->prog;
3119     regexp *rex = (struct regexp *)SvANY(rex_sv);
3120     RXi_GET_DECL(rex,rexi);
3121     I32 oldsave;
3122     /* the current state. This is a cached copy of PL_regmatch_state */
3123     register regmatch_state *st;
3124     /* cache heavy used fields of st in registers */
3125     register regnode *scan;
3126     register regnode *next;
3127     register U32 n = 0; /* general value; init to avoid compiler warning */
3128     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3129     register char *locinput = PL_reginput;
3130     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3131
3132     bool result = 0;        /* return value of S_regmatch */
3133     int depth = 0;          /* depth of backtrack stack */
3134     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3135     const U32 max_nochange_depth =
3136         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3137         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3138     regmatch_state *yes_state = NULL; /* state to pop to on success of
3139                                                             subpattern */
3140     /* mark_state piggy backs on the yes_state logic so that when we unwind
3141        the stack on success we can update the mark_state as we go */
3142     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3143     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3144     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3145     U32 state_num;
3146     bool no_final = 0;      /* prevent failure from backtracking? */
3147     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3148     char *startpoint = PL_reginput;
3149     SV *popmark = NULL;     /* are we looking for a mark? */
3150     SV *sv_commit = NULL;   /* last mark name seen in failure */
3151     SV *sv_yes_mark = NULL; /* last mark name we have seen
3152                                during a successful match */
3153     U32 lastopen = 0;       /* last open we saw */
3154     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3155     SV* const oreplsv = GvSV(PL_replgv);
3156     /* these three flags are set by various ops to signal information to
3157      * the very next op. They have a useful lifetime of exactly one loop
3158      * iteration, and are not preserved or restored by state pushes/pops
3159      */
3160     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3161     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3162     int logical = 0;        /* the following EVAL is:
3163                                 0: (?{...})
3164                                 1: (?(?{...})X|Y)
3165                                 2: (??{...})
3166                                or the following IFMATCH/UNLESSM is:
3167                                 false: plain (?=foo)
3168                                 true:  used as a condition: (?(?=foo))
3169                             */
3170 #ifdef DEBUGGING
3171     GET_RE_DEBUG_FLAGS_DECL;
3172 #endif
3173
3174     PERL_ARGS_ASSERT_REGMATCH;
3175
3176     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3177             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3178     }));
3179     /* on first ever call to regmatch, allocate first slab */
3180     if (!PL_regmatch_slab) {
3181         Newx(PL_regmatch_slab, 1, regmatch_slab);
3182         PL_regmatch_slab->prev = NULL;
3183         PL_regmatch_slab->next = NULL;
3184         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3185     }
3186
3187     oldsave = PL_savestack_ix;
3188     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3189     SAVEVPTR(PL_regmatch_slab);
3190     SAVEVPTR(PL_regmatch_state);
3191
3192     /* grab next free state slot */
3193     st = ++PL_regmatch_state;
3194     if (st >  SLAB_LAST(PL_regmatch_slab))
3195         st = PL_regmatch_state = S_push_slab(aTHX);
3196
3197     /* Note that nextchr is a byte even in UTF */
3198     nextchr = UCHARAT(locinput);
3199     scan = prog;
3200     while (scan != NULL) {
3201
3202         DEBUG_EXECUTE_r( {
3203             SV * const prop = sv_newmortal();
3204             regnode *rnext=regnext(scan);
3205             DUMP_EXEC_POS( locinput, scan, utf8_target );
3206             regprop(rex, prop, scan);
3207
3208             PerlIO_printf(Perl_debug_log,
3209                     "%3"IVdf":%*s%s(%"IVdf")\n",
3210                     (IV)(scan - rexi->program), depth*2, "",
3211                     SvPVX_const(prop),
3212                     (PL_regkind[OP(scan)] == END || !rnext) ?
3213                         0 : (IV)(rnext - rexi->program));
3214         });
3215
3216         next = scan + NEXT_OFF(scan);
3217         if (next == scan)
3218             next = NULL;
3219         state_num = OP(scan);
3220
3221         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3222       reenter_switch:
3223
3224         assert(PL_reglastparen == &rex->lastparen);
3225         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3226         assert(PL_regoffs == rex->offs);
3227
3228         switch (state_num) {
3229         case BOL:
3230             if (locinput == PL_bostr)
3231             {
3232                 /* reginfo->till = reginfo->bol; */
3233                 break;
3234             }
3235             sayNO;
3236         case MBOL:
3237             if (locinput == PL_bostr ||
3238                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3239             {
3240                 break;
3241             }
3242             sayNO;
3243         case SBOL:
3244             if (locinput == PL_bostr)
3245                 break;
3246             sayNO;
3247         case GPOS:
3248             if (locinput == reginfo->ganch)
3249                 break;
3250             sayNO;
3251
3252         case KEEPS:
3253             /* update the startpoint */
3254             st->u.keeper.val = PL_regoffs[0].start;
3255             PL_reginput = locinput;
3256             PL_regoffs[0].start = locinput - PL_bostr;
3257             PUSH_STATE_GOTO(KEEPS_next, next);
3258             /*NOT-REACHED*/
3259         case KEEPS_next_fail:
3260             /* rollback the start point change */
3261             PL_regoffs[0].start = st->u.keeper.val;
3262             sayNO_SILENT;
3263             /*NOT-REACHED*/
3264         case EOL:
3265                 goto seol;
3266         case MEOL:
3267             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3268                 sayNO;
3269             break;
3270         case SEOL:
3271           seol:
3272             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3273                 sayNO;
3274             if (PL_regeol - locinput > 1)
3275                 sayNO;
3276             break;
3277         case EOS:
3278             if (PL_regeol != locinput)
3279                 sayNO;
3280             break;
3281         case SANY:
3282             if (!nextchr && locinput >= PL_regeol)
3283                 sayNO;
3284             if (utf8_target) {
3285                 locinput += PL_utf8skip[nextchr];
3286                 if (locinput > PL_regeol)
3287                     sayNO;
3288                 nextchr = UCHARAT(locinput);
3289             }
3290             else
3291                 nextchr = UCHARAT(++locinput);
3292             break;
3293         case CANY:
3294             if (!nextchr && locinput >= PL_regeol)
3295                 sayNO;
3296             nextchr = UCHARAT(++locinput);
3297             break;
3298         case REG_ANY:
3299             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3300                 sayNO;
3301             if (utf8_target) {
3302                 locinput += PL_utf8skip[nextchr];
3303                 if (locinput > PL_regeol)
3304                     sayNO;
3305                 nextchr = UCHARAT(locinput);
3306             }
3307             else
3308                 nextchr = UCHARAT(++locinput);
3309             break;
3310
3311 #undef  ST
3312 #define ST st->u.trie
3313         case TRIEC:
3314             /* In this case the charclass data is available inline so
3315                we can fail fast without a lot of extra overhead.
3316              */
3317             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3318                 DEBUG_EXECUTE_r(
3319                     PerlIO_printf(Perl_debug_log,
3320                               "%*s  %sfailed to match trie start class...%s\n",
3321                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3322                 );
3323                 sayNO_SILENT;
3324                 /* NOTREACHED */
3325             }
3326             /* FALL THROUGH */
3327         case TRIE:
3328             /* the basic plan of execution of the trie is:
3329              * At the beginning, run though all the states, and
3330              * find the longest-matching word. Also remember the position
3331              * of the shortest matching word. For example, this pattern:
3332              *    1  2 3 4    5
3333              *    ab|a|x|abcd|abc
3334              * when matched against the string "abcde", will generate
3335              * accept states for all words except 3, with the longest
3336              * matching word being 4, and the shortest being 1 (with
3337              * the position being after char 1 of the string).
3338              *
3339              * Then for each matching word, in word order (i.e. 1,2,4,5),
3340              * we run the remainder of the pattern; on each try setting
3341              * the current position to the character following the word,
3342              * returning to try the next word on failure.
3343              *
3344              * We avoid having to build a list of words at runtime by
3345              * using a compile-time structure, wordinfo[].prev, which
3346              * gives, for each word, the previous accepting word (if any).
3347              * In the case above it would contain the mappings 1->2, 2->0,
3348              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3349              * the longest word (4 above), a list of all words, by
3350              * following the list of prev pointers; this gives us the
3351              * unordered list 4,5,1,2. Then given the current word we have
3352              * just tried, we can go through the list and find the
3353              * next-biggest word to try (so if we just failed on word 2,
3354              * the next in the list is 4).
3355              *
3356              * Since at runtime we don't record the matching position in
3357              * the string for each word, we have to work that out for
3358              * each word we're about to process. The wordinfo table holds
3359              * the character length of each word; given that we recorded
3360              * at the start: the position of the shortest word and its
3361              * length in chars, we just need to move the pointer the
3362              * difference between the two char lengths. Depending on
3363              * Unicode status and folding, that's cheap or expensive.
3364              *
3365              * This algorithm is optimised for the case where are only a
3366              * small number of accept states, i.e. 0,1, or maybe 2.
3367              * With lots of accepts states, and having to try all of them,
3368              * it becomes quadratic on number of accept states to find all
3369              * the next words.
3370              */
3371
3372             {
3373                 /* what type of TRIE am I? (utf8 makes this contextual) */
3374                 DECL_TRIE_TYPE(scan);
3375
3376                 /* what trie are we using right now */
3377                 reg_trie_data * const trie
3378                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3379                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3380                 U32 state = trie->startstate;
3381
3382                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3383                     if (trie->states[ state ].wordnum) {
3384                          DEBUG_EXECUTE_r(
3385                             PerlIO_printf(Perl_debug_log,
3386                                           "%*s  %smatched empty string...%s\n",
3387                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3388                         );
3389                         if (!trie->jump)
3390                             break;
3391                     } else {
3392                         DEBUG_EXECUTE_r(
3393                             PerlIO_printf(Perl_debug_log,
3394                                           "%*s  %sfailed to match trie start class...%s\n",
3395                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3396                         );
3397                         sayNO_SILENT;
3398                    }
3399                 }
3400
3401             {
3402                 U8 *uc = ( U8* )locinput;
3403
3404                 STRLEN len = 0;
3405                 STRLEN foldlen = 0;
3406                 U8 *uscan = (U8*)NULL;
3407                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3408                 U32 charcount = 0; /* how many input chars we have matched */
3409                 U32 accepted = 0; /* have we seen any accepting states? */
3410
3411                 ST.B = next;
3412                 ST.jump = trie->jump;
3413                 ST.me = scan;
3414                 ST.firstpos = NULL;
3415                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3416                 ST.nextword = 0;
3417
3418                 /* fully traverse the TRIE; note the position of the
3419                    shortest accept state and the wordnum of the longest
3420                    accept state */
3421
3422                 while ( state && uc <= (U8*)PL_regeol ) {
3423                     U32 base = trie->states[ state ].trans.base;
3424                     UV uvc = 0;
3425                     U16 charid = 0;
3426                     U16 wordnum;
3427                     wordnum = trie->states[ state ].wordnum;
3428
3429                     if (wordnum) { /* it's an accept state */
3430                         if (!accepted) {
3431                             accepted = 1;
3432                             /* record first match position */
3433                             if (ST.longfold) {
3434                                 ST.firstpos = (U8*)locinput;
3435                                 ST.firstchars = 0;
3436                             }
3437                             else {
3438                                 ST.firstpos = uc;
3439                                 ST.firstchars = charcount;
3440                             }
3441                         }
3442                         if (!ST.nextword || wordnum < ST.nextword)
3443                             ST.nextword = wordnum;
3444                         ST.topword = wordnum;
3445                     }
3446
3447                     DEBUG_TRIE_EXECUTE_r({
3448                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3449                                 PerlIO_printf( Perl_debug_log,
3450                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3451                                     2+depth * 2, "", PL_colors[4],
3452                                     (UV)state, (accepted ? 'Y' : 'N'));
3453                     });
3454
3455                     /* read a char and goto next state */
3456                     if ( base ) {
3457                         I32 offset;
3458                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3459                                              uscan, len, uvc, charid, foldlen,
3460                                              foldbuf, uniflags);
3461                         charcount++;
3462                         if (foldlen>0)
3463                             ST.longfold = TRUE;
3464                         if (charid &&
3465                              ( ((offset =
3466                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3467
3468                              && ((U32)offset < trie->lasttrans)
3469                              && trie->trans[offset].check == state)
3470                         {
3471                             state = trie->trans[offset].next;
3472                         }
3473                         else {
3474                             state = 0;
3475                         }
3476                         uc += len;
3477
3478                     }
3479                     else {
3480                         state = 0;
3481                     }
3482                     DEBUG_TRIE_EXECUTE_r(
3483                         PerlIO_printf( Perl_debug_log,
3484                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3485                             charid, uvc, (UV)state, PL_colors[5] );
3486                     );
3487                 }
3488                 if (!accepted)
3489                    sayNO;
3490
3491                 /* calculate total number of accept states */
3492                 {
3493                     U16 w = ST.topword;
3494                     accepted = 0;
3495                     while (w) {
3496                         w = trie->wordinfo[w].prev;
3497                         accepted++;
3498                     }
3499                     ST.accepted = accepted;
3500                 }
3501
3502                 DEBUG_EXECUTE_r(
3503                     PerlIO_printf( Perl_debug_log,
3504                         "%*s  %sgot %"IVdf" possible matches%s\n",
3505                         REPORT_CODE_OFF + depth * 2, "",
3506                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3507                 );
3508                 goto trie_first_try; /* jump into the fail handler */
3509             }}
3510             /* NOTREACHED */
3511
3512         case TRIE_next_fail: /* we failed - try next alternative */
3513             if ( ST.jump) {
3514                 REGCP_UNWIND(ST.cp);
3515                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3516                     PL_regoffs[n].end = -1;
3517                 *PL_reglastparen = n;
3518             }
3519             if (!--ST.accepted) {
3520                 DEBUG_EXECUTE_r({
3521                     PerlIO_printf( Perl_debug_log,
3522                         "%*s  %sTRIE failed...%s\n",
3523                         REPORT_CODE_OFF+depth*2, "",
3524                         PL_colors[4],
3525                         PL_colors[5] );
3526                 });
3527                 sayNO_SILENT;
3528             }
3529             {
3530                 /* Find next-highest word to process.  Note that this code
3531                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3532                 register U16 min = 0;
3533                 register U16 word;
3534                 register U16 const nextword = ST.nextword;
3535                 register reg_trie_wordinfo * const wordinfo
3536                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3537                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3538                     if (word > nextword && (!min || word < min))
3539                         min = word;
3540                 }
3541                 ST.nextword = min;
3542             }
3543
3544           trie_first_try:
3545             if (do_cutgroup) {
3546                 do_cutgroup = 0;
3547                 no_final = 0;
3548             }
3549
3550             if ( ST.jump) {
3551                 ST.lastparen = *PL_reglastparen;
3552                 REGCP_SET(ST.cp);
3553             }
3554
3555             /* find start char of end of current word */
3556             {
3557                 U32 chars; /* how many chars to skip */
3558                 U8 *uc = ST.firstpos;
3559                 reg_trie_data * const trie
3560                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3561
3562                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3563                             >=  ST.firstchars);
3564                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3565                             - ST.firstchars;
3566
3567                 if (ST.longfold) {
3568                     /* the hard option - fold each char in turn and find
3569                      * its folded length (which may be different */
3570                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3571                     STRLEN foldlen;
3572                     STRLEN len;
3573                     UV uvc;
3574                     U8 *uscan;
3575
3576                     while (chars) {
3577                         if (utf8_target) {
3578                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3579                                                     uniflags);
3580                             uc += len;
3581                         }
3582                         else {
3583                             uvc = *uc;
3584                             uc++;
3585                         }
3586                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3587                         uscan = foldbuf;
3588                         while (foldlen) {
3589                             if (!--chars)
3590                                 break;
3591                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3592                                             uniflags);
3593                             uscan += len;
3594                             foldlen -= len;
3595                         }
3596                     }
3597                 }
3598                 else {
3599                     if (utf8_target)
3600                         while (chars--)
3601                             uc += UTF8SKIP(uc);
3602                     else
3603                         uc += chars;
3604                 }
3605                 PL_reginput = (char *)uc;
3606             }
3607
3608             scan = (ST.jump && ST.jump[ST.nextword])
3609                         ? ST.me + ST.jump[ST.nextword]
3610                         : ST.B;
3611
3612             DEBUG_EXECUTE_r({
3613                 PerlIO_printf( Perl_debug_log,
3614                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3615                     REPORT_CODE_OFF+depth*2, "",
3616                     PL_colors[4],
3617                     ST.nextword,
3618                     PL_colors[5]
3619                     );
3620             });
3621
3622             if (ST.accepted > 1 || has_cutgroup) {
3623                 PUSH_STATE_GOTO(TRIE_next, scan);
3624                 /* NOTREACHED */
3625             }
3626             /* only one choice left - just continue */
3627             DEBUG_EXECUTE_r({
3628                 AV *const trie_words
3629                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3630                 SV ** const tmp = av_fetch( trie_words,
3631                     ST.nextword-1, 0 );
3632                 SV *sv= tmp ? sv_newmortal() : NULL;
3633
3634                 PerlIO_printf( Perl_debug_log,
3635                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3636                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3637                     ST.nextword,
3638                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3639                             PL_colors[0], PL_colors[1],
3640                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3641                         )
3642                     : "not compiled under -Dr",
3643                     PL_colors[5] );
3644             });
3645
3646             locinput = PL_reginput;
3647             nextchr = UCHARAT(locinput);
3648             continue; /* execute rest of RE */
3649             /* NOTREACHED */
3650 #undef  ST
3651
3652         case EXACT: {
3653             char *s = STRING(scan);
3654             ln = STR_LEN(scan);
3655             if (utf8_target != UTF_PATTERN) {
3656                 /* The target and the pattern have differing utf8ness. */
3657                 char *l = locinput;
3658                 const char * const e = s + ln;
3659
3660                 if (utf8_target) {
3661                     /* The target is utf8, the pattern is not utf8. */
3662                     while (s < e) {
3663                         STRLEN ulen;
3664                         if (l >= PL_regeol)
3665                              sayNO;
3666                         if (NATIVE_TO_UNI(*(U8*)s) !=
3667                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3668                                             uniflags))
3669                              sayNO;
3670                         l += ulen;
3671                         s ++;
3672                     }
3673                 }
3674                 else {
3675                     /* The target is not utf8, the pattern is utf8. */
3676                     while (s < e) {
3677                         STRLEN ulen;
3678                         if (l >= PL_regeol)
3679                             sayNO;
3680                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3681                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3682                                            uniflags))
3683                             sayNO;
3684                         s += ulen;
3685                         l ++;
3686                     }
3687                 }
3688                 locinput = l;
3689                 nextchr = UCHARAT(locinput);
3690                 break;
3691             }
3692             /* The target and the pattern have the same utf8ness. */
3693             /* Inline the first character, for speed. */
3694             if (UCHARAT(s) != nextchr)
3695                 sayNO;
3696             if (PL_regeol - locinput < ln)
3697                 sayNO;
3698             if (ln > 1 && memNE(s, locinput, ln))
3699                 sayNO;
3700             locinput += ln;
3701             nextchr = UCHARAT(locinput);
3702             break;
3703             }
3704         case EXACTFL: {
3705             re_fold_t folder;
3706             const U8 * fold_array;
3707             const char * s;
3708             U32 fold_utf8_flags;
3709
3710             PL_reg_flags |= RF_tainted;
3711             folder = foldEQ_locale;
3712             fold_array = PL_fold_locale;
3713             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3714             goto do_exactf;
3715
3716         case EXACTFU_SS:
3717         case EXACTFU_TRICKYFOLD:
3718         case EXACTFU:
3719             folder = foldEQ_latin1;
3720             fold_array = PL_fold_latin1;
3721             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3722             goto do_exactf;
3723
3724         case EXACTFA:
3725             folder = foldEQ_latin1;
3726             fold_array = PL_fold_latin1;
3727             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3728             goto do_exactf;
3729
3730         case EXACTF:
3731             folder = foldEQ;
3732             fold_array = PL_fold;
3733             fold_utf8_flags = 0;
3734
3735           do_exactf:
3736             s = STRING(scan);
3737             ln = STR_LEN(scan);
3738
3739             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3740               /* Either target or the pattern are utf8, or has the issue where
3741                * the fold lengths may differ. */
3742                 const char * const l = locinput;
3743                 char *e = PL_regeol;
3744
3745                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3746                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3747                 {
3748                     sayNO;
3749                 }
3750                 locinput = e;
3751                 nextchr = UCHARAT(locinput);
3752                 break;
3753             }
3754
3755             /* Neither the target nor the pattern are utf8 */
3756             if (UCHARAT(s) != nextchr &&
3757                 UCHARAT(s) != fold_array[nextchr])
3758             {
3759                 sayNO;
3760             }
3761             if (PL_regeol - locinput < ln)
3762                 sayNO;
3763             if (ln > 1 && ! folder(s, locinput, ln))
3764                 sayNO;
3765             locinput += ln;
3766             nextchr = UCHARAT(locinput);
3767             break;
3768         }
3769
3770         /* XXX Could improve efficiency by separating these all out using a
3771          * macro or in-line function.  At that point regcomp.c would no longer
3772          * have to set the FLAGS fields of these */
3773         case BOUNDL:
3774         case NBOUNDL:
3775             PL_reg_flags |= RF_tainted;
3776             /* FALL THROUGH */
3777         case BOUND:
3778         case BOUNDU:
3779         case BOUNDA:
3780         case NBOUND:
3781         case NBOUNDU:
3782         case NBOUNDA:
3783             /* was last char in word? */
3784             if (utf8_target
3785                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3786                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3787             {
3788                 if (locinput == PL_bostr)
3789                     ln = '\n';
3790                 else {
3791                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3792
3793                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3794                 }
3795                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3796                     ln = isALNUM_uni(ln);
3797                     LOAD_UTF8_CHARCLASS_ALNUM();
3798                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3799                 }
3800                 else {
3801                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3802                     n = isALNUM_LC_utf8((U8*)locinput);
3803                 }
3804             }
3805             else {
3806
3807                 /* Here the string isn't utf8, or is utf8 and only ascii
3808                  * characters are to match \w.  In the latter case looking at
3809                  * the byte just prior to the current one may be just the final
3810                  * byte of a multi-byte character.  This is ok.  There are two
3811                  * cases:
3812                  * 1) it is a single byte character, and then the test is doing
3813                  *      just what it's supposed to.
3814                  * 2) it is a multi-byte character, in which case the final
3815                  *      byte is never mistakable for ASCII, and so the test
3816                  *      will say it is not a word character, which is the
3817                  *      correct answer. */
3818                 ln = (locinput != PL_bostr) ?
3819                     UCHARAT(locinput - 1) : '\n';
3820                 switch (FLAGS(scan)) {
3821                     case REGEX_UNICODE_CHARSET:
3822                         ln = isWORDCHAR_L1(ln);
3823                         n = isWORDCHAR_L1(nextchr);
3824                         break;
3825                     case REGEX_LOCALE_CHARSET:
3826                         ln = isALNUM_LC(ln);
3827                         n = isALNUM_LC(nextchr);
3828                         break;
3829                     case REGEX_DEPENDS_CHARSET:
3830                         ln = isALNUM(ln);
3831                         n = isALNUM(nextchr);
3832                         break;
3833                     case REGEX_ASCII_RESTRICTED_CHARSET:
3834                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3835                         ln = isWORDCHAR_A(ln);
3836                         n = isWORDCHAR_A(nextchr);
3837                         break;
3838                     default:
3839                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3840                         break;
3841                 }
3842             }
3843             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3844              * regcomp.sym */
3845             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3846                     sayNO;
3847             break;
3848         case ANYOFV:
3849         case ANYOF:
3850             if (utf8_target || state_num == ANYOFV) {
3851                 STRLEN inclasslen = PL_regeol - locinput;
3852                 if (locinput >= PL_regeol)
3853                     sayNO;
3854
3855                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3856                     sayNO;
3857                 locinput += inclasslen;
3858                 nextchr = UCHARAT(locinput);
3859                 break;
3860             }
3861             else {
3862                 if (nextchr < 0)
3863                     nextchr = UCHARAT(locinput);
3864                 if (!nextchr && locinput >= PL_regeol)
3865                     sayNO;
3866                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3867                     sayNO;
3868                 nextchr = UCHARAT(++locinput);
3869                 break;
3870             }
3871             break;
3872         /* Special char classes - The defines start on line 129 or so */
3873         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3874                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3875                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3876                   ALNUMA, NALNUMA, isWORDCHAR_A,
3877                   alnum, "a");
3878
3879         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3880                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3881                   SPACEU, NSPACEU, isSPACE_L1,
3882                   SPACEA, NSPACEA, isSPACE_A,
3883                   space, " ");
3884
3885         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3886                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3887                 DIGITA, NDIGITA, isDIGIT_A,
3888                 digit, "0");
3889
3890         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3891                        a Unicode extended Grapheme Cluster */
3892             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3893               extended Grapheme Cluster is:
3894
3895                CR LF
3896                | Prepend* Begin Extend*
3897                | .
3898
3899                Begin is (Hangul-syllable | ! Control)
3900                Extend is (Grapheme_Extend | Spacing_Mark)
3901                Control is [ GCB_Control CR LF ]
3902
3903                The discussion below shows how the code for CLUMP is derived
3904                from this regex.  Note that most of these concepts are from
3905                property values of the Grapheme Cluster Boundary (GCB) property.
3906                No code point can have multiple property values for a given
3907                property.  Thus a code point in Prepend can't be in Control, but
3908                it must be in !Control.  This is why Control above includes
3909                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3910                property separately, and so can't be in GCB_Control, even though
3911                they logically are controls.  Control is not the same as gc=cc,
3912                but includes format and other characters as well.
3913
3914                The Unicode definition of Hangul-syllable is:
3915                    L+
3916                    | (L* ( ( V | LV ) V* | LVT ) T*)
3917                    | T+
3918                   )
3919                Each of these is a value for the GCB property, and hence must be
3920                disjoint, so the order they are tested is immaterial, so the
3921                above can safely be changed to
3922                    T+
3923                    | L+
3924                    | (L* ( LVT | ( V | LV ) V*) T*)
3925
3926                The last two terms can be combined like this:
3927                    L* ( L
3928                         | (( LVT | ( V | LV ) V*) T*))
3929
3930                And refactored into this:
3931                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3932
3933                That means that if we have seen any L's at all we can quit
3934                there, but if the next character is an LVT, a V, or an LV we
3935                should keep going.
3936
3937                There is a subtlety with Prepend* which showed up in testing.
3938                Note that the Begin, and only the Begin is required in:
3939                 | Prepend* Begin Extend*
3940                Also, Begin contains '! Control'.  A Prepend must be a
3941                '!  Control', which means it must also be a Begin.  What it
3942                comes down to is that if we match Prepend* and then find no
3943                suitable Begin afterwards, that if we backtrack the last
3944                Prepend, that one will be a suitable Begin.
3945             */
3946
3947             if (locinput >= PL_regeol)
3948                 sayNO;
3949             if  (! utf8_target) {
3950
3951                 /* Match either CR LF  or '.', as all the other possibilities
3952                  * require utf8 */
3953                 locinput++;         /* Match the . or CR */
3954                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3955                                        match the LF */
3956                     && locinput < PL_regeol
3957                     && UCHARAT(locinput) == '\n') locinput++;
3958             }
3959             else {
3960
3961                 /* Utf8: See if is ( CR LF ); already know that locinput <
3962                  * PL_regeol, so locinput+1 is in bounds */
3963                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3964                     locinput += 2;
3965                 }
3966                 else {
3967                     /* In case have to backtrack to beginning, then match '.' */
3968                     char *starting = locinput;
3969
3970                     /* In case have to backtrack the last prepend */
3971                     char *previous_prepend = 0;
3972
3973                     LOAD_UTF8_CHARCLASS_GCB();
3974
3975                     /* Match (prepend)* */
3976                     while (locinput < PL_regeol
3977                            && swash_fetch(PL_utf8_X_prepend,
3978                                           (U8*)locinput, utf8_target))
3979                     {
3980                         previous_prepend = locinput;
3981                         locinput += UTF8SKIP(locinput);
3982                     }
3983
3984                     /* As noted above, if we matched a prepend character, but
3985                      * the next thing won't match, back off the last prepend we
3986                      * matched, as it is guaranteed to match the begin */
3987                     if (previous_prepend
3988                         && (locinput >=  PL_regeol
3989                             || ! swash_fetch(PL_utf8_X_begin,
3990                                              (U8*)locinput, utf8_target)))
3991                     {
3992                         locinput = previous_prepend;
3993                     }
3994
3995                     /* Note that here we know PL_regeol > locinput, as we
3996                      * tested that upon input to this switch case, and if we
3997                      * moved locinput forward, we tested the result just above
3998                      * and it either passed, or we backed off so that it will
3999                      * now pass */
4000                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
4001
4002                         /* Here did not match the required 'Begin' in the
4003                          * second term.  So just match the very first
4004                          * character, the '.' of the final term of the regex */
4005                         locinput = starting + UTF8SKIP(starting);
4006                     } else {
4007
4008                         /* Here is the beginning of a character that can have
4009                          * an extender.  It is either a hangul syllable, or a
4010                          * non-control */
4011                         if (swash_fetch(PL_utf8_X_non_hangul,
4012                                         (U8*)locinput, utf8_target))
4013                         {
4014
4015                             /* Here not a Hangul syllable, must be a
4016                              * ('!  * Control') */
4017                             locinput += UTF8SKIP(locinput);
4018                         } else {
4019
4020                             /* Here is a Hangul syllable.  It can be composed
4021                              * of several individual characters.  One
4022                              * possibility is T+ */
4023                             if (swash_fetch(PL_utf8_X_T,
4024                                             (U8*)locinput, utf8_target))
4025                             {
4026                                 while (locinput < PL_regeol
4027                                         && swash_fetch(PL_utf8_X_T,
4028                                                         (U8*)locinput, utf8_target))
4029                                 {
4030                                     locinput += UTF8SKIP(locinput);
4031                                 }
4032                             } else {
4033
4034                                 /* Here, not T+, but is a Hangul.  That means
4035                                  * it is one of the others: L, LV, LVT or V,
4036                                  * and matches:
4037                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4038
4039                                 /* Match L*           */
4040                                 while (locinput < PL_regeol
4041                                         && swash_fetch(PL_utf8_X_L,
4042                                                         (U8*)locinput, utf8_target))
4043                                 {
4044                                     locinput += UTF8SKIP(locinput);
4045                                 }
4046
4047                                 /* Here, have exhausted L*.  If the next
4048                                  * character is not an LV, LVT nor V, it means
4049                                  * we had to have at least one L, so matches L+
4050                                  * in the original equation, we have a complete
4051                                  * hangul syllable.  Are done. */
4052
4053                                 if (locinput < PL_regeol
4054                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4055                                                     (U8*)locinput, utf8_target))
4056                                 {
4057
4058                                     /* Otherwise keep going.  Must be LV, LVT
4059                                      * or V.  See if LVT */
4060                                     if (swash_fetch(PL_utf8_X_LVT,
4061                                                     (U8*)locinput, utf8_target))
4062                                     {
4063                                         locinput += UTF8SKIP(locinput);
4064                                     } else {
4065
4066                                         /* Must be  V or LV.  Take it, then
4067                                          * match V*     */
4068                                         locinput += UTF8SKIP(locinput);
4069                                         while (locinput < PL_regeol
4070                                                 && swash_fetch(PL_utf8_X_V,
4071                                                          (U8*)locinput, utf8_target))
4072                                         {
4073                                             locinput += UTF8SKIP(locinput);
4074                                         }
4075                                     }
4076
4077                                     /* And any of LV, LVT, or V can be followed
4078                                      * by T*            */
4079                                     while (locinput < PL_regeol
4080                                            && swash_fetch(PL_utf8_X_T,
4081                                                            (U8*)locinput,
4082                                                            utf8_target))
4083                                     {
4084                                         locinput += UTF8SKIP(locinput);
4085                                     }
4086                                 }
4087                             }
4088                         }
4089
4090                         /* Match any extender */
4091                         while (locinput < PL_regeol
4092                                 && swash_fetch(PL_utf8_X_extend,
4093                                                 (U8*)locinput, utf8_target))
4094                         {
4095                             locinput += UTF8SKIP(locinput);
4096                         }
4097                     }
4098                 }
4099                 if (locinput > PL_regeol) sayNO;
4100             }
4101             nextchr = UCHARAT(locinput);
4102             break;
4103
4104         case NREFFL:
4105         {   /* The capture buffer cases.  The ones beginning with N for the
4106                named buffers just convert to the equivalent numbered and
4107                pretend they were called as the corresponding numbered buffer
4108                op.  */
4109             /* don't initialize these in the declaration, it makes C++
4110                unhappy */
4111             char *s;
4112             char type;
4113             re_fold_t folder;
4114             const U8 *fold_array;
4115             UV utf8_fold_flags;
4116
4117             PL_reg_flags |= RF_tainted;
4118             folder = foldEQ_locale;
4119             fold_array = PL_fold_locale;
4120             type = REFFL;
4121             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4122             goto do_nref;
4123
4124         case NREFFA:
4125             folder = foldEQ_latin1;
4126             fold_array = PL_fold_latin1;
4127             type = REFFA;
4128             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4129             goto do_nref;
4130
4131         case NREFFU:
4132             folder = foldEQ_latin1;
4133             fold_array = PL_fold_latin1;
4134             type = REFFU;
4135             utf8_fold_flags = 0;
4136             goto do_nref;
4137
4138         case NREFF:
4139             folder = foldEQ;
4140             fold_array = PL_fold;
4141             type = REFF;
4142             utf8_fold_flags = 0;
4143             goto do_nref;
4144
4145         case NREF:
4146             type = REF;
4147             folder = NULL;
4148             fold_array = NULL;
4149             utf8_fold_flags = 0;
4150           do_nref:
4151
4152             /* For the named back references, find the corresponding buffer
4153              * number */
4154             n = reg_check_named_buff_matched(rex,scan);
4155
4156             if ( ! n ) {
4157                 sayNO;
4158             }
4159             goto do_nref_ref_common;
4160
4161         case REFFL:
4162             PL_reg_flags |= RF_tainted;
4163             folder = foldEQ_locale;
4164             fold_array = PL_fold_locale;
4165             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4166             goto do_ref;
4167
4168         case REFFA:
4169             folder = foldEQ_latin1;
4170             fold_array = PL_fold_latin1;
4171             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4172             goto do_ref;
4173
4174         case REFFU:
4175             folder = foldEQ_latin1;
4176             fold_array = PL_fold_latin1;
4177             utf8_fold_flags = 0;
4178             goto do_ref;
4179
4180         case REFF:
4181             folder = foldEQ;
4182             fold_array = PL_fold;
4183             utf8_fold_flags = 0;
4184             goto do_ref;
4185
4186         case REF:
4187             folder = NULL;
4188             fold_array = NULL;
4189             utf8_fold_flags = 0;
4190
4191           do_ref:
4192             type = OP(scan);
4193             n = ARG(scan);  /* which paren pair */
4194
4195           do_nref_ref_common:
4196             ln = PL_regoffs[n].start;
4197             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4198             if (*PL_reglastparen < n || ln == -1)
4199                 sayNO;                  /* Do not match unless seen CLOSEn. */
4200             if (ln == PL_regoffs[n].end)
4201                 break;
4202
4203             s = PL_bostr + ln;
4204             if (type != REF     /* REF can do byte comparison */
4205                 && (utf8_target || type == REFFU))
4206             { /* XXX handle REFFL better */
4207                 char * limit = PL_regeol;
4208
4209                 /* This call case insensitively compares the entire buffer
4210                     * at s, with the current input starting at locinput, but
4211                     * not going off the end given by PL_regeol, and returns in
4212                     * limit upon success, how much of the current input was
4213                     * matched */
4214                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4215                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4216                 {
4217                     sayNO;
4218                 }
4219                 locinput = limit;
4220                 nextchr = UCHARAT(locinput);
4221                 break;
4222             }
4223
4224             /* Not utf8:  Inline the first character, for speed. */
4225             if (UCHARAT(s) != nextchr &&
4226                 (type == REF ||
4227                  UCHARAT(s) != fold_array[nextchr]))
4228                 sayNO;
4229             ln = PL_regoffs[n].end - ln;
4230             if (locinput + ln > PL_regeol)
4231                 sayNO;
4232             if (ln > 1 && (type == REF
4233                            ? memNE(s, locinput, ln)
4234                            : ! folder(s, locinput, ln)))
4235                 sayNO;
4236             locinput += ln;
4237             nextchr = UCHARAT(locinput);
4238             break;
4239         }
4240         case NOTHING:
4241         case TAIL:
4242             break;
4243         case BACK:
4244             break;
4245
4246 #undef  ST
4247 #define ST st->u.eval
4248         {
4249             SV *ret;
4250             REGEXP *re_sv;
4251             regexp *re;
4252             regexp_internal *rei;
4253             regnode *startpoint;
4254
4255         case GOSTART:
4256         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4257             if (cur_eval && cur_eval->locinput==locinput) {
4258                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4259                     Perl_croak(aTHX_ "Infinite recursion in regex");
4260                 if ( ++nochange_depth > max_nochange_depth )
4261                     Perl_croak(aTHX_
4262                         "Pattern subroutine nesting without pos change"
4263                         " exceeded limit in regex");
4264             } else {
4265                 nochange_depth = 0;
4266             }
4267             re_sv = rex_sv;
4268             re = rex;
4269             rei = rexi;
4270             (void)ReREFCNT_inc(rex_sv);
4271             if (OP(scan)==GOSUB) {
4272                 startpoint = scan + ARG2L(scan);
4273                 ST.close_paren = ARG(scan);
4274             } else {
4275                 startpoint = rei->program+1;
4276                 ST.close_paren = 0;
4277             }
4278             goto eval_recurse_doit;
4279             /* NOTREACHED */
4280         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4281             if (cur_eval && cur_eval->locinput==locinput) {
4282                 if ( ++nochange_depth > max_nochange_depth )
4283                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4284             } else {
4285                 nochange_depth = 0;
4286             }
4287             {
4288                 /* execute the code in the {...} */
4289                 dSP;
4290                 SV ** const before = SP;
4291                 OP_4tree * const oop = PL_op;
4292                 COP * const ocurcop = PL_curcop;
4293                 PAD *old_comppad;
4294                 char *saved_regeol = PL_regeol;
4295                 struct re_save_state saved_state;
4296
4297                 /* To not corrupt the existing regex state while executing the
4298                  * eval we would normally put it on the save stack, like with
4299                  * save_re_context. However, re-evals have a weird scoping so we
4300                  * can't just add ENTER/LEAVE here. With that, things like
4301                  *
4302                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4303                  *
4304                  * would break, as they expect the localisation to be unwound
4305                  * only when the re-engine backtracks through the bit that
4306                  * localised it.
4307                  *
4308                  * What we do instead is just saving the state in a local c
4309                  * variable.
4310                  */
4311                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4312
4313                 n = ARG(scan);
4314                 PL_op = (OP_4tree*)rexi->data->data[n];
4315                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4316                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4317                 /* wrap the call in two SAVECOMPPADs. This ensures that
4318                  * when the save stack is eventually unwound, all the
4319                  * accumulated SAVEt_CLEARSV's will be processed with
4320                  * interspersed SAVEt_COMPPAD's to ensure that lexicals
4321                  * are cleared in the right pad */
4322                 SAVECOMPPAD();
4323                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4324                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4325
4326                 if (sv_yes_mark) {
4327                     SV *sv_mrk = get_sv("REGMARK", 1);
4328                     sv_setsv(sv_mrk, sv_yes_mark);
4329                 }
4330
4331                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4332                 SPAGAIN;
4333                 if (SP == before)
4334                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4335                 else {
4336                     ret = POPs;
4337                     PUTBACK;
4338                 }
4339
4340                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4341
4342                 PL_op = oop;
4343                 SAVECOMPPAD();
4344                 PAD_RESTORE_LOCAL(old_comppad);
4345                 PL_curcop = ocurcop;
4346                 PL_regeol = saved_regeol;
4347                 if (!logical) {
4348                     /* /(?{...})/ */
4349                     sv_setsv(save_scalar(PL_replgv), ret);
4350                     break;
4351                 }
4352             }
4353             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4354                 logical = 0;
4355                 {
4356                     /* extract RE object from returned value; compiling if
4357                      * necessary */
4358                     MAGIC *mg = NULL;
4359                     REGEXP *rx = NULL;
4360
4361                     if (SvROK(ret)) {
4362                         SV *const sv = SvRV(ret);
4363
4364                         if (SvTYPE(sv) == SVt_REGEXP) {
4365                             rx = (REGEXP*) sv;
4366                         } else if (SvSMAGICAL(sv)) {
4367                             mg = mg_find(sv, PERL_MAGIC_qr);
4368                             assert(mg);
4369                         }
4370                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4371                         rx = (REGEXP*) ret;
4372                     } else if (SvSMAGICAL(ret)) {
4373                         if (SvGMAGICAL(ret)) {
4374                             /* I don't believe that there is ever qr magic
4375                                here.  */
4376                             assert(!mg_find(ret, PERL_MAGIC_qr));
4377                             sv_unmagic(ret, PERL_MAGIC_qr);
4378                         }
4379                         else {
4380                             mg = mg_find(ret, PERL_MAGIC_qr);
4381                             /* testing suggests mg only ends up non-NULL for
4382                                scalars who were upgraded and compiled in the
4383                                else block below. In turn, this is only
4384                                triggered in the "postponed utf8 string" tests
4385                                in t/op/pat.t  */
4386                         }
4387                     }
4388
4389                     if (mg) {
4390                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4391                         assert(rx);
4392                     }
4393                     if (rx) {
4394                         rx = reg_temp_copy(NULL, rx);
4395                     }
4396                     else {
4397                         U32 pm_flags = 0;
4398                         const I32 osize = PL_regsize;
4399
4400                         if (DO_UTF8(ret)) {
4401                             assert (SvUTF8(ret));
4402                         } else if (SvUTF8(ret)) {
4403                             /* Not doing UTF-8, despite what the SV says. Is
4404                                this only if we're trapped in use 'bytes'?  */
4405                             /* Make a copy of the octet sequence, but without
4406                                the flag on, as the compiler now honours the
4407                                SvUTF8 flag on ret.  */
4408                             STRLEN len;
4409                             const char *const p = SvPV(ret, len);
4410                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4411                         }
4412                         rx = CALLREGCOMP(ret, pm_flags);
4413                         if (!(SvFLAGS(ret)
4414                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4415                                  | SVs_GMG))) {
4416                             /* This isn't a first class regexp. Instead, it's
4417                                caching a regexp onto an existing, Perl visible
4418                                scalar.  */
4419                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4420                         }
4421                         PL_regsize = osize;
4422                     }
4423                     re_sv = rx;
4424                     re = (struct regexp *)SvANY(rx);
4425                 }
4426                 RXp_MATCH_COPIED_off(re);
4427                 re->subbeg = rex->subbeg;
4428                 re->sublen = rex->sublen;
4429                 rei = RXi_GET(re);
4430                 DEBUG_EXECUTE_r(
4431                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4432                         "Matching embedded");
4433                 );
4434                 startpoint = rei->program + 1;
4435                 ST.close_paren = 0; /* only used for GOSUB */
4436                 /* borrowed from regtry */
4437                 if (PL_reg_start_tmpl <= re->nparens) {
4438                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4439                     if(PL_reg_start_tmp)
4440                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4441                     else
4442                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4443                 }
4444
4445         eval_recurse_doit: /* Share code with GOSUB below this line */
4446                 /* run the pattern returned from (??{...}) */
4447                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4448                 REGCP_SET(ST.lastcp);
4449
4450                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4451
4452                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4453                 PL_reglastparen = &re->lastparen;
4454                 PL_reglastcloseparen = &re->lastcloseparen;
4455                 re->lastparen = 0;
4456                 re->lastcloseparen = 0;
4457
4458                 PL_reginput = locinput;
4459                 PL_regsize = 0;
4460
4461                 /* XXXX This is too dramatic a measure... */
4462                 PL_reg_maxiter = 0;
4463
4464                 ST.toggle_reg_flags = PL_reg_flags;
4465                 if (RX_UTF8(re_sv))
4466                     PL_reg_flags |= RF_utf8;
4467                 else
4468                     PL_reg_flags &= ~RF_utf8;
4469                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4470
4471                 ST.prev_rex = rex_sv;
4472                 ST.prev_curlyx = cur_curlyx;
4473                 SETREX(rex_sv,re_sv);
4474                 rex = re;
4475                 rexi = rei;
4476                 cur_curlyx = NULL;
4477                 ST.B = next;
4478                 ST.prev_eval = cur_eval;
4479                 cur_eval = st;
4480                 /* now continue from first node in postoned RE */
4481                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4482                 /* NOTREACHED */
4483             }
4484             /* logical is 1,   /(?(?{...})X|Y)/ */
4485             sw = cBOOL(SvTRUE(ret));
4486             logical = 0;
4487             break;
4488         }
4489
4490         case EVAL_AB: /* cleanup after a successful (??{A})B */
4491             /* note: this is called twice; first after popping B, then A */
4492             PL_reg_flags ^= ST.toggle_reg_flags;
4493             ReREFCNT_dec(rex_sv);
4494             SETREX(rex_sv,ST.prev_rex);
4495             rex = (struct regexp *)SvANY(rex_sv);
4496             rexi = RXi_GET(rex);
4497             regcpblow(ST.cp);
4498             cur_eval = ST.prev_eval;
4499             cur_curlyx = ST.prev_curlyx;
4500
4501             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4502             PL_reglastparen = &rex->lastparen;
4503             PL_reglastcloseparen = &rex->lastcloseparen;
4504             /* also update PL_regoffs */
4505             PL_regoffs = rex->offs;
4506
4507             /* XXXX This is too dramatic a measure... */
4508             PL_reg_maxiter = 0;
4509             if ( nochange_depth )
4510                 nochange_depth--;
4511             sayYES;
4512
4513
4514         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4515             /* note: this is called twice; first after popping B, then A */
4516             PL_reg_flags ^= ST.toggle_reg_flags;
4517             ReREFCNT_dec(rex_sv);
4518             SETREX(rex_sv,ST.prev_rex);
4519             rex = (struct regexp *)SvANY(rex_sv);
4520             rexi = RXi_GET(rex);
4521             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4522             PL_reglastparen = &rex->lastparen;
4523             PL_reglastcloseparen = &rex->lastcloseparen;
4524
4525             PL_reginput = locinput;
4526             REGCP_UNWIND(ST.lastcp);
4527             regcppop(rex);
4528             cur_eval = ST.prev_eval;
4529             cur_curlyx = ST.prev_curlyx;
4530             /* XXXX This is too dramatic a measure... */
4531             PL_reg_maxiter = 0;
4532             if ( nochange_depth )
4533                 nochange_depth--;
4534             sayNO_SILENT;
4535 #undef ST
4536
4537         case OPEN:
4538             n = ARG(scan);  /* which paren pair */
4539             PL_reg_start_tmp[n] = locinput;
4540             if (n > PL_regsize)
4541                 PL_regsize = n;
4542             lastopen = n;
4543             break;
4544         case CLOSE:
4545             n = ARG(scan);  /* which paren pair */
4546             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4547             PL_regoffs[n].end = locinput - PL_bostr;
4548             /*if (n > PL_regsize)
4549                 PL_regsize = n;*/
4550             if (n > *PL_reglastparen)
4551                 *PL_reglastparen = n;
4552             *PL_reglastcloseparen = n;
4553             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4554                 goto fake_end;
4555             }
4556             break;
4557         case ACCEPT:
4558             if (ARG(scan)){
4559                 regnode *cursor;
4560                 for (cursor=scan;
4561                      cursor && OP(cursor)!=END;
4562                      cursor=regnext(cursor))
4563                 {
4564                     if ( OP(cursor)==CLOSE ){
4565                         n = ARG(cursor);
4566                         if ( n <= lastopen ) {
4567                             PL_regoffs[n].start
4568                                 = PL_reg_start_tmp[n] - PL_bostr;
4569                             PL_regoffs[n].end = locinput - PL_bostr;
4570                             /*if (n > PL_regsize)
4571                             PL_regsize = n;*/
4572                             if (n > *PL_reglastparen)
4573                                 *PL_reglastparen = n;
4574                             *PL_reglastcloseparen = n;
4575                             if ( n == ARG(scan) || (cur_eval &&
4576                                 cur_eval->u.eval.close_paren == n))
4577                                 break;
4578                         }
4579                     }
4580                 }
4581             }
4582             goto fake_end;
4583             /*NOTREACHED*/
4584         case GROUPP:
4585             n = ARG(scan);  /* which paren pair */
4586             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4587             break;
4588         case NGROUPP:
4589             /* reg_check_named_buff_matched returns 0 for no match */
4590             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4591             break;
4592         case INSUBP:
4593             n = ARG(scan);
4594             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4595             break;
4596         case DEFINEP:
4597             sw = 0;
4598             break;
4599         case IFTHEN:
4600             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4601             if (sw)
4602                 next = NEXTOPER(NEXTOPER(scan));
4603             else {
4604                 next = scan + ARG(scan);
4605                 if (OP(next) == IFTHEN) /* Fake one. */
4606                     next = NEXTOPER(NEXTOPER(next));
4607             }
4608             break;
4609         case LOGICAL:
4610             logical = scan->flags;
4611             break;
4612
4613 /*******************************************************************
4614
4615 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4616 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4617 STAR/PLUS/CURLY/CURLYN are used instead.)
4618
4619 A*B is compiled as <CURLYX><A><WHILEM><B>
4620
4621 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4622 state, which contains the current count, initialised to -1. It also sets
4623 cur_curlyx to point to this state, with any previous value saved in the
4624 state block.
4625
4626 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4627 since the pattern may possibly match zero times (i.e. it's a while {} loop
4628 rather than a do {} while loop).
4629
4630 Each entry to WHILEM represents a successful match of A. The count in the
4631 CURLYX block is incremented, another WHILEM state is pushed, and execution
4632 passes to A or B depending on greediness and the current count.
4633
4634 For example, if matching against the string a1a2a3b (where the aN are
4635 substrings that match /A/), then the match progresses as follows: (the
4636 pushed states are interspersed with the bits of strings matched so far):
4637
4638     <CURLYX cnt=-1>
4639     <CURLYX cnt=0><WHILEM>
4640     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4641     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4642     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4643     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4644
4645 (Contrast this with something like CURLYM, which maintains only a single
4646 backtrack state:
4647
4648     <CURLYM cnt=0> a1
4649     a1 <CURLYM cnt=1> a2
4650     a1 a2 <CURLYM cnt=2> a3
4651     a1 a2 a3 <CURLYM cnt=3> b
4652 )
4653
4654 Each WHILEM state block marks a point to backtrack to upon partial failure
4655 of A or B, and also contains some minor state data related to that
4656 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4657 overall state, such as the count, and pointers to the A and B ops.
4658
4659 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4660 must always point to the *current* CURLYX block, the rules are:
4661
4662 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4663 and set cur_curlyx to point the new block.
4664
4665 When popping the CURLYX block after a successful or unsuccessful match,
4666 restore the previous cur_curlyx.
4667
4668 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4669 to the outer one saved in the CURLYX block.
4670
4671 When popping the WHILEM block after a successful or unsuccessful B match,
4672 restore the previous cur_curlyx.
4673
4674 Here's an example for the pattern (AI* BI)*BO
4675 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4676
4677 cur_
4678 curlyx backtrack stack
4679 ------ ---------------
4680 NULL
4681 CO     <CO prev=NULL> <WO>
4682 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4683 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4684 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4685
4686 At this point the pattern succeeds, and we work back down the stack to
4687 clean up, restoring as we go:
4688
4689 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4690 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4691 CO     <CO prev=NULL> <WO>
4692 NULL
4693
4694 *******************************************************************/
4695
4696 #define ST st->u.curlyx
4697
4698         case CURLYX:    /* start of /A*B/  (for complex A) */
4699         {
4700             /* No need to save/restore up to this paren */
4701             I32 parenfloor = scan->flags;
4702
4703             assert(next); /* keep Coverity happy */
4704             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4705                 next += ARG(next);
4706
4707             /* XXXX Probably it is better to teach regpush to support
4708                parenfloor > PL_regsize... */
4709             if (parenfloor > (I32)*PL_reglastparen)
4710                 parenfloor = *PL_reglastparen; /* Pessimization... */
4711
4712             ST.prev_curlyx= cur_curlyx;
4713             cur_curlyx = st;
4714             ST.cp = PL_savestack_ix;
4715
4716             /* these fields contain the state of the current curly.
4717              * they are accessed by subsequent WHILEMs */
4718             ST.parenfloor = parenfloor;
4719             ST.me = scan;
4720             ST.B = next;
4721             ST.minmod = minmod;
4722             minmod = 0;
4723             ST.count = -1;      /* this will be updated by WHILEM */
4724             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4725
4726             PL_reginput = locinput;
4727             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4728             /* NOTREACHED */
4729         }
4730
4731         case CURLYX_end: /* just finished matching all of A*B */
4732             cur_curlyx = ST.prev_curlyx;
4733             sayYES;
4734             /* NOTREACHED */
4735
4736         case CURLYX_end_fail: /* just failed to match all of A*B */
4737             regcpblow(ST.cp);
4738             cur_curlyx = ST.prev_curlyx;
4739             sayNO;
4740             /* NOTREACHED */
4741
4742
4743 #undef ST
4744 #define ST st->u.whilem
4745
4746         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4747         {
4748             /* see the discussion above about CURLYX/WHILEM */
4749             I32 n;
4750             int min = ARG1(cur_curlyx->u.curlyx.me);
4751             int max = ARG2(cur_curlyx->u.curlyx.me);
4752             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4753
4754             assert(cur_curlyx); /* keep Coverity happy */
4755             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4756             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4757             ST.cache_offset = 0;
4758             ST.cache_mask = 0;
4759
4760             PL_reginput = locinput;
4761
4762             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4763                   "%*s  whilem: matched %ld out of %d..%d\n",
4764                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4765             );
4766
4767             /* First just match a string of min A's. */
4768
4769             if (n < min) {
4770                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4771                 cur_curlyx->u.curlyx.lastloc = locinput;
4772                 REGCP_SET(ST.lastcp);
4773
4774                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4775                 /* NOTREACHED */
4776             }
4777
4778             /* If degenerate A matches "", assume A done. */
4779
4780             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4781                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4782                    "%*s  whilem: empty match detected, trying continuation...\n",
4783                    REPORT_CODE_OFF+depth*2, "")
4784                 );
4785                 goto do_whilem_B_max;
4786             }
4787
4788             /* super-linear cache processing */
4789
4790             if (scan->flags) {
4791
4792                 if (!PL_reg_maxiter) {
4793                     /* start the countdown: Postpone detection until we
4794                      * know the match is not *that* much linear. */
4795                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4796                     /* possible overflow for long strings and many CURLYX's */
4797                     if (PL_reg_maxiter < 0)
4798                         PL_reg_maxiter = I32_MAX;
4799                     PL_reg_leftiter = PL_reg_maxiter;
4800                 }
4801
4802                 if (PL_reg_leftiter-- == 0) {
4803                     /* initialise cache */
4804                     const I32 size = (PL_reg_maxiter + 7)/8;
4805                     if (PL_reg_poscache) {
4806                         if ((I32)PL_reg_poscache_size < size) {
4807                             Renew(PL_reg_poscache, size, char);
4808                             PL_reg_poscache_size = size;
4809                         }
4810                         Zero(PL_reg_poscache, size, char);
4811                     }
4812                     else {
4813                         PL_reg_poscache_size = size;
4814                         Newxz(PL_reg_poscache, size, char);
4815                     }
4816                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4817       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4818                               PL_colors[4], PL_colors[5])
4819                     );
4820                 }
4821
4822                 if (PL_reg_leftiter < 0) {
4823                     /* have we already failed at this position? */
4824                     I32 offset, mask;
4825                     offset  = (scan->flags & 0xf) - 1
4826                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4827                     mask    = 1 << (offset % 8);
4828                     offset /= 8;
4829                     if (PL_reg_poscache[offset] & mask) {
4830                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4831                             "%*s  whilem: (cache) already tried at this position...\n",
4832                             REPORT_CODE_OFF+depth*2, "")
4833                         );
4834                         sayNO; /* cache records failure */
4835                     }
4836                     ST.cache_offset = offset;
4837                     ST.cache_mask   = mask;
4838                 }
4839             }
4840
4841             /* Prefer B over A for minimal matching. */
4842
4843             if (cur_curlyx->u.curlyx.minmod) {
4844                 ST.save_curlyx = cur_curlyx;
4845                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4846                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4847                 REGCP_SET(ST.lastcp);
4848                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4849                 /* NOTREACHED */
4850             }
4851
4852             /* Prefer A over B for maximal matching. */
4853
4854             if (n < max) { /* More greed allowed? */
4855                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4856                 cur_curlyx->u.curlyx.lastloc = locinput;
4857                 REGCP_SET(ST.lastcp);
4858                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4859                 /* NOTREACHED */
4860             }
4861             goto do_whilem_B_max;
4862         }
4863         /* NOTREACHED */
4864
4865         case WHILEM_B_min: /* just matched B in a minimal match */
4866         case WHILEM_B_max: /* just matched B in a maximal match */
4867             cur_curlyx = ST.save_curlyx;
4868             sayYES;
4869             /* NOTREACHED */
4870
4871         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4872             cur_curlyx = ST.save_curlyx;
4873             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4874             cur_curlyx->u.curlyx.count--;
4875             CACHEsayNO;
4876             /* NOTREACHED */
4877
4878         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4879             /* FALL THROUGH */
4880         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4881             REGCP_UNWIND(ST.lastcp);
4882             regcppop(rex);
4883             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4884             cur_curlyx->u.curlyx.count--;
4885             CACHEsayNO;
4886             /* NOTREACHED */
4887
4888         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4889             REGCP_UNWIND(ST.lastcp);
4890             regcppop(rex);      /* Restore some previous $<digit>s? */
4891             PL_reginput = locinput;
4892             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4893                 "%*s  whilem: failed, trying continuation...\n",
4894                 REPORT_CODE_OFF+depth*2, "")
4895             );
4896           do_whilem_B_max:
4897             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4898                 && ckWARN(WARN_REGEXP)
4899                 && !(PL_reg_flags & RF_warned))
4900             {
4901                 PL_reg_flags |= RF_warned;
4902                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4903                      "Complex regular subexpression recursion limit (%d) "
4904                      "exceeded",
4905                      REG_INFTY - 1);
4906             }
4907
4908             /* now try B */
4909             ST.save_curlyx = cur_curlyx;
4910             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4911             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4912             /* NOTREACHED */
4913
4914         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4915             cur_curlyx = ST.save_curlyx;
4916             REGCP_UNWIND(ST.lastcp);
4917             regcppop(rex);
4918
4919             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4920                 /* Maximum greed exceeded */
4921                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4922                     && ckWARN(WARN_REGEXP)
4923                     && !(PL_reg_flags & RF_warned))
4924                 {
4925                     PL_reg_flags |= RF_warned;
4926                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4927                         "Complex regular subexpression recursion "
4928                         "limit (%d) exceeded",
4929                         REG_INFTY - 1);
4930                 }
4931                 cur_curlyx->u.curlyx.count--;
4932                 CACHEsayNO;
4933             }
4934
4935             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4936                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4937             );
4938             /* Try grabbing another A and see if it helps. */
4939             PL_reginput = locinput;
4940             cur_curlyx->u.curlyx.lastloc = locinput;
4941             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4942             REGCP_SET(ST.lastcp);
4943             PUSH_STATE_GOTO(WHILEM_A_min,
4944                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4945             /* NOTREACHED */
4946
4947 #undef  ST
4948 #define ST st->u.branch
4949
4950         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4951             next = scan + ARG(scan);
4952             if (next == scan)
4953                 next = NULL;
4954             scan = NEXTOPER(scan);
4955             /* FALL THROUGH */
4956
4957         case BRANCH:        /*  /(...|A|...)/ */
4958             scan = NEXTOPER(scan); /* scan now points to inner node */
4959             ST.lastparen = *PL_reglastparen;
4960             ST.next_branch = next;
4961             REGCP_SET(ST.cp);
4962             PL_reginput = locinput;
4963
4964             /* Now go into the branch */
4965             if (has_cutgroup) {
4966                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4967             } else {
4968                 PUSH_STATE_GOTO(BRANCH_next, scan);
4969             }
4970             /* NOTREACHED */
4971         case CUTGROUP:
4972             PL_reginput = locinput;
4973             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4974                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4975             PUSH_STATE_GOTO(CUTGROUP_next,next);
4976             /* NOTREACHED */
4977         case CUTGROUP_next_fail:
4978             do_cutgroup = 1;
4979             no_final = 1;
4980             if (st->u.mark.mark_name)
4981                 sv_commit = st->u.mark.mark_name;
4982             sayNO;
4983             /* NOTREACHED */
4984         case BRANCH_next:
4985             sayYES;
4986             /* NOTREACHED */
4987         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4988             if (do_cutgroup) {
4989                 do_cutgroup = 0;
4990                 no_final = 0;
4991             }
4992             REGCP_UNWIND(ST.cp);
4993             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4994                 PL_regoffs[n].end = -1;
4995             *PL_reglastparen = n;
4996             /*dmq: *PL_reglastcloseparen = n; */
4997             scan = ST.next_branch;
4998             /* no more branches? */
4999             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5000                 DEBUG_EXECUTE_r({
5001                     PerlIO_printf( Perl_debug_log,
5002                         "%*s  %sBRANCH failed...%s\n",
5003                         REPORT_CODE_OFF+depth*2, "",
5004                         PL_colors[4],
5005                         PL_colors[5] );
5006                 });
5007                 sayNO_SILENT;
5008             }
5009             continue; /* execute next BRANCH[J] op */
5010             /* NOTREACHED */
5011
5012         case MINMOD:
5013             minmod = 1;
5014             break;
5015
5016 #undef  ST
5017 #define ST st->u.curlym
5018
5019         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5020
5021             /* This is an optimisation of CURLYX that enables us to push
5022              * only a single backtracking state, no matter how many matches
5023              * there are in {m,n}. It relies on the pattern being constant
5024              * length, with no parens to influence future backrefs
5025              */
5026
5027             ST.me = scan;
5028             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5029
5030             /* if paren positive, emulate an OPEN/CLOSE around A */
5031             if (ST.me->flags) {
5032                 U32 paren = ST.me->flags;
5033                 if (paren > PL_regsize)
5034                     PL_regsize = paren;
5035                 if (paren > *PL_reglastparen)
5036                     *PL_reglastparen = paren;
5037                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5038             }
5039             ST.A = scan;
5040             ST.B = next;
5041             ST.alen = 0;
5042             ST.count = 0;
5043             ST.minmod = minmod;
5044             minmod = 0;
5045             ST.c1 = CHRTEST_UNINIT;
5046             REGCP_SET(ST.cp);
5047
5048             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5049                 goto curlym_do_B;
5050
5051           curlym_do_A: /* execute the A in /A{m,n}B/  */
5052             PL_reginput = locinput;
5053             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5054             /* NOTREACHED */
5055
5056         case CURLYM_A: /* we've just matched an A */
5057             locinput = st->locinput;
5058             nextchr = UCHARAT(locinput);
5059
5060             ST.count++;
5061             /* after first match, determine A's length: u.curlym.alen */
5062             if (ST.count == 1) {
5063                 if (PL_reg_match_utf8) {
5064                     char *s = locinput;
5065                     while (s < PL_reginput) {
5066                         ST.alen++;
5067                         s += UTF8SKIP(s);
5068                     }
5069                 }
5070                 else {
5071                     ST.alen = PL_reginput - locinput;
5072                 }
5073                 if (ST.alen == 0)
5074                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5075             }
5076             DEBUG_EXECUTE_r(
5077                 PerlIO_printf(Perl_debug_log,
5078                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5079                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5080                           (IV) ST.count, (IV)ST.alen)
5081             );
5082
5083             locinput = PL_reginput;
5084
5085             if (cur_eval && cur_eval->u.eval.close_paren &&
5086                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5087                 goto fake_end;
5088
5089             {
5090                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5091                 if ( max == REG_INFTY || ST.count < max )
5092                     goto curlym_do_A; /* try to match another A */
5093             }
5094             goto curlym_do_B; /* try to match B */
5095
5096         case CURLYM_A_fail: /* just failed to match an A */
5097             REGCP_UNWIND(ST.cp);
5098
5099             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5100                 || (cur_eval && cur_eval->u.eval.close_paren &&
5101                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5102                 sayNO;
5103
5104           curlym_do_B: /* execute the B in /A{m,n}B/  */
5105             PL_reginput = locinput;
5106             if (ST.c1 == CHRTEST_UNINIT) {
5107                 /* calculate c1 and c2 for possible match of 1st char
5108                  * following curly */
5109                 ST.c1 = ST.c2 = CHRTEST_VOID;
5110                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5111                     regnode *text_node = ST.B;
5112                     if (! HAS_TEXT(text_node))
5113                         FIND_NEXT_IMPT(text_node);
5114                     /* this used to be
5115
5116                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5117
5118                         But the former is redundant in light of the latter.
5119
5120                         if this changes back then the macro for
5121                         IS_TEXT and friends need to change.
5122                      */
5123                     if (PL_regkind[OP(text_node)] == EXACT)
5124                     {
5125
5126                         ST.c1 = (U8)*STRING(text_node);
5127                         switch (OP(text_node)) {
5128                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5129                             case EXACTFA:
5130                             case EXACTFU_SS:
5131                             case EXACTFU_TRICKYFOLD:
5132                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5133                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5134                             default: ST.c2 = ST.c1;
5135                         }
5136                     }
5137                 }
5138             }
5139
5140             DEBUG_EXECUTE_r(
5141                 PerlIO_printf(Perl_debug_log,
5142                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5143                     (int)(REPORT_CODE_OFF+(depth*2)),
5144                     "", (IV)ST.count)
5145                 );
5146             if (ST.c1 != CHRTEST_VOID
5147                     && UCHARAT(PL_reginput) != ST.c1
5148                     && UCHARAT(PL_reginput) != ST.c2)
5149             {
5150                 /* simulate B failing */
5151                 DEBUG_OPTIMISE_r(
5152                     PerlIO_printf(Perl_debug_log,
5153                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5154                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5155                         (IV)ST.c1,(IV)ST.c2
5156                 ));
5157                 state_num = CURLYM_B_fail;
5158                 goto reenter_switch;
5159             }
5160
5161             if (ST.me->flags) {
5162                 /* mark current A as captured */
5163                 I32 paren = ST.me->flags;
5164                 if (ST.count) {
5165                     PL_regoffs[paren].start
5166                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5167                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5168                     /*dmq: *PL_reglastcloseparen = paren; */
5169                 }
5170                 else
5171                     PL_regoffs[paren].end = -1;
5172                 if (cur_eval && cur_eval->u.eval.close_paren &&
5173                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5174                 {
5175                     if (ST.count)
5176                         goto fake_end;
5177                     else
5178                         sayNO;
5179                 }
5180             }
5181
5182             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5183             /* NOTREACHED */
5184
5185         case CURLYM_B_fail: /* just failed to match a B */
5186             REGCP_UNWIND(ST.cp);
5187             if (ST.minmod) {
5188                 I32 max = ARG2(ST.me);
5189                 if (max != REG_INFTY && ST.count == max)
5190                     sayNO;
5191                 goto curlym_do_A; /* try to match a further A */
5192             }
5193             /* backtrack one A */
5194             if (ST.count == ARG1(ST.me) /* min */)
5195                 sayNO;
5196             ST.count--;
5197             locinput = HOPc(locinput, -ST.alen);
5198             goto curlym_do_B; /* try to match B */
5199
5200 #undef ST
5201 #define ST st->u.curly
5202
5203 #define CURLY_SETPAREN(paren, success) \
5204     if (paren) { \
5205         if (success) { \
5206             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5207             PL_regoffs[paren].end = locinput - PL_bostr; \
5208             *PL_reglastcloseparen = paren; \
5209         } \
5210         else \
5211             PL_regoffs[paren].end = -1; \
5212     }
5213
5214         case STAR:              /*  /A*B/ where A is width 1 */
5215             ST.paren = 0;
5216             ST.min = 0;
5217             ST.max = REG_INFTY;
5218             scan = NEXTOPER(scan);
5219             goto repeat;
5220         case PLUS:              /*  /A+B/ where A is width 1 */
5221             ST.paren = 0;
5222             ST.min = 1;
5223             ST.max = REG_INFTY;
5224             scan = NEXTOPER(scan);
5225             goto repeat;
5226         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5227             ST.paren = scan->flags;     /* Which paren to set */
5228             if (ST.paren > PL_regsize)
5229                 PL_regsize = ST.paren;
5230             if (ST.paren > *PL_reglastparen)
5231                 *PL_reglastparen = ST.paren;
5232             ST.min = ARG1(scan);  /* min to match */
5233             ST.max = ARG2(scan);  /* max to match */
5234             if (cur_eval && cur_eval->u.eval.close_paren &&
5235                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5236                 ST.min=1;
5237                 ST.max=1;
5238             }
5239             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5240             goto repeat;
5241         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5242             ST.paren = 0;
5243             ST.min = ARG1(scan);  /* min to match */
5244             ST.max = ARG2(scan);  /* max to match */
5245             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5246           repeat:
5247             /*
5248             * Lookahead to avoid useless match attempts
5249             * when we know what character comes next.
5250             *
5251             * Used to only do .*x and .*?x, but now it allows
5252             * for )'s, ('s and (?{ ... })'s to be in the way
5253             * of the quantifier and the EXACT-like node.  -- japhy
5254             */
5255
5256             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5257                 sayNO;
5258             if (HAS_TEXT(next) || JUMPABLE(next)) {
5259                 U8 *s;
5260                 regnode *text_node = next;
5261
5262                 if (! HAS_TEXT(text_node))
5263                     FIND_NEXT_IMPT(text_node);
5264
5265                 if (! HAS_TEXT(text_node))
5266                     ST.c1 = ST.c2 = CHRTEST_VOID;
5267                 else {
5268                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5269                         ST.c1 = ST.c2 = CHRTEST_VOID;
5270                         goto assume_ok_easy;
5271                     }
5272                     else
5273                         s = (U8*)STRING(text_node);
5274
5275                     /*  Currently we only get here when
5276
5277                         PL_rekind[OP(text_node)] == EXACT
5278
5279                         if this changes back then the macro for IS_TEXT and
5280                         friends need to change. */
5281                     if (!UTF_PATTERN) {
5282                         ST.c1 = *s;
5283                         switch (OP(text_node)) {
5284                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5285                             case EXACTFA:
5286                             case EXACTFU_SS:
5287                             case EXACTFU_TRICKYFOLD:
5288                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5289                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5290                             default: ST.c2 = ST.c1; break;
5291                         }
5292                     }
5293                     else { /* UTF_PATTERN */
5294                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5295                              STRLEN ulen1, ulen2;
5296                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5297                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5298
5299                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5300                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5301 #ifdef EBCDIC
5302                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5303                                                     ckWARN(WARN_UTF8) ?
5304                                                     0 : UTF8_ALLOW_ANY);
5305                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5306                                                     ckWARN(WARN_UTF8) ?
5307                                                     0 : UTF8_ALLOW_ANY);
5308 #else
5309                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5310                                                     uniflags);
5311                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5312                                                     uniflags);
5313 #endif
5314                         }
5315                         else {
5316                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5317                                                      uniflags);
5318                         }
5319                     }
5320                 }
5321             }
5322             else
5323                 ST.c1 = ST.c2 = CHRTEST_VOID;
5324         assume_ok_easy:
5325
5326             ST.A = scan;
5327             ST.B = next;
5328             PL_reginput = locinput;
5329             if (minmod) {
5330                 minmod = 0;
5331                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5332                     sayNO;
5333                 ST.count = ST.min;
5334                 locinput = PL_reginput;
5335                 REGCP_SET(ST.cp);
5336                 if (ST.c1 == CHRTEST_VOID)
5337                     goto curly_try_B_min;
5338
5339                 ST.oldloc = locinput;
5340
5341                 /* set ST.maxpos to the furthest point along the
5342                  * string that could possibly match */
5343                 if  (ST.max == REG_INFTY) {
5344                     ST.maxpos = PL_regeol - 1;
5345                     if (utf8_target)
5346                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5347                             ST.maxpos--;
5348                 }
5349                 else if (utf8_target) {
5350                     int m = ST.max - ST.min;
5351                     for (ST.maxpos = locinput;
5352                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5353                         ST.maxpos += UTF8SKIP(ST.maxpos);
5354                 }
5355                 else {
5356                     ST.maxpos = locinput + ST.max - ST.min;
5357                     if (ST.maxpos >= PL_regeol)
5358                         ST.maxpos = PL_regeol - 1;
5359                 }
5360                 goto curly_try_B_min_known;
5361
5362             }
5363             else {
5364                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5365                 locinput = PL_reginput;
5366                 if (ST.count < ST.min)
5367                     sayNO;
5368                 if ((ST.count > ST.min)
5369                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5370                 {
5371                     /* A{m,n} must come at the end of the string, there's
5372                      * no point in backing off ... */
5373                     ST.min = ST.count;
5374                     /* ...except that $ and \Z can match before *and* after
5375                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5376                        We may back off by one in this case. */
5377                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5378                         ST.min--;
5379                 }
5380                 REGCP_SET(ST.cp);
5381                 goto curly_try_B_max;
5382             }
5383             /* NOTREACHED */
5384
5385
5386         case CURLY_B_min_known_fail:
5387             /* failed to find B in a non-greedy match where c1,c2 valid */
5388             if (ST.paren && ST.count)
5389                 PL_regoffs[ST.paren].end = -1;
5390
5391             PL_reginput = locinput;     /* Could be reset... */
5392             REGCP_UNWIND(ST.cp);
5393             /* Couldn't or didn't -- move forward. */
5394             ST.oldloc = locinput;
5395             if (utf8_target)
5396                 locinput += UTF8SKIP(locinput);
5397             else
5398                 locinput++;
5399             ST.count++;
5400           curly_try_B_min_known:
5401              /* find the next place where 'B' could work, then call B */
5402             {
5403                 int n;
5404                 if (utf8_target) {
5405                     n = (ST.oldloc == locinput) ? 0 : 1;
5406                     if (ST.c1 == ST.c2) {
5407                         STRLEN len;
5408                         /* set n to utf8_distance(oldloc, locinput) */
5409                         while (locinput <= ST.maxpos &&
5410                                utf8n_to_uvchr((U8*)locinput,
5411                                               UTF8_MAXBYTES, &len,
5412                                               uniflags) != (UV)ST.c1) {
5413                             locinput += len;
5414                             n++;
5415                         }
5416                     }
5417                     else {
5418                         /* set n to utf8_distance(oldloc, locinput) */
5419                         while (locinput <= ST.maxpos) {
5420                             STRLEN len;
5421                             const UV c = utf8n_to_uvchr((U8*)locinput,
5422                                                   UTF8_MAXBYTES, &len,
5423                                                   uniflags);
5424                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5425                                 break;
5426                             locinput += len;
5427                             n++;
5428                         }
5429                     }
5430                 }
5431                 else {
5432                     if (ST.c1 == ST.c2) {
5433                         while (locinput <= ST.maxpos &&
5434                                UCHARAT(locinput) != ST.c1)
5435                             locinput++;
5436                     }
5437                     else {
5438                         while (locinput <= ST.maxpos
5439                                && UCHARAT(locinput) != ST.c1
5440                                && UCHARAT(locinput) != ST.c2)
5441                             locinput++;
5442                     }
5443                     n = locinput - ST.oldloc;
5444                 }
5445                 if (locinput > ST.maxpos)
5446                     sayNO;
5447                 /* PL_reginput == oldloc now */
5448                 if (n) {
5449                     ST.count += n;
5450                     if (regrepeat(rex, ST.A, n, depth) < n)
5451                         sayNO;
5452                 }
5453                 PL_reginput = locinput;
5454                 CURLY_SETPAREN(ST.paren, ST.count);
5455                 if (cur_eval && cur_eval->u.eval.close_paren &&
5456                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5457                     goto fake_end;
5458                 }
5459                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5460             }
5461             /* NOTREACHED */
5462
5463
5464         case CURLY_B_min_fail:
5465             /* failed to find B in a non-greedy match where c1,c2 invalid */
5466             if (ST.paren && ST.count)
5467                 PL_regoffs[ST.paren].end = -1;
5468
5469             REGCP_UNWIND(ST.cp);
5470             /* failed -- move forward one */
5471             PL_reginput = locinput;
5472             if (regrepeat(rex, ST.A, 1, depth)) {
5473                 ST.count++;
5474                 locinput = PL_reginput;
5475                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5476                         ST.count > 0)) /* count overflow ? */
5477                 {
5478                   curly_try_B_min:
5479                     CURLY_SETPAREN(ST.paren, ST.count);
5480                     if (cur_eval && cur_eval->u.eval.close_paren &&
5481                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5482                         goto fake_end;
5483                     }
5484                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5485                 }
5486             }
5487             sayNO;
5488             /* NOTREACHED */
5489
5490
5491         curly_try_B_max:
5492             /* a successful greedy match: now try to match B */
5493             if (cur_eval && cur_eval->u.eval.close_paren &&
5494                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5495                 goto fake_end;
5496             }
5497             {
5498                 UV c = 0;
5499                 if (ST.c1 != CHRTEST_VOID)
5500                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5501                                            UTF8_MAXBYTES, 0, uniflags)
5502                                 : (UV) UCHARAT(PL_reginput);
5503                 /* If it could work, try it. */
5504                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5505                     CURLY_SETPAREN(ST.paren, ST.count);
5506                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5507                     /* NOTREACHED */
5508                 }
5509             }
5510             /* FALL THROUGH */
5511         case CURLY_B_max_fail:
5512             /* failed to find B in a greedy match */
5513             if (ST.paren && ST.count)
5514                 PL_regoffs[ST.paren].end = -1;
5515
5516             REGCP_UNWIND(ST.cp);
5517             /*  back up. */
5518             if (--ST.count < ST.min)
5519                 sayNO;
5520             PL_reginput = locinput = HOPc(locinput, -1);
5521             goto curly_try_B_max;
5522
5523 #undef ST
5524
5525         case END:
5526             fake_end:
5527             if (cur_eval) {
5528                 /* we've just finished A in /(??{A})B/; now continue with B */
5529                 I32 tmpix;
5530                 st->u.eval.toggle_reg_flags
5531                             = cur_eval->u.eval.toggle_reg_flags;
5532                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5533
5534                 st->u.eval.prev_rex = rex_sv;           /* inner */
5535                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5536                 rex = (struct regexp *)SvANY(rex_sv);
5537                 rexi = RXi_GET(rex);
5538                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5539                 (void)ReREFCNT_inc(rex_sv);
5540                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5541
5542                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5543                 PL_reglastparen = &rex->lastparen;
5544                 PL_reglastcloseparen = &rex->lastcloseparen;
5545
5546                 REGCP_SET(st->u.eval.lastcp);
5547                 PL_reginput = locinput;
5548
5549                 /* Restore parens of the outer rex without popping the
5550                  * savestack */
5551                 tmpix = PL_savestack_ix;
5552                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5553                 regcppop(rex);
5554                 PL_savestack_ix = tmpix;
5555
5556                 st->u.eval.prev_eval = cur_eval;
5557                 cur_eval = cur_eval->u.eval.prev_eval;
5558                 DEBUG_EXECUTE_r(
5559                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5560                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5561                 if ( nochange_depth )
5562                     nochange_depth--;
5563
5564                 PUSH_YES_STATE_GOTO(EVAL_AB,
5565                         st->u.eval.prev_eval->u.eval.B); /* match B */
5566             }
5567
5568             if (locinput < reginfo->till) {
5569                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5570                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5571                                       PL_colors[4],
5572                                       (long)(locinput - PL_reg_starttry),
5573                                       (long)(reginfo->till - PL_reg_starttry),
5574                                       PL_colors[5]));
5575
5576                 sayNO_SILENT;           /* Cannot match: too short. */
5577             }
5578             PL_reginput = locinput;     /* put where regtry can find it */
5579             sayYES;                     /* Success! */
5580
5581         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5582             DEBUG_EXECUTE_r(
5583             PerlIO_printf(Perl_debug_log,
5584                 "%*s  %ssubpattern success...%s\n",
5585                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5586             PL_reginput = locinput;     /* put where regtry can find it */
5587             sayYES;                     /* Success! */
5588
5589 #undef  ST
5590 #define ST st->u.ifmatch
5591
5592         case SUSPEND:   /* (?>A) */
5593             ST.wanted = 1;
5594             PL_reginput = locinput;
5595             goto do_ifmatch;
5596
5597         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5598             ST.wanted = 0;
5599             goto ifmatch_trivial_fail_test;
5600
5601         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5602             ST.wanted = 1;
5603           ifmatch_trivial_fail_test:
5604             if (scan->flags) {
5605                 char * const s = HOPBACKc(locinput, scan->flags);
5606                 if (!s) {
5607                     /* trivial fail */
5608                     if (logical) {
5609                         logical = 0;
5610                         sw = 1 - cBOOL(ST.wanted);
5611                     }
5612                     else if (ST.wanted)
5613                         sayNO;
5614                     next = scan + ARG(scan);
5615                     if (next == scan)
5616                         next = NULL;
5617                     break;
5618                 }
5619                 PL_reginput = s;
5620             }
5621             else
5622                 PL_reginput = locinput;
5623
5624           do_ifmatch:
5625             ST.me = scan;
5626             ST.logical = logical;
5627             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5628
5629             /* execute body of (?...A) */
5630             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5631             /* NOTREACHED */
5632
5633         case IFMATCH_A_fail: /* body of (?...A) failed */
5634             ST.wanted = !ST.wanted;
5635             /* FALL THROUGH */
5636
5637         case IFMATCH_A: /* body of (?...A) succeeded */
5638             if (ST.logical) {
5639                 sw = cBOOL(ST.wanted);
5640             }
5641             else if (!ST.wanted)
5642                 sayNO;
5643
5644             if (OP(ST.me) == SUSPEND)
5645                 locinput = PL_reginput;
5646             else {
5647                 locinput = PL_reginput = st->locinput;
5648                 nextchr = UCHARAT(locinput);
5649             }
5650             scan = ST.me + ARG(ST.me);
5651             if (scan == ST.me)
5652                 scan = NULL;
5653             continue; /* execute B */
5654
5655 #undef ST
5656
5657         case LONGJMP:
5658             next = scan + ARG(scan);
5659             if (next == scan)
5660                 next = NULL;
5661             break;
5662         case COMMIT:
5663             reginfo->cutpoint = PL_regeol;
5664             /* FALLTHROUGH */
5665         case PRUNE:
5666             PL_reginput = locinput;
5667             if (!scan->flags)
5668                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5669             PUSH_STATE_GOTO(COMMIT_next,next);
5670             /* NOTREACHED */
5671         case COMMIT_next_fail:
5672             no_final = 1;
5673             /* FALLTHROUGH */
5674         case OPFAIL:
5675             sayNO;
5676             /* NOTREACHED */
5677
5678 #define ST st->u.mark
5679         case MARKPOINT:
5680             ST.prev_mark = mark_state;
5681             ST.mark_name = sv_commit = sv_yes_mark
5682                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5683             mark_state = st;
5684             ST.mark_loc = PL_reginput = locinput;
5685             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5686             /* NOTREACHED */
5687         case MARKPOINT_next:
5688             mark_state = ST.prev_mark;
5689             sayYES;
5690             /* NOTREACHED */
5691         case MARKPOINT_next_fail:
5692             if (popmark && sv_eq(ST.mark_name,popmark))
5693             {
5694                 if (ST.mark_loc > startpoint)
5695                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5696                 popmark = NULL; /* we found our mark */
5697                 sv_commit = ST.mark_name;
5698
5699                 DEBUG_EXECUTE_r({
5700                         PerlIO_printf(Perl_debug_log,
5701                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5702                             REPORT_CODE_OFF+depth*2, "",
5703                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5704                 });
5705             }
5706             mark_state = ST.prev_mark;
5707             sv_yes_mark = mark_state ?
5708                 mark_state->u.mark.mark_name : NULL;
5709             sayNO;
5710             /* NOTREACHED */
5711         case SKIP:
5712             PL_reginput = locinput;
5713             if (scan->flags) {
5714                 /* (*SKIP) : if we fail we cut here*/
5715                 ST.mark_name = NULL;
5716                 ST.mark_loc = locinput;
5717                 PUSH_STATE_GOTO(SKIP_next,next);
5718             } else {
5719                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5720                    otherwise do nothing.  Meaning we need to scan
5721                  */
5722                 regmatch_state *cur = mark_state;
5723                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5724
5725                 while (cur) {
5726                     if ( sv_eq( cur->u.mark.mark_name,
5727                                 find ) )
5728                     {
5729                         ST.mark_name = find;
5730                         PUSH_STATE_GOTO( SKIP_next, next );
5731                     }
5732                     cur = cur->u.mark.prev_mark;
5733                 }
5734             }
5735             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5736             break;
5737         case SKIP_next_fail:
5738             if (ST.mark_name) {
5739                 /* (*CUT:NAME) - Set up to search for the name as we
5740                    collapse the stack*/
5741                 popmark = ST.mark_name;
5742             } else {
5743                 /* (*CUT) - No name, we cut here.*/
5744                 if (ST.mark_loc > startpoint)
5745                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5746                 /* but we set sv_commit to latest mark_name if there
5747                    is one so they can test to see how things lead to this
5748                    cut */
5749                 if (mark_state)
5750                     sv_commit=mark_state->u.mark.mark_name;
5751             }
5752             no_final = 1;
5753             sayNO;
5754             /* NOTREACHED */
5755 #undef ST
5756         case LNBREAK:
5757             if ((n=is_LNBREAK(locinput,utf8_target))) {
5758                 locinput += n;
5759                 nextchr = UCHARAT(locinput);
5760             } else
5761                 sayNO;
5762             break;
5763
5764 #define CASE_CLASS(nAmE)                              \
5765         case nAmE:                                    \
5766             if (locinput >= PL_regeol)                \
5767                 sayNO;                                \
5768             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5769                 locinput += n;                        \
5770                 nextchr = UCHARAT(locinput);          \
5771             } else                                    \
5772                 sayNO;                                \
5773             break;                                    \
5774         case N##nAmE:                                 \
5775             if (locinput >= PL_regeol)                \
5776                 sayNO;                                \
5777             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5778                 sayNO;                                \
5779             } else {                                  \
5780                 locinput += UTF8SKIP(locinput);       \
5781                 nextchr = UCHARAT(locinput);          \
5782             }                                         \
5783             break
5784
5785         CASE_CLASS(VERTWS);
5786         CASE_CLASS(HORIZWS);
5787 #undef CASE_CLASS
5788
5789         default:
5790             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5791                           PTR2UV(scan), OP(scan));
5792             Perl_croak(aTHX_ "regexp memory corruption");
5793
5794         } /* end switch */
5795
5796         /* switch break jumps here */
5797         scan = next; /* prepare to execute the next op and ... */
5798         continue;    /* ... jump back to the top, reusing st */
5799         /* NOTREACHED */
5800
5801       push_yes_state:
5802         /* push a state that backtracks on success */
5803         st->u.yes.prev_yes_state = yes_state;
5804         yes_state = st;
5805         /* FALL THROUGH */
5806       push_state:
5807         /* push a new regex state, then continue at scan  */
5808         {
5809             regmatch_state *newst;
5810
5811             DEBUG_STACK_r({
5812                 regmatch_state *cur = st;
5813                 regmatch_state *curyes = yes_state;
5814                 int curd = depth;
5815                 regmatch_slab *slab = PL_regmatch_slab;
5816                 for (;curd > -1;cur--,curd--) {
5817                     if (cur < SLAB_FIRST(slab)) {
5818                         slab = slab->prev;
5819                         cur = SLAB_LAST(slab);
5820                     }
5821                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5822                         REPORT_CODE_OFF + 2 + depth * 2,"",
5823                         curd, PL_reg_name[cur->resume_state],
5824                         (curyes == cur) ? "yes" : ""
5825                     );
5826                     if (curyes == cur)
5827                         curyes = cur->u.yes.prev_yes_state;
5828                 }
5829             } else
5830                 DEBUG_STATE_pp("push")
5831             );
5832             depth++;
5833             st->locinput = locinput;
5834             newst = st+1;
5835             if (newst >  SLAB_LAST(PL_regmatch_slab))
5836                 newst = S_push_slab(aTHX);
5837             PL_regmatch_state = newst;
5838
5839             locinput = PL_reginput;
5840             nextchr = UCHARAT(locinput);
5841             st = newst;
5842             continue;
5843             /* NOTREACHED */
5844         }
5845     }
5846
5847     /*
5848     * We get here only if there's trouble -- normally "case END" is
5849     * the terminating point.
5850     */
5851     Perl_croak(aTHX_ "corrupted regexp pointers");
5852     /*NOTREACHED*/
5853     sayNO;
5854
5855 yes:
5856     if (yes_state) {
5857         /* we have successfully completed a subexpression, but we must now
5858          * pop to the state marked by yes_state and continue from there */
5859         assert(st != yes_state);
5860 #ifdef DEBUGGING
5861         while (st != yes_state) {
5862             st--;
5863             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5864                 PL_regmatch_slab = PL_regmatch_slab->prev;
5865                 st = SLAB_LAST(PL_regmatch_slab);
5866             }
5867             DEBUG_STATE_r({
5868                 if (no_final) {
5869                     DEBUG_STATE_pp("pop (no final)");
5870                 } else {
5871                     DEBUG_STATE_pp("pop (yes)");
5872                 }
5873             });
5874             depth--;
5875         }
5876 #else
5877         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5878             || yes_state > SLAB_LAST(PL_regmatch_slab))
5879         {
5880             /* not in this slab, pop slab */
5881             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5882             PL_regmatch_slab = PL_regmatch_slab->prev;
5883             st = SLAB_LAST(PL_regmatch_slab);
5884         }
5885         depth -= (st - yes_state);
5886 #endif
5887         st = yes_state;
5888         yes_state = st->u.yes.prev_yes_state;
5889         PL_regmatch_state = st;
5890
5891         if (no_final) {
5892             locinput= st->locinput;
5893             nextchr = UCHARAT(locinput);
5894         }
5895         state_num = st->resume_state + no_final;
5896         goto reenter_switch;
5897     }
5898
5899     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5900                           PL_colors[4], PL_colors[5]));
5901
5902     if (PL_reg_eval_set) {
5903         /* each successfully executed (?{...}) block does the equivalent of
5904          *   local $^R = do {...}
5905          * When popping the save stack, all these locals would be undone;
5906          * bypass this by setting the outermost saved $^R to the latest
5907          * value */
5908         if (oreplsv != GvSV(PL_replgv))
5909             sv_setsv(oreplsv, GvSV(PL_replgv));
5910     }
5911     result = 1;
5912     goto final_exit;
5913
5914 no:
5915     DEBUG_EXECUTE_r(
5916         PerlIO_printf(Perl_debug_log,
5917             "%*s  %sfailed...%s\n",
5918             REPORT_CODE_OFF+depth*2, "",
5919             PL_colors[4], PL_colors[5])
5920         );
5921
5922 no_silent:
5923     if (no_final) {
5924         if (yes_state) {
5925             goto yes;
5926         } else {
5927             goto final_exit;
5928         }
5929     }
5930     if (depth) {
5931         /* there's a previous state to backtrack to */
5932         st--;
5933         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5934             PL_regmatch_slab = PL_regmatch_slab->prev;
5935             st = SLAB_LAST(PL_regmatch_slab);
5936         }
5937         PL_regmatch_state = st;
5938         locinput= st->locinput;
5939         nextchr = UCHARAT(locinput);
5940
5941         DEBUG_STATE_pp("pop");
5942         depth--;
5943         if (yes_state == st)
5944             yes_state = st->u.yes.prev_yes_state;
5945
5946         state_num = st->resume_state + 1; /* failure = success + 1 */
5947         goto reenter_switch;
5948     }
5949     result = 0;
5950
5951   final_exit:
5952     if (rex->intflags & PREGf_VERBARG_SEEN) {
5953         SV *sv_err = get_sv("REGERROR", 1);
5954         SV *sv_mrk = get_sv("REGMARK", 1);
5955         if (result) {
5956             sv_commit = &PL_sv_no;
5957             if (!sv_yes_mark)
5958                 sv_yes_mark = &PL_sv_yes;
5959         } else {
5960             if (!sv_commit)
5961                 sv_commit = &PL_sv_yes;
5962             sv_yes_mark = &PL_sv_no;
5963         }
5964         sv_setsv(sv_err, sv_commit);
5965         sv_setsv(sv_mrk, sv_yes_mark);
5966     }
5967
5968     /* clean up; in particular, free all slabs above current one */
5969     LEAVE_SCOPE(oldsave);
5970
5971     return result;
5972 }
5973
5974 /*
5975  - regrepeat - repeatedly match something simple, report how many
5976  */
5977 /*
5978  * [This routine now assumes that it will only match on things of length 1.
5979  * That was true before, but now we assume scan - reginput is the count,
5980  * rather than incrementing count on every character.  [Er, except utf8.]]
5981  */
5982 STATIC I32
5983 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5984 {
5985     dVAR;
5986     register char *scan;
5987     register I32 c;
5988     register char *loceol = PL_regeol;
5989     register I32 hardcount = 0;
5990     register bool utf8_target = PL_reg_match_utf8;
5991     UV utf8_flags;
5992 #ifndef DEBUGGING
5993     PERL_UNUSED_ARG(depth);
5994 #endif
5995
5996     PERL_ARGS_ASSERT_REGREPEAT;
5997
5998     scan = PL_reginput;
5999     if (max == REG_INFTY)
6000         max = I32_MAX;
6001     else if (max < loceol - scan)
6002         loceol = scan + max;
6003     switch (OP(p)) {
6004     case REG_ANY:
6005         if (utf8_target) {
6006             loceol = PL_regeol;
6007             while (scan < loceol && hardcount < max && *scan != '\n') {
6008                 scan += UTF8SKIP(scan);
6009                 hardcount++;
6010             }
6011         } else {
6012             while (scan < loceol && *scan != '\n')
6013                 scan++;
6014         }
6015         break;
6016     case SANY:
6017         if (utf8_target) {
6018             loceol = PL_regeol;
6019             while (scan < loceol && hardcount < max) {
6020                 scan += UTF8SKIP(scan);
6021                 hardcount++;
6022             }
6023         }
6024         else
6025             scan = loceol;
6026         break;
6027     case CANY:
6028         scan = loceol;
6029         break;
6030     case EXACT:
6031         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6032          * means they match only characters in the string that can be expressed
6033          * as a single byte.  For non-utf8 strings, that means a simple match.
6034          * For utf8 strings, the character matched must be an invariant, or
6035          * downgradable to a single byte.  The pattern's utf8ness is
6036          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6037          * it is, it's an invariant */
6038
6039         c = (U8)*STRING(p);
6040         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6041
6042         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6043             while (scan < loceol && UCHARAT(scan) == c) {
6044                 scan++;
6045             }
6046         }
6047         else {
6048
6049             /* Here, the string is utf8, and the pattern char is different
6050              * in utf8 than not, so can't compare them directly.  Outside the
6051              * loop, find the two utf8 bytes that represent c, and then
6052              * look for those in sequence in the utf8 string */
6053             U8 high = UTF8_TWO_BYTE_HI(c);
6054             U8 low = UTF8_TWO_BYTE_LO(c);
6055             loceol = PL_regeol;
6056
6057             while (hardcount < max
6058                     && scan + 1 < loceol
6059                     && UCHARAT(scan) == high
6060                     && UCHARAT(scan + 1) == low)
6061             {
6062                 scan += 2;
6063                 hardcount++;
6064             }
6065         }
6066         break;
6067     case EXACTFA:
6068         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6069         goto do_exactf;
6070
6071     case EXACTFL:
6072         PL_reg_flags |= RF_tainted;
6073         utf8_flags = FOLDEQ_UTF8_LOCALE;
6074         goto do_exactf;
6075
6076     case EXACTF:
6077             utf8_flags = 0;
6078             goto do_exactf;
6079
6080     case EXACTFU_SS:
6081     case EXACTFU_TRICKYFOLD:
6082     case EXACTFU:
6083         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6084
6085         /* The comments for the EXACT case above apply as well to these fold
6086          * ones */
6087
6088     do_exactf:
6089         c = (U8)*STRING(p);
6090         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6091
6092         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6093             char *tmpeol = loceol;
6094             while (hardcount < max
6095                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6096                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6097             {
6098                 scan = tmpeol;
6099                 tmpeol = loceol;
6100                 hardcount++;
6101             }
6102
6103             /* XXX Note that the above handles properly the German sharp s in
6104              * the pattern matching ss in the string.  But it doesn't handle
6105              * properly cases where the string contains say 'LIGATURE ff' and
6106              * the pattern is 'f+'.  This would require, say, a new function or
6107              * revised interface to foldEQ_utf8(), in which the maximum number
6108              * of characters to match could be passed and it would return how
6109              * many actually did.  This is just one of many cases where
6110              * multi-char folds don't work properly, and so the fix is being
6111              * deferred */
6112         }
6113         else {
6114             U8 folded;
6115
6116             /* Here, the string isn't utf8 and c is a single byte; and either
6117              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6118              * doesn't affect c.  Can just do simple comparisons for exact or
6119              * fold matching. */
6120             switch (OP(p)) {
6121                 case EXACTF: folded = PL_fold[c]; break;
6122                 case EXACTFA:
6123                 case EXACTFU_TRICKYFOLD:
6124                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6125                 case EXACTFL: folded = PL_fold_locale[c]; break;
6126                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6127             }
6128             while (scan < loceol &&
6129                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6130             {
6131                 scan++;
6132             }
6133         }
6134         break;
6135     case ANYOFV:
6136     case ANYOF:
6137         if (utf8_target || OP(p) == ANYOFV) {
6138             STRLEN inclasslen;
6139             loceol = PL_regeol;
6140             inclasslen = loceol - scan;
6141             while (hardcount < max
6142                    && ((inclasslen = loceol - scan) > 0)
6143                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6144             {
6145                 scan += inclasslen;
6146                 hardcount++;
6147             }
6148         } else {
6149             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6150                 scan++;
6151         }
6152         break;
6153     case ALNUMU:
6154         if (utf8_target) {
6155     utf8_wordchar:
6156             loceol = PL_regeol;
6157             LOAD_UTF8_CHARCLASS_ALNUM();
6158             while (hardcount < max && scan < loceol &&
6159                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6160             {
6161                 scan += UTF8SKIP(scan);
6162                 hardcount++;
6163             }
6164         } else {
6165             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6166                 scan++;
6167             }
6168         }
6169         break;
6170     case ALNUM:
6171         if (utf8_target)
6172             goto utf8_wordchar;
6173         while (scan < loceol && isALNUM((U8) *scan)) {
6174             scan++;
6175         }
6176         break;
6177     case ALNUMA:
6178         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6179             scan++;
6180         }
6181         break;
6182     case ALNUML:
6183         PL_reg_flags |= RF_tainted;
6184         if (utf8_target) {
6185             loceol = PL_regeol;
6186             while (hardcount < max && scan < loceol &&
6187                    isALNUM_LC_utf8((U8*)scan)) {
6188                 scan += UTF8SKIP(scan);
6189                 hardcount++;
6190             }
6191         } else {
6192             while (scan < loceol && isALNUM_LC(*scan))
6193                 scan++;
6194         }
6195         break;
6196     case NALNUMU:
6197         if (utf8_target) {
6198
6199     utf8_Nwordchar:
6200
6201             loceol = PL_regeol;
6202             LOAD_UTF8_CHARCLASS_ALNUM();
6203             while (hardcount < max && scan < loceol &&
6204                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6205             {
6206                 scan += UTF8SKIP(scan);
6207                 hardcount++;
6208             }
6209         } else {
6210             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6211                 scan++;
6212             }
6213         }
6214         break;
6215     case NALNUM:
6216         if (utf8_target)
6217             goto utf8_Nwordchar;
6218         while (scan < loceol && ! isALNUM((U8) *scan)) {
6219             scan++;
6220         }
6221         break;
6222     case NALNUMA:
6223         if (utf8_target) {
6224             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6225                 scan += UTF8SKIP(scan);
6226             }
6227         }
6228         else {
6229             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6230                 scan++;
6231             }
6232         }
6233         break;
6234     case NALNUML:
6235         PL_reg_flags |= RF_tainted;
6236         if (utf8_target) {
6237             loceol = PL_regeol;
6238             while (hardcount < max && scan < loceol &&
6239                    !isALNUM_LC_utf8((U8*)scan)) {
6240                 scan += UTF8SKIP(scan);
6241                 hardcount++;
6242             }
6243         } else {
6244             while (scan < loceol && !isALNUM_LC(*scan))
6245                 scan++;
6246         }
6247         break;
6248     case SPACEU:
6249         if (utf8_target) {
6250
6251     utf8_space:
6252
6253             loceol = PL_regeol;
6254             LOAD_UTF8_CHARCLASS_SPACE();
6255             while (hardcount < max && scan < loceol &&
6256                    (*scan == ' ' ||
6257                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6258             {
6259                 scan += UTF8SKIP(scan);
6260                 hardcount++;
6261             }
6262             break;
6263         }
6264         else {
6265             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6266                 scan++;
6267             }
6268             break;
6269         }
6270     case SPACE:
6271         if (utf8_target)
6272             goto utf8_space;
6273
6274         while (scan < loceol && isSPACE((U8) *scan)) {
6275             scan++;
6276         }
6277         break;
6278     case SPACEA:
6279         while (scan < loceol && isSPACE_A((U8) *scan)) {
6280             scan++;
6281         }
6282         break;
6283     case SPACEL:
6284         PL_reg_flags |= RF_tainted;
6285         if (utf8_target) {
6286             loceol = PL_regeol;
6287             while (hardcount < max && scan < loceol &&
6288                    isSPACE_LC_utf8((U8*)scan)) {
6289                 scan += UTF8SKIP(scan);
6290                 hardcount++;
6291             }
6292         } else {
6293             while (scan < loceol && isSPACE_LC(*scan))
6294                 scan++;
6295         }
6296         break;
6297     case NSPACEU:
6298         if (utf8_target) {
6299
6300     utf8_Nspace:
6301
6302             loceol = PL_regeol;
6303             LOAD_UTF8_CHARCLASS_SPACE();
6304             while (hardcount < max && scan < loceol &&
6305                    ! (*scan == ' ' ||
6306                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6307             {
6308                 scan += UTF8SKIP(scan);
6309                 hardcount++;
6310             }
6311             break;
6312         }
6313         else {
6314             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6315                 scan++;
6316             }
6317         }
6318         break;
6319     case NSPACE:
6320         if (utf8_target)
6321             goto utf8_Nspace;
6322
6323         while (scan < loceol && ! isSPACE((U8) *scan)) {
6324             scan++;
6325         }
6326         break;
6327     case NSPACEA:
6328         if (utf8_target) {
6329             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6330                 scan += UTF8SKIP(scan);
6331             }
6332         }
6333         else {
6334             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6335                 scan++;
6336             }
6337         }
6338         break;
6339     case NSPACEL:
6340         PL_reg_flags |= RF_tainted;
6341         if (utf8_target) {
6342             loceol = PL_regeol;
6343             while (hardcount < max && scan < loceol &&
6344                    !isSPACE_LC_utf8((U8*)scan)) {
6345                 scan += UTF8SKIP(scan);
6346                 hardcount++;
6347             }
6348         } else {
6349             while (scan < loceol && !isSPACE_LC(*scan))
6350                 scan++;
6351         }
6352         break;
6353     case DIGIT:
6354         if (utf8_target) {
6355             loceol = PL_regeol;
6356             LOAD_UTF8_CHARCLASS_DIGIT();
6357             while (hardcount < max && scan < loceol &&
6358                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6359                 scan += UTF8SKIP(scan);
6360                 hardcount++;
6361             }
6362         } else {
6363             while (scan < loceol && isDIGIT(*scan))
6364                 scan++;
6365         }
6366         break;
6367     case DIGITA:
6368         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6369             scan++;
6370         }
6371         break;
6372     case DIGITL:
6373         PL_reg_flags |= RF_tainted;
6374         if (utf8_target) {
6375             loceol = PL_regeol;
6376             while (hardcount < max && scan < loceol &&
6377                    isDIGIT_LC_utf8((U8*)scan)) {
6378                 scan += UTF8SKIP(scan);
6379                 hardcount++;
6380             }
6381         } else {
6382             while (scan < loceol && isDIGIT_LC(*scan))
6383                 scan++;
6384         }
6385         break;
6386     case NDIGIT:
6387         if (utf8_target) {
6388             loceol = PL_regeol;
6389             LOAD_UTF8_CHARCLASS_DIGIT();
6390             while (hardcount < max && scan < loceol &&
6391                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6392                 scan += UTF8SKIP(scan);
6393                 hardcount++;
6394             }
6395         } else {
6396             while (scan < loceol && !isDIGIT(*scan))
6397                 scan++;
6398         }
6399         break;
6400     case NDIGITA:
6401         if (utf8_target) {
6402             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6403                 scan += UTF8SKIP(scan);
6404             }
6405         }
6406         else {
6407             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6408                 scan++;
6409             }
6410         }
6411         break;
6412     case NDIGITL:
6413         PL_reg_flags |= RF_tainted;
6414         if (utf8_target) {
6415             loceol = PL_regeol;
6416             while (hardcount < max && scan < loceol &&
6417                    !isDIGIT_LC_utf8((U8*)scan)) {
6418                 scan += UTF8SKIP(scan);
6419                 hardcount++;
6420             }
6421         } else {
6422             while (scan < loceol && !isDIGIT_LC(*scan))
6423                 scan++;
6424         }
6425         break;
6426     case LNBREAK:
6427         if (utf8_target) {
6428             loceol = PL_regeol;
6429             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6430                 scan += c;
6431                 hardcount++;
6432             }
6433         } else {
6434             /*
6435               LNBREAK can match two latin chars, which is ok,
6436               because we have a null terminated string, but we
6437               have to use hardcount in this situation
6438             */
6439             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6440                 scan+=c;
6441                 hardcount++;
6442             }
6443         }
6444         break;
6445     case HORIZWS:
6446         if (utf8_target) {
6447             loceol = PL_regeol;
6448             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6449                 scan += c;
6450                 hardcount++;
6451             }
6452         } else {
6453             while (scan < loceol && is_HORIZWS_latin1(scan))
6454                 scan++;
6455         }
6456         break;
6457     case NHORIZWS:
6458         if (utf8_target) {
6459             loceol = PL_regeol;
6460             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6461                 scan += UTF8SKIP(scan);
6462                 hardcount++;
6463             }
6464         } else {
6465             while (scan < loceol && !is_HORIZWS_latin1(scan))
6466                 scan++;
6467
6468         }
6469         break;
6470     case VERTWS:
6471         if (utf8_target) {
6472             loceol = PL_regeol;
6473             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6474                 scan += c;
6475                 hardcount++;
6476             }
6477         } else {
6478             while (scan < loceol && is_VERTWS_latin1(scan))
6479                 scan++;
6480
6481         }
6482         break;
6483     case NVERTWS:
6484         if (utf8_target) {
6485             loceol = PL_regeol;
6486             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6487                 scan += UTF8SKIP(scan);
6488                 hardcount++;
6489             }
6490         } else {
6491             while (scan < loceol && !is_VERTWS_latin1(scan))
6492                 scan++;
6493
6494         }
6495         break;
6496
6497     default:            /* Called on something of 0 width. */
6498         break;          /* So match right here or not at all. */
6499     }
6500
6501     if (hardcount)
6502         c = hardcount;
6503     else
6504         c = scan - PL_reginput;
6505     PL_reginput = scan;
6506
6507     DEBUG_r({
6508         GET_RE_DEBUG_FLAGS_DECL;
6509         DEBUG_EXECUTE_r({
6510             SV * const prop = sv_newmortal();
6511             regprop(prog, prop, p);
6512             PerlIO_printf(Perl_debug_log,
6513                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6514                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6515         });
6516     });
6517
6518     return(c);
6519 }
6520
6521
6522 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6523 /*
6524 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6525 create a copy so that changes the caller makes won't change the shared one
6526  */
6527 SV *
6528 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6529 {
6530     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6531     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6532 }
6533 #endif
6534
6535 STATIC SV *
6536 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6537 {
6538     /* Returns the swash for the input 'node' in the regex 'prog'.
6539      * If <doinit> is true, will attempt to create the swash if not already
6540      *    done.
6541      * If <listsvp> is non-null, will return the swash initialization string in
6542      *    it.
6543      * If <altsvp> is non-null, will return the alternates to the regular swash
6544      *    in it
6545      * Tied intimately to how regcomp.c sets up the data structure */
6546
6547     dVAR;
6548     SV *sw  = NULL;
6549     SV *si  = NULL;
6550     SV *alt = NULL;
6551     SV*  invlist = NULL;
6552
6553     RXi_GET_DECL(prog,progi);
6554     const struct reg_data * const data = prog ? progi->data : NULL;
6555
6556     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6557
6558     assert(ANYOF_NONBITMAP(node));
6559
6560     if (data && data->count) {
6561         const U32 n = ARG(node);
6562
6563         if (data->what[n] == 's') {
6564             SV * const rv = MUTABLE_SV(data->data[n]);
6565             AV * const av = MUTABLE_AV(SvRV(rv));
6566             SV **const ary = AvARRAY(av);
6567             bool invlist_has_user_defined_property;
6568
6569             si = *ary;  /* ary[0] = the string to initialize the swash with */
6570
6571             /* Elements 3 and 4 are either both present or both absent. [3] is
6572              * any inversion list generated at compile time; [4] indicates if
6573              * that inversion list has any user-defined properties in it. */
6574             if (av_len(av) >= 3) {
6575                 invlist = ary[3];
6576                 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6577             }
6578             else {
6579                 invlist = NULL;
6580                 invlist_has_user_defined_property = FALSE;
6581             }
6582
6583             /* Element [1] is reserved for the set-up swash.  If already there,
6584              * return it; if not, create it and store it there */
6585             if (SvROK(ary[1])) {
6586                 sw = ary[1];
6587             }
6588             else if (si && doinit) {
6589
6590                 sw = _core_swash_init("utf8", /* the utf8 package */
6591                                       "", /* nameless */
6592                                       si,
6593                                       1, /* binary */
6594                                       0, /* not from tr/// */
6595                                       FALSE, /* is error if can't find
6596                                                 property */
6597                                       invlist,
6598                                       invlist_has_user_defined_property);
6599                 (void)av_store(av, 1, sw);
6600             }
6601
6602             /* Element [2] is for any multi-char folds.  Note that is a
6603              * fundamentally flawed design, because can't backtrack and try
6604              * again.  See [perl #89774] */
6605             if (SvTYPE(ary[2]) == SVt_PVAV) {
6606                 alt = ary[2];
6607             }
6608         }
6609     }
6610
6611     if (listsvp) {
6612         SV* matches_string = newSVpvn("", 0);
6613         SV** invlistsvp;
6614
6615         /* Use the swash, if any, which has to have incorporated into it all
6616          * possibilities */
6617         if (   sw
6618             && SvROK(sw)
6619             && SvTYPE(SvRV(sw)) == SVt_PVHV
6620             && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6621         {
6622             invlist = *invlistsvp;
6623         }
6624         else if (si && si != &PL_sv_undef) {
6625
6626             /* If no swash, use the input nitialization string, if available */
6627             sv_catsv(matches_string, si);
6628         }
6629
6630         /* Add the inversion list to whatever we have.  This may have come from
6631          * the swash, or from an input parameter */
6632         if (invlist) {
6633             sv_catsv(matches_string, _invlist_contents(invlist));
6634         }
6635         *listsvp = matches_string;
6636     }
6637
6638     if (altsvp)
6639         *altsvp  = alt;
6640
6641     return sw;
6642 }
6643
6644 /*
6645  - reginclass - determine if a character falls into a character class
6646
6647   n is the ANYOF regnode
6648   p is the target string
6649   lenp is pointer to the maximum number of bytes of how far to go in p
6650     (This is assumed wthout checking to always be at least the current
6651     character's size)
6652   utf8_target tells whether p is in UTF-8.
6653
6654   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6655   from a successful match, the value it points to will be updated to how many
6656   bytes in p were matched.  If there was no match, the value is undefined,
6657   possibly changed from the input.
6658
6659   Note that this can be a synthetic start class, a combination of various
6660   nodes, so things you think might be mutually exclusive, such as locale,
6661   aren't.  It can match both locale and non-locale
6662
6663  */
6664
6665 STATIC bool
6666 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6667 {
6668     dVAR;
6669     const char flags = ANYOF_FLAGS(n);
6670     bool match = FALSE;
6671     UV c = *p;
6672     STRLEN c_len = 0;
6673     STRLEN maxlen;
6674
6675     PERL_ARGS_ASSERT_REGINCLASS;
6676
6677     /* If c is not already the code point, get it */
6678     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6679         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6680                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6681                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6682                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6683                  * UTF8_ALLOW_FFFF */
6684         if (c_len == (STRLEN)-1)
6685             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6686     }
6687     else {
6688         c_len = 1;
6689     }
6690
6691     /* Use passed in max length, or one character if none passed in or less
6692      * than one character.  And assume will match just one character.  This is
6693      * overwritten later if matched more. */
6694     if (lenp) {
6695         maxlen = (*lenp > c_len) ? *lenp : c_len;
6696         *lenp = c_len;
6697
6698     }
6699     else {
6700         maxlen = c_len;
6701     }
6702
6703     /* If this character is potentially in the bitmap, check it */
6704     if (c < 256) {
6705         if (ANYOF_BITMAP_TEST(n, c))
6706             match = TRUE;
6707         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6708                 && ! utf8_target
6709                 && ! isASCII(c))
6710         {
6711             match = TRUE;
6712         }
6713
6714         else if (flags & ANYOF_LOCALE) {
6715             PL_reg_flags |= RF_tainted;
6716
6717             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6718                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6719             {
6720                 match = TRUE;
6721             }
6722             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6723                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6724                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6725                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6726                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6727                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6728                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6729                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6730                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6731                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6732                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6733                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6734                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6735                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6736                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6737                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6738                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6739                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6740                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6741                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6742                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6743                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6744                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6745                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6746                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6747                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6748                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6749                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6750                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6751                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6752                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6753                      ) /* How's that for a conditional? */
6754             ) {
6755                 match = TRUE;
6756             }
6757         }
6758     }
6759
6760     /* If the bitmap didn't (or couldn't) match, and something outside the
6761      * bitmap could match, try that.  Locale nodes specifiy completely the
6762      * behavior of code points in the bit map (otherwise, a utf8 target would
6763      * cause them to be treated as Unicode and not locale), except in
6764      * the very unlikely event when this node is a synthetic start class, which
6765      * could be a combination of locale and non-locale nodes.  So allow locale
6766      * to match for the synthetic start class, which will give a false
6767      * positive that will be resolved when the match is done again as not part
6768      * of the synthetic start class */
6769     if (!match) {
6770         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6771             match = TRUE;       /* Everything above 255 matches */
6772         }
6773         else if (ANYOF_NONBITMAP(n)
6774                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6775                      || (utf8_target
6776                          && (c >=256
6777                              || (! (flags & ANYOF_LOCALE))
6778                              || (flags & ANYOF_IS_SYNTHETIC)))))
6779         {
6780             AV *av;
6781             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6782
6783             if (sw) {
6784                 U8 * utf8_p;
6785                 if (utf8_target) {
6786                     utf8_p = (U8 *) p;
6787                 } else {
6788
6789                     /* Not utf8.  Convert as much of the string as available up
6790                      * to the limit of how far the (single) character in the
6791                      * pattern can possibly match (no need to go further).  If
6792                      * the node is a straight ANYOF or not folding, it can't
6793                      * match more than one.  Otherwise, It can match up to how
6794                      * far a single char can fold to.  Since not utf8, each
6795                      * character is a single byte, so the max it can be in
6796                      * bytes is the same as the max it can be in characters */
6797                     STRLEN len = (OP(n) == ANYOF
6798                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6799                                   ? 1
6800                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6801                                     ? maxlen
6802                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6803                     utf8_p = bytes_to_utf8(p, &len);
6804                 }
6805
6806                 if (swash_fetch(sw, utf8_p, TRUE))
6807                     match = TRUE;
6808                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6809
6810                     /* Here, we need to test if the fold of the target string
6811                      * matches.  The non-multi char folds have all been moved to
6812                      * the compilation phase, and the multi-char folds have
6813                      * been stored by regcomp into 'av'; we linearly check to
6814                      * see if any match the target string (folded).   We know
6815                      * that the originals were each one character, but we don't
6816                      * currently know how many characters/bytes each folded to,
6817                      * except we do know that there are small limits imposed by
6818                      * Unicode.  XXX A performance enhancement would be to have
6819                      * regcomp.c store the max number of chars/bytes that are
6820                      * in an av entry, as, say the 0th element.  Even better
6821                      * would be to have a hash of the few characters that can
6822                      * start a multi-char fold to the max number of chars of
6823                      * those folds.
6824                      *
6825                      * If there is a match, we will need to advance (if lenp is
6826                      * specified) the match pointer in the target string.  But
6827                      * what we are comparing here isn't that string directly,
6828                      * but its fold, whose length may differ from the original.
6829                      * As we go along in constructing the fold, therefore, we
6830                      * create a map so that we know how many bytes in the
6831                      * source to advance given that we have matched a certain
6832                      * number of bytes in the fold.  This map is stored in
6833                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6834                      * the fold of the first character that we are folding.
6835                      * Then map_fold_len_back[n] is set to the number of bytes
6836                      * in that first character.  Similarly let m be the
6837                      * corresponding number for the second character to be
6838                      * folded.  Then map_fold_len_back[n+m] is set to the
6839                      * number of bytes occupied by the first two source
6840                      * characters. ... */
6841                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6842                     U8 folded[UTF8_MAXBYTES_CASE+1];
6843                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6844                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6845                                                   chars */
6846
6847                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6848
6849                         /* Here, only need to fold the first char of the target
6850                          * string.  It the source wasn't utf8, is 1 byte long */
6851                         to_utf8_fold(utf8_p, folded, &foldlen);
6852                         total_foldlen = foldlen;
6853                         map_fold_len_back[foldlen] = (utf8_target)
6854                                                      ? UTF8SKIP(utf8_p)
6855                                                      : 1;
6856                     }
6857                     else {
6858
6859                         /* Here, need to fold more than the first char.  Do so
6860                          * up to the limits */
6861                         U8* source_ptr = utf8_p;    /* The source for the fold
6862                                                        is the regex target
6863                                                        string */
6864                         U8* folded_ptr = folded;
6865                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6866                                                        available byte in the
6867                                                        target string */
6868                         U8 i;
6869                         for (i = 0;
6870                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6871                              i++)
6872                         {
6873
6874                             /* Fold the next character */
6875                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6876                             STRLEN this_char_foldlen;
6877                             to_utf8_fold(source_ptr,
6878                                          this_char_folded,
6879                                          &this_char_foldlen);
6880
6881                             /* Bail if it would exceed the byte limit for
6882                              * folding a single char. */
6883                             if (this_char_foldlen + folded_ptr - folded >
6884                                                             UTF8_MAXBYTES_CASE)
6885                             {
6886                                 break;
6887                             }
6888
6889                             /* Add the fold of this character */
6890                             Copy(this_char_folded,
6891                                  folded_ptr,
6892                                  this_char_foldlen,
6893                                  U8);
6894                             source_ptr += UTF8SKIP(source_ptr);
6895                             folded_ptr += this_char_foldlen;
6896                             total_foldlen = folded_ptr - folded;
6897
6898                             /* Create map from the number of bytes in the fold
6899                              * back to the number of bytes in the source.  If
6900                              * the source isn't utf8, the byte count is just
6901                              * the number of characters so far */
6902                             map_fold_len_back[total_foldlen]
6903                                                       = (utf8_target)
6904                                                         ? source_ptr - utf8_p
6905                                                         : i + 1;
6906                         }
6907                         *folded_ptr = '\0';
6908                     }
6909
6910
6911                     /* Do the linear search to see if the fold is in the list
6912                      * of multi-char folds. */
6913                     if (av) {
6914                         I32 i;
6915                         for (i = 0; i <= av_len(av); i++) {
6916                             SV* const sv = *av_fetch(av, i, FALSE);
6917                             STRLEN len;
6918                             const char * const s = SvPV_const(sv, len);
6919
6920                             if (len <= total_foldlen
6921                                 && memEQ(s, (char*)folded, len)
6922
6923                                    /* If 0, means matched a partial char. See
6924                                     * [perl #90536] */
6925                                 && map_fold_len_back[len])
6926                             {
6927
6928                                 /* Advance the target string ptr to account for
6929                                  * this fold, but have to translate from the
6930                                  * folded length to the corresponding source
6931                                  * length. */
6932                                 if (lenp) {
6933                                     *lenp = map_fold_len_back[len];
6934                                 }
6935                                 match = TRUE;
6936                                 break;
6937                             }
6938                         }
6939                     }
6940                 }
6941
6942                 /* If we allocated a string above, free it */
6943                 if (! utf8_target) Safefree(utf8_p);
6944             }
6945         }
6946     }
6947
6948     return (flags & ANYOF_INVERT) ? !match : match;
6949 }
6950
6951 STATIC U8 *
6952 S_reghop3(U8 *s, I32 off, const U8* lim)
6953 {
6954     /* return the position 'off' UTF-8 characters away from 's', forward if
6955      * 'off' >= 0, backwards if negative.  But don't go outside of position
6956      * 'lim', which better be < s  if off < 0 */
6957
6958     dVAR;
6959
6960     PERL_ARGS_ASSERT_REGHOP3;
6961
6962     if (off >= 0) {
6963         while (off-- && s < lim) {
6964             /* XXX could check well-formedness here */
6965             s += UTF8SKIP(s);
6966         }
6967     }
6968     else {
6969         while (off++ && s > lim) {
6970             s--;
6971             if (UTF8_IS_CONTINUED(*s)) {
6972                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6973                     s--;
6974             }
6975             /* XXX could check well-formedness here */
6976         }
6977     }
6978     return s;
6979 }
6980
6981 #ifdef XXX_dmq
6982 /* there are a bunch of places where we use two reghop3's that should
6983    be replaced with this routine. but since thats not done yet
6984    we ifdef it out - dmq
6985 */
6986 STATIC U8 *
6987 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6988 {
6989     dVAR;
6990
6991     PERL_ARGS_ASSERT_REGHOP4;
6992
6993     if (off >= 0) {
6994         while (off-- && s < rlim) {
6995             /* XXX could check well-formedness here */
6996             s += UTF8SKIP(s);
6997         }
6998     }
6999     else {
7000         while (off++ && s > llim) {
7001             s--;
7002             if (UTF8_IS_CONTINUED(*s)) {
7003                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7004                     s--;
7005             }
7006             /* XXX could check well-formedness here */
7007         }
7008     }
7009     return s;
7010 }
7011 #endif
7012
7013 STATIC U8 *
7014 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7015 {
7016     dVAR;
7017
7018     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7019
7020     if (off >= 0) {
7021         while (off-- && s < lim) {
7022             /* XXX could check well-formedness here */
7023             s += UTF8SKIP(s);
7024         }
7025         if (off >= 0)
7026             return NULL;
7027     }
7028     else {
7029         while (off++ && s > lim) {
7030             s--;
7031             if (UTF8_IS_CONTINUED(*s)) {
7032                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7033                     s--;
7034             }
7035             /* XXX could check well-formedness here */
7036         }
7037         if (off <= 0)
7038             return NULL;
7039     }
7040     return s;
7041 }
7042
7043 static void
7044 restore_pos(pTHX_ void *arg)
7045 {
7046     dVAR;
7047     regexp * const rex = (regexp *)arg;
7048     if (PL_reg_eval_set) {
7049         if (PL_reg_oldsaved) {
7050             rex->subbeg = PL_reg_oldsaved;
7051             rex->sublen = PL_reg_oldsavedlen;
7052 #ifdef PERL_OLD_COPY_ON_WRITE
7053             rex->saved_copy = PL_nrs;
7054 #endif
7055             RXp_MATCH_COPIED_on(rex);
7056         }
7057         PL_reg_magic->mg_len = PL_reg_oldpos;
7058         PL_reg_eval_set = 0;
7059         PL_curpm = PL_reg_oldcurpm;
7060     }
7061 }
7062
7063 STATIC void
7064 S_to_utf8_substr(pTHX_ register regexp *prog)
7065 {
7066     int i = 1;
7067
7068     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7069
7070     do {
7071         if (prog->substrs->data[i].substr
7072             && !prog->substrs->data[i].utf8_substr) {
7073             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7074             prog->substrs->data[i].utf8_substr = sv;
7075             sv_utf8_upgrade(sv);
7076             if (SvVALID(prog->substrs->data[i].substr)) {
7077                 if (SvTAIL(prog->substrs->data[i].substr)) {
7078                     /* Trim the trailing \n that fbm_compile added last
7079                        time.  */
7080                     SvCUR_set(sv, SvCUR(sv) - 1);
7081                     /* Whilst this makes the SV technically "invalid" (as its
7082                        buffer is no longer followed by "\0") when fbm_compile()
7083                        adds the "\n" back, a "\0" is restored.  */
7084                     fbm_compile(sv, FBMcf_TAIL);
7085                 } else
7086                     fbm_compile(sv, 0);
7087             }
7088             if (prog->substrs->data[i].substr == prog->check_substr)
7089                 prog->check_utf8 = sv;
7090         }
7091     } while (i--);
7092 }
7093
7094 STATIC void
7095 S_to_byte_substr(pTHX_ register regexp *prog)
7096 {
7097     dVAR;
7098     int i = 1;
7099
7100     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7101
7102     do {
7103         if (prog->substrs->data[i].utf8_substr
7104             && !prog->substrs->data[i].substr) {
7105             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7106             if (sv_utf8_downgrade(sv, TRUE)) {
7107                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7108                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7109                         /* Trim the trailing \n that fbm_compile added last
7110                            time.  */
7111                         SvCUR_set(sv, SvCUR(sv) - 1);
7112                         fbm_compile(sv, FBMcf_TAIL);
7113                     } else
7114                         fbm_compile(sv, 0);
7115                 }
7116             } else {
7117                 SvREFCNT_dec(sv);
7118                 sv = &PL_sv_undef;
7119             }
7120             prog->substrs->data[i].substr = sv;
7121             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7122                 prog->check_substr = sv;
7123         }
7124     } while (i--);
7125 }
7126
7127 /*
7128  * Local variables:
7129  * c-indentation-style: bsd
7130  * c-basic-offset: 4
7131  * indent-tabs-mode: t
7132  * End:
7133  *
7134  * ex: set ts=8 sts=4 sw=4 noet:
7135  */