src/5015009/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1       /* tainted information used? e.g. locale */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #ifdef EBCDIC
 127     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 128      * skip the check on EBCDIC platforms */
 129 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 130 #else
 131 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 132     if (!CAT2(PL_utf8_,class)) { \
 133         bool ok; \
 134         ENTER; save_re_context(); \
 135         ok=CAT2(is_utf8_,class)((const U8*)str); \
 136         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 137 #endif
 138
 139 /* Doesn't do an assert to verify that is correct */
 140 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 141     if (!CAT2(PL_utf8_,class)) { \
 142         bool throw_away PERL_UNUSED_DECL; \
 143         ENTER; save_re_context(); \
 144         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 145         LEAVE; } } STMT_END
 146
 147 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 148 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 149 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 150
 151 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 152         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 153         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 154         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 155             * assert should likely and hopefully fail on an EBCDIC machine */ \
 156         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 157                                                                             \
 158         /* No asserts are done for these, in case called on an early        \
 159             * Unicode version in which they map to nothing */               \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 163         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 164         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 165         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 166         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 167
 168 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 169
 170 /* The actual code for CCC_TRY, which uses several variables from the routine
 171  * it's callable from.  It is designed to be the bulk of a case statement.
 172  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 173  *      nextchr matches the class.
 174  * UTF8_TEST is the whole test string to use for utf8 targets
 175  * LOAD is what to use to test, and if not present to load in the swash for the
 176  *      class
 177  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 178  *      UTF8_TEST test.
 179  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 180  * utf8 and a variant, load the swash if necessary and test using the utf8
 181  * test.  Advance to the next character if test is ok, otherwise fail; If not
 182  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 183  * fails, or advance to the next character */
 184
 185 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 186     if (locinput >= PL_regeol) {                                              \
 187         sayNO;                                                                \
 188     }                                                                         \
 189     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 190         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 191         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 192             sayNO;                                                            \
 193         }                                                                     \
 194         locinput += PL_utf8skip[nextchr];                                     \
 195         nextchr = UCHARAT(locinput);                                          \
 196         break;                                                                \
 197     }                                                                         \
 198     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 199         sayNO;                                                                \
 200     }                                                                         \
 201     nextchr = UCHARAT(++locinput);                                            \
 202     break;
 203
 204 /* Handle the non-locale cases for a character class and its complement.  It
 205  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 206  * This is because that code fails when the test succeeds, so we want to have
 207  * the test fail so that the code succeeds.  The swash is stored in a
 208  * predictable PL_ place */
 209 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 210                            CLASS, STR)                                        \
 211     case NAME:                                                                \
 212         _CCC_TRY_CODE( !, FUNC,                                               \
 213                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 214                                             (U8*)locinput, TRUE)),            \
 215                           CLASS, STR)                                         \
 216     case NNAME:                                                               \
 217         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 218                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 219                                             (U8*)locinput, TRUE)),            \
 220                           CLASS, STR)                                         \
 221
 222 /* Generate the case statements for both locale and non-locale character
 223  * classes in regmatch for classes that don't have special unicode semantics.
 224  * Locales don't use an immediate swash, but an intermediary special locale
 225  * function that is called on the pointer to the current place in the input
 226  * string.  That function will resolve to needing the same swash.  One might
 227  * think that because we don't know what the locale will match, we shouldn't
 228  * check with the swash loading function that it loaded properly; ie, that we
 229  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 230  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 231  * irrelevant here */
 232 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 233                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 234                 NAMEA, NNAMEA, FUNCA,                                         \
 235                 CLASS, STR)                                                   \
 236     case NAMEL:                                                               \
 237         PL_reg_flags |= RF_tainted;                                           \
 238         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 239     case NNAMEL:                                                              \
 240         PL_reg_flags |= RF_tainted;                                           \
 241         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 242                        CLASS, STR)                                            \
 243     case NAMEA:                                                               \
 244         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 245             sayNO;                                                            \
 246         }                                                                     \
 247         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 248         nextchr = UCHARAT(++locinput);                                        \
 249         break;                                                                \
 250     case NNAMEA:                                                              \
 251         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 252             sayNO;                                                            \
 253         }                                                                     \
 254         if (utf8_target) {                                                    \
 255             locinput += PL_utf8skip[nextchr];                                 \
 256             nextchr = UCHARAT(locinput);                                      \
 257         }                                                                     \
 258         else {                                                                \
 259             nextchr = UCHARAT(++locinput);                                    \
 260         }                                                                     \
 261         break;                                                                \
 262     /* Generate the non-locale cases */                                       \
 263     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 264
 265 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 266  * statements to handle separate Unicode semantics nodes */
 267 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 268                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 269                   NAMEU, NNAMEU, FUNCU,                                        \
 270                   NAMEA, NNAMEA, FUNCA,                                        \
 271                   CLASS, STR)                                                  \
 272     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 273             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 274             NAMEA, NNAMEA, FUNCA,                                              \
 275             CLASS, STR)                                                        \
 276     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 277
 278 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 279
 280 /* for use after a quantifier and before an EXACT-like node -- japhy */
 281 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 282  *
 283  * NOTE that *nothing* that affects backtracking should be in here, specifically
 284  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 285  * node that is in between two EXACT like nodes when ascertaining what the required
 286  * "follow" character is. This should probably be moved to regex compile time
 287  * although it may be done at run time beause of the REF possibility - more
 288  * investigation required. -- demerphq
 289 */
 290 #define JUMPABLE(rn) (      \
 291     OP(rn) == OPEN ||       \
 292     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 293     OP(rn) == EVAL ||   \
 294     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 295     OP(rn) == PLUS || OP(rn) == MINMOD || \
 296     OP(rn) == KEEPS || \
 297     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 298 )
 299 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 300
 301 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 302
 303 #if 0
 304 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 305    we don't need this definition. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 307 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 309
 310 #else
 311 /* ... so we use this as its faster. */
 312 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 313 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 314 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 315 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 316
 317 #endif
 318
 319 /*
 320   Search for mandatory following text node; for lookahead, the text must
 321   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 322 */
 323 #define FIND_NEXT_IMPT(rn) STMT_START { \
 324     while (JUMPABLE(rn)) { \
 325         const OPCODE type = OP(rn); \
 326         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 327             rn = NEXTOPER(NEXTOPER(rn)); \
 328         else if (type == PLUS) \
 329             rn = NEXTOPER(rn); \
 330         else if (type == IFMATCH) \
 331             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 332         else rn += NEXT_OFF(rn); \
 333     } \
 334 } STMT_END
 335
 336
 337 static void restore_pos(pTHX_ void *arg);
 338
 339 #define REGCP_PAREN_ELEMS 4
 340 #define REGCP_OTHER_ELEMS 5
 341 #define REGCP_FRAME_ELEMS 1
 342 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 343  * are needed for the regexp context stack bookkeeping. */
 344
 345 STATIC CHECKPOINT
 346 S_regcppush(pTHX_ I32 parenfloor)
 347 {
 348     dVAR;
 349     const int retval = PL_savestack_ix;
 350     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 351     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 352     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 353     int p;
 354     GET_RE_DEBUG_FLAGS_DECL;
 355
 356     if (paren_elems_to_push < 0)
 357         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 358                    paren_elems_to_push);
 359
 360     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 361         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 362                    " out of range (%lu-%ld)",
 363                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 364
 365     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 366
 367     for (p = PL_regsize; p > parenfloor; p--) {
 368 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 369         SSPUSHINT(PL_regoffs[p].end);
 370         SSPUSHINT(PL_regoffs[p].start);
 371         SSPUSHPTR(PL_reg_start_tmp[p]);
 372         SSPUSHINT(p);
 373         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 374           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 375                       (UV)p, (IV)PL_regoffs[p].start,
 376                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 377                       (IV)PL_regoffs[p].end
 378         ));
 379     }
 380 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 381     SSPUSHPTR(PL_regoffs);
 382     SSPUSHINT(PL_regsize);
 383     SSPUSHINT(*PL_reglastparen);
 384     SSPUSHINT(*PL_reglastcloseparen);
 385     SSPUSHPTR(PL_reginput);
 386     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 387
 388     return retval;
 389 }
 390
 391 /* These are needed since we do not localize EVAL nodes: */
 392 #define REGCP_SET(cp)                                           \
 393     DEBUG_STATE_r(                                              \
 394             PerlIO_printf(Perl_debug_log,                       \
 395                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 396                 (IV)PL_savestack_ix));                          \
 397     cp = PL_savestack_ix
 398
 399 #define REGCP_UNWIND(cp)                                        \
 400     DEBUG_STATE_r(                                              \
 401         if (cp != PL_savestack_ix)                              \
 402             PerlIO_printf(Perl_debug_log,                       \
 403                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 404                 (IV)(cp), (IV)PL_savestack_ix));                \
 405     regcpblow(cp)
 406
 407 STATIC char *
 408 S_regcppop(pTHX_ const regexp *rex)
 409 {
 410     dVAR;
 411     UV i;
 412     char *input;
 413     GET_RE_DEBUG_FLAGS_DECL;
 414
 415     PERL_ARGS_ASSERT_REGCPPOP;
 416
 417     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 418     i = SSPOPUV;
 419     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 420     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 421     input = (char *) SSPOPPTR;
 422     *PL_reglastcloseparen = SSPOPINT;
 423     *PL_reglastparen = SSPOPINT;
 424     PL_regsize = SSPOPINT;
 425     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 426
 427     i -= REGCP_OTHER_ELEMS;
 428     /* Now restore the parentheses context. */
 429     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 430         I32 tmps;
 431         U32 paren = (U32)SSPOPINT;
 432         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 433         PL_regoffs[paren].start = SSPOPINT;
 434         tmps = SSPOPINT;
 435         if (paren <= *PL_reglastparen)
 436             PL_regoffs[paren].end = tmps;
 437         DEBUG_BUFFERS_r(
 438             PerlIO_printf(Perl_debug_log,
 439                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 440                           (UV)paren, (IV)PL_regoffs[paren].start,
 441                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 442                           (IV)PL_regoffs[paren].end,
 443                           (paren > *PL_reglastparen ? "(no)" : ""));
 444         );
 445     }
 446     DEBUG_BUFFERS_r(
 447         if (*PL_reglastparen + 1 <= rex->nparens) {
 448             PerlIO_printf(Perl_debug_log,
 449                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 450                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 451         }
 452     );
 453 #if 1
 454     /* It would seem that the similar code in regtry()
 455      * already takes care of this, and in fact it is in
 456      * a better location to since this code can #if 0-ed out
 457      * but the code in regtry() is needed or otherwise tests
 458      * requiring null fields (pat.t#187 and split.t#{13,14}
 459      * (as of patchlevel 7877)  will fail.  Then again,
 460      * this code seems to be necessary or otherwise
 461      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 462      * --jhi updated by dapm */
 463     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 464         if (i > PL_regsize)
 465             PL_regoffs[i].start = -1;
 466         PL_regoffs[i].end = -1;
 467     }
 468 #endif
 469     return input;
 470 }
 471
 472 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 473
 474 /*
 475  * pregexec and friends
 476  */
 477
 478 #ifndef PERL_IN_XSUB_RE
 479 /*
 480  - pregexec - match a regexp against a string
 481  */
 482 I32
 483 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 484          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 485 /* strend: pointer to null at end of string */
 486 /* strbeg: real beginning of string */
 487 /* minend: end of match must be >=minend after stringarg. */
 488 /* nosave: For optimizations. */
 489 {
 490     PERL_ARGS_ASSERT_PREGEXEC;
 491
 492     return
 493         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 494                       nosave ? 0 : REXEC_COPY_STR);
 495 }
 496 #endif
 497
 498 /*
 499  * Need to implement the following flags for reg_anch:
 500  *
 501  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 502  * USE_INTUIT_ML
 503  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 504  * INTUIT_AUTORITATIVE_ML
 505  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 506  * INTUIT_ONCE_ML
 507  *
 508  * Another flag for this function: SECOND_TIME (so that float substrs
 509  * with giant delta may be not rechecked).
 510  */
 511
 512 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 513
 514 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 515    Otherwise, only SvCUR(sv) is used to get strbeg. */
 516
 517 /* XXXX We assume that strpos is strbeg unless sv. */
 518
 519 /* XXXX Some places assume that there is a fixed substring.
 520         An update may be needed if optimizer marks as "INTUITable"
 521         RExen without fixed substrings.  Similarly, it is assumed that
 522         lengths of all the strings are no more than minlen, thus they
 523         cannot come from lookahead.
 524         (Or minlen should take into account lookahead.)
 525   NOTE: Some of this comment is not correct. minlen does now take account
 526   of lookahead/behind. Further research is required. -- demerphq
 527
 528 */
 529
 530 /* A failure to find a constant substring means that there is no need to make
 531    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 532    finding a substring too deep into the string means that less calls to
 533    regtry() should be needed.
 534
 535    REx compiler's optimizer found 4 possible hints:
 536         a) Anchored substring;
 537         b) Fixed substring;
 538         c) Whether we are anchored (beginning-of-line or \G);
 539         d) First node (of those at offset 0) which may distinguish positions;
 540    We use a)b)d) and multiline-part of c), and try to find a position in the
 541    string which does not contradict any of them.
 542  */
 543
 544 /* Most of decisions we do here should have been done at compile time.
 545    The nodes of the REx which we used for the search should have been
 546    deleted from the finite automaton. */
 547
 548 char *
 549 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 550                      char *strend, const U32 flags, re_scream_pos_data *data)
 551 {
 552     dVAR;
 553     struct regexp *const prog = (struct regexp *)SvANY(rx);
 554     register I32 start_shift = 0;
 555     /* Should be nonnegative! */
 556     register I32 end_shift   = 0;
 557     register char *s;
 558     register SV *check;
 559     char *strbeg;
 560     char *t;
 561     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 562     I32 ml_anch;
 563     register char *other_last = NULL;   /* other substr checked before this */
 564     char *check_at = NULL;              /* check substr found at this pos */
 565     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 566     RXi_GET_DECL(prog,progi);
 567 #ifdef DEBUGGING
 568     const char * const i_strpos = strpos;
 569 #endif
 570     GET_RE_DEBUG_FLAGS_DECL;
 571
 572     PERL_ARGS_ASSERT_RE_INTUIT_START;
 573
 574     RX_MATCH_UTF8_set(rx,utf8_target);
 575
 576     if (RX_UTF8(rx)) {
 577         PL_reg_flags |= RF_utf8;
 578     }
 579     DEBUG_EXECUTE_r(
 580         debug_start_match(rx, utf8_target, strpos, strend,
 581             sv ? "Guessing start of match in sv for"
 582                : "Guessing start of match in string for");
 583               );
 584
 585     /* CHR_DIST() would be more correct here but it makes things slow. */
 586     if (prog->minlen > strend - strpos) {
 587         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 588                               "String too short... [re_intuit_start]\n"));
 589         goto fail;
 590     }
 591
 592     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 593     PL_regeol = strend;
 594     if (utf8_target) {
 595         if (!prog->check_utf8 && prog->check_substr)
 596             to_utf8_substr(prog);
 597         check = prog->check_utf8;
 598     } else {
 599         if (!prog->check_substr && prog->check_utf8)
 600             to_byte_substr(prog);
 601         check = prog->check_substr;
 602     }
 603     if (check == &PL_sv_undef) {
 604         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 605                 "Non-utf8 string cannot match utf8 check string\n"));
 606         goto fail;
 607     }
 608     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 609         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 610                      || ( (prog->extflags & RXf_ANCH_BOL)
 611                           && !multiline ) );    /* Check after \n? */
 612
 613         if (!ml_anch) {
 614           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 615                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 616                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 617                && sv && !SvROK(sv)
 618                && (strpos != strbeg)) {
 619               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 620               goto fail;
 621           }
 622           if (prog->check_offset_min == prog->check_offset_max &&
 623               !(prog->extflags & RXf_CANY_SEEN)) {
 624             /* Substring at constant offset from beg-of-str... */
 625             I32 slen;
 626
 627             s = HOP3c(strpos, prog->check_offset_min, strend);
 628
 629             if (SvTAIL(check)) {
 630                 slen = SvCUR(check);    /* >= 1 */
 631
 632                 if ( strend - s > slen || strend - s < slen - 1
 633                      || (strend - s == slen && strend[-1] != '\n')) {
 634                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 635                     goto fail_finish;
 636                 }
 637                 /* Now should match s[0..slen-2] */
 638                 slen--;
 639                 if (slen && (*SvPVX_const(check) != *s
 640                              || (slen > 1
 641                                  && memNE(SvPVX_const(check), s, slen)))) {
 642                   report_neq:
 643                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 644                     goto fail_finish;
 645                 }
 646             }
 647             else if (*SvPVX_const(check) != *s
 648                      || ((slen = SvCUR(check)) > 1
 649                          && memNE(SvPVX_const(check), s, slen)))
 650                 goto report_neq;
 651             check_at = s;
 652             goto success_at_start;
 653           }
 654         }
 655         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 656         s = strpos;
 657         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 658         end_shift = prog->check_end_shift;
 659
 660         if (!ml_anch) {
 661             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 662                                          - (SvTAIL(check) != 0);
 663             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 664
 665             if (end_shift < eshift)
 666                 end_shift = eshift;
 667         }
 668     }
 669     else {                              /* Can match at random position */
 670         ml_anch = 0;
 671         s = strpos;
 672         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 673         end_shift = prog->check_end_shift;
 674
 675         /* end shift should be non negative here */
 676     }
 677
 678 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 679     if (end_shift < 0)
 680         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 681                    (IV)end_shift, RX_PRECOMP(prog));
 682 #endif
 683
 684   restart:
 685     /* Find a possible match in the region s..strend by looking for
 686        the "check" substring in the region corrected by start/end_shift. */
 687
 688     {
 689         I32 srch_start_shift = start_shift;
 690         I32 srch_end_shift = end_shift;
 691         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 692             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 693             srch_start_shift = strbeg - s;
 694         }
 695     DEBUG_OPTIMISE_MORE_r({
 696         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 697             (IV)prog->check_offset_min,
 698             (IV)srch_start_shift,
 699             (IV)srch_end_shift,
 700             (IV)prog->check_end_shift);
 701     });
 702
 703     if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
 704         I32 p = -1;                     /* Internal iterator of scream. */
 705         I32 * const pp = data ? data->scream_pos : &p;
 706         const MAGIC *mg;
 707         bool found = FALSE;
 708
 709         assert(SvMAGICAL(sv));
 710         mg = mg_find(sv, PERL_MAGIC_study);
 711         assert(mg);
 712
 713         if (mg->mg_private == 1) {
 714             found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
 715         } else if (mg->mg_private == 2) {
 716             found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
 717         } else {
 718             assert (mg->mg_private == 4);
 719             found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
 720         }
 721
 722         if (found
 723             || ( BmRARE(check) == '\n'
 724                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 725                  && SvTAIL(check) ))
 726             s = screaminstr(sv, check,
 727                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 728         else
 729             goto fail_finish;
 730         /* we may be pointing at the wrong string */
 731         if (s && RXp_MATCH_COPIED(prog))
 732             s = strbeg + (s - SvPVX_const(sv));
 733         if (data)
 734             *data->scream_olds = s;
 735     }
 736     else {
 737         U8* start_point;
 738         U8* end_point;
 739         if (prog->extflags & RXf_CANY_SEEN) {
 740             start_point= (U8*)(s + srch_start_shift);
 741             end_point= (U8*)(strend - srch_end_shift);
 742         } else {
 743             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 744             end_point= HOP3(strend, -srch_end_shift, strbeg);
 745         }
 746         DEBUG_OPTIMISE_MORE_r({
 747             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 748                 (int)(end_point - start_point),
 749                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 750                 start_point);
 751         });
 752
 753         s = fbm_instr( start_point, end_point,
 754                       check, multiline ? FBMrf_MULTILINE : 0);
 755     }
 756     }
 757     /* Update the count-of-usability, remove useless subpatterns,
 758         unshift s.  */
 759
 760     DEBUG_EXECUTE_r({
 761         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 762             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 763         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 764                           (s ? "Found" : "Did not find"),
 765             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 766                 ? "anchored" : "floating"),
 767             quoted,
 768             RE_SV_TAIL(check),
 769             (s ? " at offset " : "...\n") );
 770     });
 771
 772     if (!s)
 773         goto fail_finish;
 774     /* Finish the diagnostic message */
 775     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 776
 777     /* XXX dmq: first branch is for positive lookbehind...
 778        Our check string is offset from the beginning of the pattern.
 779        So we need to do any stclass tests offset forward from that
 780        point. I think. :-(
 781      */
 782
 783
 784
 785     check_at=s;
 786
 787
 788     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 789        Start with the other substr.
 790        XXXX no SCREAM optimization yet - and a very coarse implementation
 791        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 792                 *always* match.  Probably should be marked during compile...
 793        Probably it is right to do no SCREAM here...
 794      */
 795
 796     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 797                 : (prog->float_substr && prog->anchored_substr))
 798     {
 799         /* Take into account the "other" substring. */
 800         /* XXXX May be hopelessly wrong for UTF... */
 801         if (!other_last)
 802             other_last = strpos;
 803         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 804           do_other_anchored:
 805             {
 806                 char * const last = HOP3c(s, -start_shift, strbeg);
 807                 char *last1, *last2;
 808                 char * const saved_s = s;
 809                 SV* must;
 810
 811                 t = s - prog->check_offset_max;
 812                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 813                     && (!utf8_target
 814                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 815                             && t > strpos)))
 816                     NOOP;
 817                 else
 818                     t = strpos;
 819                 t = HOP3c(t, prog->anchored_offset, strend);
 820                 if (t < other_last)     /* These positions already checked */
 821                     t = other_last;
 822                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 823                 if (last < last1)
 824                     last1 = last;
 825                 /* XXXX It is not documented what units *_offsets are in.
 826                    We assume bytes, but this is clearly wrong.
 827                    Meaning this code needs to be carefully reviewed for errors.
 828                    dmq.
 829                   */
 830
 831                 /* On end-of-str: see comment below. */
 832                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 833                 if (must == &PL_sv_undef) {
 834                     s = (char*)NULL;
 835                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 836                 }
 837                 else
 838                     s = fbm_instr(
 839                         (unsigned char*)t,
 840                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 841                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 842                         must,
 843                         multiline ? FBMrf_MULTILINE : 0
 844                     );
 845                 DEBUG_EXECUTE_r({
 846                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 847                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 848                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 849                         (s ? "Found" : "Contradicts"),
 850                         quoted, RE_SV_TAIL(must));
 851                 });
 852
 853
 854                 if (!s) {
 855                     if (last1 >= last2) {
 856                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 857                                                 ", giving up...\n"));
 858                         goto fail_finish;
 859                     }
 860                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 861                         ", trying floating at offset %ld...\n",
 862                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 863                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 864                     s = HOP3c(last, 1, strend);
 865                     goto restart;
 866                 }
 867                 else {
 868                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 869                           (long)(s - i_strpos)));
 870                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 871                     other_last = HOP3c(s, 1, strend);
 872                     s = saved_s;
 873                     if (t == strpos)
 874                         goto try_at_start;
 875                     goto try_at_offset;
 876                 }
 877             }
 878         }
 879         else {          /* Take into account the floating substring. */
 880             char *last, *last1;
 881             char * const saved_s = s;
 882             SV* must;
 883
 884             t = HOP3c(s, -start_shift, strbeg);
 885             last1 = last =
 886                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 887             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 888                 last = HOP3c(t, prog->float_max_offset, strend);
 889             s = HOP3c(t, prog->float_min_offset, strend);
 890             if (s < other_last)
 891                 s = other_last;
 892  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 893             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 894             /* fbm_instr() takes into account exact value of end-of-str
 895                if the check is SvTAIL(ed).  Since false positives are OK,
 896                and end-of-str is not later than strend we are OK. */
 897             if (must == &PL_sv_undef) {
 898                 s = (char*)NULL;
 899                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 900             }
 901             else
 902                 s = fbm_instr((unsigned char*)s,
 903                               (unsigned char*)last + SvCUR(must)
 904                                   - (SvTAIL(must)!=0),
 905                               must, multiline ? FBMrf_MULTILINE : 0);
 906             DEBUG_EXECUTE_r({
 907                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 908                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 909                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 910                     (s ? "Found" : "Contradicts"),
 911                     quoted, RE_SV_TAIL(must));
 912             });
 913             if (!s) {
 914                 if (last1 == last) {
 915                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 916                                             ", giving up...\n"));
 917                     goto fail_finish;
 918                 }
 919                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 920                     ", trying anchored starting at offset %ld...\n",
 921                     (long)(saved_s + 1 - i_strpos)));
 922                 other_last = last;
 923                 s = HOP3c(t, 1, strend);
 924                 goto restart;
 925             }
 926             else {
 927                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 928                       (long)(s - i_strpos)));
 929                 other_last = s; /* Fix this later. --Hugo */
 930                 s = saved_s;
 931                 if (t == strpos)
 932                     goto try_at_start;
 933                 goto try_at_offset;
 934             }
 935         }
 936     }
 937
 938
 939     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 940
 941     DEBUG_OPTIMISE_MORE_r(
 942         PerlIO_printf(Perl_debug_log,
 943             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 944             (IV)prog->check_offset_min,
 945             (IV)prog->check_offset_max,
 946             (IV)(s-strpos),
 947             (IV)(t-strpos),
 948             (IV)(t-s),
 949             (IV)(strend-strpos)
 950         )
 951     );
 952
 953     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 954         && (!utf8_target
 955             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 956                  && t > strpos)))
 957     {
 958         /* Fixed substring is found far enough so that the match
 959            cannot start at strpos. */
 960       try_at_offset:
 961         if (ml_anch && t[-1] != '\n') {
 962             /* Eventually fbm_*() should handle this, but often
 963                anchored_offset is not 0, so this check will not be wasted. */
 964             /* XXXX In the code below we prefer to look for "^" even in
 965                presence of anchored substrings.  And we search even
 966                beyond the found float position.  These pessimizations
 967                are historical artefacts only.  */
 968           find_anchor:
 969             while (t < strend - prog->minlen) {
 970                 if (*t == '\n') {
 971                     if (t < check_at - prog->check_offset_min) {
 972                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 973                             /* Since we moved from the found position,
 974                                we definitely contradict the found anchored
 975                                substr.  Due to the above check we do not
 976                                contradict "check" substr.
 977                                Thus we can arrive here only if check substr
 978                                is float.  Redo checking for "other"=="fixed".
 979                              */
 980                             strpos = t + 1;
 981                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 982                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 983                             goto do_other_anchored;
 984                         }
 985                         /* We don't contradict the found floating substring. */
 986                         /* XXXX Why not check for STCLASS? */
 987                         s = t + 1;
 988                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 989                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 990                         goto set_useful;
 991                     }
 992                     /* Position contradicts check-string */
 993                     /* XXXX probably better to look for check-string
 994                        than for "\n", so one should lower the limit for t? */
 995                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 996                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 997                     other_last = strpos = s = t + 1;
 998                     goto restart;
 999                 }
1000                 t++;
1001             }
1002             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1003                         PL_colors[0], PL_colors[1]));
1004             goto fail_finish;
1005         }
1006         else {
1007             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1008                         PL_colors[0], PL_colors[1]));
1009         }
1010         s = t;
1011       set_useful:
1012         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1013     }
1014     else {
1015         /* The found string does not prohibit matching at strpos,
1016            - no optimization of calling REx engine can be performed,
1017            unless it was an MBOL and we are not after MBOL,
1018            or a future STCLASS check will fail this. */
1019       try_at_start:
1020         /* Even in this situation we may use MBOL flag if strpos is offset
1021            wrt the start of the string. */
1022         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1023             && (strpos != strbeg) && strpos[-1] != '\n'
1024             /* May be due to an implicit anchor of m{.*foo}  */
1025             && !(prog->intflags & PREGf_IMPLICIT))
1026         {
1027             t = strpos;
1028             goto find_anchor;
1029         }
1030         DEBUG_EXECUTE_r( if (ml_anch)
1031             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1032                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1033         );
1034       success_at_start:
1035         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1036             && (utf8_target ? (
1037                 prog->check_utf8                /* Could be deleted already */
1038                 && --BmUSEFUL(prog->check_utf8) < 0
1039                 && (prog->check_utf8 == prog->float_utf8)
1040             ) : (
1041                 prog->check_substr              /* Could be deleted already */
1042                 && --BmUSEFUL(prog->check_substr) < 0
1043                 && (prog->check_substr == prog->float_substr)
1044             )))
1045         {
1046             /* If flags & SOMETHING - do not do it many times on the same match */
1047             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1048             /* XXX Does the destruction order has to change with utf8_target? */
1049             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1050             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1051             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1052             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1053             check = NULL;                       /* abort */
1054             s = strpos;
1055             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1056                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1057             if (prog->intflags & PREGf_IMPLICIT)
1058                 prog->extflags &= ~RXf_ANCH_MBOL;
1059             /* XXXX This is a remnant of the old implementation.  It
1060                     looks wasteful, since now INTUIT can use many
1061                     other heuristics. */
1062             prog->extflags &= ~RXf_USE_INTUIT;
1063             /* XXXX What other flags might need to be cleared in this branch? */
1064         }
1065         else
1066             s = strpos;
1067     }
1068
1069     /* Last resort... */
1070     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1071     /* trie stclasses are too expensive to use here, we are better off to
1072        leave it to regmatch itself */
1073     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1074         /* minlen == 0 is possible if regstclass is \b or \B,
1075            and the fixed substr is ''$.
1076            Since minlen is already taken into account, s+1 is before strend;
1077            accidentally, minlen >= 1 guaranties no false positives at s + 1
1078            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1079            regstclass does not come from lookahead...  */
1080         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1081            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1082         const U8* const str = (U8*)STRING(progi->regstclass);
1083         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1084                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1085                     : 1);
1086         char * endpos;
1087         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1088             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1089         else if (prog->float_substr || prog->float_utf8)
1090             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1091         else
1092             endpos= strend;
1093
1094         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1095                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1096
1097         t = s;
1098         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1099         if (!s) {
1100 #ifdef DEBUGGING
1101             const char *what = NULL;
1102 #endif
1103             if (endpos == strend) {
1104                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1105                                 "Could not match STCLASS...\n") );
1106                 goto fail;
1107             }
1108             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109                                    "This position contradicts STCLASS...\n") );
1110             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1111                 goto fail;
1112             /* Contradict one of substrings */
1113             if (prog->anchored_substr || prog->anchored_utf8) {
1114                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1115                     DEBUG_EXECUTE_r( what = "anchored" );
1116                   hop_and_restart:
1117                     s = HOP3c(t, 1, strend);
1118                     if (s + start_shift + end_shift > strend) {
1119                         /* XXXX Should be taken into account earlier? */
1120                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1121                                                "Could not match STCLASS...\n") );
1122                         goto fail;
1123                     }
1124                     if (!check)
1125                         goto giveup;
1126                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                                 "Looking for %s substr starting at offset %ld...\n",
1128                                  what, (long)(s + start_shift - i_strpos)) );
1129                     goto restart;
1130                 }
1131                 /* Have both, check_string is floating */
1132                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1133                     goto retry_floating_check;
1134                 /* Recheck anchored substring, but not floating... */
1135                 s = check_at;
1136                 if (!check)
1137                     goto giveup;
1138                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1139                           "Looking for anchored substr starting at offset %ld...\n",
1140                           (long)(other_last - i_strpos)) );
1141                 goto do_other_anchored;
1142             }
1143             /* Another way we could have checked stclass at the
1144                current position only: */
1145             if (ml_anch) {
1146                 s = t = t + 1;
1147                 if (!check)
1148                     goto giveup;
1149                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1150                           "Looking for /%s^%s/m starting at offset %ld...\n",
1151                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1152                 goto try_at_offset;
1153             }
1154             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1155                 goto fail;
1156             /* Check is floating substring. */
1157           retry_floating_check:
1158             t = check_at - start_shift;
1159             DEBUG_EXECUTE_r( what = "floating" );
1160             goto hop_and_restart;
1161         }
1162         if (t != s) {
1163             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1164                         "By STCLASS: moving %ld --> %ld\n",
1165                                   (long)(t - i_strpos), (long)(s - i_strpos))
1166                    );
1167         }
1168         else {
1169             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1170                                   "Does not contradict STCLASS...\n");
1171                    );
1172         }
1173     }
1174   giveup:
1175     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1176                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1177                           PL_colors[5], (long)(s - i_strpos)) );
1178     return s;
1179
1180   fail_finish:                          /* Substring not found */
1181     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1182         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1183   fail:
1184     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1185                           PL_colors[4], PL_colors[5]));
1186     return NULL;
1187 }
1188
1189 #define DECL_TRIE_TYPE(scan) \
1190     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1191                     trie_type = ((scan->flags == EXACT) \
1192                               ? (utf8_target ? trie_utf8 : trie_plain) \
1193                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1194
1195 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1196 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1197     STRLEN skiplen;                                                                 \
1198     switch (trie_type) {                                                            \
1199     case trie_utf8_fold:                                                            \
1200         if ( foldlen>0 ) {                                                          \
1201             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1202             foldlen -= len;                                                         \
1203             uscan += len;                                                           \
1204             len=0;                                                                  \
1205         } else {                                                                    \
1206             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1207             len = UTF8SKIP(uc);                                                     \
1208             skiplen = UNISKIP( uvc );                                               \
1209             foldlen -= skiplen;                                                     \
1210             uscan = foldbuf + skiplen;                                              \
1211         }                                                                           \
1212         break;                                                                      \
1213     case trie_latin_utf8_fold:                                                      \
1214         if ( foldlen>0 ) {                                                          \
1215             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1216             foldlen -= len;                                                         \
1217             uscan += len;                                                           \
1218             len=0;                                                                  \
1219         } else {                                                                    \
1220             len = 1;                                                                \
1221             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1222             skiplen = UNISKIP( uvc );                                               \
1223             foldlen -= skiplen;                                                     \
1224             uscan = foldbuf + skiplen;                                              \
1225         }                                                                           \
1226         break;                                                                      \
1227     case trie_utf8:                                                                 \
1228         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1229         break;                                                                      \
1230     case trie_plain:                                                                \
1231         uvc = (UV)*uc;                                                              \
1232         len = 1;                                                                    \
1233     }                                                                               \
1234     if (uvc < 256) {                                                                \
1235         charid = trie->charmap[ uvc ];                                              \
1236     }                                                                               \
1237     else {                                                                          \
1238         charid = 0;                                                                 \
1239         if (widecharmap) {                                                          \
1240             SV** const svpp = hv_fetch(widecharmap,                                 \
1241                         (char*)&uvc, sizeof(UV), 0);                                \
1242             if (svpp)                                                               \
1243                 charid = (U16)SvIV(*svpp);                                          \
1244         }                                                                           \
1245     }                                                                               \
1246 } STMT_END
1247
1248 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1249 STMT_START {                                              \
1250     while (s <= e) {                                      \
1251         if ( (CoNd)                                       \
1252              && (ln == 1 || folder(s, pat_string, ln))    \
1253              && (!reginfo || regtry(reginfo, &s)) )       \
1254             goto got_it;                                  \
1255         s++;                                              \
1256     }                                                     \
1257 } STMT_END
1258
1259 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1260 STMT_START {                                          \
1261     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1262         CoDe                                          \
1263         s += uskip;                                   \
1264     }                                                 \
1265 } STMT_END
1266
1267 #define REXEC_FBC_SCAN(CoDe)                          \
1268 STMT_START {                                          \
1269     while (s < strend) {                              \
1270         CoDe                                          \
1271         s++;                                          \
1272     }                                                 \
1273 } STMT_END
1274
1275 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1276 REXEC_FBC_UTF8_SCAN(                                  \
1277     if (CoNd) {                                       \
1278         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1279             goto got_it;                              \
1280         else                                          \
1281             tmp = doevery;                            \
1282     }                                                 \
1283     else                                              \
1284         tmp = 1;                                      \
1285 )
1286
1287 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1288 REXEC_FBC_SCAN(                                       \
1289     if (CoNd) {                                       \
1290         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1291             goto got_it;                              \
1292         else                                          \
1293             tmp = doevery;                            \
1294     }                                                 \
1295     else                                              \
1296         tmp = 1;                                      \
1297 )
1298
1299 #define REXEC_FBC_TRYIT               \
1300 if ((!reginfo || regtry(reginfo, &s))) \
1301     goto got_it
1302
1303 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1304     if (utf8_target) {                                             \
1305         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1306     }                                                          \
1307     else {                                                     \
1308         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1309     }
1310
1311 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1312     if (utf8_target) {                                             \
1313         UtFpReLoAd;                                            \
1314         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1315     }                                                          \
1316     else {                                                     \
1317         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1318     }
1319
1320 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1321     PL_reg_flags |= RF_tainted;                                \
1322     if (utf8_target) {                                             \
1323         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1324     }                                                          \
1325     else {                                                     \
1326         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1327     }
1328
1329 #define DUMP_EXEC_POS(li,s,doutf8) \
1330     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1331
1332
1333 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1334         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1335         tmp = TEST_NON_UTF8(tmp);                                              \
1336         REXEC_FBC_UTF8_SCAN(                                                   \
1337             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1338                 tmp = !tmp;                                                    \
1339                 IF_SUCCESS;                                                    \
1340             }                                                                  \
1341             else {                                                             \
1342                 IF_FAIL;                                                       \
1343             }                                                                  \
1344         );                                                                     \
1345
1346 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1347         if (s == PL_bostr) {                                                   \
1348             tmp = '\n';                                                        \
1349         }                                                                      \
1350         else {                                                                 \
1351             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1352             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1353         }                                                                      \
1354         tmp = TeSt1_UtF8;                                                      \
1355         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1356         REXEC_FBC_UTF8_SCAN(                                                   \
1357             if (tmp == ! (TeSt2_UtF8)) { \
1358                 tmp = !tmp;                                                    \
1359                 IF_SUCCESS;                                                    \
1360             }                                                                  \
1361             else {                                                             \
1362                 IF_FAIL;                                                       \
1363             }                                                                  \
1364         );                                                                     \
1365
1366 /* The only difference between the BOUND and NBOUND cases is that
1367  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1368  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1369  * with the other one being empty */
1370 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1371     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1372
1373 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1374     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1375
1376 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1377     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1378
1379 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1380     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1381
1382
1383 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1384  * be passed in completely with the variable name being tested, which isn't
1385  * such a clean interface, but this is easier to read than it was before.  We
1386  * are looking for the boundary (or non-boundary between a word and non-word
1387  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1388  * must be different.  Find the "wordness" of the character just prior to this
1389  * one, and compare it with the wordness of this one.  If they differ, we have
1390  * a boundary.  At the beginning of the string, pretend that the previous
1391  * character was a new-line */
1392 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1393     if (utf8_target) {                                                         \
1394                 UTF8_CODE \
1395     }                                                                          \
1396     else {  /* Not utf8 */                                                     \
1397         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1398         tmp = TEST_NON_UTF8(tmp);                                              \
1399         REXEC_FBC_SCAN(                                                        \
1400             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1401                 tmp = !tmp;                                                    \
1402                 IF_SUCCESS;                                                    \
1403             }                                                                  \
1404             else {                                                             \
1405                 IF_FAIL;                                                       \
1406             }                                                                  \
1407         );                                                                     \
1408     }                                                                          \
1409     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1410         goto got_it;
1411
1412 /* We know what class REx starts with.  Try to find this position... */
1413 /* if reginfo is NULL, its a dryrun */
1414 /* annoyingly all the vars in this routine have different names from their counterparts
1415    in regmatch. /grrr */
1416
1417 STATIC char *
1418 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1419     const char *strend, regmatch_info *reginfo)
1420 {
1421         dVAR;
1422         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1423         char *pat_string;   /* The pattern's exactish string */
1424         char *pat_end;      /* ptr to end char of pat_string */
1425         re_fold_t folder;       /* Function for computing non-utf8 folds */
1426         const U8 *fold_array;   /* array for folding ords < 256 */
1427         STRLEN ln;
1428         STRLEN lnc;
1429         register STRLEN uskip;
1430         U8 c1;
1431         U8 c2;
1432         char *e;
1433         register I32 tmp = 1;   /* Scratch variable? */
1434         register const bool utf8_target = PL_reg_match_utf8;
1435         UV utf8_fold_flags = 0;
1436         RXi_GET_DECL(prog,progi);
1437
1438         PERL_ARGS_ASSERT_FIND_BYCLASS;
1439
1440         /* We know what class it must start with. */
1441         switch (OP(c)) {
1442         case ANYOFV:
1443         case ANYOF:
1444             if (utf8_target || OP(c) == ANYOFV) {
1445                 STRLEN inclasslen = strend - s;
1446                 REXEC_FBC_UTF8_CLASS_SCAN(
1447                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1448             }
1449             else {
1450                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1451             }
1452             break;
1453         case CANY:
1454             REXEC_FBC_SCAN(
1455                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1456                     goto got_it;
1457                 else
1458                     tmp = doevery;
1459             );
1460             break;
1461
1462         case EXACTFA:
1463             if (UTF_PATTERN || utf8_target) {
1464                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1465                 goto do_exactf_utf8;
1466             }
1467             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1468             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1469             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1470
1471         case EXACTF:
1472             if (utf8_target) {
1473
1474                 /* regcomp.c already folded this if pattern is in UTF-8 */
1475                 utf8_fold_flags = 0;
1476                 goto do_exactf_utf8;
1477             }
1478             fold_array = PL_fold;
1479             folder = foldEQ;
1480             goto do_exactf_non_utf8;
1481
1482         case EXACTFL:
1483             if (UTF_PATTERN || utf8_target) {
1484                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1485                 goto do_exactf_utf8;
1486             }
1487             fold_array = PL_fold_locale;
1488             folder = foldEQ_locale;
1489             goto do_exactf_non_utf8;
1490
1491         case EXACTFU_SS:
1492             if (UTF_PATTERN) {
1493                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1494             }
1495             goto do_exactf_utf8;
1496
1497         case EXACTFU_TRICKYFOLD:
1498         case EXACTFU:
1499             if (UTF_PATTERN || utf8_target) {
1500                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1501                 goto do_exactf_utf8;
1502             }
1503
1504             /* Any 'ss' in the pattern should have been replaced by regcomp,
1505              * so we don't have to worry here about this single special case
1506              * in the Latin1 range */
1507             fold_array = PL_fold_latin1;
1508             folder = foldEQ_latin1;
1509
1510             /* FALL THROUGH */
1511
1512         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1513                                are no glitches with fold-length differences
1514                                between the target string and pattern */
1515
1516             /* The idea in the non-utf8 EXACTF* cases is to first find the
1517              * first character of the EXACTF* node and then, if necessary,
1518              * case-insensitively compare the full text of the node.  c1 is the
1519              * first character.  c2 is its fold.  This logic will not work for
1520              * Unicode semantics and the german sharp ss, which hence should
1521              * not be compiled into a node that gets here. */
1522             pat_string = STRING(c);
1523             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1524
1525             /* We know that we have to match at least 'ln' bytes (which is the
1526              * same as characters, since not utf8).  If we have to match 3
1527              * characters, and there are only 2 availabe, we know without
1528              * trying that it will fail; so don't start a match past the
1529              * required minimum number from the far end */
1530             e = HOP3c(strend, -((I32)ln), s);
1531
1532             if (!reginfo && e < s) {
1533                 e = s;                  /* Due to minlen logic of intuit() */
1534             }
1535
1536             c1 = *pat_string;
1537             c2 = fold_array[c1];
1538             if (c1 == c2) { /* If char and fold are the same */
1539                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1540             }
1541             else {
1542                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1543             }
1544             break;
1545
1546         do_exactf_utf8:
1547         {
1548             unsigned expansion;
1549
1550
1551             /* If one of the operands is in utf8, we can't use the simpler
1552              * folding above, due to the fact that many different characters
1553              * can have the same fold, or portion of a fold, or different-
1554              * length fold */
1555             pat_string = STRING(c);
1556             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1557             pat_end = pat_string + ln;
1558             lnc = (UTF_PATTERN) /* length to match in characters */
1559                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1560                     : ln;
1561
1562             /* We have 'lnc' characters to match in the pattern, but because of
1563              * multi-character folding, each character in the target can match
1564              * up to 3 characters (Unicode guarantees it will never exceed
1565              * this) if it is utf8-encoded; and up to 2 if not (based on the
1566              * fact that the Latin 1 folds are already determined, and the
1567              * only multi-char fold in that range is the sharp-s folding to
1568              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1569              * string character.  Adjust lnc accordingly, rounding up, so that
1570              * if we need to match at least 4+1/3 chars, that really is 5. */
1571             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1572             lnc = (lnc + expansion - 1) / expansion;
1573
1574             /* As in the non-UTF8 case, if we have to match 3 characters, and
1575              * only 2 are left, it's guaranteed to fail, so don't start a
1576              * match that would require us to go beyond the end of the string
1577              */
1578             e = HOP3c(strend, -((I32)lnc), s);
1579
1580             if (!reginfo && e < s) {
1581                 e = s;                  /* Due to minlen logic of intuit() */
1582             }
1583
1584             /* XXX Note that we could recalculate e to stop the loop earlier,
1585              * as the worst case expansion above will rarely be met, and as we
1586              * go along we would usually find that e moves further to the left.
1587              * This would happen only after we reached the point in the loop
1588              * where if there were no expansion we should fail.  Unclear if
1589              * worth the expense */
1590
1591             while (s <= e) {
1592                 char *my_strend= (char *)strend;
1593                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1594                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1595                     && (!reginfo || regtry(reginfo, &s)) )
1596                 {
1597                     goto got_it;
1598                 }
1599                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1600             }
1601             break;
1602         }
1603         case BOUNDL:
1604             PL_reg_flags |= RF_tainted;
1605             FBC_BOUND(isALNUM_LC,
1606                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1607                       isALNUM_LC_utf8((U8*)s));
1608             break;
1609         case NBOUNDL:
1610             PL_reg_flags |= RF_tainted;
1611             FBC_NBOUND(isALNUM_LC,
1612                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1613                        isALNUM_LC_utf8((U8*)s));
1614             break;
1615         case BOUND:
1616             FBC_BOUND(isWORDCHAR,
1617                       isALNUM_uni(tmp),
1618                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1619             break;
1620         case BOUNDA:
1621             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1622                              isWORDCHAR_A(tmp),
1623                              isWORDCHAR_A((U8*)s));
1624             break;
1625         case NBOUND:
1626             FBC_NBOUND(isWORDCHAR,
1627                        isALNUM_uni(tmp),
1628                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1629             break;
1630         case NBOUNDA:
1631             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1632                               isWORDCHAR_A(tmp),
1633                               isWORDCHAR_A((U8*)s));
1634             break;
1635         case BOUNDU:
1636             FBC_BOUND(isWORDCHAR_L1,
1637                       isALNUM_uni(tmp),
1638                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1639             break;
1640         case NBOUNDU:
1641             FBC_NBOUND(isWORDCHAR_L1,
1642                        isALNUM_uni(tmp),
1643                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1644             break;
1645         case ALNUML:
1646             REXEC_FBC_CSCAN_TAINT(
1647                 isALNUM_LC_utf8((U8*)s),
1648                 isALNUM_LC(*s)
1649             );
1650             break;
1651         case ALNUMU:
1652             REXEC_FBC_CSCAN_PRELOAD(
1653                 LOAD_UTF8_CHARCLASS_ALNUM(),
1654                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1655                 isWORDCHAR_L1((U8) *s)
1656             );
1657             break;
1658         case ALNUM:
1659             REXEC_FBC_CSCAN_PRELOAD(
1660                 LOAD_UTF8_CHARCLASS_ALNUM(),
1661                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1662                 isWORDCHAR((U8) *s)
1663             );
1664             break;
1665         case ALNUMA:
1666             /* Don't need to worry about utf8, as it can match only a single
1667              * byte invariant character */
1668             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1669             break;
1670         case NALNUMU:
1671             REXEC_FBC_CSCAN_PRELOAD(
1672                 LOAD_UTF8_CHARCLASS_ALNUM(),
1673                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1674                 ! isWORDCHAR_L1((U8) *s)
1675             );
1676             break;
1677         case NALNUM:
1678             REXEC_FBC_CSCAN_PRELOAD(
1679                 LOAD_UTF8_CHARCLASS_ALNUM(),
1680                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1681                 ! isALNUM(*s)
1682             );
1683             break;
1684         case NALNUMA:
1685             REXEC_FBC_CSCAN(
1686                 !isWORDCHAR_A(*s),
1687                 !isWORDCHAR_A(*s)
1688             );
1689             break;
1690         case NALNUML:
1691             REXEC_FBC_CSCAN_TAINT(
1692                 !isALNUM_LC_utf8((U8*)s),
1693                 !isALNUM_LC(*s)
1694             );
1695             break;
1696         case SPACEU:
1697             REXEC_FBC_CSCAN_PRELOAD(
1698                 LOAD_UTF8_CHARCLASS_SPACE(),
1699                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1700                 isSPACE_L1((U8) *s)
1701             );
1702             break;
1703         case SPACE:
1704             REXEC_FBC_CSCAN_PRELOAD(
1705                 LOAD_UTF8_CHARCLASS_SPACE(),
1706                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1707                 isSPACE((U8) *s)
1708             );
1709             break;
1710         case SPACEA:
1711             /* Don't need to worry about utf8, as it can match only a single
1712              * byte invariant character */
1713             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1714             break;
1715         case SPACEL:
1716             REXEC_FBC_CSCAN_TAINT(
1717                 isSPACE_LC_utf8((U8*)s),
1718                 isSPACE_LC(*s)
1719             );
1720             break;
1721         case NSPACEU:
1722             REXEC_FBC_CSCAN_PRELOAD(
1723                 LOAD_UTF8_CHARCLASS_SPACE(),
1724                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1725                 ! isSPACE_L1((U8) *s)
1726             );
1727             break;
1728         case NSPACE:
1729             REXEC_FBC_CSCAN_PRELOAD(
1730                 LOAD_UTF8_CHARCLASS_SPACE(),
1731                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1732                 ! isSPACE((U8) *s)
1733             );
1734             break;
1735         case NSPACEA:
1736             REXEC_FBC_CSCAN(
1737                 !isSPACE_A(*s),
1738                 !isSPACE_A(*s)
1739             );
1740             break;
1741         case NSPACEL:
1742             REXEC_FBC_CSCAN_TAINT(
1743                 !isSPACE_LC_utf8((U8*)s),
1744                 !isSPACE_LC(*s)
1745             );
1746             break;
1747         case DIGIT:
1748             REXEC_FBC_CSCAN_PRELOAD(
1749                 LOAD_UTF8_CHARCLASS_DIGIT(),
1750                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1751                 isDIGIT(*s)
1752             );
1753             break;
1754         case DIGITA:
1755             /* Don't need to worry about utf8, as it can match only a single
1756              * byte invariant character */
1757             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1758             break;
1759         case DIGITL:
1760             REXEC_FBC_CSCAN_TAINT(
1761                 isDIGIT_LC_utf8((U8*)s),
1762                 isDIGIT_LC(*s)
1763             );
1764             break;
1765         case NDIGIT:
1766             REXEC_FBC_CSCAN_PRELOAD(
1767                 LOAD_UTF8_CHARCLASS_DIGIT(),
1768                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1769                 !isDIGIT(*s)
1770             );
1771             break;
1772         case NDIGITA:
1773             REXEC_FBC_CSCAN(
1774                 !isDIGIT_A(*s),
1775                 !isDIGIT_A(*s)
1776             );
1777             break;
1778         case NDIGITL:
1779             REXEC_FBC_CSCAN_TAINT(
1780                 !isDIGIT_LC_utf8((U8*)s),
1781                 !isDIGIT_LC(*s)
1782             );
1783             break;
1784         case LNBREAK:
1785             REXEC_FBC_CSCAN(
1786                 is_LNBREAK_utf8(s),
1787                 is_LNBREAK_latin1(s)
1788             );
1789             break;
1790         case VERTWS:
1791             REXEC_FBC_CSCAN(
1792                 is_VERTWS_utf8(s),
1793                 is_VERTWS_latin1(s)
1794             );
1795             break;
1796         case NVERTWS:
1797             REXEC_FBC_CSCAN(
1798                 !is_VERTWS_utf8(s),
1799                 !is_VERTWS_latin1(s)
1800             );
1801             break;
1802         case HORIZWS:
1803             REXEC_FBC_CSCAN(
1804                 is_HORIZWS_utf8(s),
1805                 is_HORIZWS_latin1(s)
1806             );
1807             break;
1808         case NHORIZWS:
1809             REXEC_FBC_CSCAN(
1810                 !is_HORIZWS_utf8(s),
1811                 !is_HORIZWS_latin1(s)
1812             );
1813             break;
1814         case AHOCORASICKC:
1815         case AHOCORASICK:
1816             {
1817                 DECL_TRIE_TYPE(c);
1818                 /* what trie are we using right now */
1819                 reg_ac_data *aho
1820                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1821                 reg_trie_data *trie
1822                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1823                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1824
1825                 const char *last_start = strend - trie->minlen;
1826 #ifdef DEBUGGING
1827                 const char *real_start = s;
1828 #endif
1829                 STRLEN maxlen = trie->maxlen;
1830                 SV *sv_points;
1831                 U8 **points; /* map of where we were in the input string
1832                                 when reading a given char. For ASCII this
1833                                 is unnecessary overhead as the relationship
1834                                 is always 1:1, but for Unicode, especially
1835                                 case folded Unicode this is not true. */
1836                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1837                 U8 *bitmap=NULL;
1838
1839
1840                 GET_RE_DEBUG_FLAGS_DECL;
1841
1842                 /* We can't just allocate points here. We need to wrap it in
1843                  * an SV so it gets freed properly if there is a croak while
1844                  * running the match */
1845                 ENTER;
1846                 SAVETMPS;
1847                 sv_points=newSV(maxlen * sizeof(U8 *));
1848                 SvCUR_set(sv_points,
1849                     maxlen * sizeof(U8 *));
1850                 SvPOK_on(sv_points);
1851                 sv_2mortal(sv_points);
1852                 points=(U8**)SvPV_nolen(sv_points );
1853                 if ( trie_type != trie_utf8_fold
1854                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1855                 {
1856                     if (trie->bitmap)
1857                         bitmap=(U8*)trie->bitmap;
1858                     else
1859                         bitmap=(U8*)ANYOF_BITMAP(c);
1860                 }
1861                 /* this is the Aho-Corasick algorithm modified a touch
1862                    to include special handling for long "unknown char"
1863                    sequences. The basic idea being that we use AC as long
1864                    as we are dealing with a possible matching char, when
1865                    we encounter an unknown char (and we have not encountered
1866                    an accepting state) we scan forward until we find a legal
1867                    starting char.
1868                    AC matching is basically that of trie matching, except
1869                    that when we encounter a failing transition, we fall back
1870                    to the current states "fail state", and try the current char
1871                    again, a process we repeat until we reach the root state,
1872                    state 1, or a legal transition. If we fail on the root state
1873                    then we can either terminate if we have reached an accepting
1874                    state previously, or restart the entire process from the beginning
1875                    if we have not.
1876
1877                  */
1878                 while (s <= last_start) {
1879                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1880                     U8 *uc = (U8*)s;
1881                     U16 charid = 0;
1882                     U32 base = 1;
1883                     U32 state = 1;
1884                     UV uvc = 0;
1885                     STRLEN len = 0;
1886                     STRLEN foldlen = 0;
1887                     U8 *uscan = (U8*)NULL;
1888                     U8 *leftmost = NULL;
1889 #ifdef DEBUGGING
1890                     U32 accepted_word= 0;
1891 #endif
1892                     U32 pointpos = 0;
1893
1894                     while ( state && uc <= (U8*)strend ) {
1895                         int failed=0;
1896                         U32 word = aho->states[ state ].wordnum;
1897
1898                         if( state==1 ) {
1899                             if ( bitmap ) {
1900                                 DEBUG_TRIE_EXECUTE_r(
1901                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1902                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1903                                             (char *)uc, utf8_target );
1904                                         PerlIO_printf( Perl_debug_log,
1905                                             " Scanning for legal start char...\n");
1906                                     }
1907                                 );
1908                                 if (utf8_target) {
1909                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1910                                         uc += UTF8SKIP(uc);
1911                                     }
1912                                 } else {
1913                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1914                                         uc++;
1915                                     }
1916                                 }
1917                                 s= (char *)uc;
1918                             }
1919                             if (uc >(U8*)last_start) break;
1920                         }
1921
1922                         if ( word ) {
1923                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1924                             if (!leftmost || lpos < leftmost) {
1925                                 DEBUG_r(accepted_word=word);
1926                                 leftmost= lpos;
1927                             }
1928                             if (base==0) break;
1929
1930                         }
1931                         points[pointpos++ % maxlen]= uc;
1932                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1933                                              uscan, len, uvc, charid, foldlen,
1934                                              foldbuf, uniflags);
1935                         DEBUG_TRIE_EXECUTE_r({
1936                             dump_exec_pos( (char *)uc, c, strend, real_start,
1937                                 s,   utf8_target );
1938                             PerlIO_printf(Perl_debug_log,
1939                                 " Charid:%3u CP:%4"UVxf" ",
1940                                  charid, uvc);
1941                         });
1942
1943                         do {
1944 #ifdef DEBUGGING
1945                             word = aho->states[ state ].wordnum;
1946 #endif
1947                             base = aho->states[ state ].trans.base;
1948
1949                             DEBUG_TRIE_EXECUTE_r({
1950                                 if (failed)
1951                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1952                                         s,   utf8_target );
1953                                 PerlIO_printf( Perl_debug_log,
1954                                     "%sState: %4"UVxf", word=%"UVxf,
1955                                     failed ? " Fail transition to " : "",
1956                                     (UV)state, (UV)word);
1957                             });
1958                             if ( base ) {
1959                                 U32 tmp;
1960                                 I32 offset;
1961                                 if (charid &&
1962                                      ( ((offset = base + charid
1963                                         - 1 - trie->uniquecharcount)) >= 0)
1964                                      && ((U32)offset < trie->lasttrans)
1965                                      && trie->trans[offset].check == state
1966                                      && (tmp=trie->trans[offset].next))
1967                                 {
1968                                     DEBUG_TRIE_EXECUTE_r(
1969                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1970                                     state = tmp;
1971                                     break;
1972                                 }
1973                                 else {
1974                                     DEBUG_TRIE_EXECUTE_r(
1975                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1976                                     failed = 1;
1977                                     state = aho->fail[state];
1978                                 }
1979                             }
1980                             else {
1981                                 /* we must be accepting here */
1982                                 DEBUG_TRIE_EXECUTE_r(
1983                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1984                                 failed = 1;
1985                                 break;
1986                             }
1987                         } while(state);
1988                         uc += len;
1989                         if (failed) {
1990                             if (leftmost)
1991                                 break;
1992                             if (!state) state = 1;
1993                         }
1994                     }
1995                     if ( aho->states[ state ].wordnum ) {
1996                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1997                         if (!leftmost || lpos < leftmost) {
1998                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1999                             leftmost = lpos;
2000                         }
2001                     }
2002                     if (leftmost) {
2003                         s = (char*)leftmost;
2004                         DEBUG_TRIE_EXECUTE_r({
2005                             PerlIO_printf(
2006                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2007                                 (UV)accepted_word, (IV)(s - real_start)
2008                             );
2009                         });
2010                         if (!reginfo || regtry(reginfo, &s)) {
2011                             FREETMPS;
2012                             LEAVE;
2013                             goto got_it;
2014                         }
2015                         s = HOPc(s,1);
2016                         DEBUG_TRIE_EXECUTE_r({
2017                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2018                         });
2019                     } else {
2020                         DEBUG_TRIE_EXECUTE_r(
2021                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2022                         break;
2023                     }
2024                 }
2025                 FREETMPS;
2026                 LEAVE;
2027             }
2028             break;
2029         default:
2030             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2031             break;
2032         }
2033         return 0;
2034       got_it:
2035         return s;
2036 }
2037
2038
2039 /*
2040  - regexec_flags - match a regexp against a string
2041  */
2042 I32
2043 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2044               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2045 /* strend: pointer to null at end of string */
2046 /* strbeg: real beginning of string */
2047 /* minend: end of match must be >=minend after stringarg. */
2048 /* data: May be used for some additional optimizations.
2049          Currently its only used, with a U32 cast, for transmitting
2050          the ganch offset when doing a /g match. This will change */
2051 /* nosave: For optimizations. */
2052 {
2053     dVAR;
2054     struct regexp *const prog = (struct regexp *)SvANY(rx);
2055     /*register*/ char *s;
2056     register regnode *c;
2057     /*register*/ char *startpos = stringarg;
2058     I32 minlen;         /* must match at least this many chars */
2059     I32 dontbother = 0; /* how many characters not to try at end */
2060     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2061     I32 scream_pos = -1;                /* Internal iterator of scream. */
2062     char *scream_olds = NULL;
2063     const bool utf8_target = cBOOL(DO_UTF8(sv));
2064     I32 multiline;
2065     RXi_GET_DECL(prog,progi);
2066     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2067     regexp_paren_pair *swap = NULL;
2068     GET_RE_DEBUG_FLAGS_DECL;
2069
2070     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2071     PERL_UNUSED_ARG(data);
2072
2073     /* Be paranoid... */
2074     if (prog == NULL || startpos == NULL) {
2075         Perl_croak(aTHX_ "NULL regexp parameter");
2076         return 0;
2077     }
2078
2079     multiline = prog->extflags & RXf_PMf_MULTILINE;
2080     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2081
2082     RX_MATCH_UTF8_set(rx, utf8_target);
2083     DEBUG_EXECUTE_r(
2084         debug_start_match(rx, utf8_target, startpos, strend,
2085         "Matching");
2086     );
2087
2088     minlen = prog->minlen;
2089
2090     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2091         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2092                               "String too short [regexec_flags]...\n"));
2093         goto phooey;
2094     }
2095
2096
2097     /* Check validity of program. */
2098     if (UCHARAT(progi->program) != REG_MAGIC) {
2099         Perl_croak(aTHX_ "corrupted regexp program");
2100     }
2101
2102     PL_reg_flags = 0;
2103     PL_reg_eval_set = 0;
2104     PL_reg_maxiter = 0;
2105
2106     if (RX_UTF8(rx))
2107         PL_reg_flags |= RF_utf8;
2108
2109     /* Mark beginning of line for ^ and lookbehind. */
2110     reginfo.bol = startpos; /* XXX not used ??? */
2111     PL_bostr  = strbeg;
2112     reginfo.sv = sv;
2113
2114     /* Mark end of line for $ (and such) */
2115     PL_regeol = strend;
2116
2117     /* see how far we have to get to not match where we matched before */
2118     reginfo.till = startpos+minend;
2119
2120     /* If there is a "must appear" string, look for it. */
2121     s = startpos;
2122
2123     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2124         MAGIC *mg;
2125         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2126             reginfo.ganch = startpos + prog->gofs;
2127             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2128               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2129         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2130                   && SvMAGIC(sv)
2131                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2132                   && mg->mg_len >= 0) {
2133             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2134             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2135                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2136
2137             if (prog->extflags & RXf_ANCH_GPOS) {
2138                 if (s > reginfo.ganch)
2139                     goto phooey;
2140                 s = reginfo.ganch - prog->gofs;
2141                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2142                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2143                 if (s < strbeg)
2144                     goto phooey;
2145             }
2146         }
2147         else if (data) {
2148             reginfo.ganch = strbeg + PTR2UV(data);
2149             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2150                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2151
2152         } else {                                /* pos() not defined */
2153             reginfo.ganch = strbeg;
2154             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2155                  "GPOS: reginfo.ganch = strbeg\n"));
2156         }
2157     }
2158     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2159         /* We have to be careful. If the previous successful match
2160            was from this regex we don't want a subsequent partially
2161            successful match to clobber the old results.
2162            So when we detect this possibility we add a swap buffer
2163            to the re, and switch the buffer each match. If we fail
2164            we switch it back, otherwise we leave it swapped.
2165         */
2166         swap = prog->offs;
2167         /* do we need a save destructor here for eval dies? */
2168         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2169     }
2170     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2171         re_scream_pos_data d;
2172
2173         d.scream_olds = &scream_olds;
2174         d.scream_pos = &scream_pos;
2175         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2176         if (!s) {
2177             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2178             goto phooey;        /* not present */
2179         }
2180     }
2181
2182
2183
2184     /* Simplest case:  anchored match need be tried only once. */
2185     /*  [unless only anchor is BOL and multiline is set] */
2186     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2187         if (s == startpos && regtry(&reginfo, &startpos))
2188             goto got_it;
2189         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2190                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2191         {
2192             char *end;
2193
2194             if (minlen)
2195                 dontbother = minlen - 1;
2196             end = HOP3c(strend, -dontbother, strbeg) - 1;
2197             /* for multiline we only have to try after newlines */
2198             if (prog->check_substr || prog->check_utf8) {
2199                 /* because of the goto we can not easily reuse the macros for bifurcating the
2200                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2201                 if (utf8_target) {
2202                     if (s == startpos)
2203                         goto after_try_utf8;
2204                     while (1) {
2205                         if (regtry(&reginfo, &s)) {
2206                             goto got_it;
2207                         }
2208                       after_try_utf8:
2209                         if (s > end) {
2210                             goto phooey;
2211                         }
2212                         if (prog->extflags & RXf_USE_INTUIT) {
2213                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2214                             if (!s) {
2215                                 goto phooey;
2216                             }
2217                         }
2218                         else {
2219                             s += UTF8SKIP(s);
2220                         }
2221                     }
2222                 } /* end search for check string in unicode */
2223                 else {
2224                     if (s == startpos) {
2225                         goto after_try_latin;
2226                     }
2227                     while (1) {
2228                         if (regtry(&reginfo, &s)) {
2229                             goto got_it;
2230                         }
2231                       after_try_latin:
2232                         if (s > end) {
2233                             goto phooey;
2234                         }
2235                         if (prog->extflags & RXf_USE_INTUIT) {
2236                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2237                             if (!s) {
2238                                 goto phooey;
2239                             }
2240                         }
2241                         else {
2242                             s++;
2243                         }
2244                     }
2245                 } /* end search for check string in latin*/
2246             } /* end search for check string */
2247             else { /* search for newline */
2248                 if (s > startpos) {
2249                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2250                     s--;
2251                 }
2252                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2253                 while (s <= end) { /* note it could be possible to match at the end of the string */
2254                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2255                         if (regtry(&reginfo, &s))
2256                             goto got_it;
2257                     }
2258                 }
2259             } /* end search for newline */
2260         } /* end anchored/multiline check string search */
2261         goto phooey;
2262     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2263     {
2264         /* the warning about reginfo.ganch being used without initialization
2265            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2266            and we only enter this block when the same bit is set. */
2267         char *tmp_s = reginfo.ganch - prog->gofs;
2268
2269         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2270             goto got_it;
2271         goto phooey;
2272     }
2273
2274     /* Messy cases:  unanchored match. */
2275     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2276         /* we have /x+whatever/ */
2277         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2278         char ch;
2279 #ifdef DEBUGGING
2280         int did_match = 0;
2281 #endif
2282         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2283             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2284         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2285
2286         if (utf8_target) {
2287             REXEC_FBC_SCAN(
2288                 if (*s == ch) {
2289                     DEBUG_EXECUTE_r( did_match = 1 );
2290                     if (regtry(&reginfo, &s)) goto got_it;
2291                     s += UTF8SKIP(s);
2292                     while (s < strend && *s == ch)
2293                         s += UTF8SKIP(s);
2294                 }
2295             );
2296         }
2297         else {
2298             REXEC_FBC_SCAN(
2299                 if (*s == ch) {
2300                     DEBUG_EXECUTE_r( did_match = 1 );
2301                     if (regtry(&reginfo, &s)) goto got_it;
2302                     s++;
2303                     while (s < strend && *s == ch)
2304                         s++;
2305                 }
2306             );
2307         }
2308         DEBUG_EXECUTE_r(if (!did_match)
2309                 PerlIO_printf(Perl_debug_log,
2310                                   "Did not find anchored character...\n")
2311                );
2312     }
2313     else if (prog->anchored_substr != NULL
2314               || prog->anchored_utf8 != NULL
2315               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2316                   && prog->float_max_offset < strend - s)) {
2317         SV *must;
2318         I32 back_max;
2319         I32 back_min;
2320         char *last;
2321         char *last1;            /* Last position checked before */
2322 #ifdef DEBUGGING
2323         int did_match = 0;
2324 #endif
2325         if (prog->anchored_substr || prog->anchored_utf8) {
2326             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2327                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2328             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2329             back_max = back_min = prog->anchored_offset;
2330         } else {
2331             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2332                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2333             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2334             back_max = prog->float_max_offset;
2335             back_min = prog->float_min_offset;
2336         }
2337
2338
2339         if (must == &PL_sv_undef)
2340             /* could not downgrade utf8 check substring, so must fail */
2341             goto phooey;
2342
2343         if (back_min<0) {
2344             last = strend;
2345         } else {
2346             last = HOP3c(strend,        /* Cannot start after this */
2347                   -(I32)(CHR_SVLEN(must)
2348                          - (SvTAIL(must) != 0) + back_min), strbeg);
2349         }
2350         if (s > PL_bostr)
2351             last1 = HOPc(s, -1);
2352         else
2353             last1 = s - 1;      /* bogus */
2354
2355         /* XXXX check_substr already used to find "s", can optimize if
2356            check_substr==must. */
2357         scream_pos = -1;
2358         dontbother = end_shift;
2359         strend = HOPc(strend, -dontbother);
2360         while ( (s <= last) &&
2361                 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
2362                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2363                                     end_shift, &scream_pos, 0))
2364                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2365                                   (unsigned char*)strend, must,
2366                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2367             /* we may be pointing at the wrong string */
2368             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2369                 s = strbeg + (s - SvPVX_const(sv));
2370             DEBUG_EXECUTE_r( did_match = 1 );
2371             if (HOPc(s, -back_max) > last1) {
2372                 last1 = HOPc(s, -back_min);
2373                 s = HOPc(s, -back_max);
2374             }
2375             else {
2376                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2377
2378                 last1 = HOPc(s, -back_min);
2379                 s = t;
2380             }
2381             if (utf8_target) {
2382                 while (s <= last1) {
2383                     if (regtry(&reginfo, &s))
2384                         goto got_it;
2385                     s += UTF8SKIP(s);
2386                 }
2387             }
2388             else {
2389                 while (s <= last1) {
2390                     if (regtry(&reginfo, &s))
2391                         goto got_it;
2392                     s++;
2393                 }
2394             }
2395         }
2396         DEBUG_EXECUTE_r(if (!did_match) {
2397             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2398                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2399             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2400                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2401                                ? "anchored" : "floating"),
2402                 quoted, RE_SV_TAIL(must));
2403         });
2404         goto phooey;
2405     }
2406     else if ( (c = progi->regstclass) ) {
2407         if (minlen) {
2408             const OPCODE op = OP(progi->regstclass);
2409             /* don't bother with what can't match */
2410             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2411                 strend = HOPc(strend, -(minlen - 1));
2412         }
2413         DEBUG_EXECUTE_r({
2414             SV * const prop = sv_newmortal();
2415             regprop(prog, prop, c);
2416             {
2417                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2418                     s,strend-s,60);
2419                 PerlIO_printf(Perl_debug_log,
2420                     "Matching stclass %.*s against %s (%d bytes)\n",
2421                     (int)SvCUR(prop), SvPVX_const(prop),
2422                      quoted, (int)(strend - s));
2423             }
2424         });
2425         if (find_byclass(prog, c, s, strend, &reginfo))
2426             goto got_it;
2427         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2428     }
2429     else {
2430         dontbother = 0;
2431         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2432             /* Trim the end. */
2433             char *last;
2434             SV* float_real;
2435
2436             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2437                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2438             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2439
2440             if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
2441                 last = screaminstr(sv, float_real, s - strbeg,
2442                                    end_shift, &scream_pos, 1); /* last one */
2443                 if (!last)
2444                     last = scream_olds; /* Only one occurrence. */
2445                 /* we may be pointing at the wrong string */
2446                 else if (RXp_MATCH_COPIED(prog))
2447                     s = strbeg + (s - SvPVX_const(sv));
2448             }
2449             else {
2450                 STRLEN len;
2451                 const char * const little = SvPV_const(float_real, len);
2452
2453                 if (SvTAIL(float_real)) {
2454                     if (memEQ(strend - len + 1, little, len - 1))
2455                         last = strend - len + 1;
2456                     else if (!multiline)
2457                         last = memEQ(strend - len, little, len)
2458                             ? strend - len : NULL;
2459                     else
2460                         goto find_last;
2461                 } else {
2462                   find_last:
2463                     if (len)
2464                         last = rninstr(s, strend, little, little + len);
2465                     else
2466                         last = strend;  /* matching "$" */
2467                 }
2468             }
2469             if (last == NULL) {
2470                 DEBUG_EXECUTE_r(
2471                     PerlIO_printf(Perl_debug_log,
2472                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2473                         PL_colors[4], PL_colors[5]));
2474                 goto phooey; /* Should not happen! */
2475             }
2476             dontbother = strend - last + prog->float_min_offset;
2477         }
2478         if (minlen && (dontbother < minlen))
2479             dontbother = minlen - 1;
2480         strend -= dontbother;              /* this one's always in bytes! */
2481         /* We don't know much -- general case. */
2482         if (utf8_target) {
2483             for (;;) {
2484                 if (regtry(&reginfo, &s))
2485                     goto got_it;
2486                 if (s >= strend)
2487                     break;
2488                 s += UTF8SKIP(s);
2489             };
2490         }
2491         else {
2492             do {
2493                 if (regtry(&reginfo, &s))
2494                     goto got_it;
2495             } while (s++ < strend);
2496         }
2497     }
2498
2499     /* Failure. */
2500     goto phooey;
2501
2502 got_it:
2503     Safefree(swap);
2504     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2505
2506     if (PL_reg_eval_set)
2507         restore_pos(aTHX_ prog);
2508     if (RXp_PAREN_NAMES(prog))
2509         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2510
2511     /* make sure $`, $&, $', and $digit will work later */
2512     if ( !(flags & REXEC_NOT_FIRST) ) {
2513         RX_MATCH_COPY_FREE(rx);
2514         if (flags & REXEC_COPY_STR) {
2515             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2516 #ifdef PERL_OLD_COPY_ON_WRITE
2517             if ((SvIsCOW(sv)
2518                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2519                 if (DEBUG_C_TEST) {
2520                     PerlIO_printf(Perl_debug_log,
2521                                   "Copy on write: regexp capture, type %d\n",
2522                                   (int) SvTYPE(sv));
2523                 }
2524                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2525                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2526                 assert (SvPOKp(prog->saved_copy));
2527             } else
2528 #endif
2529             {
2530                 RX_MATCH_COPIED_on(rx);
2531                 s = savepvn(strbeg, i);
2532                 prog->subbeg = s;
2533             }
2534             prog->sublen = i;
2535         }
2536         else {
2537             prog->subbeg = strbeg;
2538             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2539         }
2540     }
2541
2542     return 1;
2543
2544 phooey:
2545     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2546                           PL_colors[4], PL_colors[5]));
2547     if (PL_reg_eval_set)
2548         restore_pos(aTHX_ prog);
2549     if (swap) {
2550         /* we failed :-( roll it back */
2551         Safefree(prog->offs);
2552         prog->offs = swap;
2553     }
2554
2555     return 0;
2556 }
2557
2558
2559 /*
2560  - regtry - try match at specific point
2561  */
2562 STATIC I32                      /* 0 failure, 1 success */
2563 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2564 {
2565     dVAR;
2566     CHECKPOINT lastcp;
2567     REGEXP *const rx = reginfo->prog;
2568     regexp *const prog = (struct regexp *)SvANY(rx);
2569     RXi_GET_DECL(prog,progi);
2570     GET_RE_DEBUG_FLAGS_DECL;
2571
2572     PERL_ARGS_ASSERT_REGTRY;
2573
2574     reginfo->cutpoint=NULL;
2575
2576     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2577         MAGIC *mg;
2578
2579         PL_reg_eval_set = RS_init;
2580         DEBUG_EXECUTE_r(DEBUG_s(
2581             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2582                           (IV)(PL_stack_sp - PL_stack_base));
2583             ));
2584         SAVESTACK_CXPOS();
2585         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2586         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2587         SAVETMPS;
2588         /* Apparently this is not needed, judging by wantarray. */
2589         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2590            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2591
2592         if (reginfo->sv) {
2593             /* Make $_ available to executed code. */
2594             if (reginfo->sv != DEFSV) {
2595                 SAVE_DEFSV;
2596                 DEFSV_set(reginfo->sv);
2597             }
2598
2599             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2600                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2601                 /* prepare for quick setting of pos */
2602 #ifdef PERL_OLD_COPY_ON_WRITE
2603                 if (SvIsCOW(reginfo->sv))
2604                     sv_force_normal_flags(reginfo->sv, 0);
2605 #endif
2606                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2607                                  &PL_vtbl_mglob, NULL, 0);
2608                 mg->mg_len = -1;
2609             }
2610             PL_reg_magic    = mg;
2611             PL_reg_oldpos   = mg->mg_len;
2612             SAVEDESTRUCTOR_X(restore_pos, prog);
2613         }
2614         if (!PL_reg_curpm) {
2615             Newxz(PL_reg_curpm, 1, PMOP);
2616 #ifdef USE_ITHREADS
2617             {
2618                 SV* const repointer = &PL_sv_undef;
2619                 /* this regexp is also owned by the new PL_reg_curpm, which
2620                    will try to free it.  */
2621                 av_push(PL_regex_padav, repointer);
2622                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2623                 PL_regex_pad = AvARRAY(PL_regex_padav);
2624             }
2625 #endif
2626         }
2627 #ifdef USE_ITHREADS
2628         /* It seems that non-ithreads works both with and without this code.
2629            So for efficiency reasons it seems best not to have the code
2630            compiled when it is not needed.  */
2631         /* This is safe against NULLs: */
2632         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2633         /* PM_reg_curpm owns a reference to this regexp.  */
2634         (void)ReREFCNT_inc(rx);
2635 #endif
2636         PM_SETRE(PL_reg_curpm, rx);
2637         PL_reg_oldcurpm = PL_curpm;
2638         PL_curpm = PL_reg_curpm;
2639         if (RXp_MATCH_COPIED(prog)) {
2640             /*  Here is a serious problem: we cannot rewrite subbeg,
2641                 since it may be needed if this match fails.  Thus
2642                 $` inside (?{}) could fail... */
2643             PL_reg_oldsaved = prog->subbeg;
2644             PL_reg_oldsavedlen = prog->sublen;
2645 #ifdef PERL_OLD_COPY_ON_WRITE
2646             PL_nrs = prog->saved_copy;
2647 #endif
2648             RXp_MATCH_COPIED_off(prog);
2649         }
2650         else
2651             PL_reg_oldsaved = NULL;
2652         prog->subbeg = PL_bostr;
2653         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2654     }
2655     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2656     prog->offs[0].start = *startpos - PL_bostr;
2657     PL_reginput = *startpos;
2658     PL_reglastparen = &prog->lastparen;
2659     PL_reglastcloseparen = &prog->lastcloseparen;
2660     prog->lastparen = 0;
2661     prog->lastcloseparen = 0;
2662     PL_regsize = 0;
2663     PL_regoffs = prog->offs;
2664     if (PL_reg_start_tmpl <= prog->nparens) {
2665         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2666         if(PL_reg_start_tmp)
2667             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2668         else
2669             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2670     }
2671
2672     /* XXXX What this code is doing here?!!!  There should be no need
2673        to do this again and again, PL_reglastparen should take care of
2674        this!  --ilya*/
2675
2676     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2677      * Actually, the code in regcppop() (which Ilya may be meaning by
2678      * PL_reglastparen), is not needed at all by the test suite
2679      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2680      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2681      * Meanwhile, this code *is* needed for the
2682      * above-mentioned test suite tests to succeed.  The common theme
2683      * on those tests seems to be returning null fields from matches.
2684      * --jhi updated by dapm */
2685 #if 1
2686     if (prog->nparens) {
2687         regexp_paren_pair *pp = PL_regoffs;
2688         register I32 i;
2689         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2690             ++pp;
2691             pp->start = -1;
2692             pp->end = -1;
2693         }
2694     }
2695 #endif
2696     REGCP_SET(lastcp);
2697     if (regmatch(reginfo, progi->program + 1)) {
2698         PL_regoffs[0].end = PL_reginput - PL_bostr;
2699         return 1;
2700     }
2701     if (reginfo->cutpoint)
2702         *startpos= reginfo->cutpoint;
2703     REGCP_UNWIND(lastcp);
2704     return 0;
2705 }
2706
2707
2708 #define sayYES goto yes
2709 #define sayNO goto no
2710 #define sayNO_SILENT goto no_silent
2711
2712 /* we dont use STMT_START/END here because it leads to
2713    "unreachable code" warnings, which are bogus, but distracting. */
2714 #define CACHEsayNO \
2715     if (ST.cache_mask) \
2716        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2717     sayNO
2718
2719 /* this is used to determine how far from the left messages like
2720    'failed...' are printed. It should be set such that messages
2721    are inline with the regop output that created them.
2722 */
2723 #define REPORT_CODE_OFF 32
2724
2725
2726 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2727 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2728
2729 #define SLAB_FIRST(s) (&(s)->states[0])
2730 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2731
2732 /* grab a new slab and return the first slot in it */
2733
2734 STATIC regmatch_state *
2735 S_push_slab(pTHX)
2736 {
2737 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2738     dMY_CXT;
2739 #endif
2740     regmatch_slab *s = PL_regmatch_slab->next;
2741     if (!s) {
2742         Newx(s, 1, regmatch_slab);
2743         s->prev = PL_regmatch_slab;
2744         s->next = NULL;
2745         PL_regmatch_slab->next = s;
2746     }
2747     PL_regmatch_slab = s;
2748     return SLAB_FIRST(s);
2749 }
2750
2751
2752 /* push a new state then goto it */
2753
2754 #define PUSH_STATE_GOTO(state, node) \
2755     scan = node; \
2756     st->resume_state = state; \
2757     goto push_state;
2758
2759 /* push a new state with success backtracking, then goto it */
2760
2761 #define PUSH_YES_STATE_GOTO(state, node) \
2762     scan = node; \
2763     st->resume_state = state; \
2764     goto push_yes_state;
2765
2766
2767
2768 /*
2769
2770 regmatch() - main matching routine
2771
2772 This is basically one big switch statement in a loop. We execute an op,
2773 set 'next' to point the next op, and continue. If we come to a point which
2774 we may need to backtrack to on failure such as (A|B|C), we push a
2775 backtrack state onto the backtrack stack. On failure, we pop the top
2776 state, and re-enter the loop at the state indicated. If there are no more
2777 states to pop, we return failure.
2778
2779 Sometimes we also need to backtrack on success; for example /A+/, where
2780 after successfully matching one A, we need to go back and try to
2781 match another one; similarly for lookahead assertions: if the assertion
2782 completes successfully, we backtrack to the state just before the assertion
2783 and then carry on.  In these cases, the pushed state is marked as
2784 'backtrack on success too'. This marking is in fact done by a chain of
2785 pointers, each pointing to the previous 'yes' state. On success, we pop to
2786 the nearest yes state, discarding any intermediate failure-only states.
2787 Sometimes a yes state is pushed just to force some cleanup code to be
2788 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2789 it to free the inner regex.
2790
2791 Note that failure backtracking rewinds the cursor position, while
2792 success backtracking leaves it alone.
2793
2794 A pattern is complete when the END op is executed, while a subpattern
2795 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2796 ops trigger the "pop to last yes state if any, otherwise return true"
2797 behaviour.
2798
2799 A common convention in this function is to use A and B to refer to the two
2800 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2801 the subpattern to be matched possibly multiple times, while B is the entire
2802 rest of the pattern. Variable and state names reflect this convention.
2803
2804 The states in the main switch are the union of ops and failure/success of
2805 substates associated with with that op.  For example, IFMATCH is the op
2806 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2807 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2808 successfully matched A and IFMATCH_A_fail is a state saying that we have
2809 just failed to match A. Resume states always come in pairs. The backtrack
2810 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2811 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2812 on success or failure.
2813
2814 The struct that holds a backtracking state is actually a big union, with
2815 one variant for each major type of op. The variable st points to the
2816 top-most backtrack struct. To make the code clearer, within each
2817 block of code we #define ST to alias the relevant union.
2818
2819 Here's a concrete example of a (vastly oversimplified) IFMATCH
2820 implementation:
2821
2822     switch (state) {
2823     ....
2824
2825 #define ST st->u.ifmatch
2826
2827     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2828         ST.foo = ...; // some state we wish to save
2829         ...
2830         // push a yes backtrack state with a resume value of
2831         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2832         // first node of A:
2833         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2834         // NOTREACHED
2835
2836     case IFMATCH_A: // we have successfully executed A; now continue with B
2837         next = B;
2838         bar = ST.foo; // do something with the preserved value
2839         break;
2840
2841     case IFMATCH_A_fail: // A failed, so the assertion failed
2842         ...;   // do some housekeeping, then ...
2843         sayNO; // propagate the failure
2844
2845 #undef ST
2846
2847     ...
2848     }
2849
2850 For any old-timers reading this who are familiar with the old recursive
2851 approach, the code above is equivalent to:
2852
2853     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2854     {
2855         int foo = ...
2856         ...
2857         if (regmatch(A)) {
2858             next = B;
2859             bar = foo;
2860             break;
2861         }
2862         ...;   // do some housekeeping, then ...
2863         sayNO; // propagate the failure
2864     }
2865
2866 The topmost backtrack state, pointed to by st, is usually free. If you
2867 want to claim it, populate any ST.foo fields in it with values you wish to
2868 save, then do one of
2869
2870         PUSH_STATE_GOTO(resume_state, node);
2871         PUSH_YES_STATE_GOTO(resume_state, node);
2872
2873 which sets that backtrack state's resume value to 'resume_state', pushes a
2874 new free entry to the top of the backtrack stack, then goes to 'node'.
2875 On backtracking, the free slot is popped, and the saved state becomes the
2876 new free state. An ST.foo field in this new top state can be temporarily
2877 accessed to retrieve values, but once the main loop is re-entered, it
2878 becomes available for reuse.
2879
2880 Note that the depth of the backtrack stack constantly increases during the
2881 left-to-right execution of the pattern, rather than going up and down with
2882 the pattern nesting. For example the stack is at its maximum at Z at the
2883 end of the pattern, rather than at X in the following:
2884
2885     /(((X)+)+)+....(Y)+....Z/
2886
2887 The only exceptions to this are lookahead/behind assertions and the cut,
2888 (?>A), which pop all the backtrack states associated with A before
2889 continuing.
2890
2891 Backtrack state structs are allocated in slabs of about 4K in size.
2892 PL_regmatch_state and st always point to the currently active state,
2893 and PL_regmatch_slab points to the slab currently containing
2894 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2895 allocated, and is never freed until interpreter destruction. When the slab
2896 is full, a new one is allocated and chained to the end. At exit from
2897 regmatch(), slabs allocated since entry are freed.
2898
2899 */
2900
2901
2902 #define DEBUG_STATE_pp(pp)                                  \
2903     DEBUG_STATE_r({                                         \
2904         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2905         PerlIO_printf(Perl_debug_log,                       \
2906             "    %*s"pp" %s%s%s%s%s\n",                     \
2907             depth*2, "",                                    \
2908             PL_reg_name[st->resume_state],                     \
2909             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2910             ((st==yes_state) ? "Y" : ""),                   \
2911             ((st==mark_state) ? "M" : ""),                  \
2912             ((st==yes_state||st==mark_state) ? "]" : "")    \
2913         );                                                  \
2914     });
2915
2916
2917 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2918
2919 #ifdef DEBUGGING
2920
2921 STATIC void
2922 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2923     const char *start, const char *end, const char *blurb)
2924 {
2925     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2926
2927     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2928
2929     if (!PL_colorset)
2930             reginitcolors();
2931     {
2932         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2933             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2934
2935         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2936             start, end - start, 60);
2937
2938         PerlIO_printf(Perl_debug_log,
2939             "%s%s REx%s %s against %s\n",
2940                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2941
2942         if (utf8_target||utf8_pat)
2943             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2944                 utf8_pat ? "pattern" : "",
2945                 utf8_pat && utf8_target ? " and " : "",
2946                 utf8_target ? "string" : ""
2947             );
2948     }
2949 }
2950
2951 STATIC void
2952 S_dump_exec_pos(pTHX_ const char *locinput,
2953                       const regnode *scan,
2954                       const char *loc_regeol,
2955                       const char *loc_bostr,
2956                       const char *loc_reg_starttry,
2957                       const bool utf8_target)
2958 {
2959     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2960     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2961     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2962     /* The part of the string before starttry has one color
2963        (pref0_len chars), between starttry and current
2964        position another one (pref_len - pref0_len chars),
2965        after the current position the third one.
2966        We assume that pref0_len <= pref_len, otherwise we
2967        decrease pref0_len.  */
2968     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2969         ? (5 + taill) - l : locinput - loc_bostr;
2970     int pref0_len;
2971
2972     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2973
2974     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2975         pref_len++;
2976     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2977     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2978         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2979               ? (5 + taill) - pref_len : loc_regeol - locinput);
2980     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2981         l--;
2982     if (pref0_len < 0)
2983         pref0_len = 0;
2984     if (pref0_len > pref_len)
2985         pref0_len = pref_len;
2986     {
2987         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2988
2989         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2990             (locinput - pref_len),pref0_len, 60, 4, 5);
2991
2992         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2993                     (locinput - pref_len + pref0_len),
2994                     pref_len - pref0_len, 60, 2, 3);
2995
2996         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2997                     locinput, loc_regeol - locinput, 10, 0, 1);
2998
2999         const STRLEN tlen=len0+len1+len2;
3000         PerlIO_printf(Perl_debug_log,
3001                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3002                     (IV)(locinput - loc_bostr),
3003                     len0, s0,
3004                     len1, s1,
3005                     (docolor ? "" : "> <"),
3006                     len2, s2,
3007                     (int)(tlen > 19 ? 0 :  19 - tlen),
3008                     "");
3009     }
3010 }
3011
3012 #endif
3013
3014 /* reg_check_named_buff_matched()
3015  * Checks to see if a named buffer has matched. The data array of
3016  * buffer numbers corresponding to the buffer is expected to reside
3017  * in the regexp->data->data array in the slot stored in the ARG() of
3018  * node involved. Note that this routine doesn't actually care about the
3019  * name, that information is not preserved from compilation to execution.
3020  * Returns the index of the leftmost defined buffer with the given name
3021  * or 0 if non of the buffers matched.
3022  */
3023 STATIC I32
3024 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3025 {
3026     I32 n;
3027     RXi_GET_DECL(rex,rexi);
3028     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3029     I32 *nums=(I32*)SvPVX(sv_dat);
3030
3031     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3032
3033     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3034         if ((I32)*PL_reglastparen >= nums[n] &&
3035             PL_regoffs[nums[n]].end != -1)
3036         {
3037             return nums[n];
3038         }
3039     }
3040     return 0;
3041 }
3042
3043
3044 /* free all slabs above current one  - called during LEAVE_SCOPE */
3045
3046 STATIC void
3047 S_clear_backtrack_stack(pTHX_ void *p)
3048 {
3049     regmatch_slab *s = PL_regmatch_slab->next;
3050     PERL_UNUSED_ARG(p);
3051
3052     if (!s)
3053         return;
3054     PL_regmatch_slab->next = NULL;
3055     while (s) {
3056         regmatch_slab * const osl = s;
3057         s = s->next;
3058         Safefree(osl);
3059     }
3060 }
3061
3062
3063 #define SETREX(Re1,Re2) \
3064     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3065     Re1 = (Re2)
3066
3067 STATIC I32                      /* 0 failure, 1 success */
3068 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3069 {
3070 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3071     dMY_CXT;
3072 #endif
3073     dVAR;
3074     register const bool utf8_target = PL_reg_match_utf8;
3075     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3076     REGEXP *rex_sv = reginfo->prog;
3077     regexp *rex = (struct regexp *)SvANY(rex_sv);
3078     RXi_GET_DECL(rex,rexi);
3079     I32 oldsave;
3080     /* the current state. This is a cached copy of PL_regmatch_state */
3081     register regmatch_state *st;
3082     /* cache heavy used fields of st in registers */
3083     register regnode *scan;
3084     register regnode *next;
3085     register U32 n = 0; /* general value; init to avoid compiler warning */
3086     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3087     register char *locinput = PL_reginput;
3088     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3089
3090     bool result = 0;        /* return value of S_regmatch */
3091     int depth = 0;          /* depth of backtrack stack */
3092     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3093     const U32 max_nochange_depth =
3094         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3095         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3096     regmatch_state *yes_state = NULL; /* state to pop to on success of
3097                                                             subpattern */
3098     /* mark_state piggy backs on the yes_state logic so that when we unwind
3099        the stack on success we can update the mark_state as we go */
3100     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3101     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3102     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3103     U32 state_num;
3104     bool no_final = 0;      /* prevent failure from backtracking? */
3105     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3106     char *startpoint = PL_reginput;
3107     SV *popmark = NULL;     /* are we looking for a mark? */
3108     SV *sv_commit = NULL;   /* last mark name seen in failure */
3109     SV *sv_yes_mark = NULL; /* last mark name we have seen
3110                                during a successful match */
3111     U32 lastopen = 0;       /* last open we saw */
3112     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3113     SV* const oreplsv = GvSV(PL_replgv);
3114     /* these three flags are set by various ops to signal information to
3115      * the very next op. They have a useful lifetime of exactly one loop
3116      * iteration, and are not preserved or restored by state pushes/pops
3117      */
3118     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3119     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3120     int logical = 0;        /* the following EVAL is:
3121                                 0: (?{...})
3122                                 1: (?(?{...})X|Y)
3123                                 2: (??{...})
3124                                or the following IFMATCH/UNLESSM is:
3125                                 false: plain (?=foo)
3126                                 true:  used as a condition: (?(?=foo))
3127                             */
3128 #ifdef DEBUGGING
3129     GET_RE_DEBUG_FLAGS_DECL;
3130 #endif
3131
3132     PERL_ARGS_ASSERT_REGMATCH;
3133
3134     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3135             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3136     }));
3137     /* on first ever call to regmatch, allocate first slab */
3138     if (!PL_regmatch_slab) {
3139         Newx(PL_regmatch_slab, 1, regmatch_slab);
3140         PL_regmatch_slab->prev = NULL;
3141         PL_regmatch_slab->next = NULL;
3142         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3143     }
3144
3145     oldsave = PL_savestack_ix;
3146     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3147     SAVEVPTR(PL_regmatch_slab);
3148     SAVEVPTR(PL_regmatch_state);
3149
3150     /* grab next free state slot */
3151     st = ++PL_regmatch_state;
3152     if (st >  SLAB_LAST(PL_regmatch_slab))
3153         st = PL_regmatch_state = S_push_slab(aTHX);
3154
3155     /* Note that nextchr is a byte even in UTF */
3156     nextchr = UCHARAT(locinput);
3157     scan = prog;
3158     while (scan != NULL) {
3159
3160         DEBUG_EXECUTE_r( {
3161             SV * const prop = sv_newmortal();
3162             regnode *rnext=regnext(scan);
3163             DUMP_EXEC_POS( locinput, scan, utf8_target );
3164             regprop(rex, prop, scan);
3165
3166             PerlIO_printf(Perl_debug_log,
3167                     "%3"IVdf":%*s%s(%"IVdf")\n",
3168                     (IV)(scan - rexi->program), depth*2, "",
3169                     SvPVX_const(prop),
3170                     (PL_regkind[OP(scan)] == END || !rnext) ?
3171                         0 : (IV)(rnext - rexi->program));
3172         });
3173
3174         next = scan + NEXT_OFF(scan);
3175         if (next == scan)
3176             next = NULL;
3177         state_num = OP(scan);
3178
3179         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3180       reenter_switch:
3181
3182         assert(PL_reglastparen == &rex->lastparen);
3183         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3184         assert(PL_regoffs == rex->offs);
3185
3186         switch (state_num) {
3187         case BOL:
3188             if (locinput == PL_bostr)
3189             {
3190                 /* reginfo->till = reginfo->bol; */
3191                 break;
3192             }
3193             sayNO;
3194         case MBOL:
3195             if (locinput == PL_bostr ||
3196                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3197             {
3198                 break;
3199             }
3200             sayNO;
3201         case SBOL:
3202             if (locinput == PL_bostr)
3203                 break;
3204             sayNO;
3205         case GPOS:
3206             if (locinput == reginfo->ganch)
3207                 break;
3208             sayNO;
3209
3210         case KEEPS:
3211             /* update the startpoint */
3212             st->u.keeper.val = PL_regoffs[0].start;
3213             PL_reginput = locinput;
3214             PL_regoffs[0].start = locinput - PL_bostr;
3215             PUSH_STATE_GOTO(KEEPS_next, next);
3216             /*NOT-REACHED*/
3217         case KEEPS_next_fail:
3218             /* rollback the start point change */
3219             PL_regoffs[0].start = st->u.keeper.val;
3220             sayNO_SILENT;
3221             /*NOT-REACHED*/
3222         case EOL:
3223                 goto seol;
3224         case MEOL:
3225             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3226                 sayNO;
3227             break;
3228         case SEOL:
3229           seol:
3230             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3231                 sayNO;
3232             if (PL_regeol - locinput > 1)
3233                 sayNO;
3234             break;
3235         case EOS:
3236             if (PL_regeol != locinput)
3237                 sayNO;
3238             break;
3239         case SANY:
3240             if (!nextchr && locinput >= PL_regeol)
3241                 sayNO;
3242             if (utf8_target) {
3243                 locinput += PL_utf8skip[nextchr];
3244                 if (locinput > PL_regeol)
3245                     sayNO;
3246                 nextchr = UCHARAT(locinput);
3247             }
3248             else
3249                 nextchr = UCHARAT(++locinput);
3250             break;
3251         case CANY:
3252             if (!nextchr && locinput >= PL_regeol)
3253                 sayNO;
3254             nextchr = UCHARAT(++locinput);
3255             break;
3256         case REG_ANY:
3257             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3258                 sayNO;
3259             if (utf8_target) {
3260                 locinput += PL_utf8skip[nextchr];
3261                 if (locinput > PL_regeol)
3262                     sayNO;
3263                 nextchr = UCHARAT(locinput);
3264             }
3265             else
3266                 nextchr = UCHARAT(++locinput);
3267             break;
3268
3269 #undef  ST
3270 #define ST st->u.trie
3271         case TRIEC:
3272             /* In this case the charclass data is available inline so
3273                we can fail fast without a lot of extra overhead.
3274              */
3275             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3276                 DEBUG_EXECUTE_r(
3277                     PerlIO_printf(Perl_debug_log,
3278                               "%*s  %sfailed to match trie start class...%s\n",
3279                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3280                 );
3281                 sayNO_SILENT;
3282                 /* NOTREACHED */
3283             }
3284             /* FALL THROUGH */
3285         case TRIE:
3286             /* the basic plan of execution of the trie is:
3287              * At the beginning, run though all the states, and
3288              * find the longest-matching word. Also remember the position
3289              * of the shortest matching word. For example, this pattern:
3290              *    1  2 3 4    5
3291              *    ab|a|x|abcd|abc
3292              * when matched against the string "abcde", will generate
3293              * accept states for all words except 3, with the longest
3294              * matching word being 4, and the shortest being 1 (with
3295              * the position being after char 1 of the string).
3296              *
3297              * Then for each matching word, in word order (i.e. 1,2,4,5),
3298              * we run the remainder of the pattern; on each try setting
3299              * the current position to the character following the word,
3300              * returning to try the next word on failure.
3301              *
3302              * We avoid having to build a list of words at runtime by
3303              * using a compile-time structure, wordinfo[].prev, which
3304              * gives, for each word, the previous accepting word (if any).
3305              * In the case above it would contain the mappings 1->2, 2->0,
3306              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3307              * the longest word (4 above), a list of all words, by
3308              * following the list of prev pointers; this gives us the
3309              * unordered list 4,5,1,2. Then given the current word we have
3310              * just tried, we can go through the list and find the
3311              * next-biggest word to try (so if we just failed on word 2,
3312              * the next in the list is 4).
3313              *
3314              * Since at runtime we don't record the matching position in
3315              * the string for each word, we have to work that out for
3316              * each word we're about to process. The wordinfo table holds
3317              * the character length of each word; given that we recorded
3318              * at the start: the position of the shortest word and its
3319              * length in chars, we just need to move the pointer the
3320              * difference between the two char lengths. Depending on
3321              * Unicode status and folding, that's cheap or expensive.
3322              *
3323              * This algorithm is optimised for the case where are only a
3324              * small number of accept states, i.e. 0,1, or maybe 2.
3325              * With lots of accepts states, and having to try all of them,
3326              * it becomes quadratic on number of accept states to find all
3327              * the next words.
3328              */
3329
3330             {
3331                 /* what type of TRIE am I? (utf8 makes this contextual) */
3332                 DECL_TRIE_TYPE(scan);
3333
3334                 /* what trie are we using right now */
3335                 reg_trie_data * const trie
3336                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3337                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3338                 U32 state = trie->startstate;
3339
3340                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3341                     if (trie->states[ state ].wordnum) {
3342                          DEBUG_EXECUTE_r(
3343                             PerlIO_printf(Perl_debug_log,
3344                                           "%*s  %smatched empty string...%s\n",
3345                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3346                         );
3347                         if (!trie->jump)
3348                             break;
3349                     } else {
3350                         DEBUG_EXECUTE_r(
3351                             PerlIO_printf(Perl_debug_log,
3352                                           "%*s  %sfailed to match trie start class...%s\n",
3353                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3354                         );
3355                         sayNO_SILENT;
3356                    }
3357                 }
3358
3359             {
3360                 U8 *uc = ( U8* )locinput;
3361
3362                 STRLEN len = 0;
3363                 STRLEN foldlen = 0;
3364                 U8 *uscan = (U8*)NULL;
3365                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3366                 U32 charcount = 0; /* how many input chars we have matched */
3367                 U32 accepted = 0; /* have we seen any accepting states? */
3368
3369                 ST.B = next;
3370                 ST.jump = trie->jump;
3371                 ST.me = scan;
3372                 ST.firstpos = NULL;
3373                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3374                 ST.nextword = 0;
3375
3376                 /* fully traverse the TRIE; note the position of the
3377                    shortest accept state and the wordnum of the longest
3378                    accept state */
3379
3380                 while ( state && uc <= (U8*)PL_regeol ) {
3381                     U32 base = trie->states[ state ].trans.base;
3382                     UV uvc = 0;
3383                     U16 charid = 0;
3384                     U16 wordnum;
3385                     wordnum = trie->states[ state ].wordnum;
3386
3387                     if (wordnum) { /* it's an accept state */
3388                         if (!accepted) {
3389                             accepted = 1;
3390                             /* record first match position */
3391                             if (ST.longfold) {
3392                                 ST.firstpos = (U8*)locinput;
3393                                 ST.firstchars = 0;
3394                             }
3395                             else {
3396                                 ST.firstpos = uc;
3397                                 ST.firstchars = charcount;
3398                             }
3399                         }
3400                         if (!ST.nextword || wordnum < ST.nextword)
3401                             ST.nextword = wordnum;
3402                         ST.topword = wordnum;
3403                     }
3404
3405                     DEBUG_TRIE_EXECUTE_r({
3406                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3407                                 PerlIO_printf( Perl_debug_log,
3408                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3409                                     2+depth * 2, "", PL_colors[4],
3410                                     (UV)state, (accepted ? 'Y' : 'N'));
3411                     });
3412
3413                     /* read a char and goto next state */
3414                     if ( base ) {
3415                         I32 offset;
3416                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3417                                              uscan, len, uvc, charid, foldlen,
3418                                              foldbuf, uniflags);
3419                         charcount++;
3420                         if (foldlen>0)
3421                             ST.longfold = TRUE;
3422                         if (charid &&
3423                              ( ((offset =
3424                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3425
3426                              && ((U32)offset < trie->lasttrans)
3427                              && trie->trans[offset].check == state)
3428                         {
3429                             state = trie->trans[offset].next;
3430                         }
3431                         else {
3432                             state = 0;
3433                         }
3434                         uc += len;
3435
3436                     }
3437                     else {
3438                         state = 0;
3439                     }
3440                     DEBUG_TRIE_EXECUTE_r(
3441                         PerlIO_printf( Perl_debug_log,
3442                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3443                             charid, uvc, (UV)state, PL_colors[5] );
3444                     );
3445                 }
3446                 if (!accepted)
3447                    sayNO;
3448
3449                 /* calculate total number of accept states */
3450                 {
3451                     U16 w = ST.topword;
3452                     accepted = 0;
3453                     while (w) {
3454                         w = trie->wordinfo[w].prev;
3455                         accepted++;
3456                     }
3457                     ST.accepted = accepted;
3458                 }
3459
3460                 DEBUG_EXECUTE_r(
3461                     PerlIO_printf( Perl_debug_log,
3462                         "%*s  %sgot %"IVdf" possible matches%s\n",
3463                         REPORT_CODE_OFF + depth * 2, "",
3464                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3465                 );
3466                 goto trie_first_try; /* jump into the fail handler */
3467             }}
3468             /* NOTREACHED */
3469
3470         case TRIE_next_fail: /* we failed - try next alternative */
3471             if ( ST.jump) {
3472                 REGCP_UNWIND(ST.cp);
3473                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3474                     PL_regoffs[n].end = -1;
3475                 *PL_reglastparen = n;
3476             }
3477             if (!--ST.accepted) {
3478                 DEBUG_EXECUTE_r({
3479                     PerlIO_printf( Perl_debug_log,
3480                         "%*s  %sTRIE failed...%s\n",
3481                         REPORT_CODE_OFF+depth*2, "",
3482                         PL_colors[4],
3483                         PL_colors[5] );
3484                 });
3485                 sayNO_SILENT;
3486             }
3487             {
3488                 /* Find next-highest word to process.  Note that this code
3489                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3490                 register U16 min = 0;
3491                 register U16 word;
3492                 register U16 const nextword = ST.nextword;
3493                 register reg_trie_wordinfo * const wordinfo
3494                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3495                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3496                     if (word > nextword && (!min || word < min))
3497                         min = word;
3498                 }
3499                 ST.nextword = min;
3500             }
3501
3502           trie_first_try:
3503             if (do_cutgroup) {
3504                 do_cutgroup = 0;
3505                 no_final = 0;
3506             }
3507
3508             if ( ST.jump) {
3509                 ST.lastparen = *PL_reglastparen;
3510                 REGCP_SET(ST.cp);
3511             }
3512
3513             /* find start char of end of current word */
3514             {
3515                 U32 chars; /* how many chars to skip */
3516                 U8 *uc = ST.firstpos;
3517                 reg_trie_data * const trie
3518                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3519
3520                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3521                             >=  ST.firstchars);
3522                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3523                             - ST.firstchars;
3524
3525                 if (ST.longfold) {
3526                     /* the hard option - fold each char in turn and find
3527                      * its folded length (which may be different */
3528                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3529                     STRLEN foldlen;
3530                     STRLEN len;
3531                     UV uvc;
3532                     U8 *uscan;
3533
3534                     while (chars) {
3535                         if (utf8_target) {
3536                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3537                                                     uniflags);
3538                             uc += len;
3539                         }
3540                         else {
3541                             uvc = *uc;
3542                             uc++;
3543                         }
3544                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3545                         uscan = foldbuf;
3546                         while (foldlen) {
3547                             if (!--chars)
3548                                 break;
3549                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3550                                             uniflags);
3551                             uscan += len;
3552                             foldlen -= len;
3553                         }
3554                     }
3555                 }
3556                 else {
3557                     if (utf8_target)
3558                         while (chars--)
3559                             uc += UTF8SKIP(uc);
3560                     else
3561                         uc += chars;
3562                 }
3563                 PL_reginput = (char *)uc;
3564             }
3565
3566             scan = (ST.jump && ST.jump[ST.nextword])
3567                         ? ST.me + ST.jump[ST.nextword]
3568                         : ST.B;
3569
3570             DEBUG_EXECUTE_r({
3571                 PerlIO_printf( Perl_debug_log,
3572                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3573                     REPORT_CODE_OFF+depth*2, "",
3574                     PL_colors[4],
3575                     ST.nextword,
3576                     PL_colors[5]
3577                     );
3578             });
3579
3580             if (ST.accepted > 1 || has_cutgroup) {
3581                 PUSH_STATE_GOTO(TRIE_next, scan);
3582                 /* NOTREACHED */
3583             }
3584             /* only one choice left - just continue */
3585             DEBUG_EXECUTE_r({
3586                 AV *const trie_words
3587                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3588                 SV ** const tmp = av_fetch( trie_words,
3589                     ST.nextword-1, 0 );
3590                 SV *sv= tmp ? sv_newmortal() : NULL;
3591
3592                 PerlIO_printf( Perl_debug_log,
3593                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3594                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3595                     ST.nextword,
3596                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3597                             PL_colors[0], PL_colors[1],
3598                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3599                         )
3600                     : "not compiled under -Dr",
3601                     PL_colors[5] );
3602             });
3603
3604             locinput = PL_reginput;
3605             nextchr = UCHARAT(locinput);
3606             continue; /* execute rest of RE */
3607             /* NOTREACHED */
3608 #undef  ST
3609
3610         case EXACT: {
3611             char *s = STRING(scan);
3612             ln = STR_LEN(scan);
3613             if (utf8_target != UTF_PATTERN) {
3614                 /* The target and the pattern have differing utf8ness. */
3615                 char *l = locinput;
3616                 const char * const e = s + ln;
3617
3618                 if (utf8_target) {
3619                     /* The target is utf8, the pattern is not utf8. */
3620                     while (s < e) {
3621                         STRLEN ulen;
3622                         if (l >= PL_regeol)
3623                              sayNO;
3624                         if (NATIVE_TO_UNI(*(U8*)s) !=
3625                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3626                                             uniflags))
3627                              sayNO;
3628                         l += ulen;
3629                         s ++;
3630                     }
3631                 }
3632                 else {
3633                     /* The target is not utf8, the pattern is utf8. */
3634                     while (s < e) {
3635                         STRLEN ulen;
3636                         if (l >= PL_regeol)
3637                             sayNO;
3638                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3639                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3640                                            uniflags))
3641                             sayNO;
3642                         s += ulen;
3643                         l ++;
3644                     }
3645                 }
3646                 locinput = l;
3647                 nextchr = UCHARAT(locinput);
3648                 break;
3649             }
3650             /* The target and the pattern have the same utf8ness. */
3651             /* Inline the first character, for speed. */
3652             if (UCHARAT(s) != nextchr)
3653                 sayNO;
3654             if (PL_regeol - locinput < ln)
3655                 sayNO;
3656             if (ln > 1 && memNE(s, locinput, ln))
3657                 sayNO;
3658             locinput += ln;
3659             nextchr = UCHARAT(locinput);
3660             break;
3661             }
3662         case EXACTFL: {
3663             re_fold_t folder;
3664             const U8 * fold_array;
3665             const char * s;
3666             U32 fold_utf8_flags;
3667
3668             PL_reg_flags |= RF_tainted;
3669             folder = foldEQ_locale;
3670             fold_array = PL_fold_locale;
3671             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3672             goto do_exactf;
3673
3674         case EXACTFU_SS:
3675         case EXACTFU_TRICKYFOLD:
3676         case EXACTFU:
3677             folder = foldEQ_latin1;
3678             fold_array = PL_fold_latin1;
3679             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3680             goto do_exactf;
3681
3682         case EXACTFA:
3683             folder = foldEQ_latin1;
3684             fold_array = PL_fold_latin1;
3685             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3686             goto do_exactf;
3687
3688         case EXACTF:
3689             folder = foldEQ;
3690             fold_array = PL_fold;
3691             fold_utf8_flags = 0;
3692
3693           do_exactf:
3694             s = STRING(scan);
3695             ln = STR_LEN(scan);
3696
3697             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3698               /* Either target or the pattern are utf8, or has the issue where
3699                * the fold lengths may differ. */
3700                 const char * const l = locinput;
3701                 char *e = PL_regeol;
3702
3703                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3704                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3705                 {
3706                     sayNO;
3707                 }
3708                 locinput = e;
3709                 nextchr = UCHARAT(locinput);
3710                 break;
3711             }
3712
3713             /* Neither the target nor the pattern are utf8 */
3714             if (UCHARAT(s) != nextchr &&
3715                 UCHARAT(s) != fold_array[nextchr])
3716             {
3717                 sayNO;
3718             }
3719             if (PL_regeol - locinput < ln)
3720                 sayNO;
3721             if (ln > 1 && ! folder(s, locinput, ln))
3722                 sayNO;
3723             locinput += ln;
3724             nextchr = UCHARAT(locinput);
3725             break;
3726         }
3727
3728         /* XXX Could improve efficiency by separating these all out using a
3729          * macro or in-line function.  At that point regcomp.c would no longer
3730          * have to set the FLAGS fields of these */
3731         case BOUNDL:
3732         case NBOUNDL:
3733             PL_reg_flags |= RF_tainted;
3734             /* FALL THROUGH */
3735         case BOUND:
3736         case BOUNDU:
3737         case BOUNDA:
3738         case NBOUND:
3739         case NBOUNDU:
3740         case NBOUNDA:
3741             /* was last char in word? */
3742             if (utf8_target
3743                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3744                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3745             {
3746                 if (locinput == PL_bostr)
3747                     ln = '\n';
3748                 else {
3749                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3750
3751                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3752                 }
3753                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3754                     ln = isALNUM_uni(ln);
3755                     LOAD_UTF8_CHARCLASS_ALNUM();
3756                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3757                 }
3758                 else {
3759                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3760                     n = isALNUM_LC_utf8((U8*)locinput);
3761                 }
3762             }
3763             else {
3764
3765                 /* Here the string isn't utf8, or is utf8 and only ascii
3766                  * characters are to match \w.  In the latter case looking at
3767                  * the byte just prior to the current one may be just the final
3768                  * byte of a multi-byte character.  This is ok.  There are two
3769                  * cases:
3770                  * 1) it is a single byte character, and then the test is doing
3771                  *      just what it's supposed to.
3772                  * 2) it is a multi-byte character, in which case the final
3773                  *      byte is never mistakable for ASCII, and so the test
3774                  *      will say it is not a word character, which is the
3775                  *      correct answer. */
3776                 ln = (locinput != PL_bostr) ?
3777                     UCHARAT(locinput - 1) : '\n';
3778                 switch (FLAGS(scan)) {
3779                     case REGEX_UNICODE_CHARSET:
3780                         ln = isWORDCHAR_L1(ln);
3781                         n = isWORDCHAR_L1(nextchr);
3782                         break;
3783                     case REGEX_LOCALE_CHARSET:
3784                         ln = isALNUM_LC(ln);
3785                         n = isALNUM_LC(nextchr);
3786                         break;
3787                     case REGEX_DEPENDS_CHARSET:
3788                         ln = isALNUM(ln);
3789                         n = isALNUM(nextchr);
3790                         break;
3791                     case REGEX_ASCII_RESTRICTED_CHARSET:
3792                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3793                         ln = isWORDCHAR_A(ln);
3794                         n = isWORDCHAR_A(nextchr);
3795                         break;
3796                     default:
3797                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3798                         break;
3799                 }
3800             }
3801             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3802              * regcomp.sym */
3803             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3804                     sayNO;
3805             break;
3806         case ANYOFV:
3807         case ANYOF:
3808             if (utf8_target || state_num == ANYOFV) {
3809                 STRLEN inclasslen = PL_regeol - locinput;
3810                 if (locinput >= PL_regeol)
3811                     sayNO;
3812
3813                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3814                     sayNO;
3815                 locinput += inclasslen;
3816                 nextchr = UCHARAT(locinput);
3817                 break;
3818             }
3819             else {
3820                 if (nextchr < 0)
3821                     nextchr = UCHARAT(locinput);
3822                 if (!nextchr && locinput >= PL_regeol)
3823                     sayNO;
3824                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3825                     sayNO;
3826                 nextchr = UCHARAT(++locinput);
3827                 break;
3828             }
3829             break;
3830         /* Special char classes - The defines start on line 129 or so */
3831         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3832                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3833                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3834                   ALNUMA, NALNUMA, isWORDCHAR_A,
3835                   alnum, "a");
3836
3837         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3838                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3839                   SPACEU, NSPACEU, isSPACE_L1,
3840                   SPACEA, NSPACEA, isSPACE_A,
3841                   space, " ");
3842
3843         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3844                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3845                 DIGITA, NDIGITA, isDIGIT_A,
3846                 digit, "0");
3847
3848         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3849                        a Unicode extended Grapheme Cluster */
3850             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3851               extended Grapheme Cluster is:
3852
3853                CR LF
3854                | Prepend* Begin Extend*
3855                | .
3856
3857                Begin is (Hangul-syllable | ! Control)
3858                Extend is (Grapheme_Extend | Spacing_Mark)
3859                Control is [ GCB_Control CR LF ]
3860
3861                The discussion below shows how the code for CLUMP is derived
3862                from this regex.  Note that most of these concepts are from
3863                property values of the Grapheme Cluster Boundary (GCB) property.
3864                No code point can have multiple property values for a given
3865                property.  Thus a code point in Prepend can't be in Control, but
3866                it must be in !Control.  This is why Control above includes
3867                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3868                property separately, and so can't be in GCB_Control, even though
3869                they logically are controls.  Control is not the same as gc=cc,
3870                but includes format and other characters as well.
3871
3872                The Unicode definition of Hangul-syllable is:
3873                    L+
3874                    | (L* ( ( V | LV ) V* | LVT ) T*)
3875                    | T+
3876                   )
3877                Each of these is a value for the GCB property, and hence must be
3878                disjoint, so the order they are tested is immaterial, so the
3879                above can safely be changed to
3880                    T+
3881                    | L+
3882                    | (L* ( LVT | ( V | LV ) V*) T*)
3883
3884                The last two terms can be combined like this:
3885                    L* ( L
3886                         | (( LVT | ( V | LV ) V*) T*))
3887
3888                And refactored into this:
3889                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3890
3891                That means that if we have seen any L's at all we can quit
3892                there, but if the next character is an LVT, a V, or an LV we
3893                should keep going.
3894
3895                There is a subtlety with Prepend* which showed up in testing.
3896                Note that the Begin, and only the Begin is required in:
3897                 | Prepend* Begin Extend*
3898                Also, Begin contains '! Control'.  A Prepend must be a
3899                '!  Control', which means it must also be a Begin.  What it
3900                comes down to is that if we match Prepend* and then find no
3901                suitable Begin afterwards, that if we backtrack the last
3902                Prepend, that one will be a suitable Begin.
3903             */
3904
3905             if (locinput >= PL_regeol)
3906                 sayNO;
3907             if  (! utf8_target) {
3908
3909                 /* Match either CR LF  or '.', as all the other possibilities
3910                  * require utf8 */
3911                 locinput++;         /* Match the . or CR */
3912                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3913                                        match the LF */
3914                     && locinput < PL_regeol
3915                     && UCHARAT(locinput) == '\n') locinput++;
3916             }
3917             else {
3918
3919                 /* Utf8: See if is ( CR LF ); already know that locinput <
3920                  * PL_regeol, so locinput+1 is in bounds */
3921                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3922                     locinput += 2;
3923                 }
3924                 else {
3925                     /* In case have to backtrack to beginning, then match '.' */
3926                     char *starting = locinput;
3927
3928                     /* In case have to backtrack the last prepend */
3929                     char *previous_prepend = 0;
3930
3931                     LOAD_UTF8_CHARCLASS_GCB();
3932
3933                     /* Match (prepend)* */
3934                     while (locinput < PL_regeol
3935                            && swash_fetch(PL_utf8_X_prepend,
3936                                           (U8*)locinput, utf8_target))
3937                     {
3938                         previous_prepend = locinput;
3939                         locinput += UTF8SKIP(locinput);
3940                     }
3941
3942                     /* As noted above, if we matched a prepend character, but
3943                      * the next thing won't match, back off the last prepend we
3944                      * matched, as it is guaranteed to match the begin */
3945                     if (previous_prepend
3946                         && (locinput >=  PL_regeol
3947                             || ! swash_fetch(PL_utf8_X_begin,
3948                                              (U8*)locinput, utf8_target)))
3949                     {
3950                         locinput = previous_prepend;
3951                     }
3952
3953                     /* Note that here we know PL_regeol > locinput, as we
3954                      * tested that upon input to this switch case, and if we
3955                      * moved locinput forward, we tested the result just above
3956                      * and it either passed, or we backed off so that it will
3957                      * now pass */
3958                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3959
3960                         /* Here did not match the required 'Begin' in the
3961                          * second term.  So just match the very first
3962                          * character, the '.' of the final term of the regex */
3963                         locinput = starting + UTF8SKIP(starting);
3964                     } else {
3965
3966                         /* Here is the beginning of a character that can have
3967                          * an extender.  It is either a hangul syllable, or a
3968                          * non-control */
3969                         if (swash_fetch(PL_utf8_X_non_hangul,
3970                                         (U8*)locinput, utf8_target))
3971                         {
3972
3973                             /* Here not a Hangul syllable, must be a
3974                              * ('!  * Control') */
3975                             locinput += UTF8SKIP(locinput);
3976                         } else {
3977
3978                             /* Here is a Hangul syllable.  It can be composed
3979                              * of several individual characters.  One
3980                              * possibility is T+ */
3981                             if (swash_fetch(PL_utf8_X_T,
3982                                             (U8*)locinput, utf8_target))
3983                             {
3984                                 while (locinput < PL_regeol
3985                                         && swash_fetch(PL_utf8_X_T,
3986                                                         (U8*)locinput, utf8_target))
3987                                 {
3988                                     locinput += UTF8SKIP(locinput);
3989                                 }
3990                             } else {
3991
3992                                 /* Here, not T+, but is a Hangul.  That means
3993                                  * it is one of the others: L, LV, LVT or V,
3994                                  * and matches:
3995                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3996
3997                                 /* Match L*           */
3998                                 while (locinput < PL_regeol
3999                                         && swash_fetch(PL_utf8_X_L,
4000                                                         (U8*)locinput, utf8_target))
4001                                 {
4002                                     locinput += UTF8SKIP(locinput);
4003                                 }
4004
4005                                 /* Here, have exhausted L*.  If the next
4006                                  * character is not an LV, LVT nor V, it means
4007                                  * we had to have at least one L, so matches L+
4008                                  * in the original equation, we have a complete
4009                                  * hangul syllable.  Are done. */
4010
4011                                 if (locinput < PL_regeol
4012                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4013                                                     (U8*)locinput, utf8_target))
4014                                 {
4015
4016                                     /* Otherwise keep going.  Must be LV, LVT
4017                                      * or V.  See if LVT */
4018                                     if (swash_fetch(PL_utf8_X_LVT,
4019                                                     (U8*)locinput, utf8_target))
4020                                     {
4021                                         locinput += UTF8SKIP(locinput);
4022                                     } else {
4023
4024                                         /* Must be  V or LV.  Take it, then
4025                                          * match V*     */
4026                                         locinput += UTF8SKIP(locinput);
4027                                         while (locinput < PL_regeol
4028                                                 && swash_fetch(PL_utf8_X_V,
4029                                                          (U8*)locinput, utf8_target))
4030                                         {
4031                                             locinput += UTF8SKIP(locinput);
4032                                         }
4033                                     }
4034
4035                                     /* And any of LV, LVT, or V can be followed
4036                                      * by T*            */
4037                                     while (locinput < PL_regeol
4038                                            && swash_fetch(PL_utf8_X_T,
4039                                                            (U8*)locinput,
4040                                                            utf8_target))
4041                                     {
4042                                         locinput += UTF8SKIP(locinput);
4043                                     }
4044                                 }
4045                             }
4046                         }
4047
4048                         /* Match any extender */
4049                         while (locinput < PL_regeol
4050                                 && swash_fetch(PL_utf8_X_extend,
4051                                                 (U8*)locinput, utf8_target))
4052                         {
4053                             locinput += UTF8SKIP(locinput);
4054                         }
4055                     }
4056                 }
4057                 if (locinput > PL_regeol) sayNO;
4058             }
4059             nextchr = UCHARAT(locinput);
4060             break;
4061
4062         case NREFFL:
4063         {   /* The capture buffer cases.  The ones beginning with N for the
4064                named buffers just convert to the equivalent numbered and
4065                pretend they were called as the corresponding numbered buffer
4066                op.  */
4067             /* don't initialize these in the declaration, it makes C++
4068                unhappy */
4069             char *s;
4070             char type;
4071             re_fold_t folder;
4072             const U8 *fold_array;
4073             UV utf8_fold_flags;
4074
4075             PL_reg_flags |= RF_tainted;
4076             folder = foldEQ_locale;
4077             fold_array = PL_fold_locale;
4078             type = REFFL;
4079             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4080             goto do_nref;
4081
4082         case NREFFA:
4083             folder = foldEQ_latin1;
4084             fold_array = PL_fold_latin1;
4085             type = REFFA;
4086             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4087             goto do_nref;
4088
4089         case NREFFU:
4090             folder = foldEQ_latin1;
4091             fold_array = PL_fold_latin1;
4092             type = REFFU;
4093             utf8_fold_flags = 0;
4094             goto do_nref;
4095
4096         case NREFF:
4097             folder = foldEQ;
4098             fold_array = PL_fold;
4099             type = REFF;
4100             utf8_fold_flags = 0;
4101             goto do_nref;
4102
4103         case NREF:
4104             type = REF;
4105             folder = NULL;
4106             fold_array = NULL;
4107             utf8_fold_flags = 0;
4108           do_nref:
4109
4110             /* For the named back references, find the corresponding buffer
4111              * number */
4112             n = reg_check_named_buff_matched(rex,scan);
4113
4114             if ( ! n ) {
4115                 sayNO;
4116             }
4117             goto do_nref_ref_common;
4118
4119         case REFFL:
4120             PL_reg_flags |= RF_tainted;
4121             folder = foldEQ_locale;
4122             fold_array = PL_fold_locale;
4123             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4124             goto do_ref;
4125
4126         case REFFA:
4127             folder = foldEQ_latin1;
4128             fold_array = PL_fold_latin1;
4129             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4130             goto do_ref;
4131
4132         case REFFU:
4133             folder = foldEQ_latin1;
4134             fold_array = PL_fold_latin1;
4135             utf8_fold_flags = 0;
4136             goto do_ref;
4137
4138         case REFF:
4139             folder = foldEQ;
4140             fold_array = PL_fold;
4141             utf8_fold_flags = 0;
4142             goto do_ref;
4143
4144         case REF:
4145             folder = NULL;
4146             fold_array = NULL;
4147             utf8_fold_flags = 0;
4148
4149           do_ref:
4150             type = OP(scan);
4151             n = ARG(scan);  /* which paren pair */
4152
4153           do_nref_ref_common:
4154             ln = PL_regoffs[n].start;
4155             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4156             if (*PL_reglastparen < n || ln == -1)
4157                 sayNO;                  /* Do not match unless seen CLOSEn. */
4158             if (ln == PL_regoffs[n].end)
4159                 break;
4160
4161             s = PL_bostr + ln;
4162             if (type != REF     /* REF can do byte comparison */
4163                 && (utf8_target || type == REFFU))
4164             { /* XXX handle REFFL better */
4165                 char * limit = PL_regeol;
4166
4167                 /* This call case insensitively compares the entire buffer
4168                     * at s, with the current input starting at locinput, but
4169                     * not going off the end given by PL_regeol, and returns in
4170                     * limit upon success, how much of the current input was
4171                     * matched */
4172                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4173                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4174                 {
4175                     sayNO;
4176                 }
4177                 locinput = limit;
4178                 nextchr = UCHARAT(locinput);
4179                 break;
4180             }
4181
4182             /* Not utf8:  Inline the first character, for speed. */
4183             if (UCHARAT(s) != nextchr &&
4184                 (type == REF ||
4185                  UCHARAT(s) != fold_array[nextchr]))
4186                 sayNO;
4187             ln = PL_regoffs[n].end - ln;
4188             if (locinput + ln > PL_regeol)
4189                 sayNO;
4190             if (ln > 1 && (type == REF
4191                            ? memNE(s, locinput, ln)
4192                            : ! folder(s, locinput, ln)))
4193                 sayNO;
4194             locinput += ln;
4195             nextchr = UCHARAT(locinput);
4196             break;
4197         }
4198         case NOTHING:
4199         case TAIL:
4200             break;
4201         case BACK:
4202             break;
4203
4204 #undef  ST
4205 #define ST st->u.eval
4206         {
4207             SV *ret;
4208             REGEXP *re_sv;
4209             regexp *re;
4210             regexp_internal *rei;
4211             regnode *startpoint;
4212
4213         case GOSTART:
4214         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4215             if (cur_eval && cur_eval->locinput==locinput) {
4216                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4217                     Perl_croak(aTHX_ "Infinite recursion in regex");
4218                 if ( ++nochange_depth > max_nochange_depth )
4219                     Perl_croak(aTHX_
4220                         "Pattern subroutine nesting without pos change"
4221                         " exceeded limit in regex");
4222             } else {
4223                 nochange_depth = 0;
4224             }
4225             re_sv = rex_sv;
4226             re = rex;
4227             rei = rexi;
4228             (void)ReREFCNT_inc(rex_sv);
4229             if (OP(scan)==GOSUB) {
4230                 startpoint = scan + ARG2L(scan);
4231                 ST.close_paren = ARG(scan);
4232             } else {
4233                 startpoint = rei->program+1;
4234                 ST.close_paren = 0;
4235             }
4236             goto eval_recurse_doit;
4237             /* NOTREACHED */
4238         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4239             if (cur_eval && cur_eval->locinput==locinput) {
4240                 if ( ++nochange_depth > max_nochange_depth )
4241                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4242             } else {
4243                 nochange_depth = 0;
4244             }
4245             {
4246                 /* execute the code in the {...} */
4247                 dSP;
4248                 SV ** const before = SP;
4249                 OP_4tree * const oop = PL_op;
4250                 COP * const ocurcop = PL_curcop;
4251                 PAD *old_comppad;
4252                 char *saved_regeol = PL_regeol;
4253                 struct re_save_state saved_state;
4254
4255                 /* To not corrupt the existing regex state while executing the
4256                  * eval we would normally put it on the save stack, like with
4257                  * save_re_context. However, re-evals have a weird scoping so we
4258                  * can't just add ENTER/LEAVE here. With that, things like
4259                  *
4260                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4261                  *
4262                  * would break, as they expect the localisation to be unwound
4263                  * only when the re-engine backtracks through the bit that
4264                  * localised it.
4265                  *
4266                  * What we do instead is just saving the state in a local c
4267                  * variable.
4268                  */
4269                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4270
4271                 n = ARG(scan);
4272                 PL_op = (OP_4tree*)rexi->data->data[n];
4273                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4274                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4275                 /* wrap the call in two SAVECOMPPADs. This ensures that
4276                  * when the save stack is eventually unwound, all the
4277                  * accumulated SAVEt_CLEARSV's will be processed with
4278                  * interspersed SAVEt_COMPPAD's to ensure that lexicals
4279                  * are cleared in the right pad */
4280                 SAVECOMPPAD();
4281                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4282                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4283
4284                 if (sv_yes_mark) {
4285                     SV *sv_mrk = get_sv("REGMARK", 1);
4286                     sv_setsv(sv_mrk, sv_yes_mark);
4287                 }
4288
4289                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4290                 SPAGAIN;
4291                 if (SP == before)
4292                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4293                 else {
4294                     ret = POPs;
4295                     PUTBACK;
4296                 }
4297
4298                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4299
4300                 PL_op = oop;
4301                 SAVECOMPPAD();
4302                 PAD_RESTORE_LOCAL(old_comppad);
4303                 PL_curcop = ocurcop;
4304                 PL_regeol = saved_regeol;
4305                 if (!logical) {
4306                     /* /(?{...})/ */
4307                     sv_setsv(save_scalar(PL_replgv), ret);
4308                     break;
4309                 }
4310             }
4311             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4312                 logical = 0;
4313                 {
4314                     /* extract RE object from returned value; compiling if
4315                      * necessary */
4316                     MAGIC *mg = NULL;
4317                     REGEXP *rx = NULL;
4318
4319                     if (SvROK(ret)) {
4320                         SV *const sv = SvRV(ret);
4321
4322                         if (SvTYPE(sv) == SVt_REGEXP) {
4323                             rx = (REGEXP*) sv;
4324                         } else if (SvSMAGICAL(sv)) {
4325                             mg = mg_find(sv, PERL_MAGIC_qr);
4326                             assert(mg);
4327                         }
4328                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4329                         rx = (REGEXP*) ret;
4330                     } else if (SvSMAGICAL(ret)) {
4331                         if (SvGMAGICAL(ret)) {
4332                             /* I don't believe that there is ever qr magic
4333                                here.  */
4334                             assert(!mg_find(ret, PERL_MAGIC_qr));
4335                             sv_unmagic(ret, PERL_MAGIC_qr);
4336                         }
4337                         else {
4338                             mg = mg_find(ret, PERL_MAGIC_qr);
4339                             /* testing suggests mg only ends up non-NULL for
4340                                scalars who were upgraded and compiled in the
4341                                else block below. In turn, this is only
4342                                triggered in the "postponed utf8 string" tests
4343                                in t/op/pat.t  */
4344                         }
4345                     }
4346
4347                     if (mg) {
4348                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4349                         assert(rx);
4350                     }
4351                     if (rx) {
4352                         rx = reg_temp_copy(NULL, rx);
4353                     }
4354                     else {
4355                         U32 pm_flags = 0;
4356                         const I32 osize = PL_regsize;
4357
4358                         if (DO_UTF8(ret)) {
4359                             assert (SvUTF8(ret));
4360                         } else if (SvUTF8(ret)) {
4361                             /* Not doing UTF-8, despite what the SV says. Is
4362                                this only if we're trapped in use 'bytes'?  */
4363                             /* Make a copy of the octet sequence, but without
4364                                the flag on, as the compiler now honours the
4365                                SvUTF8 flag on ret.  */
4366                             STRLEN len;
4367                             const char *const p = SvPV(ret, len);
4368                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4369                         }
4370                         rx = CALLREGCOMP(ret, pm_flags);
4371                         if (!(SvFLAGS(ret)
4372                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4373                                  | SVs_GMG))) {
4374                             /* This isn't a first class regexp. Instead, it's
4375                                caching a regexp onto an existing, Perl visible
4376                                scalar.  */
4377                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4378                         }
4379                         PL_regsize = osize;
4380                     }
4381                     re_sv = rx;
4382                     re = (struct regexp *)SvANY(rx);
4383                 }
4384                 RXp_MATCH_COPIED_off(re);
4385                 re->subbeg = rex->subbeg;
4386                 re->sublen = rex->sublen;
4387                 rei = RXi_GET(re);
4388                 DEBUG_EXECUTE_r(
4389                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4390                         "Matching embedded");
4391                 );
4392                 startpoint = rei->program + 1;
4393                 ST.close_paren = 0; /* only used for GOSUB */
4394                 /* borrowed from regtry */
4395                 if (PL_reg_start_tmpl <= re->nparens) {
4396                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4397                     if(PL_reg_start_tmp)
4398                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4399                     else
4400                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4401                 }
4402
4403         eval_recurse_doit: /* Share code with GOSUB below this line */
4404                 /* run the pattern returned from (??{...}) */
4405                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4406                 REGCP_SET(ST.lastcp);
4407
4408                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4409
4410                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4411                 PL_reglastparen = &re->lastparen;
4412                 PL_reglastcloseparen = &re->lastcloseparen;
4413                 re->lastparen = 0;
4414                 re->lastcloseparen = 0;
4415
4416                 PL_reginput = locinput;
4417                 PL_regsize = 0;
4418
4419                 /* XXXX This is too dramatic a measure... */
4420                 PL_reg_maxiter = 0;
4421
4422                 ST.toggle_reg_flags = PL_reg_flags;
4423                 if (RX_UTF8(re_sv))
4424                     PL_reg_flags |= RF_utf8;
4425                 else
4426                     PL_reg_flags &= ~RF_utf8;
4427                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4428
4429                 ST.prev_rex = rex_sv;
4430                 ST.prev_curlyx = cur_curlyx;
4431                 SETREX(rex_sv,re_sv);
4432                 rex = re;
4433                 rexi = rei;
4434                 cur_curlyx = NULL;
4435                 ST.B = next;
4436                 ST.prev_eval = cur_eval;
4437                 cur_eval = st;
4438                 /* now continue from first node in postoned RE */
4439                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4440                 /* NOTREACHED */
4441             }
4442             /* logical is 1,   /(?(?{...})X|Y)/ */
4443             sw = cBOOL(SvTRUE(ret));
4444             logical = 0;
4445             break;
4446         }
4447
4448         case EVAL_AB: /* cleanup after a successful (??{A})B */
4449             /* note: this is called twice; first after popping B, then A */
4450             PL_reg_flags ^= ST.toggle_reg_flags;
4451             ReREFCNT_dec(rex_sv);
4452             SETREX(rex_sv,ST.prev_rex);
4453             rex = (struct regexp *)SvANY(rex_sv);
4454             rexi = RXi_GET(rex);
4455             regcpblow(ST.cp);
4456             cur_eval = ST.prev_eval;
4457             cur_curlyx = ST.prev_curlyx;
4458
4459             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4460             PL_reglastparen = &rex->lastparen;
4461             PL_reglastcloseparen = &rex->lastcloseparen;
4462             /* also update PL_regoffs */
4463             PL_regoffs = rex->offs;
4464
4465             /* XXXX This is too dramatic a measure... */
4466             PL_reg_maxiter = 0;
4467             if ( nochange_depth )
4468                 nochange_depth--;
4469             sayYES;
4470
4471
4472         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4473             /* note: this is called twice; first after popping B, then A */
4474             PL_reg_flags ^= ST.toggle_reg_flags;
4475             ReREFCNT_dec(rex_sv);
4476             SETREX(rex_sv,ST.prev_rex);
4477             rex = (struct regexp *)SvANY(rex_sv);
4478             rexi = RXi_GET(rex);
4479             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4480             PL_reglastparen = &rex->lastparen;
4481             PL_reglastcloseparen = &rex->lastcloseparen;
4482
4483             PL_reginput = locinput;
4484             REGCP_UNWIND(ST.lastcp);
4485             regcppop(rex);
4486             cur_eval = ST.prev_eval;
4487             cur_curlyx = ST.prev_curlyx;
4488             /* XXXX This is too dramatic a measure... */
4489             PL_reg_maxiter = 0;
4490             if ( nochange_depth )
4491                 nochange_depth--;
4492             sayNO_SILENT;
4493 #undef ST
4494
4495         case OPEN:
4496             n = ARG(scan);  /* which paren pair */
4497             PL_reg_start_tmp[n] = locinput;
4498             if (n > PL_regsize)
4499                 PL_regsize = n;
4500             lastopen = n;
4501             break;
4502         case CLOSE:
4503             n = ARG(scan);  /* which paren pair */
4504             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4505             PL_regoffs[n].end = locinput - PL_bostr;
4506             /*if (n > PL_regsize)
4507                 PL_regsize = n;*/
4508             if (n > *PL_reglastparen)
4509                 *PL_reglastparen = n;
4510             *PL_reglastcloseparen = n;
4511             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4512                 goto fake_end;
4513             }
4514             break;
4515         case ACCEPT:
4516             if (ARG(scan)){
4517                 regnode *cursor;
4518                 for (cursor=scan;
4519                      cursor && OP(cursor)!=END;
4520                      cursor=regnext(cursor))
4521                 {
4522                     if ( OP(cursor)==CLOSE ){
4523                         n = ARG(cursor);
4524                         if ( n <= lastopen ) {
4525                             PL_regoffs[n].start
4526                                 = PL_reg_start_tmp[n] - PL_bostr;
4527                             PL_regoffs[n].end = locinput - PL_bostr;
4528                             /*if (n > PL_regsize)
4529                             PL_regsize = n;*/
4530                             if (n > *PL_reglastparen)
4531                                 *PL_reglastparen = n;
4532                             *PL_reglastcloseparen = n;
4533                             if ( n == ARG(scan) || (cur_eval &&
4534                                 cur_eval->u.eval.close_paren == n))
4535                                 break;
4536                         }
4537                     }
4538                 }
4539             }
4540             goto fake_end;
4541             /*NOTREACHED*/
4542         case GROUPP:
4543             n = ARG(scan);  /* which paren pair */
4544             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4545             break;
4546         case NGROUPP:
4547             /* reg_check_named_buff_matched returns 0 for no match */
4548             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4549             break;
4550         case INSUBP:
4551             n = ARG(scan);
4552             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4553             break;
4554         case DEFINEP:
4555             sw = 0;
4556             break;
4557         case IFTHEN:
4558             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4559             if (sw)
4560                 next = NEXTOPER(NEXTOPER(scan));
4561             else {
4562                 next = scan + ARG(scan);
4563                 if (OP(next) == IFTHEN) /* Fake one. */
4564                     next = NEXTOPER(NEXTOPER(next));
4565             }
4566             break;
4567         case LOGICAL:
4568             logical = scan->flags;
4569             break;
4570
4571 /*******************************************************************
4572
4573 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4574 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4575 STAR/PLUS/CURLY/CURLYN are used instead.)
4576
4577 A*B is compiled as <CURLYX><A><WHILEM><B>
4578
4579 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4580 state, which contains the current count, initialised to -1. It also sets
4581 cur_curlyx to point to this state, with any previous value saved in the
4582 state block.
4583
4584 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4585 since the pattern may possibly match zero times (i.e. it's a while {} loop
4586 rather than a do {} while loop).
4587
4588 Each entry to WHILEM represents a successful match of A. The count in the
4589 CURLYX block is incremented, another WHILEM state is pushed, and execution
4590 passes to A or B depending on greediness and the current count.
4591
4592 For example, if matching against the string a1a2a3b (where the aN are
4593 substrings that match /A/), then the match progresses as follows: (the
4594 pushed states are interspersed with the bits of strings matched so far):
4595
4596     <CURLYX cnt=-1>
4597     <CURLYX cnt=0><WHILEM>
4598     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4599     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4600     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4601     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4602
4603 (Contrast this with something like CURLYM, which maintains only a single
4604 backtrack state:
4605
4606     <CURLYM cnt=0> a1
4607     a1 <CURLYM cnt=1> a2
4608     a1 a2 <CURLYM cnt=2> a3
4609     a1 a2 a3 <CURLYM cnt=3> b
4610 )
4611
4612 Each WHILEM state block marks a point to backtrack to upon partial failure
4613 of A or B, and also contains some minor state data related to that
4614 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4615 overall state, such as the count, and pointers to the A and B ops.
4616
4617 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4618 must always point to the *current* CURLYX block, the rules are:
4619
4620 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4621 and set cur_curlyx to point the new block.
4622
4623 When popping the CURLYX block after a successful or unsuccessful match,
4624 restore the previous cur_curlyx.
4625
4626 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4627 to the outer one saved in the CURLYX block.
4628
4629 When popping the WHILEM block after a successful or unsuccessful B match,
4630 restore the previous cur_curlyx.
4631
4632 Here's an example for the pattern (AI* BI)*BO
4633 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4634
4635 cur_
4636 curlyx backtrack stack
4637 ------ ---------------
4638 NULL
4639 CO     <CO prev=NULL> <WO>
4640 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4641 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4642 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4643
4644 At this point the pattern succeeds, and we work back down the stack to
4645 clean up, restoring as we go:
4646
4647 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4648 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4649 CO     <CO prev=NULL> <WO>
4650 NULL
4651
4652 *******************************************************************/
4653
4654 #define ST st->u.curlyx
4655
4656         case CURLYX:    /* start of /A*B/  (for complex A) */
4657         {
4658             /* No need to save/restore up to this paren */
4659             I32 parenfloor = scan->flags;
4660
4661             assert(next); /* keep Coverity happy */
4662             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4663                 next += ARG(next);
4664
4665             /* XXXX Probably it is better to teach regpush to support
4666                parenfloor > PL_regsize... */
4667             if (parenfloor > (I32)*PL_reglastparen)
4668                 parenfloor = *PL_reglastparen; /* Pessimization... */
4669
4670             ST.prev_curlyx= cur_curlyx;
4671             cur_curlyx = st;
4672             ST.cp = PL_savestack_ix;
4673
4674             /* these fields contain the state of the current curly.
4675              * they are accessed by subsequent WHILEMs */
4676             ST.parenfloor = parenfloor;
4677             ST.me = scan;
4678             ST.B = next;
4679             ST.minmod = minmod;
4680             minmod = 0;
4681             ST.count = -1;      /* this will be updated by WHILEM */
4682             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4683
4684             PL_reginput = locinput;
4685             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4686             /* NOTREACHED */
4687         }
4688
4689         case CURLYX_end: /* just finished matching all of A*B */
4690             cur_curlyx = ST.prev_curlyx;
4691             sayYES;
4692             /* NOTREACHED */
4693
4694         case CURLYX_end_fail: /* just failed to match all of A*B */
4695             regcpblow(ST.cp);
4696             cur_curlyx = ST.prev_curlyx;
4697             sayNO;
4698             /* NOTREACHED */
4699
4700
4701 #undef ST
4702 #define ST st->u.whilem
4703
4704         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4705         {
4706             /* see the discussion above about CURLYX/WHILEM */
4707             I32 n;
4708             int min = ARG1(cur_curlyx->u.curlyx.me);
4709             int max = ARG2(cur_curlyx->u.curlyx.me);
4710             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4711
4712             assert(cur_curlyx); /* keep Coverity happy */
4713             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4714             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4715             ST.cache_offset = 0;
4716             ST.cache_mask = 0;
4717
4718             PL_reginput = locinput;
4719
4720             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4721                   "%*s  whilem: matched %ld out of %d..%d\n",
4722                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4723             );
4724
4725             /* First just match a string of min A's. */
4726
4727             if (n < min) {
4728                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4729                 cur_curlyx->u.curlyx.lastloc = locinput;
4730                 REGCP_SET(ST.lastcp);
4731
4732                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4733                 /* NOTREACHED */
4734             }
4735
4736             /* If degenerate A matches "", assume A done. */
4737
4738             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4739                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4740                    "%*s  whilem: empty match detected, trying continuation...\n",
4741                    REPORT_CODE_OFF+depth*2, "")
4742                 );
4743                 goto do_whilem_B_max;
4744             }
4745
4746             /* super-linear cache processing */
4747
4748             if (scan->flags) {
4749
4750                 if (!PL_reg_maxiter) {
4751                     /* start the countdown: Postpone detection until we
4752                      * know the match is not *that* much linear. */
4753                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4754                     /* possible overflow for long strings and many CURLYX's */
4755                     if (PL_reg_maxiter < 0)
4756                         PL_reg_maxiter = I32_MAX;
4757                     PL_reg_leftiter = PL_reg_maxiter;
4758                 }
4759
4760                 if (PL_reg_leftiter-- == 0) {
4761                     /* initialise cache */
4762                     const I32 size = (PL_reg_maxiter + 7)/8;
4763                     if (PL_reg_poscache) {
4764                         if ((I32)PL_reg_poscache_size < size) {
4765                             Renew(PL_reg_poscache, size, char);
4766                             PL_reg_poscache_size = size;
4767                         }
4768                         Zero(PL_reg_poscache, size, char);
4769                     }
4770                     else {
4771                         PL_reg_poscache_size = size;
4772                         Newxz(PL_reg_poscache, size, char);
4773                     }
4774                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4775       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4776                               PL_colors[4], PL_colors[5])
4777                     );
4778                 }
4779
4780                 if (PL_reg_leftiter < 0) {
4781                     /* have we already failed at this position? */
4782                     I32 offset, mask;
4783                     offset  = (scan->flags & 0xf) - 1
4784                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4785                     mask    = 1 << (offset % 8);
4786                     offset /= 8;
4787                     if (PL_reg_poscache[offset] & mask) {
4788                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4789                             "%*s  whilem: (cache) already tried at this position...\n",
4790                             REPORT_CODE_OFF+depth*2, "")
4791                         );
4792                         sayNO; /* cache records failure */
4793                     }
4794                     ST.cache_offset = offset;
4795                     ST.cache_mask   = mask;
4796                 }
4797             }
4798
4799             /* Prefer B over A for minimal matching. */
4800
4801             if (cur_curlyx->u.curlyx.minmod) {
4802                 ST.save_curlyx = cur_curlyx;
4803                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4804                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4805                 REGCP_SET(ST.lastcp);
4806                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4807                 /* NOTREACHED */
4808             }
4809
4810             /* Prefer A over B for maximal matching. */
4811
4812             if (n < max) { /* More greed allowed? */
4813                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4814                 cur_curlyx->u.curlyx.lastloc = locinput;
4815                 REGCP_SET(ST.lastcp);
4816                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4817                 /* NOTREACHED */
4818             }
4819             goto do_whilem_B_max;
4820         }
4821         /* NOTREACHED */
4822
4823         case WHILEM_B_min: /* just matched B in a minimal match */
4824         case WHILEM_B_max: /* just matched B in a maximal match */
4825             cur_curlyx = ST.save_curlyx;
4826             sayYES;
4827             /* NOTREACHED */
4828
4829         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4830             cur_curlyx = ST.save_curlyx;
4831             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4832             cur_curlyx->u.curlyx.count--;
4833             CACHEsayNO;
4834             /* NOTREACHED */
4835
4836         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4837             /* FALL THROUGH */
4838         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4839             REGCP_UNWIND(ST.lastcp);
4840             regcppop(rex);
4841             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4842             cur_curlyx->u.curlyx.count--;
4843             CACHEsayNO;
4844             /* NOTREACHED */
4845
4846         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4847             REGCP_UNWIND(ST.lastcp);
4848             regcppop(rex);      /* Restore some previous $<digit>s? */
4849             PL_reginput = locinput;
4850             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4851                 "%*s  whilem: failed, trying continuation...\n",
4852                 REPORT_CODE_OFF+depth*2, "")
4853             );
4854           do_whilem_B_max:
4855             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4856                 && ckWARN(WARN_REGEXP)
4857                 && !(PL_reg_flags & RF_warned))
4858             {
4859                 PL_reg_flags |= RF_warned;
4860                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4861                      "Complex regular subexpression recursion limit (%d) "
4862                      "exceeded",
4863                      REG_INFTY - 1);
4864             }
4865
4866             /* now try B */
4867             ST.save_curlyx = cur_curlyx;
4868             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4869             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4870             /* NOTREACHED */
4871
4872         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4873             cur_curlyx = ST.save_curlyx;
4874             REGCP_UNWIND(ST.lastcp);
4875             regcppop(rex);
4876
4877             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4878                 /* Maximum greed exceeded */
4879                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4880                     && ckWARN(WARN_REGEXP)
4881                     && !(PL_reg_flags & RF_warned))
4882                 {
4883                     PL_reg_flags |= RF_warned;
4884                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4885                         "Complex regular subexpression recursion "
4886                         "limit (%d) exceeded",
4887                         REG_INFTY - 1);
4888                 }
4889                 cur_curlyx->u.curlyx.count--;
4890                 CACHEsayNO;
4891             }
4892
4893             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4894                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4895             );
4896             /* Try grabbing another A and see if it helps. */
4897             PL_reginput = locinput;
4898             cur_curlyx->u.curlyx.lastloc = locinput;
4899             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4900             REGCP_SET(ST.lastcp);
4901             PUSH_STATE_GOTO(WHILEM_A_min,
4902                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4903             /* NOTREACHED */
4904
4905 #undef  ST
4906 #define ST st->u.branch
4907
4908         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4909             next = scan + ARG(scan);
4910             if (next == scan)
4911                 next = NULL;
4912             scan = NEXTOPER(scan);
4913             /* FALL THROUGH */
4914
4915         case BRANCH:        /*  /(...|A|...)/ */
4916             scan = NEXTOPER(scan); /* scan now points to inner node */
4917             ST.lastparen = *PL_reglastparen;
4918             ST.next_branch = next;
4919             REGCP_SET(ST.cp);
4920             PL_reginput = locinput;
4921
4922             /* Now go into the branch */
4923             if (has_cutgroup) {
4924                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4925             } else {
4926                 PUSH_STATE_GOTO(BRANCH_next, scan);
4927             }
4928             /* NOTREACHED */
4929         case CUTGROUP:
4930             PL_reginput = locinput;
4931             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4932                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4933             PUSH_STATE_GOTO(CUTGROUP_next,next);
4934             /* NOTREACHED */
4935         case CUTGROUP_next_fail:
4936             do_cutgroup = 1;
4937             no_final = 1;
4938             if (st->u.mark.mark_name)
4939                 sv_commit = st->u.mark.mark_name;
4940             sayNO;
4941             /* NOTREACHED */
4942         case BRANCH_next:
4943             sayYES;
4944             /* NOTREACHED */
4945         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4946             if (do_cutgroup) {
4947                 do_cutgroup = 0;
4948                 no_final = 0;
4949             }
4950             REGCP_UNWIND(ST.cp);
4951             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4952                 PL_regoffs[n].end = -1;
4953             *PL_reglastparen = n;
4954             /*dmq: *PL_reglastcloseparen = n; */
4955             scan = ST.next_branch;
4956             /* no more branches? */
4957             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4958                 DEBUG_EXECUTE_r({
4959                     PerlIO_printf( Perl_debug_log,
4960                         "%*s  %sBRANCH failed...%s\n",
4961                         REPORT_CODE_OFF+depth*2, "",
4962                         PL_colors[4],
4963                         PL_colors[5] );
4964                 });
4965                 sayNO_SILENT;
4966             }
4967             continue; /* execute next BRANCH[J] op */
4968             /* NOTREACHED */
4969
4970         case MINMOD:
4971             minmod = 1;
4972             break;
4973
4974 #undef  ST
4975 #define ST st->u.curlym
4976
4977         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4978
4979             /* This is an optimisation of CURLYX that enables us to push
4980              * only a single backtracking state, no matter how many matches
4981              * there are in {m,n}. It relies on the pattern being constant
4982              * length, with no parens to influence future backrefs
4983              */
4984
4985             ST.me = scan;
4986             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4987
4988             /* if paren positive, emulate an OPEN/CLOSE around A */
4989             if (ST.me->flags) {
4990                 U32 paren = ST.me->flags;
4991                 if (paren > PL_regsize)
4992                     PL_regsize = paren;
4993                 if (paren > *PL_reglastparen)
4994                     *PL_reglastparen = paren;
4995                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4996             }
4997             ST.A = scan;
4998             ST.B = next;
4999             ST.alen = 0;
5000             ST.count = 0;
5001             ST.minmod = minmod;
5002             minmod = 0;
5003             ST.c1 = CHRTEST_UNINIT;
5004             REGCP_SET(ST.cp);
5005
5006             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5007                 goto curlym_do_B;
5008
5009           curlym_do_A: /* execute the A in /A{m,n}B/  */
5010             PL_reginput = locinput;
5011             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5012             /* NOTREACHED */
5013
5014         case CURLYM_A: /* we've just matched an A */
5015             locinput = st->locinput;
5016             nextchr = UCHARAT(locinput);
5017
5018             ST.count++;
5019             /* after first match, determine A's length: u.curlym.alen */
5020             if (ST.count == 1) {
5021                 if (PL_reg_match_utf8) {
5022                     char *s = locinput;
5023                     while (s < PL_reginput) {
5024                         ST.alen++;
5025                         s += UTF8SKIP(s);
5026                     }
5027                 }
5028                 else {
5029                     ST.alen = PL_reginput - locinput;
5030                 }
5031                 if (ST.alen == 0)
5032                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5033             }
5034             DEBUG_EXECUTE_r(
5035                 PerlIO_printf(Perl_debug_log,
5036                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5037                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5038                           (IV) ST.count, (IV)ST.alen)
5039             );
5040
5041             locinput = PL_reginput;
5042
5043             if (cur_eval && cur_eval->u.eval.close_paren &&
5044                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5045                 goto fake_end;
5046
5047             {
5048                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5049                 if ( max == REG_INFTY || ST.count < max )
5050                     goto curlym_do_A; /* try to match another A */
5051             }
5052             goto curlym_do_B; /* try to match B */
5053
5054         case CURLYM_A_fail: /* just failed to match an A */
5055             REGCP_UNWIND(ST.cp);
5056
5057             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5058                 || (cur_eval && cur_eval->u.eval.close_paren &&
5059                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5060                 sayNO;
5061
5062           curlym_do_B: /* execute the B in /A{m,n}B/  */
5063             PL_reginput = locinput;
5064             if (ST.c1 == CHRTEST_UNINIT) {
5065                 /* calculate c1 and c2 for possible match of 1st char
5066                  * following curly */
5067                 ST.c1 = ST.c2 = CHRTEST_VOID;
5068                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5069                     regnode *text_node = ST.B;
5070                     if (! HAS_TEXT(text_node))
5071                         FIND_NEXT_IMPT(text_node);
5072                     /* this used to be
5073
5074                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5075
5076                         But the former is redundant in light of the latter.
5077
5078                         if this changes back then the macro for
5079                         IS_TEXT and friends need to change.
5080                      */
5081                     if (PL_regkind[OP(text_node)] == EXACT)
5082                     {
5083
5084                         ST.c1 = (U8)*STRING(text_node);
5085                         switch (OP(text_node)) {
5086                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5087                             case EXACTFA:
5088                             case EXACTFU_SS:
5089                             case EXACTFU_TRICKYFOLD:
5090                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5091                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5092                             default: ST.c2 = ST.c1;
5093                         }
5094                     }
5095                 }
5096             }
5097
5098             DEBUG_EXECUTE_r(
5099                 PerlIO_printf(Perl_debug_log,
5100                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5101                     (int)(REPORT_CODE_OFF+(depth*2)),
5102                     "", (IV)ST.count)
5103                 );
5104             if (ST.c1 != CHRTEST_VOID
5105                     && UCHARAT(PL_reginput) != ST.c1
5106                     && UCHARAT(PL_reginput) != ST.c2)
5107             {
5108                 /* simulate B failing */
5109                 DEBUG_OPTIMISE_r(
5110                     PerlIO_printf(Perl_debug_log,
5111                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5112                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5113                         (IV)ST.c1,(IV)ST.c2
5114                 ));
5115                 state_num = CURLYM_B_fail;
5116                 goto reenter_switch;
5117             }
5118
5119             if (ST.me->flags) {
5120                 /* mark current A as captured */
5121                 I32 paren = ST.me->flags;
5122                 if (ST.count) {
5123                     PL_regoffs[paren].start
5124                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5125                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5126                     /*dmq: *PL_reglastcloseparen = paren; */
5127                 }
5128                 else
5129                     PL_regoffs[paren].end = -1;
5130                 if (cur_eval && cur_eval->u.eval.close_paren &&
5131                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5132                 {
5133                     if (ST.count)
5134                         goto fake_end;
5135                     else
5136                         sayNO;
5137                 }
5138             }
5139
5140             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5141             /* NOTREACHED */
5142
5143         case CURLYM_B_fail: /* just failed to match a B */
5144             REGCP_UNWIND(ST.cp);
5145             if (ST.minmod) {
5146                 I32 max = ARG2(ST.me);
5147                 if (max != REG_INFTY && ST.count == max)
5148                     sayNO;
5149                 goto curlym_do_A; /* try to match a further A */
5150             }
5151             /* backtrack one A */
5152             if (ST.count == ARG1(ST.me) /* min */)
5153                 sayNO;
5154             ST.count--;
5155             locinput = HOPc(locinput, -ST.alen);
5156             goto curlym_do_B; /* try to match B */
5157
5158 #undef ST
5159 #define ST st->u.curly
5160
5161 #define CURLY_SETPAREN(paren, success) \
5162     if (paren) { \
5163         if (success) { \
5164             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5165             PL_regoffs[paren].end = locinput - PL_bostr; \
5166             *PL_reglastcloseparen = paren; \
5167         } \
5168         else \
5169             PL_regoffs[paren].end = -1; \
5170     }
5171
5172         case STAR:              /*  /A*B/ where A is width 1 */
5173             ST.paren = 0;
5174             ST.min = 0;
5175             ST.max = REG_INFTY;
5176             scan = NEXTOPER(scan);
5177             goto repeat;
5178         case PLUS:              /*  /A+B/ where A is width 1 */
5179             ST.paren = 0;
5180             ST.min = 1;
5181             ST.max = REG_INFTY;
5182             scan = NEXTOPER(scan);
5183             goto repeat;
5184         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5185             ST.paren = scan->flags;     /* Which paren to set */
5186             if (ST.paren > PL_regsize)
5187                 PL_regsize = ST.paren;
5188             if (ST.paren > *PL_reglastparen)
5189                 *PL_reglastparen = ST.paren;
5190             ST.min = ARG1(scan);  /* min to match */
5191             ST.max = ARG2(scan);  /* max to match */
5192             if (cur_eval && cur_eval->u.eval.close_paren &&
5193                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5194                 ST.min=1;
5195                 ST.max=1;
5196             }
5197             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5198             goto repeat;
5199         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5200             ST.paren = 0;
5201             ST.min = ARG1(scan);  /* min to match */
5202             ST.max = ARG2(scan);  /* max to match */
5203             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5204           repeat:
5205             /*
5206             * Lookahead to avoid useless match attempts
5207             * when we know what character comes next.
5208             *
5209             * Used to only do .*x and .*?x, but now it allows
5210             * for )'s, ('s and (?{ ... })'s to be in the way
5211             * of the quantifier and the EXACT-like node.  -- japhy
5212             */
5213
5214             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5215                 sayNO;
5216             if (HAS_TEXT(next) || JUMPABLE(next)) {
5217                 U8 *s;
5218                 regnode *text_node = next;
5219
5220                 if (! HAS_TEXT(text_node))
5221                     FIND_NEXT_IMPT(text_node);
5222
5223                 if (! HAS_TEXT(text_node))
5224                     ST.c1 = ST.c2 = CHRTEST_VOID;
5225                 else {
5226                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5227                         ST.c1 = ST.c2 = CHRTEST_VOID;
5228                         goto assume_ok_easy;
5229                     }
5230                     else
5231                         s = (U8*)STRING(text_node);
5232
5233                     /*  Currently we only get here when
5234
5235                         PL_rekind[OP(text_node)] == EXACT
5236
5237                         if this changes back then the macro for IS_TEXT and
5238                         friends need to change. */
5239                     if (!UTF_PATTERN) {
5240                         ST.c1 = *s;
5241                         switch (OP(text_node)) {
5242                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5243                             case EXACTFA:
5244                             case EXACTFU_SS:
5245                             case EXACTFU_TRICKYFOLD:
5246                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5247                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5248                             default: ST.c2 = ST.c1; break;
5249                         }
5250                     }
5251                     else { /* UTF_PATTERN */
5252                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5253                              STRLEN ulen1, ulen2;
5254                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5255                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5256
5257                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5258                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5259 #ifdef EBCDIC
5260                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5261                                                     ckWARN(WARN_UTF8) ?
5262                                                     0 : UTF8_ALLOW_ANY);
5263                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5264                                                     ckWARN(WARN_UTF8) ?
5265                                                     0 : UTF8_ALLOW_ANY);
5266 #else
5267                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5268                                                     uniflags);
5269                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5270                                                     uniflags);
5271 #endif
5272                         }
5273                         else {
5274                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5275                                                      uniflags);
5276                         }
5277                     }
5278                 }
5279             }
5280             else
5281                 ST.c1 = ST.c2 = CHRTEST_VOID;
5282         assume_ok_easy:
5283
5284             ST.A = scan;
5285             ST.B = next;
5286             PL_reginput = locinput;
5287             if (minmod) {
5288                 minmod = 0;
5289                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5290                     sayNO;
5291                 ST.count = ST.min;
5292                 locinput = PL_reginput;
5293                 REGCP_SET(ST.cp);
5294                 if (ST.c1 == CHRTEST_VOID)
5295                     goto curly_try_B_min;
5296
5297                 ST.oldloc = locinput;
5298
5299                 /* set ST.maxpos to the furthest point along the
5300                  * string that could possibly match */
5301                 if  (ST.max == REG_INFTY) {
5302                     ST.maxpos = PL_regeol - 1;
5303                     if (utf8_target)
5304                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5305                             ST.maxpos--;
5306                 }
5307                 else if (utf8_target) {
5308                     int m = ST.max - ST.min;
5309                     for (ST.maxpos = locinput;
5310                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5311                         ST.maxpos += UTF8SKIP(ST.maxpos);
5312                 }
5313                 else {
5314                     ST.maxpos = locinput + ST.max - ST.min;
5315                     if (ST.maxpos >= PL_regeol)
5316                         ST.maxpos = PL_regeol - 1;
5317                 }
5318                 goto curly_try_B_min_known;
5319
5320             }
5321             else {
5322                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5323                 locinput = PL_reginput;
5324                 if (ST.count < ST.min)
5325                     sayNO;
5326                 if ((ST.count > ST.min)
5327                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5328                 {
5329                     /* A{m,n} must come at the end of the string, there's
5330                      * no point in backing off ... */
5331                     ST.min = ST.count;
5332                     /* ...except that $ and \Z can match before *and* after
5333                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5334                        We may back off by one in this case. */
5335                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5336                         ST.min--;
5337                 }
5338                 REGCP_SET(ST.cp);
5339                 goto curly_try_B_max;
5340             }
5341             /* NOTREACHED */
5342
5343
5344         case CURLY_B_min_known_fail:
5345             /* failed to find B in a non-greedy match where c1,c2 valid */
5346             if (ST.paren && ST.count)
5347                 PL_regoffs[ST.paren].end = -1;
5348
5349             PL_reginput = locinput;     /* Could be reset... */
5350             REGCP_UNWIND(ST.cp);
5351             /* Couldn't or didn't -- move forward. */
5352             ST.oldloc = locinput;
5353             if (utf8_target)
5354                 locinput += UTF8SKIP(locinput);
5355             else
5356                 locinput++;
5357             ST.count++;
5358           curly_try_B_min_known:
5359              /* find the next place where 'B' could work, then call B */
5360             {
5361                 int n;
5362                 if (utf8_target) {
5363                     n = (ST.oldloc == locinput) ? 0 : 1;
5364                     if (ST.c1 == ST.c2) {
5365                         STRLEN len;
5366                         /* set n to utf8_distance(oldloc, locinput) */
5367                         while (locinput <= ST.maxpos &&
5368                                utf8n_to_uvchr((U8*)locinput,
5369                                               UTF8_MAXBYTES, &len,
5370                                               uniflags) != (UV)ST.c1) {
5371                             locinput += len;
5372                             n++;
5373                         }
5374                     }
5375                     else {
5376                         /* set n to utf8_distance(oldloc, locinput) */
5377                         while (locinput <= ST.maxpos) {
5378                             STRLEN len;
5379                             const UV c = utf8n_to_uvchr((U8*)locinput,
5380                                                   UTF8_MAXBYTES, &len,
5381                                                   uniflags);
5382                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5383                                 break;
5384                             locinput += len;
5385                             n++;
5386                         }
5387                     }
5388                 }
5389                 else {
5390                     if (ST.c1 == ST.c2) {
5391                         while (locinput <= ST.maxpos &&
5392                                UCHARAT(locinput) != ST.c1)
5393                             locinput++;
5394                     }
5395                     else {
5396                         while (locinput <= ST.maxpos
5397                                && UCHARAT(locinput) != ST.c1
5398                                && UCHARAT(locinput) != ST.c2)
5399                             locinput++;
5400                     }
5401                     n = locinput - ST.oldloc;
5402                 }
5403                 if (locinput > ST.maxpos)
5404                     sayNO;
5405                 /* PL_reginput == oldloc now */
5406                 if (n) {
5407                     ST.count += n;
5408                     if (regrepeat(rex, ST.A, n, depth) < n)
5409                         sayNO;
5410                 }
5411                 PL_reginput = locinput;
5412                 CURLY_SETPAREN(ST.paren, ST.count);
5413                 if (cur_eval && cur_eval->u.eval.close_paren &&
5414                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5415                     goto fake_end;
5416                 }
5417                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5418             }
5419             /* NOTREACHED */
5420
5421
5422         case CURLY_B_min_fail:
5423             /* failed to find B in a non-greedy match where c1,c2 invalid */
5424             if (ST.paren && ST.count)
5425                 PL_regoffs[ST.paren].end = -1;
5426
5427             REGCP_UNWIND(ST.cp);
5428             /* failed -- move forward one */
5429             PL_reginput = locinput;
5430             if (regrepeat(rex, ST.A, 1, depth)) {
5431                 ST.count++;
5432                 locinput = PL_reginput;
5433                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5434                         ST.count > 0)) /* count overflow ? */
5435                 {
5436                   curly_try_B_min:
5437                     CURLY_SETPAREN(ST.paren, ST.count);
5438                     if (cur_eval && cur_eval->u.eval.close_paren &&
5439                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5440                         goto fake_end;
5441                     }
5442                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5443                 }
5444             }
5445             sayNO;
5446             /* NOTREACHED */
5447
5448
5449         curly_try_B_max:
5450             /* a successful greedy match: now try to match B */
5451             if (cur_eval && cur_eval->u.eval.close_paren &&
5452                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5453                 goto fake_end;
5454             }
5455             {
5456                 UV c = 0;
5457                 if (ST.c1 != CHRTEST_VOID)
5458                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5459                                            UTF8_MAXBYTES, 0, uniflags)
5460                                 : (UV) UCHARAT(PL_reginput);
5461                 /* If it could work, try it. */
5462                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5463                     CURLY_SETPAREN(ST.paren, ST.count);
5464                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5465                     /* NOTREACHED */
5466                 }
5467             }
5468             /* FALL THROUGH */
5469         case CURLY_B_max_fail:
5470             /* failed to find B in a greedy match */
5471             if (ST.paren && ST.count)
5472                 PL_regoffs[ST.paren].end = -1;
5473
5474             REGCP_UNWIND(ST.cp);
5475             /*  back up. */
5476             if (--ST.count < ST.min)
5477                 sayNO;
5478             PL_reginput = locinput = HOPc(locinput, -1);
5479             goto curly_try_B_max;
5480
5481 #undef ST
5482
5483         case END:
5484             fake_end:
5485             if (cur_eval) {
5486                 /* we've just finished A in /(??{A})B/; now continue with B */
5487                 I32 tmpix;
5488                 st->u.eval.toggle_reg_flags
5489                             = cur_eval->u.eval.toggle_reg_flags;
5490                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5491
5492                 st->u.eval.prev_rex = rex_sv;           /* inner */
5493                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5494                 rex = (struct regexp *)SvANY(rex_sv);
5495                 rexi = RXi_GET(rex);
5496                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5497                 (void)ReREFCNT_inc(rex_sv);
5498                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5499
5500                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5501                 PL_reglastparen = &rex->lastparen;
5502                 PL_reglastcloseparen = &rex->lastcloseparen;
5503
5504                 REGCP_SET(st->u.eval.lastcp);
5505                 PL_reginput = locinput;
5506
5507                 /* Restore parens of the outer rex without popping the
5508                  * savestack */
5509                 tmpix = PL_savestack_ix;
5510                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5511                 regcppop(rex);
5512                 PL_savestack_ix = tmpix;
5513
5514                 st->u.eval.prev_eval = cur_eval;
5515                 cur_eval = cur_eval->u.eval.prev_eval;
5516                 DEBUG_EXECUTE_r(
5517                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5518                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5519                 if ( nochange_depth )
5520                     nochange_depth--;
5521
5522                 PUSH_YES_STATE_GOTO(EVAL_AB,
5523                         st->u.eval.prev_eval->u.eval.B); /* match B */
5524             }
5525
5526             if (locinput < reginfo->till) {
5527                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5528                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5529                                       PL_colors[4],
5530                                       (long)(locinput - PL_reg_starttry),
5531                                       (long)(reginfo->till - PL_reg_starttry),
5532                                       PL_colors[5]));
5533
5534                 sayNO_SILENT;           /* Cannot match: too short. */
5535             }
5536             PL_reginput = locinput;     /* put where regtry can find it */
5537             sayYES;                     /* Success! */
5538
5539         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5540             DEBUG_EXECUTE_r(
5541             PerlIO_printf(Perl_debug_log,
5542                 "%*s  %ssubpattern success...%s\n",
5543                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5544             PL_reginput = locinput;     /* put where regtry can find it */
5545             sayYES;                     /* Success! */
5546
5547 #undef  ST
5548 #define ST st->u.ifmatch
5549
5550         case SUSPEND:   /* (?>A) */
5551             ST.wanted = 1;
5552             PL_reginput = locinput;
5553             goto do_ifmatch;
5554
5555         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5556             ST.wanted = 0;
5557             goto ifmatch_trivial_fail_test;
5558
5559         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5560             ST.wanted = 1;
5561           ifmatch_trivial_fail_test:
5562             if (scan->flags) {
5563                 char * const s = HOPBACKc(locinput, scan->flags);
5564                 if (!s) {
5565                     /* trivial fail */
5566                     if (logical) {
5567                         logical = 0;
5568                         sw = 1 - cBOOL(ST.wanted);
5569                     }
5570                     else if (ST.wanted)
5571                         sayNO;
5572                     next = scan + ARG(scan);
5573                     if (next == scan)
5574                         next = NULL;
5575                     break;
5576                 }
5577                 PL_reginput = s;
5578             }
5579             else
5580                 PL_reginput = locinput;
5581
5582           do_ifmatch:
5583             ST.me = scan;
5584             ST.logical = logical;
5585             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5586
5587             /* execute body of (?...A) */
5588             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5589             /* NOTREACHED */
5590
5591         case IFMATCH_A_fail: /* body of (?...A) failed */
5592             ST.wanted = !ST.wanted;
5593             /* FALL THROUGH */
5594
5595         case IFMATCH_A: /* body of (?...A) succeeded */
5596             if (ST.logical) {
5597                 sw = cBOOL(ST.wanted);
5598             }
5599             else if (!ST.wanted)
5600                 sayNO;
5601
5602             if (OP(ST.me) == SUSPEND)
5603                 locinput = PL_reginput;
5604             else {
5605                 locinput = PL_reginput = st->locinput;
5606                 nextchr = UCHARAT(locinput);
5607             }
5608             scan = ST.me + ARG(ST.me);
5609             if (scan == ST.me)
5610                 scan = NULL;
5611             continue; /* execute B */
5612
5613 #undef ST
5614
5615         case LONGJMP:
5616             next = scan + ARG(scan);
5617             if (next == scan)
5618                 next = NULL;
5619             break;
5620         case COMMIT:
5621             reginfo->cutpoint = PL_regeol;
5622             /* FALLTHROUGH */
5623         case PRUNE:
5624             PL_reginput = locinput;
5625             if (!scan->flags)
5626                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5627             PUSH_STATE_GOTO(COMMIT_next,next);
5628             /* NOTREACHED */
5629         case COMMIT_next_fail:
5630             no_final = 1;
5631             /* FALLTHROUGH */
5632         case OPFAIL:
5633             sayNO;
5634             /* NOTREACHED */
5635
5636 #define ST st->u.mark
5637         case MARKPOINT:
5638             ST.prev_mark = mark_state;
5639             ST.mark_name = sv_commit = sv_yes_mark
5640                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5641             mark_state = st;
5642             ST.mark_loc = PL_reginput = locinput;
5643             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5644             /* NOTREACHED */
5645         case MARKPOINT_next:
5646             mark_state = ST.prev_mark;
5647             sayYES;
5648             /* NOTREACHED */
5649         case MARKPOINT_next_fail:
5650             if (popmark && sv_eq(ST.mark_name,popmark))
5651             {
5652                 if (ST.mark_loc > startpoint)
5653                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5654                 popmark = NULL; /* we found our mark */
5655                 sv_commit = ST.mark_name;
5656
5657                 DEBUG_EXECUTE_r({
5658                         PerlIO_printf(Perl_debug_log,
5659                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5660                             REPORT_CODE_OFF+depth*2, "",
5661                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5662                 });
5663             }
5664             mark_state = ST.prev_mark;
5665             sv_yes_mark = mark_state ?
5666                 mark_state->u.mark.mark_name : NULL;
5667             sayNO;
5668             /* NOTREACHED */
5669         case SKIP:
5670             PL_reginput = locinput;
5671             if (scan->flags) {
5672                 /* (*SKIP) : if we fail we cut here*/
5673                 ST.mark_name = NULL;
5674                 ST.mark_loc = locinput;
5675                 PUSH_STATE_GOTO(SKIP_next,next);
5676             } else {
5677                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5678                    otherwise do nothing.  Meaning we need to scan
5679                  */
5680                 regmatch_state *cur = mark_state;
5681                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5682
5683                 while (cur) {
5684                     if ( sv_eq( cur->u.mark.mark_name,
5685                                 find ) )
5686                     {
5687                         ST.mark_name = find;
5688                         PUSH_STATE_GOTO( SKIP_next, next );
5689                     }
5690                     cur = cur->u.mark.prev_mark;
5691                 }
5692             }
5693             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5694             break;
5695         case SKIP_next_fail:
5696             if (ST.mark_name) {
5697                 /* (*CUT:NAME) - Set up to search for the name as we
5698                    collapse the stack*/
5699                 popmark = ST.mark_name;
5700             } else {
5701                 /* (*CUT) - No name, we cut here.*/
5702                 if (ST.mark_loc > startpoint)
5703                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5704                 /* but we set sv_commit to latest mark_name if there
5705                    is one so they can test to see how things lead to this
5706                    cut */
5707                 if (mark_state)
5708                     sv_commit=mark_state->u.mark.mark_name;
5709             }
5710             no_final = 1;
5711             sayNO;
5712             /* NOTREACHED */
5713 #undef ST
5714         case LNBREAK:
5715             if ((n=is_LNBREAK(locinput,utf8_target))) {
5716                 locinput += n;
5717                 nextchr = UCHARAT(locinput);
5718             } else
5719                 sayNO;
5720             break;
5721
5722 #define CASE_CLASS(nAmE)                              \
5723         case nAmE:                                    \
5724             if (locinput >= PL_regeol)                \
5725                 sayNO;                                \
5726             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5727                 locinput += n;                        \
5728                 nextchr = UCHARAT(locinput);          \
5729             } else                                    \
5730                 sayNO;                                \
5731             break;                                    \
5732         case N##nAmE:                                 \
5733             if (locinput >= PL_regeol)                \
5734                 sayNO;                                \
5735             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5736                 sayNO;                                \
5737             } else {                                  \
5738                 locinput += UTF8SKIP(locinput);       \
5739                 nextchr = UCHARAT(locinput);          \
5740             }                                         \
5741             break
5742
5743         CASE_CLASS(VERTWS);
5744         CASE_CLASS(HORIZWS);
5745 #undef CASE_CLASS
5746
5747         default:
5748             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5749                           PTR2UV(scan), OP(scan));
5750             Perl_croak(aTHX_ "regexp memory corruption");
5751
5752         } /* end switch */
5753
5754         /* switch break jumps here */
5755         scan = next; /* prepare to execute the next op and ... */
5756         continue;    /* ... jump back to the top, reusing st */
5757         /* NOTREACHED */
5758
5759       push_yes_state:
5760         /* push a state that backtracks on success */
5761         st->u.yes.prev_yes_state = yes_state;
5762         yes_state = st;
5763         /* FALL THROUGH */
5764       push_state:
5765         /* push a new regex state, then continue at scan  */
5766         {
5767             regmatch_state *newst;
5768
5769             DEBUG_STACK_r({
5770                 regmatch_state *cur = st;
5771                 regmatch_state *curyes = yes_state;
5772                 int curd = depth;
5773                 regmatch_slab *slab = PL_regmatch_slab;
5774                 for (;curd > -1;cur--,curd--) {
5775                     if (cur < SLAB_FIRST(slab)) {
5776                         slab = slab->prev;
5777                         cur = SLAB_LAST(slab);
5778                     }
5779                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5780                         REPORT_CODE_OFF + 2 + depth * 2,"",
5781                         curd, PL_reg_name[cur->resume_state],
5782                         (curyes == cur) ? "yes" : ""
5783                     );
5784                     if (curyes == cur)
5785                         curyes = cur->u.yes.prev_yes_state;
5786                 }
5787             } else
5788                 DEBUG_STATE_pp("push")
5789             );
5790             depth++;
5791             st->locinput = locinput;
5792             newst = st+1;
5793             if (newst >  SLAB_LAST(PL_regmatch_slab))
5794                 newst = S_push_slab(aTHX);
5795             PL_regmatch_state = newst;
5796
5797             locinput = PL_reginput;
5798             nextchr = UCHARAT(locinput);
5799             st = newst;
5800             continue;
5801             /* NOTREACHED */
5802         }
5803     }
5804
5805     /*
5806     * We get here only if there's trouble -- normally "case END" is
5807     * the terminating point.
5808     */
5809     Perl_croak(aTHX_ "corrupted regexp pointers");
5810     /*NOTREACHED*/
5811     sayNO;
5812
5813 yes:
5814     if (yes_state) {
5815         /* we have successfully completed a subexpression, but we must now
5816          * pop to the state marked by yes_state and continue from there */
5817         assert(st != yes_state);
5818 #ifdef DEBUGGING
5819         while (st != yes_state) {
5820             st--;
5821             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5822                 PL_regmatch_slab = PL_regmatch_slab->prev;
5823                 st = SLAB_LAST(PL_regmatch_slab);
5824             }
5825             DEBUG_STATE_r({
5826                 if (no_final) {
5827                     DEBUG_STATE_pp("pop (no final)");
5828                 } else {
5829                     DEBUG_STATE_pp("pop (yes)");
5830                 }
5831             });
5832             depth--;
5833         }
5834 #else
5835         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5836             || yes_state > SLAB_LAST(PL_regmatch_slab))
5837         {
5838             /* not in this slab, pop slab */
5839             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5840             PL_regmatch_slab = PL_regmatch_slab->prev;
5841             st = SLAB_LAST(PL_regmatch_slab);
5842         }
5843         depth -= (st - yes_state);
5844 #endif
5845         st = yes_state;
5846         yes_state = st->u.yes.prev_yes_state;
5847         PL_regmatch_state = st;
5848
5849         if (no_final) {
5850             locinput= st->locinput;
5851             nextchr = UCHARAT(locinput);
5852         }
5853         state_num = st->resume_state + no_final;
5854         goto reenter_switch;
5855     }
5856
5857     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5858                           PL_colors[4], PL_colors[5]));
5859
5860     if (PL_reg_eval_set) {
5861         /* each successfully executed (?{...}) block does the equivalent of
5862          *   local $^R = do {...}
5863          * When popping the save stack, all these locals would be undone;
5864          * bypass this by setting the outermost saved $^R to the latest
5865          * value */
5866         if (oreplsv != GvSV(PL_replgv))
5867             sv_setsv(oreplsv, GvSV(PL_replgv));
5868     }
5869     result = 1;
5870     goto final_exit;
5871
5872 no:
5873     DEBUG_EXECUTE_r(
5874         PerlIO_printf(Perl_debug_log,
5875             "%*s  %sfailed...%s\n",
5876             REPORT_CODE_OFF+depth*2, "",
5877             PL_colors[4], PL_colors[5])
5878         );
5879
5880 no_silent:
5881     if (no_final) {
5882         if (yes_state) {
5883             goto yes;
5884         } else {
5885             goto final_exit;
5886         }
5887     }
5888     if (depth) {
5889         /* there's a previous state to backtrack to */
5890         st--;
5891         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5892             PL_regmatch_slab = PL_regmatch_slab->prev;
5893             st = SLAB_LAST(PL_regmatch_slab);
5894         }
5895         PL_regmatch_state = st;
5896         locinput= st->locinput;
5897         nextchr = UCHARAT(locinput);
5898
5899         DEBUG_STATE_pp("pop");
5900         depth--;
5901         if (yes_state == st)
5902             yes_state = st->u.yes.prev_yes_state;
5903
5904         state_num = st->resume_state + 1; /* failure = success + 1 */
5905         goto reenter_switch;
5906     }
5907     result = 0;
5908
5909   final_exit:
5910     if (rex->intflags & PREGf_VERBARG_SEEN) {
5911         SV *sv_err = get_sv("REGERROR", 1);
5912         SV *sv_mrk = get_sv("REGMARK", 1);
5913         if (result) {
5914             sv_commit = &PL_sv_no;
5915             if (!sv_yes_mark)
5916                 sv_yes_mark = &PL_sv_yes;
5917         } else {
5918             if (!sv_commit)
5919                 sv_commit = &PL_sv_yes;
5920             sv_yes_mark = &PL_sv_no;
5921         }
5922         sv_setsv(sv_err, sv_commit);
5923         sv_setsv(sv_mrk, sv_yes_mark);
5924     }
5925
5926     /* clean up; in particular, free all slabs above current one */
5927     LEAVE_SCOPE(oldsave);
5928
5929     return result;
5930 }
5931
5932 /*
5933  - regrepeat - repeatedly match something simple, report how many
5934  */
5935 /*
5936  * [This routine now assumes that it will only match on things of length 1.
5937  * That was true before, but now we assume scan - reginput is the count,
5938  * rather than incrementing count on every character.  [Er, except utf8.]]
5939  */
5940 STATIC I32
5941 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5942 {
5943     dVAR;
5944     register char *scan;
5945     register I32 c;
5946     register char *loceol = PL_regeol;
5947     register I32 hardcount = 0;
5948     register bool utf8_target = PL_reg_match_utf8;
5949     UV utf8_flags;
5950 #ifndef DEBUGGING
5951     PERL_UNUSED_ARG(depth);
5952 #endif
5953
5954     PERL_ARGS_ASSERT_REGREPEAT;
5955
5956     scan = PL_reginput;
5957     if (max == REG_INFTY)
5958         max = I32_MAX;
5959     else if (max < loceol - scan)
5960         loceol = scan + max;
5961     switch (OP(p)) {
5962     case REG_ANY:
5963         if (utf8_target) {
5964             loceol = PL_regeol;
5965             while (scan < loceol && hardcount < max && *scan != '\n') {
5966                 scan += UTF8SKIP(scan);
5967                 hardcount++;
5968             }
5969         } else {
5970             while (scan < loceol && *scan != '\n')
5971                 scan++;
5972         }
5973         break;
5974     case SANY:
5975         if (utf8_target) {
5976             loceol = PL_regeol;
5977             while (scan < loceol && hardcount < max) {
5978                 scan += UTF8SKIP(scan);
5979                 hardcount++;
5980             }
5981         }
5982         else
5983             scan = loceol;
5984         break;
5985     case CANY:
5986         scan = loceol;
5987         break;
5988     case EXACT:
5989         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5990          * means they match only characters in the string that can be expressed
5991          * as a single byte.  For non-utf8 strings, that means a simple match.
5992          * For utf8 strings, the character matched must be an invariant, or
5993          * downgradable to a single byte.  The pattern's utf8ness is
5994          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5995          * it is, it's an invariant */
5996
5997         c = (U8)*STRING(p);
5998         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5999
6000         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6001             while (scan < loceol && UCHARAT(scan) == c) {
6002                 scan++;
6003             }
6004         }
6005         else {
6006
6007             /* Here, the string is utf8, and the pattern char is different
6008              * in utf8 than not, so can't compare them directly.  Outside the
6009              * loop, find the two utf8 bytes that represent c, and then
6010              * look for those in sequence in the utf8 string */
6011             U8 high = UTF8_TWO_BYTE_HI(c);
6012             U8 low = UTF8_TWO_BYTE_LO(c);
6013             loceol = PL_regeol;
6014
6015             while (hardcount < max
6016                     && scan + 1 < loceol
6017                     && UCHARAT(scan) == high
6018                     && UCHARAT(scan + 1) == low)
6019             {
6020                 scan += 2;
6021                 hardcount++;
6022             }
6023         }
6024         break;
6025     case EXACTFA:
6026         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6027         goto do_exactf;
6028
6029     case EXACTFL:
6030         PL_reg_flags |= RF_tainted;
6031         utf8_flags = FOLDEQ_UTF8_LOCALE;
6032         goto do_exactf;
6033
6034     case EXACTF:
6035             utf8_flags = 0;
6036             goto do_exactf;
6037
6038     case EXACTFU_SS:
6039     case EXACTFU_TRICKYFOLD:
6040     case EXACTFU:
6041         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6042
6043         /* The comments for the EXACT case above apply as well to these fold
6044          * ones */
6045
6046     do_exactf:
6047         c = (U8)*STRING(p);
6048         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6049
6050         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6051             char *tmpeol = loceol;
6052             while (hardcount < max
6053                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6054                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6055             {
6056                 scan = tmpeol;
6057                 tmpeol = loceol;
6058                 hardcount++;
6059             }
6060
6061             /* XXX Note that the above handles properly the German sharp s in
6062              * the pattern matching ss in the string.  But it doesn't handle
6063              * properly cases where the string contains say 'LIGATURE ff' and
6064              * the pattern is 'f+'.  This would require, say, a new function or
6065              * revised interface to foldEQ_utf8(), in which the maximum number
6066              * of characters to match could be passed and it would return how
6067              * many actually did.  This is just one of many cases where
6068              * multi-char folds don't work properly, and so the fix is being
6069              * deferred */
6070         }
6071         else {
6072             U8 folded;
6073
6074             /* Here, the string isn't utf8 and c is a single byte; and either
6075              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6076              * doesn't affect c.  Can just do simple comparisons for exact or
6077              * fold matching. */
6078             switch (OP(p)) {
6079                 case EXACTF: folded = PL_fold[c]; break;
6080                 case EXACTFA:
6081                 case EXACTFU_TRICKYFOLD:
6082                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6083                 case EXACTFL: folded = PL_fold_locale[c]; break;
6084                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6085             }
6086             while (scan < loceol &&
6087                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6088             {
6089                 scan++;
6090             }
6091         }
6092         break;
6093     case ANYOFV:
6094     case ANYOF:
6095         if (utf8_target || OP(p) == ANYOFV) {
6096             STRLEN inclasslen;
6097             loceol = PL_regeol;
6098             inclasslen = loceol - scan;
6099             while (hardcount < max
6100                    && ((inclasslen = loceol - scan) > 0)
6101                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6102             {
6103                 scan += inclasslen;
6104                 hardcount++;
6105             }
6106         } else {
6107             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6108                 scan++;
6109         }
6110         break;
6111     case ALNUMU:
6112         if (utf8_target) {
6113     utf8_wordchar:
6114             loceol = PL_regeol;
6115             LOAD_UTF8_CHARCLASS_ALNUM();
6116             while (hardcount < max && scan < loceol &&
6117                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6118             {
6119                 scan += UTF8SKIP(scan);
6120                 hardcount++;
6121             }
6122         } else {
6123             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6124                 scan++;
6125             }
6126         }
6127         break;
6128     case ALNUM:
6129         if (utf8_target)
6130             goto utf8_wordchar;
6131         while (scan < loceol && isALNUM((U8) *scan)) {
6132             scan++;
6133         }
6134         break;
6135     case ALNUMA:
6136         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6137             scan++;
6138         }
6139         break;
6140     case ALNUML:
6141         PL_reg_flags |= RF_tainted;
6142         if (utf8_target) {
6143             loceol = PL_regeol;
6144             while (hardcount < max && scan < loceol &&
6145                    isALNUM_LC_utf8((U8*)scan)) {
6146                 scan += UTF8SKIP(scan);
6147                 hardcount++;
6148             }
6149         } else {
6150             while (scan < loceol && isALNUM_LC(*scan))
6151                 scan++;
6152         }
6153         break;
6154     case NALNUMU:
6155         if (utf8_target) {
6156
6157     utf8_Nwordchar:
6158
6159             loceol = PL_regeol;
6160             LOAD_UTF8_CHARCLASS_ALNUM();
6161             while (hardcount < max && scan < loceol &&
6162                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6163             {
6164                 scan += UTF8SKIP(scan);
6165                 hardcount++;
6166             }
6167         } else {
6168             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6169                 scan++;
6170             }
6171         }
6172         break;
6173     case NALNUM:
6174         if (utf8_target)
6175             goto utf8_Nwordchar;
6176         while (scan < loceol && ! isALNUM((U8) *scan)) {
6177             scan++;
6178         }
6179         break;
6180     case NALNUMA:
6181         if (utf8_target) {
6182             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6183                 scan += UTF8SKIP(scan);
6184             }
6185         }
6186         else {
6187             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6188                 scan++;
6189             }
6190         }
6191         break;
6192     case NALNUML:
6193         PL_reg_flags |= RF_tainted;
6194         if (utf8_target) {
6195             loceol = PL_regeol;
6196             while (hardcount < max && scan < loceol &&
6197                    !isALNUM_LC_utf8((U8*)scan)) {
6198                 scan += UTF8SKIP(scan);
6199                 hardcount++;
6200             }
6201         } else {
6202             while (scan < loceol && !isALNUM_LC(*scan))
6203                 scan++;
6204         }
6205         break;
6206     case SPACEU:
6207         if (utf8_target) {
6208
6209     utf8_space:
6210
6211             loceol = PL_regeol;
6212             LOAD_UTF8_CHARCLASS_SPACE();
6213             while (hardcount < max && scan < loceol &&
6214                    (*scan == ' ' ||
6215                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6216             {
6217                 scan += UTF8SKIP(scan);
6218                 hardcount++;
6219             }
6220             break;
6221         }
6222         else {
6223             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6224                 scan++;
6225             }
6226             break;
6227         }
6228     case SPACE:
6229         if (utf8_target)
6230             goto utf8_space;
6231
6232         while (scan < loceol && isSPACE((U8) *scan)) {
6233             scan++;
6234         }
6235         break;
6236     case SPACEA:
6237         while (scan < loceol && isSPACE_A((U8) *scan)) {
6238             scan++;
6239         }
6240         break;
6241     case SPACEL:
6242         PL_reg_flags |= RF_tainted;
6243         if (utf8_target) {
6244             loceol = PL_regeol;
6245             while (hardcount < max && scan < loceol &&
6246                    isSPACE_LC_utf8((U8*)scan)) {
6247                 scan += UTF8SKIP(scan);
6248                 hardcount++;
6249             }
6250         } else {
6251             while (scan < loceol && isSPACE_LC(*scan))
6252                 scan++;
6253         }
6254         break;
6255     case NSPACEU:
6256         if (utf8_target) {
6257
6258     utf8_Nspace:
6259
6260             loceol = PL_regeol;
6261             LOAD_UTF8_CHARCLASS_SPACE();
6262             while (hardcount < max && scan < loceol &&
6263                    ! (*scan == ' ' ||
6264                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6265             {
6266                 scan += UTF8SKIP(scan);
6267                 hardcount++;
6268             }
6269             break;
6270         }
6271         else {
6272             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6273                 scan++;
6274             }
6275         }
6276         break;
6277     case NSPACE:
6278         if (utf8_target)
6279             goto utf8_Nspace;
6280
6281         while (scan < loceol && ! isSPACE((U8) *scan)) {
6282             scan++;
6283         }
6284         break;
6285     case NSPACEA:
6286         if (utf8_target) {
6287             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6288                 scan += UTF8SKIP(scan);
6289             }
6290         }
6291         else {
6292             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6293                 scan++;
6294             }
6295         }
6296         break;
6297     case NSPACEL:
6298         PL_reg_flags |= RF_tainted;
6299         if (utf8_target) {
6300             loceol = PL_regeol;
6301             while (hardcount < max && scan < loceol &&
6302                    !isSPACE_LC_utf8((U8*)scan)) {
6303                 scan += UTF8SKIP(scan);
6304                 hardcount++;
6305             }
6306         } else {
6307             while (scan < loceol && !isSPACE_LC(*scan))
6308                 scan++;
6309         }
6310         break;
6311     case DIGIT:
6312         if (utf8_target) {
6313             loceol = PL_regeol;
6314             LOAD_UTF8_CHARCLASS_DIGIT();
6315             while (hardcount < max && scan < loceol &&
6316                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6317                 scan += UTF8SKIP(scan);
6318                 hardcount++;
6319             }
6320         } else {
6321             while (scan < loceol && isDIGIT(*scan))
6322                 scan++;
6323         }
6324         break;
6325     case DIGITA:
6326         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6327             scan++;
6328         }
6329         break;
6330     case DIGITL:
6331         PL_reg_flags |= RF_tainted;
6332         if (utf8_target) {
6333             loceol = PL_regeol;
6334             while (hardcount < max && scan < loceol &&
6335                    isDIGIT_LC_utf8((U8*)scan)) {
6336                 scan += UTF8SKIP(scan);
6337                 hardcount++;
6338             }
6339         } else {
6340             while (scan < loceol && isDIGIT_LC(*scan))
6341                 scan++;
6342         }
6343         break;
6344     case NDIGIT:
6345         if (utf8_target) {
6346             loceol = PL_regeol;
6347             LOAD_UTF8_CHARCLASS_DIGIT();
6348             while (hardcount < max && scan < loceol &&
6349                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6350                 scan += UTF8SKIP(scan);
6351                 hardcount++;
6352             }
6353         } else {
6354             while (scan < loceol && !isDIGIT(*scan))
6355                 scan++;
6356         }
6357         break;
6358     case NDIGITA:
6359         if (utf8_target) {
6360             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6361                 scan += UTF8SKIP(scan);
6362             }
6363         }
6364         else {
6365             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6366                 scan++;
6367             }
6368         }
6369         break;
6370     case NDIGITL:
6371         PL_reg_flags |= RF_tainted;
6372         if (utf8_target) {
6373             loceol = PL_regeol;
6374             while (hardcount < max && scan < loceol &&
6375                    !isDIGIT_LC_utf8((U8*)scan)) {
6376                 scan += UTF8SKIP(scan);
6377                 hardcount++;
6378             }
6379         } else {
6380             while (scan < loceol && !isDIGIT_LC(*scan))
6381                 scan++;
6382         }
6383         break;
6384     case LNBREAK:
6385         if (utf8_target) {
6386             loceol = PL_regeol;
6387             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6388                 scan += c;
6389                 hardcount++;
6390             }
6391         } else {
6392             /*
6393               LNBREAK can match two latin chars, which is ok,
6394               because we have a null terminated string, but we
6395               have to use hardcount in this situation
6396             */
6397             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6398                 scan+=c;
6399                 hardcount++;
6400             }
6401         }
6402         break;
6403     case HORIZWS:
6404         if (utf8_target) {
6405             loceol = PL_regeol;
6406             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6407                 scan += c;
6408                 hardcount++;
6409             }
6410         } else {
6411             while (scan < loceol && is_HORIZWS_latin1(scan))
6412                 scan++;
6413         }
6414         break;
6415     case NHORIZWS:
6416         if (utf8_target) {
6417             loceol = PL_regeol;
6418             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6419                 scan += UTF8SKIP(scan);
6420                 hardcount++;
6421             }
6422         } else {
6423             while (scan < loceol && !is_HORIZWS_latin1(scan))
6424                 scan++;
6425
6426         }
6427         break;
6428     case VERTWS:
6429         if (utf8_target) {
6430             loceol = PL_regeol;
6431             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6432                 scan += c;
6433                 hardcount++;
6434             }
6435         } else {
6436             while (scan < loceol && is_VERTWS_latin1(scan))
6437                 scan++;
6438
6439         }
6440         break;
6441     case NVERTWS:
6442         if (utf8_target) {
6443             loceol = PL_regeol;
6444             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6445                 scan += UTF8SKIP(scan);
6446                 hardcount++;
6447             }
6448         } else {
6449             while (scan < loceol && !is_VERTWS_latin1(scan))
6450                 scan++;
6451
6452         }
6453         break;
6454
6455     default:            /* Called on something of 0 width. */
6456         break;          /* So match right here or not at all. */
6457     }
6458
6459     if (hardcount)
6460         c = hardcount;
6461     else
6462         c = scan - PL_reginput;
6463     PL_reginput = scan;
6464
6465     DEBUG_r({
6466         GET_RE_DEBUG_FLAGS_DECL;
6467         DEBUG_EXECUTE_r({
6468             SV * const prop = sv_newmortal();
6469             regprop(prog, prop, p);
6470             PerlIO_printf(Perl_debug_log,
6471                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6472                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6473         });
6474     });
6475
6476     return(c);
6477 }
6478
6479
6480 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6481 /*
6482 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6483 create a copy so that changes the caller makes won't change the shared one
6484  */
6485 SV *
6486 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6487 {
6488     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6489     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6490 }
6491 #endif
6492
6493 STATIC SV *
6494 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6495 {
6496     /* Returns the swash for the input 'node' in the regex 'prog'.
6497      * If <doinit> is true, will attempt to create the swash if not already
6498      *    done.
6499      * If <listsvp> is non-null, will return the swash initialization string in
6500      *    it.
6501      * If <altsvp> is non-null, will return the alternates to the regular swash
6502      *    in it
6503      * Tied intimately to how regcomp.c sets up the data structure */
6504
6505     dVAR;
6506     SV *sw  = NULL;
6507     SV *si  = NULL;
6508     SV *alt = NULL;
6509     SV*  invlist = NULL;
6510
6511     RXi_GET_DECL(prog,progi);
6512     const struct reg_data * const data = prog ? progi->data : NULL;
6513
6514     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6515
6516     assert(ANYOF_NONBITMAP(node));
6517
6518     if (data && data->count) {
6519         const U32 n = ARG(node);
6520
6521         if (data->what[n] == 's') {
6522             SV * const rv = MUTABLE_SV(data->data[n]);
6523             AV * const av = MUTABLE_AV(SvRV(rv));
6524             SV **const ary = AvARRAY(av);
6525             bool invlist_has_user_defined_property;
6526
6527             si = *ary;  /* ary[0] = the string to initialize the swash with */
6528
6529             /* Elements 3 and 4 are either both present or both absent. [3] is
6530              * any inversion list generated at compile time; [4] indicates if
6531              * that inversion list has any user-defined properties in it. */
6532             if (av_len(av) >= 3) {
6533                 invlist = ary[3];
6534                 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6535             }
6536             else {
6537                 invlist = NULL;
6538                 invlist_has_user_defined_property = FALSE;
6539             }
6540
6541             /* Element [1] is reserved for the set-up swash.  If already there,
6542              * return it; if not, create it and store it there */
6543             if (SvROK(ary[1])) {
6544                 sw = ary[1];
6545             }
6546             else if (si && doinit) {
6547
6548                 sw = _core_swash_init("utf8", /* the utf8 package */
6549                                       "", /* nameless */
6550                                       si,
6551                                       1, /* binary */
6552                                       0, /* not from tr/// */
6553                                       FALSE, /* is error if can't find
6554                                                 property */
6555                                       invlist,
6556                                       invlist_has_user_defined_property);
6557                 (void)av_store(av, 1, sw);
6558             }
6559
6560             /* Element [2] is for any multi-char folds.  Note that is a
6561              * fundamentally flawed design, because can't backtrack and try
6562              * again.  See [perl #89774] */
6563             if (SvTYPE(ary[2]) == SVt_PVAV) {
6564                 alt = ary[2];
6565             }
6566         }
6567     }
6568
6569     if (listsvp) {
6570         SV* matches_string = newSVpvn("", 0);
6571         SV** invlistsvp;
6572
6573         /* Use the swash, if any, which has to have incorporated into it all
6574          * possibilities */
6575         if (   sw
6576             && SvROK(sw)
6577             && SvTYPE(SvRV(sw)) == SVt_PVHV
6578             && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6579         {
6580             invlist = *invlistsvp;
6581         }
6582         else if (si && si != &PL_sv_undef) {
6583
6584             /* If no swash, use the input nitialization string, if available */
6585             sv_catsv(matches_string, si);
6586         }
6587
6588         /* Add the inversion list to whatever we have.  This may have come from
6589          * the swash, or from an input parameter */
6590         if (invlist) {
6591             sv_catsv(matches_string, _invlist_contents(invlist));
6592         }
6593         *listsvp = matches_string;
6594     }
6595
6596     if (altsvp)
6597         *altsvp  = alt;
6598
6599     return sw;
6600 }
6601
6602 /*
6603  - reginclass - determine if a character falls into a character class
6604
6605   n is the ANYOF regnode
6606   p is the target string
6607   lenp is pointer to the maximum number of bytes of how far to go in p
6608     (This is assumed wthout checking to always be at least the current
6609     character's size)
6610   utf8_target tells whether p is in UTF-8.
6611
6612   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6613   from a successful match, the value it points to will be updated to how many
6614   bytes in p were matched.  If there was no match, the value is undefined,
6615   possibly changed from the input.
6616
6617   Note that this can be a synthetic start class, a combination of various
6618   nodes, so things you think might be mutually exclusive, such as locale,
6619   aren't.  It can match both locale and non-locale
6620
6621  */
6622
6623 STATIC bool
6624 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6625 {
6626     dVAR;
6627     const char flags = ANYOF_FLAGS(n);
6628     bool match = FALSE;
6629     UV c = *p;
6630     STRLEN c_len = 0;
6631     STRLEN maxlen;
6632
6633     PERL_ARGS_ASSERT_REGINCLASS;
6634
6635     /* If c is not already the code point, get it */
6636     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6637         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6638                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6639                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6640                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6641                  * UTF8_ALLOW_FFFF */
6642         if (c_len == (STRLEN)-1)
6643             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6644     }
6645     else {
6646         c_len = 1;
6647     }
6648
6649     /* Use passed in max length, or one character if none passed in or less
6650      * than one character.  And assume will match just one character.  This is
6651      * overwritten later if matched more. */
6652     if (lenp) {
6653         maxlen = (*lenp > c_len) ? *lenp : c_len;
6654         *lenp = c_len;
6655
6656     }
6657     else {
6658         maxlen = c_len;
6659     }
6660
6661     /* If this character is potentially in the bitmap, check it */
6662     if (c < 256) {
6663         if (ANYOF_BITMAP_TEST(n, c))
6664             match = TRUE;
6665         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6666                 && ! utf8_target
6667                 && ! isASCII(c))
6668         {
6669             match = TRUE;
6670         }
6671
6672         else if (flags & ANYOF_LOCALE) {
6673             PL_reg_flags |= RF_tainted;
6674
6675             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6676                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6677             {
6678                 match = TRUE;
6679             }
6680             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6681                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6682                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6683                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6684                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6685                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6686                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6687                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6688                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6689                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6690                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6691                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6692                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6693                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6694                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6695                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6696                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6697                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6698                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6699                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6700                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6701                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6702                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6703                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6704                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6705                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6706                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6707                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6708                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6709                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6710                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6711                      ) /* How's that for a conditional? */
6712             ) {
6713                 match = TRUE;
6714             }
6715         }
6716     }
6717
6718     /* If the bitmap didn't (or couldn't) match, and something outside the
6719      * bitmap could match, try that.  Locale nodes specifiy completely the
6720      * behavior of code points in the bit map (otherwise, a utf8 target would
6721      * cause them to be treated as Unicode and not locale), except in
6722      * the very unlikely event when this node is a synthetic start class, which
6723      * could be a combination of locale and non-locale nodes.  So allow locale
6724      * to match for the synthetic start class, which will give a false
6725      * positive that will be resolved when the match is done again as not part
6726      * of the synthetic start class */
6727     if (!match) {
6728         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6729             match = TRUE;       /* Everything above 255 matches */
6730         }
6731         else if (ANYOF_NONBITMAP(n)
6732                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6733                      || (utf8_target
6734                          && (c >=256
6735                              || (! (flags & ANYOF_LOCALE))
6736                              || (flags & ANYOF_IS_SYNTHETIC)))))
6737         {
6738             AV *av;
6739             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6740
6741             if (sw) {
6742                 U8 * utf8_p;
6743                 if (utf8_target) {
6744                     utf8_p = (U8 *) p;
6745                 } else {
6746
6747                     /* Not utf8.  Convert as much of the string as available up
6748                      * to the limit of how far the (single) character in the
6749                      * pattern can possibly match (no need to go further).  If
6750                      * the node is a straight ANYOF or not folding, it can't
6751                      * match more than one.  Otherwise, It can match up to how
6752                      * far a single char can fold to.  Since not utf8, each
6753                      * character is a single byte, so the max it can be in
6754                      * bytes is the same as the max it can be in characters */
6755                     STRLEN len = (OP(n) == ANYOF
6756                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6757                                   ? 1
6758                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6759                                     ? maxlen
6760                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6761                     utf8_p = bytes_to_utf8(p, &len);
6762                 }
6763
6764                 if (swash_fetch(sw, utf8_p, TRUE))
6765                     match = TRUE;
6766                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6767
6768                     /* Here, we need to test if the fold of the target string
6769                      * matches.  The non-multi char folds have all been moved to
6770                      * the compilation phase, and the multi-char folds have
6771                      * been stored by regcomp into 'av'; we linearly check to
6772                      * see if any match the target string (folded).   We know
6773                      * that the originals were each one character, but we don't
6774                      * currently know how many characters/bytes each folded to,
6775                      * except we do know that there are small limits imposed by
6776                      * Unicode.  XXX A performance enhancement would be to have
6777                      * regcomp.c store the max number of chars/bytes that are
6778                      * in an av entry, as, say the 0th element.  Even better
6779                      * would be to have a hash of the few characters that can
6780                      * start a multi-char fold to the max number of chars of
6781                      * those folds.
6782                      *
6783                      * If there is a match, we will need to advance (if lenp is
6784                      * specified) the match pointer in the target string.  But
6785                      * what we are comparing here isn't that string directly,
6786                      * but its fold, whose length may differ from the original.
6787                      * As we go along in constructing the fold, therefore, we
6788                      * create a map so that we know how many bytes in the
6789                      * source to advance given that we have matched a certain
6790                      * number of bytes in the fold.  This map is stored in
6791                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6792                      * the fold of the first character that we are folding.
6793                      * Then map_fold_len_back[n] is set to the number of bytes
6794                      * in that first character.  Similarly let m be the
6795                      * corresponding number for the second character to be
6796                      * folded.  Then map_fold_len_back[n+m] is set to the
6797                      * number of bytes occupied by the first two source
6798                      * characters. ... */
6799                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6800                     U8 folded[UTF8_MAXBYTES_CASE+1];
6801                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6802                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6803                                                   chars */
6804
6805                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6806
6807                         /* Here, only need to fold the first char of the target
6808                          * string.  It the source wasn't utf8, is 1 byte long */
6809                         to_utf8_fold(utf8_p, folded, &foldlen);
6810                         total_foldlen = foldlen;
6811                         map_fold_len_back[foldlen] = (utf8_target)
6812                                                      ? UTF8SKIP(utf8_p)
6813                                                      : 1;
6814                     }
6815                     else {
6816
6817                         /* Here, need to fold more than the first char.  Do so
6818                          * up to the limits */
6819                         U8* source_ptr = utf8_p;    /* The source for the fold
6820                                                        is the regex target
6821                                                        string */
6822                         U8* folded_ptr = folded;
6823                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6824                                                        available byte in the
6825                                                        target string */
6826                         U8 i;
6827                         for (i = 0;
6828                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6829                              i++)
6830                         {
6831
6832                             /* Fold the next character */
6833                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6834                             STRLEN this_char_foldlen;
6835                             to_utf8_fold(source_ptr,
6836                                          this_char_folded,
6837                                          &this_char_foldlen);
6838
6839                             /* Bail if it would exceed the byte limit for
6840                              * folding a single char. */
6841                             if (this_char_foldlen + folded_ptr - folded >
6842                                                             UTF8_MAXBYTES_CASE)
6843                             {
6844                                 break;
6845                             }
6846
6847                             /* Add the fold of this character */
6848                             Copy(this_char_folded,
6849                                  folded_ptr,
6850                                  this_char_foldlen,
6851                                  U8);
6852                             source_ptr += UTF8SKIP(source_ptr);
6853                             folded_ptr += this_char_foldlen;
6854                             total_foldlen = folded_ptr - folded;
6855
6856                             /* Create map from the number of bytes in the fold
6857                              * back to the number of bytes in the source.  If
6858                              * the source isn't utf8, the byte count is just
6859                              * the number of characters so far */
6860                             map_fold_len_back[total_foldlen]
6861                                                       = (utf8_target)
6862                                                         ? source_ptr - utf8_p
6863                                                         : i + 1;
6864                         }
6865                         *folded_ptr = '\0';
6866                     }
6867
6868
6869                     /* Do the linear search to see if the fold is in the list
6870                      * of multi-char folds. */
6871                     if (av) {
6872                         I32 i;
6873                         for (i = 0; i <= av_len(av); i++) {
6874                             SV* const sv = *av_fetch(av, i, FALSE);
6875                             STRLEN len;
6876                             const char * const s = SvPV_const(sv, len);
6877
6878                             if (len <= total_foldlen
6879                                 && memEQ(s, (char*)folded, len)
6880
6881                                    /* If 0, means matched a partial char. See
6882                                     * [perl #90536] */
6883                                 && map_fold_len_back[len])
6884                             {
6885
6886                                 /* Advance the target string ptr to account for
6887                                  * this fold, but have to translate from the
6888                                  * folded length to the corresponding source
6889                                  * length. */
6890                                 if (lenp) {
6891                                     *lenp = map_fold_len_back[len];
6892                                 }
6893                                 match = TRUE;
6894                                 break;
6895                             }
6896                         }
6897                     }
6898                 }
6899
6900                 /* If we allocated a string above, free it */
6901                 if (! utf8_target) Safefree(utf8_p);
6902             }
6903         }
6904     }
6905
6906     return (flags & ANYOF_INVERT) ? !match : match;
6907 }
6908
6909 STATIC U8 *
6910 S_reghop3(U8 *s, I32 off, const U8* lim)
6911 {
6912     /* return the position 'off' UTF-8 characters away from 's', forward if
6913      * 'off' >= 0, backwards if negative.  But don't go outside of position
6914      * 'lim', which better be < s  if off < 0 */
6915
6916     dVAR;
6917
6918     PERL_ARGS_ASSERT_REGHOP3;
6919
6920     if (off >= 0) {
6921         while (off-- && s < lim) {
6922             /* XXX could check well-formedness here */
6923             s += UTF8SKIP(s);
6924         }
6925     }
6926     else {
6927         while (off++ && s > lim) {
6928             s--;
6929             if (UTF8_IS_CONTINUED(*s)) {
6930                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6931                     s--;
6932             }
6933             /* XXX could check well-formedness here */
6934         }
6935     }
6936     return s;
6937 }
6938
6939 #ifdef XXX_dmq
6940 /* there are a bunch of places where we use two reghop3's that should
6941    be replaced with this routine. but since thats not done yet
6942    we ifdef it out - dmq
6943 */
6944 STATIC U8 *
6945 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6946 {
6947     dVAR;
6948
6949     PERL_ARGS_ASSERT_REGHOP4;
6950
6951     if (off >= 0) {
6952         while (off-- && s < rlim) {
6953             /* XXX could check well-formedness here */
6954             s += UTF8SKIP(s);
6955         }
6956     }
6957     else {
6958         while (off++ && s > llim) {
6959             s--;
6960             if (UTF8_IS_CONTINUED(*s)) {
6961                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6962                     s--;
6963             }
6964             /* XXX could check well-formedness here */
6965         }
6966     }
6967     return s;
6968 }
6969 #endif
6970
6971 STATIC U8 *
6972 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6973 {
6974     dVAR;
6975
6976     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6977
6978     if (off >= 0) {
6979         while (off-- && s < lim) {
6980             /* XXX could check well-formedness here */
6981             s += UTF8SKIP(s);
6982         }
6983         if (off >= 0)
6984             return NULL;
6985     }
6986     else {
6987         while (off++ && s > lim) {
6988             s--;
6989             if (UTF8_IS_CONTINUED(*s)) {
6990                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6991                     s--;
6992             }
6993             /* XXX could check well-formedness here */
6994         }
6995         if (off <= 0)
6996             return NULL;
6997     }
6998     return s;
6999 }
7000
7001 static void
7002 restore_pos(pTHX_ void *arg)
7003 {
7004     dVAR;
7005     regexp * const rex = (regexp *)arg;
7006     if (PL_reg_eval_set) {
7007         if (PL_reg_oldsaved) {
7008             rex->subbeg = PL_reg_oldsaved;
7009             rex->sublen = PL_reg_oldsavedlen;
7010 #ifdef PERL_OLD_COPY_ON_WRITE
7011             rex->saved_copy = PL_nrs;
7012 #endif
7013             RXp_MATCH_COPIED_on(rex);
7014         }
7015         PL_reg_magic->mg_len = PL_reg_oldpos;
7016         PL_reg_eval_set = 0;
7017         PL_curpm = PL_reg_oldcurpm;
7018     }
7019 }
7020
7021 STATIC void
7022 S_to_utf8_substr(pTHX_ register regexp *prog)
7023 {
7024     int i = 1;
7025
7026     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7027
7028     do {
7029         if (prog->substrs->data[i].substr
7030             && !prog->substrs->data[i].utf8_substr) {
7031             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7032             prog->substrs->data[i].utf8_substr = sv;
7033             sv_utf8_upgrade(sv);
7034             if (SvVALID(prog->substrs->data[i].substr)) {
7035                 if (SvTAIL(prog->substrs->data[i].substr)) {
7036                     /* Trim the trailing \n that fbm_compile added last
7037                        time.  */
7038                     SvCUR_set(sv, SvCUR(sv) - 1);
7039                     /* Whilst this makes the SV technically "invalid" (as its
7040                        buffer is no longer followed by "\0") when fbm_compile()
7041                        adds the "\n" back, a "\0" is restored.  */
7042                     fbm_compile(sv, FBMcf_TAIL);
7043                 } else
7044                     fbm_compile(sv, 0);
7045             }
7046             if (prog->substrs->data[i].substr == prog->check_substr)
7047                 prog->check_utf8 = sv;
7048         }
7049     } while (i--);
7050 }
7051
7052 STATIC void
7053 S_to_byte_substr(pTHX_ register regexp *prog)
7054 {
7055     dVAR;
7056     int i = 1;
7057
7058     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7059
7060     do {
7061         if (prog->substrs->data[i].utf8_substr
7062             && !prog->substrs->data[i].substr) {
7063             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7064             if (sv_utf8_downgrade(sv, TRUE)) {
7065                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7066                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7067                         /* Trim the trailing \n that fbm_compile added last
7068                            time.  */
7069                         SvCUR_set(sv, SvCUR(sv) - 1);
7070                         fbm_compile(sv, FBMcf_TAIL);
7071                     } else
7072                         fbm_compile(sv, 0);
7073                 }
7074             } else {
7075                 SvREFCNT_dec(sv);
7076                 sv = &PL_sv_undef;
7077             }
7078             prog->substrs->data[i].substr = sv;
7079             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7080                 prog->check_substr = sv;
7081         }
7082     } while (i--);
7083 }
7084
7085 /*
7086  * Local variables:
7087  * c-indentation-style: bsd
7088  * c-basic-offset: 4
7089  * indent-tabs-mode: t
7090  * End:
7091  *
7092  * ex: set ts=8 sts=4 sw=4 noet:
7093  */