regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 /*
  46  * pregcomp and pregexec -- regsub and regerror are not used in perl
  47  *
  48  *      Copyright (c) 1986 by University of Toronto.
  49  *      Written by Henry Spencer.  Not derived from licensed software.
  50  *
  51  *      Permission is granted to anyone to use this software for any
  52  *      purpose on any computer system, and to redistribute it freely,
  53  *      subject to the following restrictions:
  54  *
  55  *      1. The author is not responsible for the consequences of use of
  56  *              this software, no matter how awful, even if they arise
  57  *              from defects in it.
  58  *
  59  *      2. The origin of this software must not be misrepresented, either
  60  *              by explicit claim or by omission.
  61  *
  62  *      3. Altered versions must be plainly marked as such, and must not
  63  *              be misrepresented as being the original software.
  64  *
  65  ****    Alterations to Henry's code are...
  66  ****
  67  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  68  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  69  ****    by Larry Wall and others
  70  ****
  71  ****    You may distribute under the terms of either the GNU General Public
  72  ****    License or the Artistic License, as specified in the README file.
  73  *
  74  * Beware that some of this code is subtly aware of the way operator
  75  * precedence is structured in regular expressions.  Serious changes in
  76  * regular-expression syntax might require a total rethink.
  77  */
  78 #include "EXTERN.h"
  79 #define PERL_IN_REGEXEC_C
  80 #include "perl.h"
  81
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "inline_invlist.c"
  89 #include "unicode_constants.h"
  90
  91 #define RF_tainted      1       /* tainted information used? e.g. locale */
  92 #define RF_warned       2               /* warned about big count? */
  93
  94 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  95
  96 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  97
  98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* Valid for non-utf8 strings: avoids the reginclass
 105  * call if there are no complications: i.e., if everything matchable is
 106  * straight forward in the bitmap */
 107 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 108                                               : ANYOF_BITMAP_TEST(p,*(c)))
 109
 110 /*
 111  * Forwards.
 112  */
 113
 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 115 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 116
 117 #define HOPc(pos,off) \
 118         (char *)(PL_reg_match_utf8 \
 119             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 120             : (U8*)(pos + off))
 121 #define HOPBACKc(pos, off) \
 122         (char*)(PL_reg_match_utf8\
 123             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 124             : (pos - off >= PL_bostr)           \
 125                 ? (U8*)pos - off                \
 126                 : NULL)
 127
 128 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 129 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 130
 131
 132 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 133 #define NEXTCHR_IS_EOS (nextchr < 0)
 134
 135 #define SET_nextchr \
 136     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 137
 138 #define SET_locinput(p) \
 139     locinput = (p);  \
 140     SET_nextchr
 141
 142
 143 /* these are unrolled below in the CCC_TRY_XXX defined */
 144 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 145     if (!CAT2(PL_utf8_,class)) { \
 146         bool ok; \
 147         ENTER; save_re_context(); \
 148         ok=CAT2(is_utf8_,class)((const U8*)str); \
 149         PERL_UNUSED_VAR(ok); \
 150         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 151 /* Doesn't do an assert to verify that is correct */
 152 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 153     if (!CAT2(PL_utf8_,class)) { \
 154         bool throw_away PERL_UNUSED_DECL; \
 155         ENTER; save_re_context(); \
 156         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 157         LEAVE; } } STMT_END
 158
 159 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 160 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 161 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 162
 163 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 164         /* No asserts are done for some of these, in case called on a   */  \
 165         /* Unicode version in which they map to nothing */                  \
 166         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 167         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 168
 169 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 170
 171 /* The actual code for CCC_TRY, which uses several variables from the routine
 172  * it's callable from.  It is designed to be the bulk of a case statement.
 173  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 174  *      nextchr matches the class.
 175  * UTF8_TEST is the whole test string to use for utf8 targets
 176  * LOAD is what to use to test, and if not present to load in the swash for the
 177  *      class
 178  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 179  *      UTF8_TEST test.
 180  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 181  * utf8 and a variant, load the swash if necessary and test using the utf8
 182  * test.  Advance to the next character if test is ok, otherwise fail; If not
 183  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 184  * fails, or advance to the next character */
 185
 186 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 187     if (NEXTCHR_IS_EOS) {                                              \
 188         sayNO;                                                                \
 189     }                                                                         \
 190     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 191         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 192         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 193             sayNO;                                                            \
 194         }                                                                     \
 195     }                                                                         \
 196     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 197             sayNO;                                                            \
 198     }                                                                         \
 199     goto increment_locinput;
 200
 201 /* Handle the non-locale cases for a character class and its complement.  It
 202  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 203  * This is because that code fails when the test succeeds, so we want to have
 204  * the test fail so that the code succeeds.  The swash is stored in a
 205  * predictable PL_ place */
 206 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 207                            CLASS, STR)                                        \
 208     case NAME:                                                                \
 209         _CCC_TRY_CODE( !, FUNC,                                               \
 210                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 211                                             (U8*)locinput, TRUE)),            \
 212                           CLASS, STR)                                         \
 213     case NNAME:                                                               \
 214         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218
 219 /* Generate the case statements for both locale and non-locale character
 220  * classes in regmatch for classes that don't have special unicode semantics.
 221  * Locales don't use an immediate swash, but an intermediary special locale
 222  * function that is called on the pointer to the current place in the input
 223  * string.  That function will resolve to needing the same swash.  One might
 224  * think that because we don't know what the locale will match, we shouldn't
 225  * check with the swash loading function that it loaded properly; ie, that we
 226  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 227  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 228  * irrelevant here */
 229 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 230                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 231                 NAMEA, NNAMEA, FUNCA,                                         \
 232                 CLASS, STR)                                                   \
 233     case NAMEL:                                                               \
 234         PL_reg_flags |= RF_tainted;                                           \
 235         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 236     case NNAMEL:                                                              \
 237         PL_reg_flags |= RF_tainted;                                           \
 238         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 239                        CLASS, STR)                                            \
 240     case NAMEA:                                                               \
 241         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 242             sayNO;                                                            \
 243         }                                                                     \
 244         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 245         locinput++;                                        \
 246         break;                                                                \
 247     case NNAMEA:                                                              \
 248         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 249             sayNO;                                                            \
 250         }                                                                     \
 251         goto increment_locinput;                                              \
 252     /* Generate the non-locale cases */                                       \
 253     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 254
 255 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 256  * statements to handle separate Unicode semantics nodes */
 257 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 258                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 259                   NAMEU, NNAMEU, FUNCU,                                        \
 260                   NAMEA, NNAMEA, FUNCA,                                        \
 261                   CLASS, STR)                                                  \
 262     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 263             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 264             NAMEA, NNAMEA, FUNCA,                                              \
 265             CLASS, STR)                                                        \
 266     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 267
 268 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 269
 270 /* for use after a quantifier and before an EXACT-like node -- japhy */
 271 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 272  *
 273  * NOTE that *nothing* that affects backtracking should be in here, specifically
 274  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 275  * node that is in between two EXACT like nodes when ascertaining what the required
 276  * "follow" character is. This should probably be moved to regex compile time
 277  * although it may be done at run time beause of the REF possibility - more
 278  * investigation required. -- demerphq
 279 */
 280 #define JUMPABLE(rn) (      \
 281     OP(rn) == OPEN ||       \
 282     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 283     OP(rn) == EVAL ||   \
 284     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 285     OP(rn) == PLUS || OP(rn) == MINMOD || \
 286     OP(rn) == KEEPS || \
 287     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 288 )
 289 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 290
 291 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 292
 293 #if 0
 294 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 295    we don't need this definition. */
 296 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 297 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 299
 300 #else
 301 /* ... so we use this as its faster. */
 302 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 303 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 304 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 305 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 306
 307 #endif
 308
 309 /*
 310   Search for mandatory following text node; for lookahead, the text must
 311   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 312 */
 313 #define FIND_NEXT_IMPT(rn) STMT_START { \
 314     while (JUMPABLE(rn)) { \
 315         const OPCODE type = OP(rn); \
 316         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 317             rn = NEXTOPER(NEXTOPER(rn)); \
 318         else if (type == PLUS) \
 319             rn = NEXTOPER(rn); \
 320         else if (type == IFMATCH) \
 321             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 322         else rn += NEXT_OFF(rn); \
 323     } \
 324 } STMT_END
 325
 326
 327 static void restore_pos(pTHX_ void *arg);
 328
 329 #define REGCP_PAREN_ELEMS 3
 330 #define REGCP_OTHER_ELEMS 3
 331 #define REGCP_FRAME_ELEMS 1
 332 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 333  * are needed for the regexp context stack bookkeeping. */
 334
 335 STATIC CHECKPOINT
 336 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 337 {
 338     dVAR;
 339     const int retval = PL_savestack_ix;
 340     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 341     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 342     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 343     I32 p;
 344     GET_RE_DEBUG_FLAGS_DECL;
 345
 346     PERL_ARGS_ASSERT_REGCPPUSH;
 347
 348     if (paren_elems_to_push < 0)
 349         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 350                    paren_elems_to_push);
 351
 352     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 354                    " out of range (%lu-%ld)",
 355                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 356
 357     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 358
 359     DEBUG_BUFFERS_r(
 360         if ((int)PL_regsize > (int)parenfloor)
 361             PerlIO_printf(Perl_debug_log,
 362                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 363                 PTR2UV(rex),
 364                 PTR2UV(rex->offs)
 365             );
 366     );
 367     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 368 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 369         SSPUSHINT(rex->offs[p].end);
 370         SSPUSHINT(rex->offs[p].start);
 371         SSPUSHINT(rex->offs[p].start_tmp);
 372         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 373             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 374             (UV)p,
 375             (IV)rex->offs[p].start,
 376             (IV)rex->offs[p].start_tmp,
 377             (IV)rex->offs[p].end
 378         ));
 379     }
 380 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 381     SSPUSHINT(PL_regsize);
 382     SSPUSHINT(rex->lastparen);
 383     SSPUSHINT(rex->lastcloseparen);
 384     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 385
 386     return retval;
 387 }
 388
 389 /* These are needed since we do not localize EVAL nodes: */
 390 #define REGCP_SET(cp)                                           \
 391     DEBUG_STATE_r(                                              \
 392             PerlIO_printf(Perl_debug_log,                       \
 393                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 394                 (IV)PL_savestack_ix));                          \
 395     cp = PL_savestack_ix
 396
 397 #define REGCP_UNWIND(cp)                                        \
 398     DEBUG_STATE_r(                                              \
 399         if (cp != PL_savestack_ix)                              \
 400             PerlIO_printf(Perl_debug_log,                       \
 401                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 402                 (IV)(cp), (IV)PL_savestack_ix));                \
 403     regcpblow(cp)
 404
 405 #define UNWIND_PAREN(lp, lcp)               \
 406     for (n = rex->lastparen; n > lp; n--)   \
 407         rex->offs[n].end = -1;              \
 408     rex->lastparen = n;                     \
 409     rex->lastcloseparen = lcp;
 410
 411
 412 STATIC void
 413 S_regcppop(pTHX_ regexp *rex)
 414 {
 415     dVAR;
 416     UV i;
 417     U32 paren;
 418     GET_RE_DEBUG_FLAGS_DECL;
 419
 420     PERL_ARGS_ASSERT_REGCPPOP;
 421
 422     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 423     i = SSPOPUV;
 424     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 425     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 426     rex->lastcloseparen = SSPOPINT;
 427     rex->lastparen = SSPOPINT;
 428     PL_regsize = SSPOPINT;
 429
 430     i -= REGCP_OTHER_ELEMS;
 431     /* Now restore the parentheses context. */
 432     DEBUG_BUFFERS_r(
 433         if (i || rex->lastparen + 1 <= rex->nparens)
 434             PerlIO_printf(Perl_debug_log,
 435                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 436                 PTR2UV(rex),
 437                 PTR2UV(rex->offs)
 438             );
 439     );
 440     paren = PL_regsize;
 441     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 442         I32 tmps;
 443         rex->offs[paren].start_tmp = SSPOPINT;
 444         rex->offs[paren].start = SSPOPINT;
 445         tmps = SSPOPINT;
 446         if (paren <= rex->lastparen)
 447             rex->offs[paren].end = tmps;
 448         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 449             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 450             (UV)paren,
 451             (IV)rex->offs[paren].start,
 452             (IV)rex->offs[paren].start_tmp,
 453             (IV)rex->offs[paren].end,
 454             (paren > rex->lastparen ? "(skipped)" : ""));
 455         );
 456         paren--;
 457     }
 458 #if 1
 459     /* It would seem that the similar code in regtry()
 460      * already takes care of this, and in fact it is in
 461      * a better location to since this code can #if 0-ed out
 462      * but the code in regtry() is needed or otherwise tests
 463      * requiring null fields (pat.t#187 and split.t#{13,14}
 464      * (as of patchlevel 7877)  will fail.  Then again,
 465      * this code seems to be necessary or otherwise
 466      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 467      * --jhi updated by dapm */
 468     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 469         if (i > PL_regsize)
 470             rex->offs[i].start = -1;
 471         rex->offs[i].end = -1;
 472         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 473             "    \\%"UVuf": %s   ..-1 undeffing\n",
 474             (UV)i,
 475             (i > PL_regsize) ? "-1" : "  "
 476         ));
 477     }
 478 #endif
 479 }
 480
 481 /* restore the parens and associated vars at savestack position ix,
 482  * but without popping the stack */
 483
 484 STATIC void
 485 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 486 {
 487     I32 tmpix = PL_savestack_ix;
 488     PL_savestack_ix = ix;
 489     regcppop(rex);
 490     PL_savestack_ix = tmpix;
 491 }
 492
 493 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 494
 495 /*
 496  * pregexec and friends
 497  */
 498
 499 #ifndef PERL_IN_XSUB_RE
 500 /*
 501  - pregexec - match a regexp against a string
 502  */
 503 I32
 504 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 505          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 506 /* stringarg: the point in the string at which to begin matching */
 507 /* strend:    pointer to null at end of string */
 508 /* strbeg:    real beginning of string */
 509 /* minend:    end of match must be >= minend bytes after stringarg. */
 510 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 511  *            itself is accessed via the pointers above */
 512 /* nosave:    For optimizations. */
 513 {
 514     PERL_ARGS_ASSERT_PREGEXEC;
 515
 516     return
 517         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 518                       nosave ? 0 : REXEC_COPY_STR);
 519 }
 520 #endif
 521
 522 /*
 523  * Need to implement the following flags for reg_anch:
 524  *
 525  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 526  * USE_INTUIT_ML
 527  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 528  * INTUIT_AUTORITATIVE_ML
 529  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 530  * INTUIT_ONCE_ML
 531  *
 532  * Another flag for this function: SECOND_TIME (so that float substrs
 533  * with giant delta may be not rechecked).
 534  */
 535
 536 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 537
 538 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 539    Otherwise, only SvCUR(sv) is used to get strbeg. */
 540
 541 /* XXXX We assume that strpos is strbeg unless sv. */
 542
 543 /* XXXX Some places assume that there is a fixed substring.
 544         An update may be needed if optimizer marks as "INTUITable"
 545         RExen without fixed substrings.  Similarly, it is assumed that
 546         lengths of all the strings are no more than minlen, thus they
 547         cannot come from lookahead.
 548         (Or minlen should take into account lookahead.)
 549   NOTE: Some of this comment is not correct. minlen does now take account
 550   of lookahead/behind. Further research is required. -- demerphq
 551
 552 */
 553
 554 /* A failure to find a constant substring means that there is no need to make
 555    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 556    finding a substring too deep into the string means that less calls to
 557    regtry() should be needed.
 558
 559    REx compiler's optimizer found 4 possible hints:
 560         a) Anchored substring;
 561         b) Fixed substring;
 562         c) Whether we are anchored (beginning-of-line or \G);
 563         d) First node (of those at offset 0) which may distinguish positions;
 564    We use a)b)d) and multiline-part of c), and try to find a position in the
 565    string which does not contradict any of them.
 566  */
 567
 568 /* Most of decisions we do here should have been done at compile time.
 569    The nodes of the REx which we used for the search should have been
 570    deleted from the finite automaton. */
 571
 572 char *
 573 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 574                      char *strend, const U32 flags, re_scream_pos_data *data)
 575 {
 576     dVAR;
 577     struct regexp *const prog = (struct regexp *)SvANY(rx);
 578     I32 start_shift = 0;
 579     /* Should be nonnegative! */
 580     I32 end_shift   = 0;
 581     char *s;
 582     SV *check;
 583     char *strbeg;
 584     char *t;
 585     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 586     I32 ml_anch;
 587     char *other_last = NULL;    /* other substr checked before this */
 588     char *check_at = NULL;              /* check substr found at this pos */
 589     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 590     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 591     RXi_GET_DECL(prog,progi);
 592 #ifdef DEBUGGING
 593     const char * const i_strpos = strpos;
 594 #endif
 595     GET_RE_DEBUG_FLAGS_DECL;
 596
 597     PERL_ARGS_ASSERT_RE_INTUIT_START;
 598     PERL_UNUSED_ARG(flags);
 599     PERL_UNUSED_ARG(data);
 600
 601     RX_MATCH_UTF8_set(rx,utf8_target);
 602
 603     if (RX_UTF8(rx)) {
 604         PL_reg_flags |= RF_utf8;
 605     }
 606     DEBUG_EXECUTE_r(
 607         debug_start_match(rx, utf8_target, strpos, strend,
 608             sv ? "Guessing start of match in sv for"
 609                : "Guessing start of match in string for");
 610               );
 611
 612     /* CHR_DIST() would be more correct here but it makes things slow. */
 613     if (prog->minlen > strend - strpos) {
 614         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 615                               "String too short... [re_intuit_start]\n"));
 616         goto fail;
 617     }
 618
 619     /* XXX we need to pass strbeg as a separate arg: the following is
 620      * guesswork and can be wrong... */
 621     if (sv && SvPOK(sv)) {
 622         char * p   = SvPVX(sv);
 623         STRLEN cur = SvCUR(sv);
 624         if (p <= strpos && strpos < p + cur) {
 625             strbeg = p;
 626             assert(p <= strend && strend <= p + cur);
 627         }
 628         else
 629             strbeg = strend - cur;
 630     }
 631     else
 632         strbeg = strpos;
 633
 634     PL_regeol = strend;
 635     if (utf8_target) {
 636         if (!prog->check_utf8 && prog->check_substr)
 637             to_utf8_substr(prog);
 638         check = prog->check_utf8;
 639     } else {
 640         if (!prog->check_substr && prog->check_utf8) {
 641             if (! to_byte_substr(prog)) {
 642                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 643                                         non_utf8_target_but_utf8_required));
 644                 goto fail;
 645             }
 646         }
 647         check = prog->check_substr;
 648     }
 649     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 650         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 651                      || ( (prog->extflags & RXf_ANCH_BOL)
 652                           && !multiline ) );    /* Check after \n? */
 653
 654         if (!ml_anch) {
 655           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 656                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 657                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 658                && sv && !SvROK(sv)
 659                && (strpos != strbeg)) {
 660               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 661               goto fail;
 662           }
 663           if (prog->check_offset_min == prog->check_offset_max &&
 664               !(prog->extflags & RXf_CANY_SEEN)) {
 665             /* Substring at constant offset from beg-of-str... */
 666             I32 slen;
 667
 668             s = HOP3c(strpos, prog->check_offset_min, strend);
 669
 670             if (SvTAIL(check)) {
 671                 slen = SvCUR(check);    /* >= 1 */
 672
 673                 if ( strend - s > slen || strend - s < slen - 1
 674                      || (strend - s == slen && strend[-1] != '\n')) {
 675                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 676                     goto fail_finish;
 677                 }
 678                 /* Now should match s[0..slen-2] */
 679                 slen--;
 680                 if (slen && (*SvPVX_const(check) != *s
 681                              || (slen > 1
 682                                  && memNE(SvPVX_const(check), s, slen)))) {
 683                   report_neq:
 684                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 685                     goto fail_finish;
 686                 }
 687             }
 688             else if (*SvPVX_const(check) != *s
 689                      || ((slen = SvCUR(check)) > 1
 690                          && memNE(SvPVX_const(check), s, slen)))
 691                 goto report_neq;
 692             check_at = s;
 693             goto success_at_start;
 694           }
 695         }
 696         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 697         s = strpos;
 698         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 699         end_shift = prog->check_end_shift;
 700
 701         if (!ml_anch) {
 702             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 703                                          - (SvTAIL(check) != 0);
 704             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 705
 706             if (end_shift < eshift)
 707                 end_shift = eshift;
 708         }
 709     }
 710     else {                              /* Can match at random position */
 711         ml_anch = 0;
 712         s = strpos;
 713         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 714         end_shift = prog->check_end_shift;
 715
 716         /* end shift should be non negative here */
 717     }
 718
 719 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 720     if (end_shift < 0)
 721         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 722                    (IV)end_shift, RX_PRECOMP(prog));
 723 #endif
 724
 725   restart:
 726     /* Find a possible match in the region s..strend by looking for
 727        the "check" substring in the region corrected by start/end_shift. */
 728
 729     {
 730         I32 srch_start_shift = start_shift;
 731         I32 srch_end_shift = end_shift;
 732         U8* start_point;
 733         U8* end_point;
 734         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 735             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 736             srch_start_shift = strbeg - s;
 737         }
 738     DEBUG_OPTIMISE_MORE_r({
 739         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 740             (IV)prog->check_offset_min,
 741             (IV)srch_start_shift,
 742             (IV)srch_end_shift,
 743             (IV)prog->check_end_shift);
 744     });
 745
 746         if (prog->extflags & RXf_CANY_SEEN) {
 747             start_point= (U8*)(s + srch_start_shift);
 748             end_point= (U8*)(strend - srch_end_shift);
 749         } else {
 750             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 751             end_point= HOP3(strend, -srch_end_shift, strbeg);
 752         }
 753         DEBUG_OPTIMISE_MORE_r({
 754             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 755                 (int)(end_point - start_point),
 756                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 757                 start_point);
 758         });
 759
 760         s = fbm_instr( start_point, end_point,
 761                       check, multiline ? FBMrf_MULTILINE : 0);
 762     }
 763     /* Update the count-of-usability, remove useless subpatterns,
 764         unshift s.  */
 765
 766     DEBUG_EXECUTE_r({
 767         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 768             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 769         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 770                           (s ? "Found" : "Did not find"),
 771             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 772                 ? "anchored" : "floating"),
 773             quoted,
 774             RE_SV_TAIL(check),
 775             (s ? " at offset " : "...\n") );
 776     });
 777
 778     if (!s)
 779         goto fail_finish;
 780     /* Finish the diagnostic message */
 781     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 782
 783     /* XXX dmq: first branch is for positive lookbehind...
 784        Our check string is offset from the beginning of the pattern.
 785        So we need to do any stclass tests offset forward from that
 786        point. I think. :-(
 787      */
 788
 789
 790
 791     check_at=s;
 792
 793
 794     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 795        Start with the other substr.
 796        XXXX no SCREAM optimization yet - and a very coarse implementation
 797        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 798                 *always* match.  Probably should be marked during compile...
 799        Probably it is right to do no SCREAM here...
 800      */
 801
 802     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 803                 : (prog->float_substr && prog->anchored_substr))
 804     {
 805         /* Take into account the "other" substring. */
 806         /* XXXX May be hopelessly wrong for UTF... */
 807         if (!other_last)
 808             other_last = strpos;
 809         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 810           do_other_anchored:
 811             {
 812                 char * const last = HOP3c(s, -start_shift, strbeg);
 813                 char *last1, *last2;
 814                 char * const saved_s = s;
 815                 SV* must;
 816
 817                 t = s - prog->check_offset_max;
 818                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 819                     && (!utf8_target
 820                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 821                             && t > strpos)))
 822                     NOOP;
 823                 else
 824                     t = strpos;
 825                 t = HOP3c(t, prog->anchored_offset, strend);
 826                 if (t < other_last)     /* These positions already checked */
 827                     t = other_last;
 828                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 829                 if (last < last1)
 830                     last1 = last;
 831                 /* XXXX It is not documented what units *_offsets are in.
 832                    We assume bytes, but this is clearly wrong.
 833                    Meaning this code needs to be carefully reviewed for errors.
 834                    dmq.
 835                   */
 836
 837                 /* On end-of-str: see comment below. */
 838                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 839                 if (must == &PL_sv_undef) {
 840                     s = (char*)NULL;
 841                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 842                 }
 843                 else
 844                     s = fbm_instr(
 845                         (unsigned char*)t,
 846                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 847                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 848                         must,
 849                         multiline ? FBMrf_MULTILINE : 0
 850                     );
 851                 DEBUG_EXECUTE_r({
 852                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 853                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 854                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 855                         (s ? "Found" : "Contradicts"),
 856                         quoted, RE_SV_TAIL(must));
 857                 });
 858
 859
 860                 if (!s) {
 861                     if (last1 >= last2) {
 862                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 863                                                 ", giving up...\n"));
 864                         goto fail_finish;
 865                     }
 866                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 867                         ", trying floating at offset %ld...\n",
 868                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 869                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 870                     s = HOP3c(last, 1, strend);
 871                     goto restart;
 872                 }
 873                 else {
 874                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 875                           (long)(s - i_strpos)));
 876                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 877                     other_last = HOP3c(s, 1, strend);
 878                     s = saved_s;
 879                     if (t == strpos)
 880                         goto try_at_start;
 881                     goto try_at_offset;
 882                 }
 883             }
 884         }
 885         else {          /* Take into account the floating substring. */
 886             char *last, *last1;
 887             char * const saved_s = s;
 888             SV* must;
 889
 890             t = HOP3c(s, -start_shift, strbeg);
 891             last1 = last =
 892                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 893             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 894                 last = HOP3c(t, prog->float_max_offset, strend);
 895             s = HOP3c(t, prog->float_min_offset, strend);
 896             if (s < other_last)
 897                 s = other_last;
 898  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 899             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 900             /* fbm_instr() takes into account exact value of end-of-str
 901                if the check is SvTAIL(ed).  Since false positives are OK,
 902                and end-of-str is not later than strend we are OK. */
 903             if (must == &PL_sv_undef) {
 904                 s = (char*)NULL;
 905                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 906             }
 907             else
 908                 s = fbm_instr((unsigned char*)s,
 909                               (unsigned char*)last + SvCUR(must)
 910                                   - (SvTAIL(must)!=0),
 911                               must, multiline ? FBMrf_MULTILINE : 0);
 912             DEBUG_EXECUTE_r({
 913                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 914                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 915                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 916                     (s ? "Found" : "Contradicts"),
 917                     quoted, RE_SV_TAIL(must));
 918             });
 919             if (!s) {
 920                 if (last1 == last) {
 921                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 922                                             ", giving up...\n"));
 923                     goto fail_finish;
 924                 }
 925                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 926                     ", trying anchored starting at offset %ld...\n",
 927                     (long)(saved_s + 1 - i_strpos)));
 928                 other_last = last;
 929                 s = HOP3c(t, 1, strend);
 930                 goto restart;
 931             }
 932             else {
 933                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 934                       (long)(s - i_strpos)));
 935                 other_last = s; /* Fix this later. --Hugo */
 936                 s = saved_s;
 937                 if (t == strpos)
 938                     goto try_at_start;
 939                 goto try_at_offset;
 940             }
 941         }
 942     }
 943
 944
 945     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 946
 947     DEBUG_OPTIMISE_MORE_r(
 948         PerlIO_printf(Perl_debug_log,
 949             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 950             (IV)prog->check_offset_min,
 951             (IV)prog->check_offset_max,
 952             (IV)(s-strpos),
 953             (IV)(t-strpos),
 954             (IV)(t-s),
 955             (IV)(strend-strpos)
 956         )
 957     );
 958
 959     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 960         && (!utf8_target
 961             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 962                  && t > strpos)))
 963     {
 964         /* Fixed substring is found far enough so that the match
 965            cannot start at strpos. */
 966       try_at_offset:
 967         if (ml_anch && t[-1] != '\n') {
 968             /* Eventually fbm_*() should handle this, but often
 969                anchored_offset is not 0, so this check will not be wasted. */
 970             /* XXXX In the code below we prefer to look for "^" even in
 971                presence of anchored substrings.  And we search even
 972                beyond the found float position.  These pessimizations
 973                are historical artefacts only.  */
 974           find_anchor:
 975             while (t < strend - prog->minlen) {
 976                 if (*t == '\n') {
 977                     if (t < check_at - prog->check_offset_min) {
 978                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 979                             /* Since we moved from the found position,
 980                                we definitely contradict the found anchored
 981                                substr.  Due to the above check we do not
 982                                contradict "check" substr.
 983                                Thus we can arrive here only if check substr
 984                                is float.  Redo checking for "other"=="fixed".
 985                              */
 986                             strpos = t + 1;
 987                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 988                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 989                             goto do_other_anchored;
 990                         }
 991                         /* We don't contradict the found floating substring. */
 992                         /* XXXX Why not check for STCLASS? */
 993                         s = t + 1;
 994                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 995                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 996                         goto set_useful;
 997                     }
 998                     /* Position contradicts check-string */
 999                     /* XXXX probably better to look for check-string
1000                        than for "\n", so one should lower the limit for t? */
1001                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1002                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1003                     other_last = strpos = s = t + 1;
1004                     goto restart;
1005                 }
1006                 t++;
1007             }
1008             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1009                         PL_colors[0], PL_colors[1]));
1010             goto fail_finish;
1011         }
1012         else {
1013             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1014                         PL_colors[0], PL_colors[1]));
1015         }
1016         s = t;
1017       set_useful:
1018         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1019     }
1020     else {
1021         /* The found string does not prohibit matching at strpos,
1022            - no optimization of calling REx engine can be performed,
1023            unless it was an MBOL and we are not after MBOL,
1024            or a future STCLASS check will fail this. */
1025       try_at_start:
1026         /* Even in this situation we may use MBOL flag if strpos is offset
1027            wrt the start of the string. */
1028         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1029             && (strpos != strbeg) && strpos[-1] != '\n'
1030             /* May be due to an implicit anchor of m{.*foo}  */
1031             && !(prog->intflags & PREGf_IMPLICIT))
1032         {
1033             t = strpos;
1034             goto find_anchor;
1035         }
1036         DEBUG_EXECUTE_r( if (ml_anch)
1037             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1038                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1039         );
1040       success_at_start:
1041         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1042             && (utf8_target ? (
1043                 prog->check_utf8                /* Could be deleted already */
1044                 && --BmUSEFUL(prog->check_utf8) < 0
1045                 && (prog->check_utf8 == prog->float_utf8)
1046             ) : (
1047                 prog->check_substr              /* Could be deleted already */
1048                 && --BmUSEFUL(prog->check_substr) < 0
1049                 && (prog->check_substr == prog->float_substr)
1050             )))
1051         {
1052             /* If flags & SOMETHING - do not do it many times on the same match */
1053             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1054             /* XXX Does the destruction order has to change with utf8_target? */
1055             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1056             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1057             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1058             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1059             check = NULL;                       /* abort */
1060             s = strpos;
1061             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1062                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1063             if (prog->intflags & PREGf_IMPLICIT)
1064                 prog->extflags &= ~RXf_ANCH_MBOL;
1065             /* XXXX This is a remnant of the old implementation.  It
1066                     looks wasteful, since now INTUIT can use many
1067                     other heuristics. */
1068             prog->extflags &= ~RXf_USE_INTUIT;
1069             /* XXXX What other flags might need to be cleared in this branch? */
1070         }
1071         else
1072             s = strpos;
1073     }
1074
1075     /* Last resort... */
1076     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1077     /* trie stclasses are too expensive to use here, we are better off to
1078        leave it to regmatch itself */
1079     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1080         /* minlen == 0 is possible if regstclass is \b or \B,
1081            and the fixed substr is ''$.
1082            Since minlen is already taken into account, s+1 is before strend;
1083            accidentally, minlen >= 1 guaranties no false positives at s + 1
1084            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1085            regstclass does not come from lookahead...  */
1086         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1087            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1088         const U8* const str = (U8*)STRING(progi->regstclass);
1089         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1090                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1091                     : 1);
1092         char * endpos;
1093         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1094             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1095         else if (prog->float_substr || prog->float_utf8)
1096             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1097         else
1098             endpos= strend;
1099
1100         if (checked_upto < s)
1101            checked_upto = s;
1102         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1103                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1104
1105         t = s;
1106         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1107         if (s) {
1108             checked_upto = s;
1109         } else {
1110 #ifdef DEBUGGING
1111             const char *what = NULL;
1112 #endif
1113             if (endpos == strend) {
1114                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1115                                 "Could not match STCLASS...\n") );
1116                 goto fail;
1117             }
1118             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                                    "This position contradicts STCLASS...\n") );
1120             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1121                 goto fail;
1122             checked_upto = HOPBACKc(endpos, start_shift);
1123             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1124                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1125             /* Contradict one of substrings */
1126             if (prog->anchored_substr || prog->anchored_utf8) {
1127                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1128                     DEBUG_EXECUTE_r( what = "anchored" );
1129                   hop_and_restart:
1130                     s = HOP3c(t, 1, strend);
1131                     if (s + start_shift + end_shift > strend) {
1132                         /* XXXX Should be taken into account earlier? */
1133                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1134                                                "Could not match STCLASS...\n") );
1135                         goto fail;
1136                     }
1137                     if (!check)
1138                         goto giveup;
1139                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1140                                 "Looking for %s substr starting at offset %ld...\n",
1141                                  what, (long)(s + start_shift - i_strpos)) );
1142                     goto restart;
1143                 }
1144                 /* Have both, check_string is floating */
1145                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1146                     goto retry_floating_check;
1147                 /* Recheck anchored substring, but not floating... */
1148                 s = check_at;
1149                 if (!check)
1150                     goto giveup;
1151                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1152                           "Looking for anchored substr starting at offset %ld...\n",
1153                           (long)(other_last - i_strpos)) );
1154                 goto do_other_anchored;
1155             }
1156             /* Another way we could have checked stclass at the
1157                current position only: */
1158             if (ml_anch) {
1159                 s = t = t + 1;
1160                 if (!check)
1161                     goto giveup;
1162                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1163                           "Looking for /%s^%s/m starting at offset %ld...\n",
1164                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1165                 goto try_at_offset;
1166             }
1167             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1168                 goto fail;
1169             /* Check is floating substring. */
1170           retry_floating_check:
1171             t = check_at - start_shift;
1172             DEBUG_EXECUTE_r( what = "floating" );
1173             goto hop_and_restart;
1174         }
1175         if (t != s) {
1176             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1177                         "By STCLASS: moving %ld --> %ld\n",
1178                                   (long)(t - i_strpos), (long)(s - i_strpos))
1179                    );
1180         }
1181         else {
1182             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1183                                   "Does not contradict STCLASS...\n");
1184                    );
1185         }
1186     }
1187   giveup:
1188     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1189                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1190                           PL_colors[5], (long)(s - i_strpos)) );
1191     return s;
1192
1193   fail_finish:                          /* Substring not found */
1194     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1195         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1196   fail:
1197     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1198                           PL_colors[4], PL_colors[5]));
1199     return NULL;
1200 }
1201
1202 #define DECL_TRIE_TYPE(scan) \
1203     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1204                     trie_type = ((scan->flags == EXACT) \
1205                               ? (utf8_target ? trie_utf8 : trie_plain) \
1206                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1207
1208 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1209 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1210     STRLEN skiplen;                                                                 \
1211     switch (trie_type) {                                                            \
1212     case trie_utf8_fold:                                                            \
1213         if ( foldlen>0 ) {                                                          \
1214             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1215             foldlen -= len;                                                         \
1216             uscan += len;                                                           \
1217             len=0;                                                                  \
1218         } else {                                                                    \
1219             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1220             len = UTF8SKIP(uc);                                                     \
1221             skiplen = UNISKIP( uvc );                                               \
1222             foldlen -= skiplen;                                                     \
1223             uscan = foldbuf + skiplen;                                              \
1224         }                                                                           \
1225         break;                                                                      \
1226     case trie_latin_utf8_fold:                                                      \
1227         if ( foldlen>0 ) {                                                          \
1228             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1229             foldlen -= len;                                                         \
1230             uscan += len;                                                           \
1231             len=0;                                                                  \
1232         } else {                                                                    \
1233             len = 1;                                                                \
1234             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1235             skiplen = UNISKIP( uvc );                                               \
1236             foldlen -= skiplen;                                                     \
1237             uscan = foldbuf + skiplen;                                              \
1238         }                                                                           \
1239         break;                                                                      \
1240     case trie_utf8:                                                                 \
1241         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1242         break;                                                                      \
1243     case trie_plain:                                                                \
1244         uvc = (UV)*uc;                                                              \
1245         len = 1;                                                                    \
1246     }                                                                               \
1247     if (uvc < 256) {                                                                \
1248         charid = trie->charmap[ uvc ];                                              \
1249     }                                                                               \
1250     else {                                                                          \
1251         charid = 0;                                                                 \
1252         if (widecharmap) {                                                          \
1253             SV** const svpp = hv_fetch(widecharmap,                                 \
1254                         (char*)&uvc, sizeof(UV), 0);                                \
1255             if (svpp)                                                               \
1256                 charid = (U16)SvIV(*svpp);                                          \
1257         }                                                                           \
1258     }                                                                               \
1259 } STMT_END
1260
1261 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1262 STMT_START {                                              \
1263     while (s <= e) {                                      \
1264         if ( (CoNd)                                       \
1265              && (ln == 1 || folder(s, pat_string, ln))    \
1266              && (!reginfo || regtry(reginfo, &s)) )       \
1267             goto got_it;                                  \
1268         s++;                                              \
1269     }                                                     \
1270 } STMT_END
1271
1272 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1273 STMT_START {                                          \
1274     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1275         CoDe                                          \
1276         s += uskip;                                   \
1277     }                                                 \
1278 } STMT_END
1279
1280 #define REXEC_FBC_SCAN(CoDe)                          \
1281 STMT_START {                                          \
1282     while (s < strend) {                              \
1283         CoDe                                          \
1284         s++;                                          \
1285     }                                                 \
1286 } STMT_END
1287
1288 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1289 REXEC_FBC_UTF8_SCAN(                                  \
1290     if (CoNd) {                                       \
1291         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1292             goto got_it;                              \
1293         else                                          \
1294             tmp = doevery;                            \
1295     }                                                 \
1296     else                                              \
1297         tmp = 1;                                      \
1298 )
1299
1300 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1301 REXEC_FBC_SCAN(                                       \
1302     if (CoNd) {                                       \
1303         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1304             goto got_it;                              \
1305         else                                          \
1306             tmp = doevery;                            \
1307     }                                                 \
1308     else                                              \
1309         tmp = 1;                                      \
1310 )
1311
1312 #define REXEC_FBC_TRYIT               \
1313 if ((!reginfo || regtry(reginfo, &s))) \
1314     goto got_it
1315
1316 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1317     if (utf8_target) {                                             \
1318         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1319     }                                                          \
1320     else {                                                     \
1321         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1322     }
1323
1324 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1325     if (utf8_target) {                                             \
1326         UtFpReLoAd;                                            \
1327         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1328     }                                                          \
1329     else {                                                     \
1330         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1331     }
1332
1333 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1334     PL_reg_flags |= RF_tainted;                                \
1335     if (utf8_target) {                                             \
1336         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1337     }                                                          \
1338     else {                                                     \
1339         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1340     }
1341
1342 #define DUMP_EXEC_POS(li,s,doutf8) \
1343     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1344
1345
1346 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1347         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1348         tmp = TEST_NON_UTF8(tmp);                                              \
1349         REXEC_FBC_UTF8_SCAN(                                                   \
1350             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1351                 tmp = !tmp;                                                    \
1352                 IF_SUCCESS;                                                    \
1353             }                                                                  \
1354             else {                                                             \
1355                 IF_FAIL;                                                       \
1356             }                                                                  \
1357         );                                                                     \
1358
1359 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1360         if (s == PL_bostr) {                                                   \
1361             tmp = '\n';                                                        \
1362         }                                                                      \
1363         else {                                                                 \
1364             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1365             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1366         }                                                                      \
1367         tmp = TeSt1_UtF8;                                                      \
1368         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1369         REXEC_FBC_UTF8_SCAN(                                                   \
1370             if (tmp == ! (TeSt2_UtF8)) { \
1371                 tmp = !tmp;                                                    \
1372                 IF_SUCCESS;                                                    \
1373             }                                                                  \
1374             else {                                                             \
1375                 IF_FAIL;                                                       \
1376             }                                                                  \
1377         );                                                                     \
1378
1379 /* The only difference between the BOUND and NBOUND cases is that
1380  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1381  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1382  * with the other one being empty */
1383 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1384     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1385
1386 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1387     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1388
1389 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1390     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1391
1392 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1393     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1394
1395
1396 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1397  * be passed in completely with the variable name being tested, which isn't
1398  * such a clean interface, but this is easier to read than it was before.  We
1399  * are looking for the boundary (or non-boundary between a word and non-word
1400  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1401  * must be different.  Find the "wordness" of the character just prior to this
1402  * one, and compare it with the wordness of this one.  If they differ, we have
1403  * a boundary.  At the beginning of the string, pretend that the previous
1404  * character was a new-line */
1405 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1406     if (utf8_target) {                                                         \
1407                 UTF8_CODE \
1408     }                                                                          \
1409     else {  /* Not utf8 */                                                     \
1410         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1411         tmp = TEST_NON_UTF8(tmp);                                              \
1412         REXEC_FBC_SCAN(                                                        \
1413             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1414                 tmp = !tmp;                                                    \
1415                 IF_SUCCESS;                                                    \
1416             }                                                                  \
1417             else {                                                             \
1418                 IF_FAIL;                                                       \
1419             }                                                                  \
1420         );                                                                     \
1421     }                                                                          \
1422     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1423         goto got_it;
1424
1425 /* We know what class REx starts with.  Try to find this position... */
1426 /* if reginfo is NULL, its a dryrun */
1427 /* annoyingly all the vars in this routine have different names from their counterparts
1428    in regmatch. /grrr */
1429
1430 STATIC char *
1431 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1432     const char *strend, regmatch_info *reginfo)
1433 {
1434         dVAR;
1435         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1436         char *pat_string;   /* The pattern's exactish string */
1437         char *pat_end;      /* ptr to end char of pat_string */
1438         re_fold_t folder;       /* Function for computing non-utf8 folds */
1439         const U8 *fold_array;   /* array for folding ords < 256 */
1440         STRLEN ln;
1441         STRLEN lnc;
1442         STRLEN uskip;
1443         U8 c1;
1444         U8 c2;
1445         char *e;
1446         I32 tmp = 1;    /* Scratch variable? */
1447         const bool utf8_target = PL_reg_match_utf8;
1448         UV utf8_fold_flags = 0;
1449         RXi_GET_DECL(prog,progi);
1450
1451         PERL_ARGS_ASSERT_FIND_BYCLASS;
1452
1453         /* We know what class it must start with. */
1454         switch (OP(c)) {
1455         case ANYOF:
1456             if (utf8_target) {
1457                 STRLEN inclasslen = strend - s;
1458                 REXEC_FBC_UTF8_CLASS_SCAN(
1459                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1460             }
1461             else {
1462                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1463             }
1464             break;
1465         case CANY:
1466             REXEC_FBC_SCAN(
1467                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1468                     goto got_it;
1469                 else
1470                     tmp = doevery;
1471             );
1472             break;
1473
1474         case EXACTFA:
1475             if (UTF_PATTERN || utf8_target) {
1476                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1477                 goto do_exactf_utf8;
1478             }
1479             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1480             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1481             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1482
1483         case EXACTF:
1484             if (utf8_target) {
1485
1486                 /* regcomp.c already folded this if pattern is in UTF-8 */
1487                 utf8_fold_flags = 0;
1488                 goto do_exactf_utf8;
1489             }
1490             fold_array = PL_fold;
1491             folder = foldEQ;
1492             goto do_exactf_non_utf8;
1493
1494         case EXACTFL:
1495             if (UTF_PATTERN || utf8_target) {
1496                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1497                 goto do_exactf_utf8;
1498             }
1499             fold_array = PL_fold_locale;
1500             folder = foldEQ_locale;
1501             goto do_exactf_non_utf8;
1502
1503         case EXACTFU_SS:
1504             if (UTF_PATTERN) {
1505                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1506             }
1507             goto do_exactf_utf8;
1508
1509         case EXACTFU_TRICKYFOLD:
1510         case EXACTFU:
1511             if (UTF_PATTERN || utf8_target) {
1512                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1513                 goto do_exactf_utf8;
1514             }
1515
1516             /* Any 'ss' in the pattern should have been replaced by regcomp,
1517              * so we don't have to worry here about this single special case
1518              * in the Latin1 range */
1519             fold_array = PL_fold_latin1;
1520             folder = foldEQ_latin1;
1521
1522             /* FALL THROUGH */
1523
1524         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1525                                are no glitches with fold-length differences
1526                                between the target string and pattern */
1527
1528             /* The idea in the non-utf8 EXACTF* cases is to first find the
1529              * first character of the EXACTF* node and then, if necessary,
1530              * case-insensitively compare the full text of the node.  c1 is the
1531              * first character.  c2 is its fold.  This logic will not work for
1532              * Unicode semantics and the german sharp ss, which hence should
1533              * not be compiled into a node that gets here. */
1534             pat_string = STRING(c);
1535             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1536
1537             /* We know that we have to match at least 'ln' bytes (which is the
1538              * same as characters, since not utf8).  If we have to match 3
1539              * characters, and there are only 2 availabe, we know without
1540              * trying that it will fail; so don't start a match past the
1541              * required minimum number from the far end */
1542             e = HOP3c(strend, -((I32)ln), s);
1543
1544             if (!reginfo && e < s) {
1545                 e = s;                  /* Due to minlen logic of intuit() */
1546             }
1547
1548             c1 = *pat_string;
1549             c2 = fold_array[c1];
1550             if (c1 == c2) { /* If char and fold are the same */
1551                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1552             }
1553             else {
1554                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1555             }
1556             break;
1557
1558         do_exactf_utf8:
1559         {
1560             unsigned expansion;
1561
1562
1563             /* If one of the operands is in utf8, we can't use the simpler
1564              * folding above, due to the fact that many different characters
1565              * can have the same fold, or portion of a fold, or different-
1566              * length fold */
1567             pat_string = STRING(c);
1568             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1569             pat_end = pat_string + ln;
1570             lnc = (UTF_PATTERN) /* length to match in characters */
1571                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1572                     : ln;
1573
1574             /* We have 'lnc' characters to match in the pattern, but because of
1575              * multi-character folding, each character in the target can match
1576              * up to 3 characters (Unicode guarantees it will never exceed
1577              * this) if it is utf8-encoded; and up to 2 if not (based on the
1578              * fact that the Latin 1 folds are already determined, and the
1579              * only multi-char fold in that range is the sharp-s folding to
1580              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1581              * string character.  Adjust lnc accordingly, rounding up, so that
1582              * if we need to match at least 4+1/3 chars, that really is 5. */
1583             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1584             lnc = (lnc + expansion - 1) / expansion;
1585
1586             /* As in the non-UTF8 case, if we have to match 3 characters, and
1587              * only 2 are left, it's guaranteed to fail, so don't start a
1588              * match that would require us to go beyond the end of the string
1589              */
1590             e = HOP3c(strend, -((I32)lnc), s);
1591
1592             if (!reginfo && e < s) {
1593                 e = s;                  /* Due to minlen logic of intuit() */
1594             }
1595
1596             /* XXX Note that we could recalculate e to stop the loop earlier,
1597              * as the worst case expansion above will rarely be met, and as we
1598              * go along we would usually find that e moves further to the left.
1599              * This would happen only after we reached the point in the loop
1600              * where if there were no expansion we should fail.  Unclear if
1601              * worth the expense */
1602
1603             while (s <= e) {
1604                 char *my_strend= (char *)strend;
1605                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1606                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1607                     && (!reginfo || regtry(reginfo, &s)) )
1608                 {
1609                     goto got_it;
1610                 }
1611                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1612             }
1613             break;
1614         }
1615         case BOUNDL:
1616             PL_reg_flags |= RF_tainted;
1617             FBC_BOUND(isALNUM_LC,
1618                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1619                       isALNUM_LC_utf8((U8*)s));
1620             break;
1621         case NBOUNDL:
1622             PL_reg_flags |= RF_tainted;
1623             FBC_NBOUND(isALNUM_LC,
1624                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625                        isALNUM_LC_utf8((U8*)s));
1626             break;
1627         case BOUND:
1628             FBC_BOUND(isWORDCHAR,
1629                       isALNUM_uni(tmp),
1630                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1631             break;
1632         case BOUNDA:
1633             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1634                              isWORDCHAR_A(tmp),
1635                              isWORDCHAR_A((U8*)s));
1636             break;
1637         case NBOUND:
1638             FBC_NBOUND(isWORDCHAR,
1639                        isALNUM_uni(tmp),
1640                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1641             break;
1642         case NBOUNDA:
1643             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1644                               isWORDCHAR_A(tmp),
1645                               isWORDCHAR_A((U8*)s));
1646             break;
1647         case BOUNDU:
1648             FBC_BOUND(isWORDCHAR_L1,
1649                       isALNUM_uni(tmp),
1650                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1651             break;
1652         case NBOUNDU:
1653             FBC_NBOUND(isWORDCHAR_L1,
1654                        isALNUM_uni(tmp),
1655                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1656             break;
1657         case ALNUML:
1658             REXEC_FBC_CSCAN_TAINT(
1659                 isALNUM_LC_utf8((U8*)s),
1660                 isALNUM_LC(*s)
1661             );
1662             break;
1663         case ALNUMU:
1664             REXEC_FBC_CSCAN_PRELOAD(
1665                 LOAD_UTF8_CHARCLASS_ALNUM(),
1666                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1667                 isWORDCHAR_L1((U8) *s)
1668             );
1669             break;
1670         case ALNUM:
1671             REXEC_FBC_CSCAN_PRELOAD(
1672                 LOAD_UTF8_CHARCLASS_ALNUM(),
1673                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1674                 isWORDCHAR((U8) *s)
1675             );
1676             break;
1677         case ALNUMA:
1678             /* Don't need to worry about utf8, as it can match only a single
1679              * byte invariant character */
1680             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1681             break;
1682         case NALNUMU:
1683             REXEC_FBC_CSCAN_PRELOAD(
1684                 LOAD_UTF8_CHARCLASS_ALNUM(),
1685                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1686                 ! isWORDCHAR_L1((U8) *s)
1687             );
1688             break;
1689         case NALNUM:
1690             REXEC_FBC_CSCAN_PRELOAD(
1691                 LOAD_UTF8_CHARCLASS_ALNUM(),
1692                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1693                 ! isALNUM(*s)
1694             );
1695             break;
1696         case NALNUMA:
1697             REXEC_FBC_CSCAN(
1698                 !isWORDCHAR_A(*s),
1699                 !isWORDCHAR_A(*s)
1700             );
1701             break;
1702         case NALNUML:
1703             REXEC_FBC_CSCAN_TAINT(
1704                 !isALNUM_LC_utf8((U8*)s),
1705                 !isALNUM_LC(*s)
1706             );
1707             break;
1708         case SPACEU:
1709             REXEC_FBC_CSCAN_PRELOAD(
1710                 LOAD_UTF8_CHARCLASS_SPACE(),
1711                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1712                 isSPACE_L1((U8) *s)
1713             );
1714             break;
1715         case SPACE:
1716             REXEC_FBC_CSCAN_PRELOAD(
1717                 LOAD_UTF8_CHARCLASS_SPACE(),
1718                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1719                 isSPACE((U8) *s)
1720             );
1721             break;
1722         case SPACEA:
1723             /* Don't need to worry about utf8, as it can match only a single
1724              * byte invariant character */
1725             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1726             break;
1727         case SPACEL:
1728             REXEC_FBC_CSCAN_TAINT(
1729                 isSPACE_LC_utf8((U8*)s),
1730                 isSPACE_LC(*s)
1731             );
1732             break;
1733         case NSPACEU:
1734             REXEC_FBC_CSCAN_PRELOAD(
1735                 LOAD_UTF8_CHARCLASS_SPACE(),
1736                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1737                 ! isSPACE_L1((U8) *s)
1738             );
1739             break;
1740         case NSPACE:
1741             REXEC_FBC_CSCAN_PRELOAD(
1742                 LOAD_UTF8_CHARCLASS_SPACE(),
1743                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1744                 ! isSPACE((U8) *s)
1745             );
1746             break;
1747         case NSPACEA:
1748             REXEC_FBC_CSCAN(
1749                 !isSPACE_A(*s),
1750                 !isSPACE_A(*s)
1751             );
1752             break;
1753         case NSPACEL:
1754             REXEC_FBC_CSCAN_TAINT(
1755                 !isSPACE_LC_utf8((U8*)s),
1756                 !isSPACE_LC(*s)
1757             );
1758             break;
1759         case DIGIT:
1760             REXEC_FBC_CSCAN_PRELOAD(
1761                 LOAD_UTF8_CHARCLASS_DIGIT(),
1762                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1763                 isDIGIT(*s)
1764             );
1765             break;
1766         case DIGITA:
1767             /* Don't need to worry about utf8, as it can match only a single
1768              * byte invariant character */
1769             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1770             break;
1771         case DIGITL:
1772             REXEC_FBC_CSCAN_TAINT(
1773                 isDIGIT_LC_utf8((U8*)s),
1774                 isDIGIT_LC(*s)
1775             );
1776             break;
1777         case NDIGIT:
1778             REXEC_FBC_CSCAN_PRELOAD(
1779                 LOAD_UTF8_CHARCLASS_DIGIT(),
1780                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1781                 !isDIGIT(*s)
1782             );
1783             break;
1784         case NDIGITA:
1785             REXEC_FBC_CSCAN(
1786                 !isDIGIT_A(*s),
1787                 !isDIGIT_A(*s)
1788             );
1789             break;
1790         case NDIGITL:
1791             REXEC_FBC_CSCAN_TAINT(
1792                 !isDIGIT_LC_utf8((U8*)s),
1793                 !isDIGIT_LC(*s)
1794             );
1795             break;
1796         case LNBREAK:
1797             REXEC_FBC_CSCAN(
1798                 is_LNBREAK_utf8_safe(s, strend),
1799                 is_LNBREAK_latin1_safe(s, strend)
1800             );
1801             break;
1802         case VERTWS:
1803             REXEC_FBC_CSCAN(
1804                 is_VERTWS_utf8_safe(s, strend),
1805                 is_VERTWS_latin1_safe(s, strend)
1806             );
1807             break;
1808         case NVERTWS:
1809             REXEC_FBC_CSCAN(
1810                 !is_VERTWS_utf8_safe(s, strend),
1811                 !is_VERTWS_latin1_safe(s, strend)
1812             );
1813             break;
1814         case HORIZWS:
1815             REXEC_FBC_CSCAN(
1816                 is_HORIZWS_utf8_safe(s, strend),
1817                 is_HORIZWS_latin1_safe(s, strend)
1818             );
1819             break;
1820         case NHORIZWS:
1821             REXEC_FBC_CSCAN(
1822                 !is_HORIZWS_utf8_safe(s, strend),
1823                 !is_HORIZWS_latin1_safe(s, strend)
1824             );
1825             break;
1826         case POSIXA:
1827             /* Don't need to worry about utf8, as it can match only a single
1828             * byte invariant character.  The flag in this node type is the
1829             * class number to pass to _generic_isCC() to build a mask for
1830             * searching in PL_charclass[] */
1831             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1832             break;
1833         case NPOSIXA:
1834             REXEC_FBC_CSCAN(
1835                 !_generic_isCC_A(*s, FLAGS(c)),
1836                 !_generic_isCC_A(*s, FLAGS(c))
1837             );
1838             break;
1839
1840         case AHOCORASICKC:
1841         case AHOCORASICK:
1842             {
1843                 DECL_TRIE_TYPE(c);
1844                 /* what trie are we using right now */
1845                 reg_ac_data *aho
1846                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1847                 reg_trie_data *trie
1848                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1849                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1850
1851                 const char *last_start = strend - trie->minlen;
1852 #ifdef DEBUGGING
1853                 const char *real_start = s;
1854 #endif
1855                 STRLEN maxlen = trie->maxlen;
1856                 SV *sv_points;
1857                 U8 **points; /* map of where we were in the input string
1858                                 when reading a given char. For ASCII this
1859                                 is unnecessary overhead as the relationship
1860                                 is always 1:1, but for Unicode, especially
1861                                 case folded Unicode this is not true. */
1862                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1863                 U8 *bitmap=NULL;
1864
1865
1866                 GET_RE_DEBUG_FLAGS_DECL;
1867
1868                 /* We can't just allocate points here. We need to wrap it in
1869                  * an SV so it gets freed properly if there is a croak while
1870                  * running the match */
1871                 ENTER;
1872                 SAVETMPS;
1873                 sv_points=newSV(maxlen * sizeof(U8 *));
1874                 SvCUR_set(sv_points,
1875                     maxlen * sizeof(U8 *));
1876                 SvPOK_on(sv_points);
1877                 sv_2mortal(sv_points);
1878                 points=(U8**)SvPV_nolen(sv_points );
1879                 if ( trie_type != trie_utf8_fold
1880                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1881                 {
1882                     if (trie->bitmap)
1883                         bitmap=(U8*)trie->bitmap;
1884                     else
1885                         bitmap=(U8*)ANYOF_BITMAP(c);
1886                 }
1887                 /* this is the Aho-Corasick algorithm modified a touch
1888                    to include special handling for long "unknown char"
1889                    sequences. The basic idea being that we use AC as long
1890                    as we are dealing with a possible matching char, when
1891                    we encounter an unknown char (and we have not encountered
1892                    an accepting state) we scan forward until we find a legal
1893                    starting char.
1894                    AC matching is basically that of trie matching, except
1895                    that when we encounter a failing transition, we fall back
1896                    to the current states "fail state", and try the current char
1897                    again, a process we repeat until we reach the root state,
1898                    state 1, or a legal transition. If we fail on the root state
1899                    then we can either terminate if we have reached an accepting
1900                    state previously, or restart the entire process from the beginning
1901                    if we have not.
1902
1903                  */
1904                 while (s <= last_start) {
1905                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1906                     U8 *uc = (U8*)s;
1907                     U16 charid = 0;
1908                     U32 base = 1;
1909                     U32 state = 1;
1910                     UV uvc = 0;
1911                     STRLEN len = 0;
1912                     STRLEN foldlen = 0;
1913                     U8 *uscan = (U8*)NULL;
1914                     U8 *leftmost = NULL;
1915 #ifdef DEBUGGING
1916                     U32 accepted_word= 0;
1917 #endif
1918                     U32 pointpos = 0;
1919
1920                     while ( state && uc <= (U8*)strend ) {
1921                         int failed=0;
1922                         U32 word = aho->states[ state ].wordnum;
1923
1924                         if( state==1 ) {
1925                             if ( bitmap ) {
1926                                 DEBUG_TRIE_EXECUTE_r(
1927                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1928                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1929                                             (char *)uc, utf8_target );
1930                                         PerlIO_printf( Perl_debug_log,
1931                                             " Scanning for legal start char...\n");
1932                                     }
1933                                 );
1934                                 if (utf8_target) {
1935                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1936                                         uc += UTF8SKIP(uc);
1937                                     }
1938                                 } else {
1939                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1940                                         uc++;
1941                                     }
1942                                 }
1943                                 s= (char *)uc;
1944                             }
1945                             if (uc >(U8*)last_start) break;
1946                         }
1947
1948                         if ( word ) {
1949                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1950                             if (!leftmost || lpos < leftmost) {
1951                                 DEBUG_r(accepted_word=word);
1952                                 leftmost= lpos;
1953                             }
1954                             if (base==0) break;
1955
1956                         }
1957                         points[pointpos++ % maxlen]= uc;
1958                         if (foldlen || uc < (U8*)strend) {
1959                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1960                                              widecharmap, uc,
1961                                              uscan, len, uvc, charid, foldlen,
1962                                              foldbuf, uniflags);
1963                             DEBUG_TRIE_EXECUTE_r({
1964                                 dump_exec_pos( (char *)uc, c, strend,
1965                                             real_start, s, utf8_target);
1966                                 PerlIO_printf(Perl_debug_log,
1967                                     " Charid:%3u CP:%4"UVxf" ",
1968                                      charid, uvc);
1969                             });
1970                         }
1971                         else {
1972                             len = 0;
1973                             charid = 0;
1974                         }
1975
1976
1977                         do {
1978 #ifdef DEBUGGING
1979                             word = aho->states[ state ].wordnum;
1980 #endif
1981                             base = aho->states[ state ].trans.base;
1982
1983                             DEBUG_TRIE_EXECUTE_r({
1984                                 if (failed)
1985                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1986                                         s,   utf8_target );
1987                                 PerlIO_printf( Perl_debug_log,
1988                                     "%sState: %4"UVxf", word=%"UVxf,
1989                                     failed ? " Fail transition to " : "",
1990                                     (UV)state, (UV)word);
1991                             });
1992                             if ( base ) {
1993                                 U32 tmp;
1994                                 I32 offset;
1995                                 if (charid &&
1996                                      ( ((offset = base + charid
1997                                         - 1 - trie->uniquecharcount)) >= 0)
1998                                      && ((U32)offset < trie->lasttrans)
1999                                      && trie->trans[offset].check == state
2000                                      && (tmp=trie->trans[offset].next))
2001                                 {
2002                                     DEBUG_TRIE_EXECUTE_r(
2003                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2004                                     state = tmp;
2005                                     break;
2006                                 }
2007                                 else {
2008                                     DEBUG_TRIE_EXECUTE_r(
2009                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2010                                     failed = 1;
2011                                     state = aho->fail[state];
2012                                 }
2013                             }
2014                             else {
2015                                 /* we must be accepting here */
2016                                 DEBUG_TRIE_EXECUTE_r(
2017                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2018                                 failed = 1;
2019                                 break;
2020                             }
2021                         } while(state);
2022                         uc += len;
2023                         if (failed) {
2024                             if (leftmost)
2025                                 break;
2026                             if (!state) state = 1;
2027                         }
2028                     }
2029                     if ( aho->states[ state ].wordnum ) {
2030                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2031                         if (!leftmost || lpos < leftmost) {
2032                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2033                             leftmost = lpos;
2034                         }
2035                     }
2036                     if (leftmost) {
2037                         s = (char*)leftmost;
2038                         DEBUG_TRIE_EXECUTE_r({
2039                             PerlIO_printf(
2040                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2041                                 (UV)accepted_word, (IV)(s - real_start)
2042                             );
2043                         });
2044                         if (!reginfo || regtry(reginfo, &s)) {
2045                             FREETMPS;
2046                             LEAVE;
2047                             goto got_it;
2048                         }
2049                         s = HOPc(s,1);
2050                         DEBUG_TRIE_EXECUTE_r({
2051                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2052                         });
2053                     } else {
2054                         DEBUG_TRIE_EXECUTE_r(
2055                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2056                         break;
2057                     }
2058                 }
2059                 FREETMPS;
2060                 LEAVE;
2061             }
2062             break;
2063         default:
2064             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2065             break;
2066         }
2067         return 0;
2068       got_it:
2069         return s;
2070 }
2071
2072
2073 /*
2074  - regexec_flags - match a regexp against a string
2075  */
2076 I32
2077 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2078               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2079 /* stringarg: the point in the string at which to begin matching */
2080 /* strend:    pointer to null at end of string */
2081 /* strbeg:    real beginning of string */
2082 /* minend:    end of match must be >= minend bytes after stringarg. */
2083 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2084  *            itself is accessed via the pointers above */
2085 /* data:      May be used for some additional optimizations.
2086               Currently its only used, with a U32 cast, for transmitting
2087               the ganch offset when doing a /g match. This will change */
2088 /* nosave:    For optimizations. */
2089
2090 {
2091     dVAR;
2092     struct regexp *const prog = (struct regexp *)SvANY(rx);
2093     /*register*/ char *s;
2094     regnode *c;
2095     /*register*/ char *startpos = stringarg;
2096     I32 minlen;         /* must match at least this many chars */
2097     I32 dontbother = 0; /* how many characters not to try at end */
2098     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2099     I32 scream_pos = -1;                /* Internal iterator of scream. */
2100     char *scream_olds = NULL;
2101     const bool utf8_target = cBOOL(DO_UTF8(sv));
2102     I32 multiline;
2103     RXi_GET_DECL(prog,progi);
2104     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2105     regexp_paren_pair *swap = NULL;
2106     GET_RE_DEBUG_FLAGS_DECL;
2107
2108     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2109     PERL_UNUSED_ARG(data);
2110
2111     /* Be paranoid... */
2112     if (prog == NULL || startpos == NULL) {
2113         Perl_croak(aTHX_ "NULL regexp parameter");
2114         return 0;
2115     }
2116
2117     multiline = prog->extflags & RXf_PMf_MULTILINE;
2118     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2119
2120     RX_MATCH_UTF8_set(rx, utf8_target);
2121     DEBUG_EXECUTE_r(
2122         debug_start_match(rx, utf8_target, startpos, strend,
2123         "Matching");
2124     );
2125
2126     minlen = prog->minlen;
2127
2128     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2129         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2130                               "String too short [regexec_flags]...\n"));
2131         goto phooey;
2132     }
2133
2134
2135     /* Check validity of program. */
2136     if (UCHARAT(progi->program) != REG_MAGIC) {
2137         Perl_croak(aTHX_ "corrupted regexp program");
2138     }
2139
2140     PL_reg_flags = 0;
2141     PL_reg_state.re_state_eval_setup_done = FALSE;
2142     PL_reg_maxiter = 0;
2143
2144     if (RX_UTF8(rx))
2145         PL_reg_flags |= RF_utf8;
2146
2147     /* Mark beginning of line for ^ and lookbehind. */
2148     reginfo.bol = startpos; /* XXX not used ??? */
2149     PL_bostr  = strbeg;
2150     reginfo.sv = sv;
2151
2152     /* Mark end of line for $ (and such) */
2153     PL_regeol = strend;
2154
2155     /* see how far we have to get to not match where we matched before */
2156     reginfo.till = startpos+minend;
2157
2158     /* If there is a "must appear" string, look for it. */
2159     s = startpos;
2160
2161     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2162         MAGIC *mg;
2163         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2164             reginfo.ganch = startpos + prog->gofs;
2165             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2166               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2167         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2168                   && SvMAGIC(sv)
2169                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2170                   && mg->mg_len >= 0) {
2171             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2172             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2173                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2174
2175             if (prog->extflags & RXf_ANCH_GPOS) {
2176                 if (s > reginfo.ganch)
2177                     goto phooey;
2178                 s = reginfo.ganch - prog->gofs;
2179                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2180                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2181                 if (s < strbeg)
2182                     goto phooey;
2183             }
2184         }
2185         else if (data) {
2186             reginfo.ganch = strbeg + PTR2UV(data);
2187             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2188                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2189
2190         } else {                                /* pos() not defined */
2191             reginfo.ganch = strbeg;
2192             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2193                  "GPOS: reginfo.ganch = strbeg\n"));
2194         }
2195     }
2196     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2197         /* We have to be careful. If the previous successful match
2198            was from this regex we don't want a subsequent partially
2199            successful match to clobber the old results.
2200            So when we detect this possibility we add a swap buffer
2201            to the re, and switch the buffer each match. If we fail
2202            we switch it back, otherwise we leave it swapped.
2203         */
2204         swap = prog->offs;
2205         /* do we need a save destructor here for eval dies? */
2206         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2207         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2208             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2209             PTR2UV(prog),
2210             PTR2UV(swap),
2211             PTR2UV(prog->offs)
2212         ));
2213     }
2214     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2215         re_scream_pos_data d;
2216
2217         d.scream_olds = &scream_olds;
2218         d.scream_pos = &scream_pos;
2219         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2220         if (!s) {
2221             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2222             goto phooey;        /* not present */
2223         }
2224     }
2225
2226
2227
2228     /* Simplest case:  anchored match need be tried only once. */
2229     /*  [unless only anchor is BOL and multiline is set] */
2230     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2231         if (s == startpos && regtry(&reginfo, &startpos))
2232             goto got_it;
2233         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2234                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2235         {
2236             char *end;
2237
2238             if (minlen)
2239                 dontbother = minlen - 1;
2240             end = HOP3c(strend, -dontbother, strbeg) - 1;
2241             /* for multiline we only have to try after newlines */
2242             if (prog->check_substr || prog->check_utf8) {
2243                 /* because of the goto we can not easily reuse the macros for bifurcating the
2244                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2245                 if (utf8_target) {
2246                     if (s == startpos)
2247                         goto after_try_utf8;
2248                     while (1) {
2249                         if (regtry(&reginfo, &s)) {
2250                             goto got_it;
2251                         }
2252                       after_try_utf8:
2253                         if (s > end) {
2254                             goto phooey;
2255                         }
2256                         if (prog->extflags & RXf_USE_INTUIT) {
2257                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2258                             if (!s) {
2259                                 goto phooey;
2260                             }
2261                         }
2262                         else {
2263                             s += UTF8SKIP(s);
2264                         }
2265                     }
2266                 } /* end search for check string in unicode */
2267                 else {
2268                     if (s == startpos) {
2269                         goto after_try_latin;
2270                     }
2271                     while (1) {
2272                         if (regtry(&reginfo, &s)) {
2273                             goto got_it;
2274                         }
2275                       after_try_latin:
2276                         if (s > end) {
2277                             goto phooey;
2278                         }
2279                         if (prog->extflags & RXf_USE_INTUIT) {
2280                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2281                             if (!s) {
2282                                 goto phooey;
2283                             }
2284                         }
2285                         else {
2286                             s++;
2287                         }
2288                     }
2289                 } /* end search for check string in latin*/
2290             } /* end search for check string */
2291             else { /* search for newline */
2292                 if (s > startpos) {
2293                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2294                     s--;
2295                 }
2296                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2297                 while (s <= end) { /* note it could be possible to match at the end of the string */
2298                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2299                         if (regtry(&reginfo, &s))
2300                             goto got_it;
2301                     }
2302                 }
2303             } /* end search for newline */
2304         } /* end anchored/multiline check string search */
2305         goto phooey;
2306     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2307     {
2308         /* the warning about reginfo.ganch being used without initialization
2309            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2310            and we only enter this block when the same bit is set. */
2311         char *tmp_s = reginfo.ganch - prog->gofs;
2312
2313         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2314             goto got_it;
2315         goto phooey;
2316     }
2317
2318     /* Messy cases:  unanchored match. */
2319     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2320         /* we have /x+whatever/ */
2321         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2322         char ch;
2323 #ifdef DEBUGGING
2324         int did_match = 0;
2325 #endif
2326         if (utf8_target) {
2327             if (! prog->anchored_utf8) {
2328                 to_utf8_substr(prog);
2329             }
2330             ch = SvPVX_const(prog->anchored_utf8)[0];
2331             REXEC_FBC_SCAN(
2332                 if (*s == ch) {
2333                     DEBUG_EXECUTE_r( did_match = 1 );
2334                     if (regtry(&reginfo, &s)) goto got_it;
2335                     s += UTF8SKIP(s);
2336                     while (s < strend && *s == ch)
2337                         s += UTF8SKIP(s);
2338                 }
2339             );
2340
2341         }
2342         else {
2343             if (! prog->anchored_substr) {
2344                 if (! to_byte_substr(prog)) {
2345                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346                                             non_utf8_target_but_utf8_required));
2347                     goto phooey;
2348                 }
2349             }
2350             ch = SvPVX_const(prog->anchored_substr)[0];
2351             REXEC_FBC_SCAN(
2352                 if (*s == ch) {
2353                     DEBUG_EXECUTE_r( did_match = 1 );
2354                     if (regtry(&reginfo, &s)) goto got_it;
2355                     s++;
2356                     while (s < strend && *s == ch)
2357                         s++;
2358                 }
2359             );
2360         }
2361         DEBUG_EXECUTE_r(if (!did_match)
2362                 PerlIO_printf(Perl_debug_log,
2363                                   "Did not find anchored character...\n")
2364                );
2365     }
2366     else if (prog->anchored_substr != NULL
2367               || prog->anchored_utf8 != NULL
2368               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2369                   && prog->float_max_offset < strend - s)) {
2370         SV *must;
2371         I32 back_max;
2372         I32 back_min;
2373         char *last;
2374         char *last1;            /* Last position checked before */
2375 #ifdef DEBUGGING
2376         int did_match = 0;
2377 #endif
2378         if (prog->anchored_substr || prog->anchored_utf8) {
2379             if (utf8_target) {
2380                 if (! prog->anchored_utf8) {
2381                     to_utf8_substr(prog);
2382                 }
2383                 must = prog->anchored_utf8;
2384             }
2385             else {
2386                 if (! prog->anchored_substr) {
2387                     if (! to_byte_substr(prog)) {
2388                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2389                                             non_utf8_target_but_utf8_required));
2390                         goto phooey;
2391                     }
2392                 }
2393                 must = prog->anchored_substr;
2394             }
2395             back_max = back_min = prog->anchored_offset;
2396         } else {
2397             if (utf8_target) {
2398                 if (! prog->float_utf8) {
2399                     to_utf8_substr(prog);
2400                 }
2401                 must = prog->float_utf8;
2402             }
2403             else {
2404                 if (! prog->float_substr) {
2405                     if (! to_byte_substr(prog)) {
2406                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2407                                             non_utf8_target_but_utf8_required));
2408                         goto phooey;
2409                     }
2410                 }
2411                 must = prog->float_substr;
2412             }
2413             back_max = prog->float_max_offset;
2414             back_min = prog->float_min_offset;
2415         }
2416
2417         if (back_min<0) {
2418             last = strend;
2419         } else {
2420             last = HOP3c(strend,        /* Cannot start after this */
2421                   -(I32)(CHR_SVLEN(must)
2422                          - (SvTAIL(must) != 0) + back_min), strbeg);
2423         }
2424         if (s > PL_bostr)
2425             last1 = HOPc(s, -1);
2426         else
2427             last1 = s - 1;      /* bogus */
2428
2429         /* XXXX check_substr already used to find "s", can optimize if
2430            check_substr==must. */
2431         scream_pos = -1;
2432         dontbother = end_shift;
2433         strend = HOPc(strend, -dontbother);
2434         while ( (s <= last) &&
2435                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2436                                   (unsigned char*)strend, must,
2437                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2438             DEBUG_EXECUTE_r( did_match = 1 );
2439             if (HOPc(s, -back_max) > last1) {
2440                 last1 = HOPc(s, -back_min);
2441                 s = HOPc(s, -back_max);
2442             }
2443             else {
2444                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2445
2446                 last1 = HOPc(s, -back_min);
2447                 s = t;
2448             }
2449             if (utf8_target) {
2450                 while (s <= last1) {
2451                     if (regtry(&reginfo, &s))
2452                         goto got_it;
2453                     if (s >= last1) {
2454                         s++; /* to break out of outer loop */
2455                         break;
2456                     }
2457                     s += UTF8SKIP(s);
2458                 }
2459             }
2460             else {
2461                 while (s <= last1) {
2462                     if (regtry(&reginfo, &s))
2463                         goto got_it;
2464                     s++;
2465                 }
2466             }
2467         }
2468         DEBUG_EXECUTE_r(if (!did_match) {
2469             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2470                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2472                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2473                                ? "anchored" : "floating"),
2474                 quoted, RE_SV_TAIL(must));
2475         });
2476         goto phooey;
2477     }
2478     else if ( (c = progi->regstclass) ) {
2479         if (minlen) {
2480             const OPCODE op = OP(progi->regstclass);
2481             /* don't bother with what can't match */
2482             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2483                 strend = HOPc(strend, -(minlen - 1));
2484         }
2485         DEBUG_EXECUTE_r({
2486             SV * const prop = sv_newmortal();
2487             regprop(prog, prop, c);
2488             {
2489                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2490                     s,strend-s,60);
2491                 PerlIO_printf(Perl_debug_log,
2492                     "Matching stclass %.*s against %s (%d bytes)\n",
2493                     (int)SvCUR(prop), SvPVX_const(prop),
2494                      quoted, (int)(strend - s));
2495             }
2496         });
2497         if (find_byclass(prog, c, s, strend, &reginfo))
2498             goto got_it;
2499         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2500     }
2501     else {
2502         dontbother = 0;
2503         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2504             /* Trim the end. */
2505             char *last= NULL;
2506             SV* float_real;
2507             STRLEN len;
2508             const char *little;
2509
2510             if (utf8_target) {
2511                 if (! prog->float_utf8) {
2512                     to_utf8_substr(prog);
2513                 }
2514                 float_real = prog->float_utf8;
2515             }
2516             else {
2517                 if (! prog->float_substr) {
2518                     if (! to_byte_substr(prog)) {
2519                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2520                                             non_utf8_target_but_utf8_required));
2521                         goto phooey;
2522                     }
2523                 }
2524                 float_real = prog->float_substr;
2525             }
2526
2527             little = SvPV_const(float_real, len);
2528             if (SvTAIL(float_real)) {
2529                     /* This means that float_real contains an artificial \n on
2530                      * the end due to the presence of something like this:
2531                      * /foo$/ where we can match both "foo" and "foo\n" at the
2532                      * end of the string.  So we have to compare the end of the
2533                      * string first against the float_real without the \n and
2534                      * then against the full float_real with the string.  We
2535                      * have to watch out for cases where the string might be
2536                      * smaller than the float_real or the float_real without
2537                      * the \n. */
2538                     char *checkpos= strend - len;
2539                     DEBUG_OPTIMISE_r(
2540                         PerlIO_printf(Perl_debug_log,
2541                             "%sChecking for float_real.%s\n",
2542                             PL_colors[4], PL_colors[5]));
2543                     if (checkpos + 1 < strbeg) {
2544                         /* can't match, even if we remove the trailing \n
2545                          * string is too short to match */
2546                         DEBUG_EXECUTE_r(
2547                             PerlIO_printf(Perl_debug_log,
2548                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2549                                 PL_colors[4], PL_colors[5]));
2550                         goto phooey;
2551                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2552                         /* can match, the end of the string matches without the
2553                          * "\n" */
2554                         last = checkpos + 1;
2555                     } else if (checkpos < strbeg) {
2556                         /* cant match, string is too short when the "\n" is
2557                          * included */
2558                         DEBUG_EXECUTE_r(
2559                             PerlIO_printf(Perl_debug_log,
2560                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2561                                 PL_colors[4], PL_colors[5]));
2562                         goto phooey;
2563                     } else if (!multiline) {
2564                         /* non multiline match, so compare with the "\n" at the
2565                          * end of the string */
2566                         if (memEQ(checkpos, little, len)) {
2567                             last= checkpos;
2568                         } else {
2569                             DEBUG_EXECUTE_r(
2570                                 PerlIO_printf(Perl_debug_log,
2571                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2572                                     PL_colors[4], PL_colors[5]));
2573                             goto phooey;
2574                         }
2575                     } else {
2576                         /* multiline match, so we have to search for a place
2577                          * where the full string is located */
2578                         goto find_last;
2579                     }
2580             } else {
2581                   find_last:
2582                     if (len)
2583                         last = rninstr(s, strend, little, little + len);
2584                     else
2585                         last = strend;  /* matching "$" */
2586             }
2587             if (!last) {
2588                 /* at one point this block contained a comment which was
2589                  * probably incorrect, which said that this was a "should not
2590                  * happen" case.  Even if it was true when it was written I am
2591                  * pretty sure it is not anymore, so I have removed the comment
2592                  * and replaced it with this one. Yves */
2593                 DEBUG_EXECUTE_r(
2594                     PerlIO_printf(Perl_debug_log,
2595                         "String does not contain required substring, cannot match.\n"
2596                     ));
2597                 goto phooey;
2598             }
2599             dontbother = strend - last + prog->float_min_offset;
2600         }
2601         if (minlen && (dontbother < minlen))
2602             dontbother = minlen - 1;
2603         strend -= dontbother;              /* this one's always in bytes! */
2604         /* We don't know much -- general case. */
2605         if (utf8_target) {
2606             for (;;) {
2607                 if (regtry(&reginfo, &s))
2608                     goto got_it;
2609                 if (s >= strend)
2610                     break;
2611                 s += UTF8SKIP(s);
2612             };
2613         }
2614         else {
2615             do {
2616                 if (regtry(&reginfo, &s))
2617                     goto got_it;
2618             } while (s++ < strend);
2619         }
2620     }
2621
2622     /* Failure. */
2623     goto phooey;
2624
2625 got_it:
2626     DEBUG_BUFFERS_r(
2627         if (swap)
2628             PerlIO_printf(Perl_debug_log,
2629                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2630                 PTR2UV(prog),
2631                 PTR2UV(swap)
2632             );
2633     );
2634     Safefree(swap);
2635     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2636
2637     if (PL_reg_state.re_state_eval_setup_done)
2638         restore_pos(aTHX_ prog);
2639     if (RXp_PAREN_NAMES(prog))
2640         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2641
2642     /* make sure $`, $&, $', and $digit will work later */
2643     if ( !(flags & REXEC_NOT_FIRST) ) {
2644         if (flags & REXEC_COPY_STR) {
2645 #ifdef PERL_OLD_COPY_ON_WRITE
2646             if ((SvIsCOW(sv)
2647                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2648                 if (DEBUG_C_TEST) {
2649                     PerlIO_printf(Perl_debug_log,
2650                                   "Copy on write: regexp capture, type %d\n",
2651                                   (int) SvTYPE(sv));
2652                 }
2653                 RX_MATCH_COPY_FREE(rx);
2654                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2655                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2656                 assert (SvPOKp(prog->saved_copy));
2657                 prog->sublen  = PL_regeol - strbeg;
2658                 prog->suboffset = 0;
2659                 prog->subcoffset = 0;
2660             } else
2661 #endif
2662             {
2663                 I32 min = 0;
2664                 I32 max = PL_regeol - strbeg;
2665                 I32 sublen;
2666
2667                 if (    (flags & REXEC_COPY_SKIP_POST)
2668                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2669                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2670                 ) { /* don't copy $' part of string */
2671                     U32 n = 0;
2672                     max = -1;
2673                     /* calculate the right-most part of the string covered
2674                      * by a capture. Due to look-ahead, this may be to
2675                      * the right of $&, so we have to scan all captures */
2676                     while (n <= prog->lastparen) {
2677                         if (prog->offs[n].end > max)
2678                             max = prog->offs[n].end;
2679                         n++;
2680                     }
2681                     if (max == -1)
2682                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2683                                 ? prog->offs[0].start
2684                                 : 0;
2685                     assert(max >= 0 && max <= PL_regeol - strbeg);
2686                 }
2687
2688                 if (    (flags & REXEC_COPY_SKIP_PRE)
2689                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2690                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2691                 ) { /* don't copy $` part of string */
2692                     U32 n = 0;
2693                     min = max;
2694                     /* calculate the left-most part of the string covered
2695                      * by a capture. Due to look-behind, this may be to
2696                      * the left of $&, so we have to scan all captures */
2697                     while (min && n <= prog->lastparen) {
2698                         if (   prog->offs[n].start != -1
2699                             && prog->offs[n].start < min)
2700                         {
2701                             min = prog->offs[n].start;
2702                         }
2703                         n++;
2704                     }
2705                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2706                         && min >  prog->offs[0].end
2707                     )
2708                         min = prog->offs[0].end;
2709
2710                 }
2711
2712                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2713                 sublen = max - min;
2714
2715                 if (RX_MATCH_COPIED(rx)) {
2716                     if (sublen > prog->sublen)
2717                         prog->subbeg =
2718                                 (char*)saferealloc(prog->subbeg, sublen+1);
2719                 }
2720                 else
2721                     prog->subbeg = (char*)safemalloc(sublen+1);
2722                 Copy(strbeg + min, prog->subbeg, sublen, char);
2723                 prog->subbeg[sublen] = '\0';
2724                 prog->suboffset = min;
2725                 prog->sublen = sublen;
2726                 RX_MATCH_COPIED_on(rx);
2727             }
2728             prog->subcoffset = prog->suboffset;
2729             if (prog->suboffset && utf8_target) {
2730                 /* Convert byte offset to chars.
2731                  * XXX ideally should only compute this if @-/@+
2732                  * has been seen, a la PL_sawampersand ??? */
2733
2734                 /* If there's a direct correspondence between the
2735                  * string which we're matching and the original SV,
2736                  * then we can use the utf8 len cache associated with
2737                  * the SV. In particular, it means that under //g,
2738                  * sv_pos_b2u() will use the previously cached
2739                  * position to speed up working out the new length of
2740                  * subcoffset, rather than counting from the start of
2741                  * the string each time. This stops
2742                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2743                  * from going quadratic */
2744                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2745                     sv_pos_b2u(sv, &(prog->subcoffset));
2746                 else
2747                     prog->subcoffset = utf8_length((U8*)strbeg,
2748                                         (U8*)(strbeg+prog->suboffset));
2749             }
2750         }
2751         else {
2752             RX_MATCH_COPY_FREE(rx);
2753             prog->subbeg = strbeg;
2754             prog->suboffset = 0;
2755             prog->subcoffset = 0;
2756             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2757         }
2758     }
2759
2760     return 1;
2761
2762 phooey:
2763     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2764                           PL_colors[4], PL_colors[5]));
2765     if (PL_reg_state.re_state_eval_setup_done)
2766         restore_pos(aTHX_ prog);
2767     if (swap) {
2768         /* we failed :-( roll it back */
2769         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2770             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2771             PTR2UV(prog),
2772             PTR2UV(prog->offs),
2773             PTR2UV(swap)
2774         ));
2775         Safefree(prog->offs);
2776         prog->offs = swap;
2777     }
2778     return 0;
2779 }
2780
2781
2782 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2783  * Do inc before dec, in case old and new rex are the same */
2784 #define SET_reg_curpm(Re2) \
2785     if (PL_reg_state.re_state_eval_setup_done) {    \
2786         (void)ReREFCNT_inc(Re2);                    \
2787         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2788         PM_SETRE((PL_reg_curpm), (Re2));            \
2789     }
2790
2791
2792 /*
2793  - regtry - try match at specific point
2794  */
2795 STATIC I32                      /* 0 failure, 1 success */
2796 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2797 {
2798     dVAR;
2799     CHECKPOINT lastcp;
2800     REGEXP *const rx = reginfo->prog;
2801     regexp *const prog = (struct regexp *)SvANY(rx);
2802     I32 result;
2803     RXi_GET_DECL(prog,progi);
2804     GET_RE_DEBUG_FLAGS_DECL;
2805
2806     PERL_ARGS_ASSERT_REGTRY;
2807
2808     reginfo->cutpoint=NULL;
2809
2810     if ((prog->extflags & RXf_EVAL_SEEN)
2811         && !PL_reg_state.re_state_eval_setup_done)
2812     {
2813         MAGIC *mg;
2814
2815         PL_reg_state.re_state_eval_setup_done = TRUE;
2816         if (reginfo->sv) {
2817             /* Make $_ available to executed code. */
2818             if (reginfo->sv != DEFSV) {
2819                 SAVE_DEFSV;
2820                 DEFSV_set(reginfo->sv);
2821             }
2822
2823             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2824                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2825                 /* prepare for quick setting of pos */
2826 #ifdef PERL_OLD_COPY_ON_WRITE
2827                 if (SvIsCOW(reginfo->sv))
2828                     sv_force_normal_flags(reginfo->sv, 0);
2829 #endif
2830                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2831                                  &PL_vtbl_mglob, NULL, 0);
2832                 mg->mg_len = -1;
2833             }
2834             PL_reg_magic    = mg;
2835             PL_reg_oldpos   = mg->mg_len;
2836             SAVEDESTRUCTOR_X(restore_pos, prog);
2837         }
2838         if (!PL_reg_curpm) {
2839             Newxz(PL_reg_curpm, 1, PMOP);
2840 #ifdef USE_ITHREADS
2841             {
2842                 SV* const repointer = &PL_sv_undef;
2843                 /* this regexp is also owned by the new PL_reg_curpm, which
2844                    will try to free it.  */
2845                 av_push(PL_regex_padav, repointer);
2846                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2847                 PL_regex_pad = AvARRAY(PL_regex_padav);
2848             }
2849 #endif
2850         }
2851         SET_reg_curpm(rx);
2852         PL_reg_oldcurpm = PL_curpm;
2853         PL_curpm = PL_reg_curpm;
2854         if (RXp_MATCH_COPIED(prog)) {
2855             /*  Here is a serious problem: we cannot rewrite subbeg,
2856                 since it may be needed if this match fails.  Thus
2857                 $` inside (?{}) could fail... */
2858             PL_reg_oldsaved = prog->subbeg;
2859             PL_reg_oldsavedlen = prog->sublen;
2860             PL_reg_oldsavedoffset = prog->suboffset;
2861             PL_reg_oldsavedcoffset = prog->suboffset;
2862 #ifdef PERL_OLD_COPY_ON_WRITE
2863             PL_nrs = prog->saved_copy;
2864 #endif
2865             RXp_MATCH_COPIED_off(prog);
2866         }
2867         else
2868             PL_reg_oldsaved = NULL;
2869         prog->subbeg = PL_bostr;
2870         prog->suboffset = 0;
2871         prog->subcoffset = 0;
2872         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2873     }
2874 #ifdef DEBUGGING
2875     PL_reg_starttry = *startposp;
2876 #endif
2877     prog->offs[0].start = *startposp - PL_bostr;
2878     prog->lastparen = 0;
2879     prog->lastcloseparen = 0;
2880     PL_regsize = 0;
2881
2882     /* XXXX What this code is doing here?!!!  There should be no need
2883        to do this again and again, prog->lastparen should take care of
2884        this!  --ilya*/
2885
2886     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2887      * Actually, the code in regcppop() (which Ilya may be meaning by
2888      * prog->lastparen), is not needed at all by the test suite
2889      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2890      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2891      * Meanwhile, this code *is* needed for the
2892      * above-mentioned test suite tests to succeed.  The common theme
2893      * on those tests seems to be returning null fields from matches.
2894      * --jhi updated by dapm */
2895 #if 1
2896     if (prog->nparens) {
2897         regexp_paren_pair *pp = prog->offs;
2898         I32 i;
2899         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2900             ++pp;
2901             pp->start = -1;
2902             pp->end = -1;
2903         }
2904     }
2905 #endif
2906     REGCP_SET(lastcp);
2907     result = regmatch(reginfo, *startposp, progi->program + 1);
2908     if (result != -1) {
2909         prog->offs[0].end = result;
2910         return 1;
2911     }
2912     if (reginfo->cutpoint)
2913         *startposp= reginfo->cutpoint;
2914     REGCP_UNWIND(lastcp);
2915     return 0;
2916 }
2917
2918
2919 #define sayYES goto yes
2920 #define sayNO goto no
2921 #define sayNO_SILENT goto no_silent
2922
2923 /* we dont use STMT_START/END here because it leads to
2924    "unreachable code" warnings, which are bogus, but distracting. */
2925 #define CACHEsayNO \
2926     if (ST.cache_mask) \
2927        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2928     sayNO
2929
2930 /* this is used to determine how far from the left messages like
2931    'failed...' are printed. It should be set such that messages
2932    are inline with the regop output that created them.
2933 */
2934 #define REPORT_CODE_OFF 32
2935
2936
2937 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2938 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2939 #define CHRTEST_NOT_A_CP_1 -999
2940 #define CHRTEST_NOT_A_CP_2 -998
2941
2942 #define SLAB_FIRST(s) (&(s)->states[0])
2943 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2944
2945 /* grab a new slab and return the first slot in it */
2946
2947 STATIC regmatch_state *
2948 S_push_slab(pTHX)
2949 {
2950 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2951     dMY_CXT;
2952 #endif
2953     regmatch_slab *s = PL_regmatch_slab->next;
2954     if (!s) {
2955         Newx(s, 1, regmatch_slab);
2956         s->prev = PL_regmatch_slab;
2957         s->next = NULL;
2958         PL_regmatch_slab->next = s;
2959     }
2960     PL_regmatch_slab = s;
2961     return SLAB_FIRST(s);
2962 }
2963
2964
2965 /* push a new state then goto it */
2966
2967 #define PUSH_STATE_GOTO(state, node, input) \
2968     pushinput = input; \
2969     scan = node; \
2970     st->resume_state = state; \
2971     goto push_state;
2972
2973 /* push a new state with success backtracking, then goto it */
2974
2975 #define PUSH_YES_STATE_GOTO(state, node, input) \
2976     pushinput = input; \
2977     scan = node; \
2978     st->resume_state = state; \
2979     goto push_yes_state;
2980
2981
2982
2983
2984 /*
2985
2986 regmatch() - main matching routine
2987
2988 This is basically one big switch statement in a loop. We execute an op,
2989 set 'next' to point the next op, and continue. If we come to a point which
2990 we may need to backtrack to on failure such as (A|B|C), we push a
2991 backtrack state onto the backtrack stack. On failure, we pop the top
2992 state, and re-enter the loop at the state indicated. If there are no more
2993 states to pop, we return failure.
2994
2995 Sometimes we also need to backtrack on success; for example /A+/, where
2996 after successfully matching one A, we need to go back and try to
2997 match another one; similarly for lookahead assertions: if the assertion
2998 completes successfully, we backtrack to the state just before the assertion
2999 and then carry on.  In these cases, the pushed state is marked as
3000 'backtrack on success too'. This marking is in fact done by a chain of
3001 pointers, each pointing to the previous 'yes' state. On success, we pop to
3002 the nearest yes state, discarding any intermediate failure-only states.
3003 Sometimes a yes state is pushed just to force some cleanup code to be
3004 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3005 it to free the inner regex.
3006
3007 Note that failure backtracking rewinds the cursor position, while
3008 success backtracking leaves it alone.
3009
3010 A pattern is complete when the END op is executed, while a subpattern
3011 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3012 ops trigger the "pop to last yes state if any, otherwise return true"
3013 behaviour.
3014
3015 A common convention in this function is to use A and B to refer to the two
3016 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3017 the subpattern to be matched possibly multiple times, while B is the entire
3018 rest of the pattern. Variable and state names reflect this convention.
3019
3020 The states in the main switch are the union of ops and failure/success of
3021 substates associated with with that op.  For example, IFMATCH is the op
3022 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3023 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3024 successfully matched A and IFMATCH_A_fail is a state saying that we have
3025 just failed to match A. Resume states always come in pairs. The backtrack
3026 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3027 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3028 on success or failure.
3029
3030 The struct that holds a backtracking state is actually a big union, with
3031 one variant for each major type of op. The variable st points to the
3032 top-most backtrack struct. To make the code clearer, within each
3033 block of code we #define ST to alias the relevant union.
3034
3035 Here's a concrete example of a (vastly oversimplified) IFMATCH
3036 implementation:
3037
3038     switch (state) {
3039     ....
3040
3041 #define ST st->u.ifmatch
3042
3043     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3044         ST.foo = ...; // some state we wish to save
3045         ...
3046         // push a yes backtrack state with a resume value of
3047         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3048         // first node of A:
3049         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3050         // NOTREACHED
3051
3052     case IFMATCH_A: // we have successfully executed A; now continue with B
3053         next = B;
3054         bar = ST.foo; // do something with the preserved value
3055         break;
3056
3057     case IFMATCH_A_fail: // A failed, so the assertion failed
3058         ...;   // do some housekeeping, then ...
3059         sayNO; // propagate the failure
3060
3061 #undef ST
3062
3063     ...
3064     }
3065
3066 For any old-timers reading this who are familiar with the old recursive
3067 approach, the code above is equivalent to:
3068
3069     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3070     {
3071         int foo = ...
3072         ...
3073         if (regmatch(A)) {
3074             next = B;
3075             bar = foo;
3076             break;
3077         }
3078         ...;   // do some housekeeping, then ...
3079         sayNO; // propagate the failure
3080     }
3081
3082 The topmost backtrack state, pointed to by st, is usually free. If you
3083 want to claim it, populate any ST.foo fields in it with values you wish to
3084 save, then do one of
3085
3086         PUSH_STATE_GOTO(resume_state, node, newinput);
3087         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3088
3089 which sets that backtrack state's resume value to 'resume_state', pushes a
3090 new free entry to the top of the backtrack stack, then goes to 'node'.
3091 On backtracking, the free slot is popped, and the saved state becomes the
3092 new free state. An ST.foo field in this new top state can be temporarily
3093 accessed to retrieve values, but once the main loop is re-entered, it
3094 becomes available for reuse.
3095
3096 Note that the depth of the backtrack stack constantly increases during the
3097 left-to-right execution of the pattern, rather than going up and down with
3098 the pattern nesting. For example the stack is at its maximum at Z at the
3099 end of the pattern, rather than at X in the following:
3100
3101     /(((X)+)+)+....(Y)+....Z/
3102
3103 The only exceptions to this are lookahead/behind assertions and the cut,
3104 (?>A), which pop all the backtrack states associated with A before
3105 continuing.
3106
3107 Backtrack state structs are allocated in slabs of about 4K in size.
3108 PL_regmatch_state and st always point to the currently active state,
3109 and PL_regmatch_slab points to the slab currently containing
3110 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3111 allocated, and is never freed until interpreter destruction. When the slab
3112 is full, a new one is allocated and chained to the end. At exit from
3113 regmatch(), slabs allocated since entry are freed.
3114
3115 */
3116
3117
3118 #define DEBUG_STATE_pp(pp)                                  \
3119     DEBUG_STATE_r({                                         \
3120         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3121         PerlIO_printf(Perl_debug_log,                       \
3122             "    %*s"pp" %s%s%s%s%s\n",                     \
3123             depth*2, "",                                    \
3124             PL_reg_name[st->resume_state],                     \
3125             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3126             ((st==yes_state) ? "Y" : ""),                   \
3127             ((st==mark_state) ? "M" : ""),                  \
3128             ((st==yes_state||st==mark_state) ? "]" : "")    \
3129         );                                                  \
3130     });
3131
3132
3133 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3134
3135 #ifdef DEBUGGING
3136
3137 STATIC void
3138 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3139     const char *start, const char *end, const char *blurb)
3140 {
3141     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3142
3143     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3144
3145     if (!PL_colorset)
3146             reginitcolors();
3147     {
3148         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3149             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3150
3151         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3152             start, end - start, 60);
3153
3154         PerlIO_printf(Perl_debug_log,
3155             "%s%s REx%s %s against %s\n",
3156                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3157
3158         if (utf8_target||utf8_pat)
3159             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3160                 utf8_pat ? "pattern" : "",
3161                 utf8_pat && utf8_target ? " and " : "",
3162                 utf8_target ? "string" : ""
3163             );
3164     }
3165 }
3166
3167 STATIC void
3168 S_dump_exec_pos(pTHX_ const char *locinput,
3169                       const regnode *scan,
3170                       const char *loc_regeol,
3171                       const char *loc_bostr,
3172                       const char *loc_reg_starttry,
3173                       const bool utf8_target)
3174 {
3175     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3176     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3177     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3178     /* The part of the string before starttry has one color
3179        (pref0_len chars), between starttry and current
3180        position another one (pref_len - pref0_len chars),
3181        after the current position the third one.
3182        We assume that pref0_len <= pref_len, otherwise we
3183        decrease pref0_len.  */
3184     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3185         ? (5 + taill) - l : locinput - loc_bostr;
3186     int pref0_len;
3187
3188     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3189
3190     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3191         pref_len++;
3192     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3193     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3194         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3195               ? (5 + taill) - pref_len : loc_regeol - locinput);
3196     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3197         l--;
3198     if (pref0_len < 0)
3199         pref0_len = 0;
3200     if (pref0_len > pref_len)
3201         pref0_len = pref_len;
3202     {
3203         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3204
3205         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3206             (locinput - pref_len),pref0_len, 60, 4, 5);
3207
3208         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3209                     (locinput - pref_len + pref0_len),
3210                     pref_len - pref0_len, 60, 2, 3);
3211
3212         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3213                     locinput, loc_regeol - locinput, 10, 0, 1);
3214
3215         const STRLEN tlen=len0+len1+len2;
3216         PerlIO_printf(Perl_debug_log,
3217                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3218                     (IV)(locinput - loc_bostr),
3219                     len0, s0,
3220                     len1, s1,
3221                     (docolor ? "" : "> <"),
3222                     len2, s2,
3223                     (int)(tlen > 19 ? 0 :  19 - tlen),
3224                     "");
3225     }
3226 }
3227
3228 #endif
3229
3230 /* reg_check_named_buff_matched()
3231  * Checks to see if a named buffer has matched. The data array of
3232  * buffer numbers corresponding to the buffer is expected to reside
3233  * in the regexp->data->data array in the slot stored in the ARG() of
3234  * node involved. Note that this routine doesn't actually care about the
3235  * name, that information is not preserved from compilation to execution.
3236  * Returns the index of the leftmost defined buffer with the given name
3237  * or 0 if non of the buffers matched.
3238  */
3239 STATIC I32
3240 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3241 {
3242     I32 n;
3243     RXi_GET_DECL(rex,rexi);
3244     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3245     I32 *nums=(I32*)SvPVX(sv_dat);
3246
3247     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3248
3249     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3250         if ((I32)rex->lastparen >= nums[n] &&
3251             rex->offs[nums[n]].end != -1)
3252         {
3253             return nums[n];
3254         }
3255     }
3256     return 0;
3257 }
3258
3259
3260 /* free all slabs above current one  - called during LEAVE_SCOPE */
3261
3262 STATIC void
3263 S_clear_backtrack_stack(pTHX_ void *p)
3264 {
3265     regmatch_slab *s = PL_regmatch_slab->next;
3266     PERL_UNUSED_ARG(p);
3267
3268     if (!s)
3269         return;
3270     PL_regmatch_slab->next = NULL;
3271     while (s) {
3272         regmatch_slab * const osl = s;
3273         s = s->next;
3274         Safefree(osl);
3275     }
3276 }
3277 static bool
3278 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3279 {
3280     /* This function determines if there are one or two characters that match
3281      * the first character of the passed-in EXACTish node <text_node>, and if
3282      * so, returns them in the passed-in pointers.
3283      *
3284      * If it determines that no possible character in the target string can
3285      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3286      * the first character in <text_node> requires UTF-8 to represent, and the
3287      * target string isn't in UTF-8.)
3288      *
3289      * If there are more than two characters that could match the beginning of
3290      * <text_node>, or if more context is required to determine a match or not,
3291      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3292      *
3293      * The motiviation behind this function is to allow the caller to set up
3294      * tight loops for matching.  If <text_node> is of type EXACT, there is
3295      * only one possible character that can match its first character, and so
3296      * the situation is quite simple.  But things get much more complicated if
3297      * folding is involved.  It may be that the first character of an EXACTFish
3298      * node doesn't participate in any possible fold, e.g., punctuation, so it
3299      * can be matched only by itself.  The vast majority of characters that are
3300      * in folds match just two things, their lower and upper-case equivalents.
3301      * But not all are like that; some have multiple possible matches, or match
3302      * sequences of more than one character.  This function sorts all that out.
3303      *
3304      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3305      * loop of trying to match A*, we know we can't exit where the thing
3306      * following it isn't a B.  And something can't be a B unless it is the
3307      * beginning of B.  By putting a quick test for that beginning in a tight
3308      * loop, we can rule out things that can't possibly be B without having to
3309      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3310      * character, we can make a tight loop matching A*, using the outputs of
3311      * this function.
3312      *
3313      * If the target string to match isn't in UTF-8, and there aren't
3314      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3315      * the one or two possible octets (which are characters in this situation)
3316      * that can match.  In all cases, if there is only one character that can
3317      * match, *<c1p> and *<c2p> will be identical.
3318      *
3319      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3320      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3321      * can match the beginning of <text_node>.  They should be declared with at
3322      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3323      * undefined what these contain.)  If one or both of the buffers are
3324      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3325      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3326      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3327      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3328      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3329
3330     const bool utf8_target = PL_reg_match_utf8;
3331
3332     UV c1, c2;
3333     bool use_chrtest_void = FALSE;
3334
3335     /* Used when we have both utf8 input and utf8 output, to avoid converting
3336      * to/from code points */
3337     bool utf8_has_been_setup = FALSE;
3338
3339     dVAR;
3340
3341     U8 *pat = (U8*)STRING(text_node);
3342
3343     if (OP(text_node) == EXACT) {
3344
3345         /* In an exact node, only one thing can be matched, that first
3346          * character.  If both the pat and the target are UTF-8, we can just
3347          * copy the input to the output, avoiding finding the code point of
3348          * that character */
3349         if (! UTF_PATTERN) {
3350             c2 = c1 = *pat;
3351         }
3352         else if (utf8_target) {
3353             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3354             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3355             utf8_has_been_setup = TRUE;
3356         }
3357         else {
3358             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3359         }
3360     }
3361     else /* an EXACTFish node */
3362          if ((UTF_PATTERN
3363                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3364                                                     pat + STR_LEN(text_node)))
3365              || (! UTF_PATTERN
3366                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3367                                                     pat + STR_LEN(text_node))))
3368     {
3369         /* Multi-character folds require more context to sort out.  Also
3370          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3371          * handled outside this routine */
3372         use_chrtest_void = TRUE;
3373     }
3374     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3375         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3376         if (c1 > 256) {
3377             /* Load the folds hash, if not already done */
3378             SV** listp;
3379             if (! PL_utf8_foldclosures) {
3380                 if (! PL_utf8_tofold) {
3381                     U8 dummy[UTF8_MAXBYTES+1];
3382
3383                     /* Force loading this by folding an above-Latin1 char */
3384                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3385                     assert(PL_utf8_tofold); /* Verify that worked */
3386                 }
3387                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3388             }
3389
3390             /* The fold closures data structure is a hash with the keys being
3391              * the UTF-8 of every character that is folded to, like 'k', and
3392              * the values each an array of all code points that fold to its
3393              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3394              * not included */
3395             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3396                                      (char *) pat,
3397                                      UTF8SKIP(pat),
3398                                      FALSE))))
3399             {
3400                 /* Not found in the hash, therefore there are no folds
3401                  * containing it, so there is only a single character that
3402                  * could match */
3403                 c2 = c1;
3404             }
3405             else {  /* Does participate in folds */
3406                 AV* list = (AV*) *listp;
3407                 if (av_len(list) != 1) {
3408
3409                     /* If there aren't exactly two folds to this, it is outside
3410                      * the scope of this function */
3411                     use_chrtest_void = TRUE;
3412                 }
3413                 else {  /* There are two.  Get them */
3414                     SV** c_p = av_fetch(list, 0, FALSE);
3415                     if (c_p == NULL) {
3416                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3417                     }
3418                     c1 = SvUV(*c_p);
3419
3420                     c_p = av_fetch(list, 1, FALSE);
3421                     if (c_p == NULL) {
3422                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3423                     }
3424                     c2 = SvUV(*c_p);
3425
3426                     /* Folds that cross the 255/256 boundary are forbidden if
3427                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3428                      * pattern character is above 256, and its only other match
3429                      * is below 256, the only legal match will be to itself.
3430                      * We have thrown away the original, so have to compute
3431                      * which is the one above 255 */
3432                     if ((c1 < 256) != (c2 < 256)) {
3433                         if (OP(text_node) == EXACTFL
3434                             || (OP(text_node) == EXACTFA
3435                                 && (isASCII(c1) || isASCII(c2))))
3436                         {
3437                             if (c1 < 256) {
3438                                 c1 = c2;
3439                             }
3440                             else {
3441                                 c2 = c1;
3442                             }
3443                         }
3444                     }
3445                 }
3446             }
3447         }
3448         else /* Here, c1 is < 255 */
3449              if (utf8_target
3450                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3451                  && OP(text_node) != EXACTFL
3452                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3453         {
3454             /* Here, there could be something above Latin1 in the target which
3455              * folds to this character in the pattern.  All such cases except
3456              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3457              * involved in their folds, so are outside the scope of this
3458              * function */
3459             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3460                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3461             }
3462             else {
3463                 use_chrtest_void = TRUE;
3464             }
3465         }
3466         else { /* Here nothing above Latin1 can fold to the pattern character */
3467             switch (OP(text_node)) {
3468
3469                 case EXACTFL:   /* /l rules */
3470                     c2 = PL_fold_locale[c1];
3471                     break;
3472
3473                 case EXACTF:
3474                     if (! utf8_target) {    /* /d rules */
3475                         c2 = PL_fold[c1];
3476                         break;
3477                     }
3478                     /* FALLTHROUGH */
3479                     /* /u rules for all these.  This happens to work for
3480                      * EXACTFA as nothing in Latin1 folds to ASCII */
3481                 case EXACTFA:
3482                 case EXACTFU_TRICKYFOLD:
3483                 case EXACTFU_SS:
3484                 case EXACTFU:
3485                     c2 = PL_fold_latin1[c1];
3486                     break;
3487
3488                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3489             }
3490         }
3491     }
3492
3493     /* Here have figured things out.  Set up the returns */
3494     if (use_chrtest_void) {
3495         *c2p = *c1p = CHRTEST_VOID;
3496     }
3497     else if (utf8_target) {
3498         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3499             uvchr_to_utf8(c1_utf8, c1);
3500             uvchr_to_utf8(c2_utf8, c2);
3501         }
3502
3503         /* Invariants are stored in both the utf8 and byte outputs; Use
3504          * negative numbers otherwise for the byte ones.  Make sure that the
3505          * byte ones are the same iff the utf8 ones are the same */
3506         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3507         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3508                 ? *c2_utf8
3509                 : (c1 == c2)
3510                   ? CHRTEST_NOT_A_CP_1
3511                   : CHRTEST_NOT_A_CP_2;
3512     }
3513     else if (c1 > 255) {
3514        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3515                            can represent */
3516            return FALSE;
3517        }
3518
3519        *c1p = *c2p = c2;    /* c2 is the only representable value */
3520     }
3521     else {  /* c1 is representable; see about c2 */
3522        *c1p = c1;
3523        *c2p = (c2 < 256) ? c2 : c1;
3524     }
3525
3526     return TRUE;
3527 }
3528
3529 /* returns -1 on failure, $+[0] on success */
3530 STATIC I32
3531 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3532 {
3533 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3534     dMY_CXT;
3535 #endif
3536     dVAR;
3537     const bool utf8_target = PL_reg_match_utf8;
3538     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3539     REGEXP *rex_sv = reginfo->prog;
3540     regexp *rex = (struct regexp *)SvANY(rex_sv);
3541     RXi_GET_DECL(rex,rexi);
3542     I32 oldsave;
3543     /* the current state. This is a cached copy of PL_regmatch_state */
3544     regmatch_state *st;
3545     /* cache heavy used fields of st in registers */
3546     regnode *scan;
3547     regnode *next;
3548     U32 n = 0;  /* general value; init to avoid compiler warning */
3549     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3550     char *locinput = startpos;
3551     char *pushinput; /* where to continue after a PUSH */
3552     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3553
3554     bool result = 0;        /* return value of S_regmatch */
3555     int depth = 0;          /* depth of backtrack stack */
3556     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3557     const U32 max_nochange_depth =
3558         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3559         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3560     regmatch_state *yes_state = NULL; /* state to pop to on success of
3561                                                             subpattern */
3562     /* mark_state piggy backs on the yes_state logic so that when we unwind
3563        the stack on success we can update the mark_state as we go */
3564     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3565     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3566     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3567     U32 state_num;
3568     bool no_final = 0;      /* prevent failure from backtracking? */
3569     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3570     char *startpoint = locinput;
3571     SV *popmark = NULL;     /* are we looking for a mark? */
3572     SV *sv_commit = NULL;   /* last mark name seen in failure */
3573     SV *sv_yes_mark = NULL; /* last mark name we have seen
3574                                during a successful match */
3575     U32 lastopen = 0;       /* last open we saw */
3576     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3577     SV* const oreplsv = GvSV(PL_replgv);
3578     /* these three flags are set by various ops to signal information to
3579      * the very next op. They have a useful lifetime of exactly one loop
3580      * iteration, and are not preserved or restored by state pushes/pops
3581      */
3582     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3583     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3584     int logical = 0;        /* the following EVAL is:
3585                                 0: (?{...})
3586                                 1: (?(?{...})X|Y)
3587                                 2: (??{...})
3588                                or the following IFMATCH/UNLESSM is:
3589                                 false: plain (?=foo)
3590                                 true:  used as a condition: (?(?=foo))
3591                             */
3592     PAD* last_pad = NULL;
3593     dMULTICALL;
3594     I32 gimme = G_SCALAR;
3595     CV *caller_cv = NULL;       /* who called us */
3596     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3597     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3598
3599 #ifdef DEBUGGING
3600     GET_RE_DEBUG_FLAGS_DECL;
3601 #endif
3602
3603     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3604     multicall_oldcatch = 0;
3605     multicall_cv = NULL;
3606     cx = NULL;
3607     PERL_UNUSED_VAR(multicall_cop);
3608     PERL_UNUSED_VAR(newsp);
3609
3610
3611     PERL_ARGS_ASSERT_REGMATCH;
3612
3613     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3614             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3615     }));
3616     /* on first ever call to regmatch, allocate first slab */
3617     if (!PL_regmatch_slab) {
3618         Newx(PL_regmatch_slab, 1, regmatch_slab);
3619         PL_regmatch_slab->prev = NULL;
3620         PL_regmatch_slab->next = NULL;
3621         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3622     }
3623
3624     oldsave = PL_savestack_ix;
3625     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3626     SAVEVPTR(PL_regmatch_slab);
3627     SAVEVPTR(PL_regmatch_state);
3628
3629     /* grab next free state slot */
3630     st = ++PL_regmatch_state;
3631     if (st >  SLAB_LAST(PL_regmatch_slab))
3632         st = PL_regmatch_state = S_push_slab(aTHX);
3633
3634     /* Note that nextchr is a byte even in UTF */
3635     SET_nextchr;
3636     scan = prog;
3637     while (scan != NULL) {
3638
3639         DEBUG_EXECUTE_r( {
3640             SV * const prop = sv_newmortal();
3641             regnode *rnext=regnext(scan);
3642             DUMP_EXEC_POS( locinput, scan, utf8_target );
3643             regprop(rex, prop, scan);
3644
3645             PerlIO_printf(Perl_debug_log,
3646                     "%3"IVdf":%*s%s(%"IVdf")\n",
3647                     (IV)(scan - rexi->program), depth*2, "",
3648                     SvPVX_const(prop),
3649                     (PL_regkind[OP(scan)] == END || !rnext) ?
3650                         0 : (IV)(rnext - rexi->program));
3651         });
3652
3653         next = scan + NEXT_OFF(scan);
3654         if (next == scan)
3655             next = NULL;
3656         state_num = OP(scan);
3657
3658       reenter_switch:
3659
3660         SET_nextchr;
3661         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3662
3663         switch (state_num) {
3664         case BOL: /*  /^../  */
3665             if (locinput == PL_bostr)
3666             {
3667                 /* reginfo->till = reginfo->bol; */
3668                 break;
3669             }
3670             sayNO;
3671
3672         case MBOL: /*  /^../m  */
3673             if (locinput == PL_bostr ||
3674                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3675             {
3676                 break;
3677             }
3678             sayNO;
3679
3680         case SBOL: /*  /^../s  */
3681             if (locinput == PL_bostr)
3682                 break;
3683             sayNO;
3684
3685         case GPOS: /*  \G  */
3686             if (locinput == reginfo->ganch)
3687                 break;
3688             sayNO;
3689
3690         case KEEPS: /*   \K  */
3691             /* update the startpoint */
3692             st->u.keeper.val = rex->offs[0].start;
3693             rex->offs[0].start = locinput - PL_bostr;
3694             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3695             /*NOT-REACHED*/
3696         case KEEPS_next_fail:
3697             /* rollback the start point change */
3698             rex->offs[0].start = st->u.keeper.val;
3699             sayNO_SILENT;
3700             /*NOT-REACHED*/
3701
3702         case EOL: /* /..$/  */
3703                 goto seol;
3704
3705         case MEOL: /* /..$/m  */
3706             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3707                 sayNO;
3708             break;
3709
3710         case SEOL: /* /..$/s  */
3711           seol:
3712             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3713                 sayNO;
3714             if (PL_regeol - locinput > 1)
3715                 sayNO;
3716             break;
3717
3718         case EOS: /*  \z  */
3719             if (!NEXTCHR_IS_EOS)
3720                 sayNO;
3721             break;
3722
3723         case SANY: /*  /./s  */
3724             if (NEXTCHR_IS_EOS)
3725                 sayNO;
3726             goto increment_locinput;
3727
3728         case CANY: /*  \C  */
3729             if (NEXTCHR_IS_EOS)
3730                 sayNO;
3731             locinput++;
3732             break;
3733
3734         case REG_ANY: /*  /./  */
3735             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3736                 sayNO;
3737             goto increment_locinput;
3738
3739
3740 #undef  ST
3741 #define ST st->u.trie
3742         case TRIEC: /* (ab|cd) with known charclass */
3743             /* In this case the charclass data is available inline so
3744                we can fail fast without a lot of extra overhead.
3745              */
3746             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3747                 DEBUG_EXECUTE_r(
3748                     PerlIO_printf(Perl_debug_log,
3749                               "%*s  %sfailed to match trie start class...%s\n",
3750                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3751                 );
3752                 sayNO_SILENT;
3753                 assert(0); /* NOTREACHED */
3754             }
3755             /* FALL THROUGH */
3756         case TRIE:  /* (ab|cd)  */
3757             /* the basic plan of execution of the trie is:
3758              * At the beginning, run though all the states, and
3759              * find the longest-matching word. Also remember the position
3760              * of the shortest matching word. For example, this pattern:
3761              *    1  2 3 4    5
3762              *    ab|a|x|abcd|abc
3763              * when matched against the string "abcde", will generate
3764              * accept states for all words except 3, with the longest
3765              * matching word being 4, and the shortest being 2 (with
3766              * the position being after char 1 of the string).
3767              *
3768              * Then for each matching word, in word order (i.e. 1,2,4,5),
3769              * we run the remainder of the pattern; on each try setting
3770              * the current position to the character following the word,
3771              * returning to try the next word on failure.
3772              *
3773              * We avoid having to build a list of words at runtime by
3774              * using a compile-time structure, wordinfo[].prev, which
3775              * gives, for each word, the previous accepting word (if any).
3776              * In the case above it would contain the mappings 1->2, 2->0,
3777              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3778              * the longest word (4 above), a list of all words, by
3779              * following the list of prev pointers; this gives us the
3780              * unordered list 4,5,1,2. Then given the current word we have
3781              * just tried, we can go through the list and find the
3782              * next-biggest word to try (so if we just failed on word 2,
3783              * the next in the list is 4).
3784              *
3785              * Since at runtime we don't record the matching position in
3786              * the string for each word, we have to work that out for
3787              * each word we're about to process. The wordinfo table holds
3788              * the character length of each word; given that we recorded
3789              * at the start: the position of the shortest word and its
3790              * length in chars, we just need to move the pointer the
3791              * difference between the two char lengths. Depending on
3792              * Unicode status and folding, that's cheap or expensive.
3793              *
3794              * This algorithm is optimised for the case where are only a
3795              * small number of accept states, i.e. 0,1, or maybe 2.
3796              * With lots of accepts states, and having to try all of them,
3797              * it becomes quadratic on number of accept states to find all
3798              * the next words.
3799              */
3800
3801             {
3802                 /* what type of TRIE am I? (utf8 makes this contextual) */
3803                 DECL_TRIE_TYPE(scan);
3804
3805                 /* what trie are we using right now */
3806                 reg_trie_data * const trie
3807                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3808                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3809                 U32 state = trie->startstate;
3810
3811                 if (   trie->bitmap
3812                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3813                 {
3814                     if (trie->states[ state ].wordnum) {
3815                          DEBUG_EXECUTE_r(
3816                             PerlIO_printf(Perl_debug_log,
3817                                           "%*s  %smatched empty string...%s\n",
3818                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3819                         );
3820                         if (!trie->jump)
3821                             break;
3822                     } else {
3823                         DEBUG_EXECUTE_r(
3824                             PerlIO_printf(Perl_debug_log,
3825                                           "%*s  %sfailed to match trie start class...%s\n",
3826                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3827                         );
3828                         sayNO_SILENT;
3829                    }
3830                 }
3831
3832             {
3833                 U8 *uc = ( U8* )locinput;
3834
3835                 STRLEN len = 0;
3836                 STRLEN foldlen = 0;
3837                 U8 *uscan = (U8*)NULL;
3838                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3839                 U32 charcount = 0; /* how many input chars we have matched */
3840                 U32 accepted = 0; /* have we seen any accepting states? */
3841
3842                 ST.jump = trie->jump;
3843                 ST.me = scan;
3844                 ST.firstpos = NULL;
3845                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3846                 ST.nextword = 0;
3847
3848                 /* fully traverse the TRIE; note the position of the
3849                    shortest accept state and the wordnum of the longest
3850                    accept state */
3851
3852                 while ( state && uc <= (U8*)PL_regeol ) {
3853                     U32 base = trie->states[ state ].trans.base;
3854                     UV uvc = 0;
3855                     U16 charid = 0;
3856                     U16 wordnum;
3857                     wordnum = trie->states[ state ].wordnum;
3858
3859                     if (wordnum) { /* it's an accept state */
3860                         if (!accepted) {
3861                             accepted = 1;
3862                             /* record first match position */
3863                             if (ST.longfold) {
3864                                 ST.firstpos = (U8*)locinput;
3865                                 ST.firstchars = 0;
3866                             }
3867                             else {
3868                                 ST.firstpos = uc;
3869                                 ST.firstchars = charcount;
3870                             }
3871                         }
3872                         if (!ST.nextword || wordnum < ST.nextword)
3873                             ST.nextword = wordnum;
3874                         ST.topword = wordnum;
3875                     }
3876
3877                     DEBUG_TRIE_EXECUTE_r({
3878                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3879                                 PerlIO_printf( Perl_debug_log,
3880                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3881                                     2+depth * 2, "", PL_colors[4],
3882                                     (UV)state, (accepted ? 'Y' : 'N'));
3883                     });
3884
3885                     /* read a char and goto next state */
3886                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3887                         I32 offset;
3888                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3889                                              uscan, len, uvc, charid, foldlen,
3890                                              foldbuf, uniflags);
3891                         charcount++;
3892                         if (foldlen>0)
3893                             ST.longfold = TRUE;
3894                         if (charid &&
3895                              ( ((offset =
3896                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3897
3898                              && ((U32)offset < trie->lasttrans)
3899                              && trie->trans[offset].check == state)
3900                         {
3901                             state = trie->trans[offset].next;
3902                         }
3903                         else {
3904                             state = 0;
3905                         }
3906                         uc += len;
3907
3908                     }
3909                     else {
3910                         state = 0;
3911                     }
3912                     DEBUG_TRIE_EXECUTE_r(
3913                         PerlIO_printf( Perl_debug_log,
3914                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3915                             charid, uvc, (UV)state, PL_colors[5] );
3916                     );
3917                 }
3918                 if (!accepted)
3919                    sayNO;
3920
3921                 /* calculate total number of accept states */
3922                 {
3923                     U16 w = ST.topword;
3924                     accepted = 0;
3925                     while (w) {
3926                         w = trie->wordinfo[w].prev;
3927                         accepted++;
3928                     }
3929                     ST.accepted = accepted;
3930                 }
3931
3932                 DEBUG_EXECUTE_r(
3933                     PerlIO_printf( Perl_debug_log,
3934                         "%*s  %sgot %"IVdf" possible matches%s\n",
3935                         REPORT_CODE_OFF + depth * 2, "",
3936                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3937                 );
3938                 goto trie_first_try; /* jump into the fail handler */
3939             }}
3940             assert(0); /* NOTREACHED */
3941
3942         case TRIE_next_fail: /* we failed - try next alternative */
3943         {
3944             U8 *uc;
3945             if ( ST.jump) {
3946                 REGCP_UNWIND(ST.cp);
3947                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3948             }
3949             if (!--ST.accepted) {
3950                 DEBUG_EXECUTE_r({
3951                     PerlIO_printf( Perl_debug_log,
3952                         "%*s  %sTRIE failed...%s\n",
3953                         REPORT_CODE_OFF+depth*2, "",
3954                         PL_colors[4],
3955                         PL_colors[5] );
3956                 });
3957                 sayNO_SILENT;
3958             }
3959             {
3960                 /* Find next-highest word to process.  Note that this code
3961                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3962                 U16 min = 0;
3963                 U16 word;
3964                 U16 const nextword = ST.nextword;
3965                 reg_trie_wordinfo * const wordinfo
3966                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3967                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3968                     if (word > nextword && (!min || word < min))
3969                         min = word;
3970                 }
3971                 ST.nextword = min;
3972             }
3973
3974           trie_first_try:
3975             if (do_cutgroup) {
3976                 do_cutgroup = 0;
3977                 no_final = 0;
3978             }
3979
3980             if ( ST.jump) {
3981                 ST.lastparen = rex->lastparen;
3982                 ST.lastcloseparen = rex->lastcloseparen;
3983                 REGCP_SET(ST.cp);
3984             }
3985
3986             /* find start char of end of current word */
3987             {
3988                 U32 chars; /* how many chars to skip */
3989                 reg_trie_data * const trie
3990                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3991
3992                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3993                             >=  ST.firstchars);
3994                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3995                             - ST.firstchars;
3996                 uc = ST.firstpos;
3997
3998                 if (ST.longfold) {
3999                     /* the hard option - fold each char in turn and find
4000                      * its folded length (which may be different */
4001                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4002                     STRLEN foldlen;
4003                     STRLEN len;
4004                     UV uvc;
4005                     U8 *uscan;
4006
4007                     while (chars) {
4008                         if (utf8_target) {
4009                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4010                                                     uniflags);
4011                             uc += len;
4012                         }
4013                         else {
4014                             uvc = *uc;
4015                             uc++;
4016                         }
4017                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4018                         uscan = foldbuf;
4019                         while (foldlen) {
4020                             if (!--chars)
4021                                 break;
4022                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4023                                             uniflags);
4024                             uscan += len;
4025                             foldlen -= len;
4026                         }
4027                     }
4028                 }
4029                 else {
4030                     if (utf8_target)
4031                         while (chars--)
4032                             uc += UTF8SKIP(uc);
4033                     else
4034                         uc += chars;
4035                 }
4036             }
4037
4038             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4039                             ? ST.jump[ST.nextword]
4040                             : NEXT_OFF(ST.me));
4041
4042             DEBUG_EXECUTE_r({
4043                 PerlIO_printf( Perl_debug_log,
4044                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4045                     REPORT_CODE_OFF+depth*2, "",
4046                     PL_colors[4],
4047                     ST.nextword,
4048                     PL_colors[5]
4049                     );
4050             });
4051
4052             if (ST.accepted > 1 || has_cutgroup) {
4053                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4054                 assert(0); /* NOTREACHED */
4055             }
4056             /* only one choice left - just continue */
4057             DEBUG_EXECUTE_r({
4058                 AV *const trie_words
4059                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4060                 SV ** const tmp = av_fetch( trie_words,
4061                     ST.nextword-1, 0 );
4062                 SV *sv= tmp ? sv_newmortal() : NULL;
4063
4064                 PerlIO_printf( Perl_debug_log,
4065                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4066                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4067                     ST.nextword,
4068                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4069                             PL_colors[0], PL_colors[1],
4070                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4071                         )
4072                     : "not compiled under -Dr",
4073                     PL_colors[5] );
4074             });
4075
4076             locinput = (char*)uc;
4077             continue; /* execute rest of RE */
4078             assert(0); /* NOTREACHED */
4079         }
4080 #undef  ST
4081
4082         case EXACT: {            /*  /abc/        */
4083             char *s = STRING(scan);
4084             ln = STR_LEN(scan);
4085             if (utf8_target != UTF_PATTERN) {
4086                 /* The target and the pattern have differing utf8ness. */
4087                 char *l = locinput;
4088                 const char * const e = s + ln;
4089
4090                 if (utf8_target) {
4091                     /* The target is utf8, the pattern is not utf8.
4092                      * Above-Latin1 code points can't match the pattern;
4093                      * invariants match exactly, and the other Latin1 ones need
4094                      * to be downgraded to a single byte in order to do the
4095                      * comparison.  (If we could be confident that the target
4096                      * is not malformed, this could be refactored to have fewer
4097                      * tests by just assuming that if the first bytes match, it
4098                      * is an invariant, but there are tests in the test suite
4099                      * dealing with (??{...}) which violate this) */
4100                     while (s < e) {
4101                         if (l >= PL_regeol)
4102                              sayNO;
4103                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4104                             sayNO;
4105                         }
4106                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4107                             if (*l != *s) {
4108                                 sayNO;
4109                             }
4110                             l++;
4111                         }
4112                         else {
4113                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4114                                 sayNO;
4115                             }
4116                             l += 2;
4117                         }
4118                         s++;
4119                     }
4120                 }
4121                 else {
4122                     /* The target is not utf8, the pattern is utf8. */
4123                     while (s < e) {
4124                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4125                         {
4126                             sayNO;
4127                         }
4128                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4129                             if (*s != *l) {
4130                                 sayNO;
4131                             }
4132                             s++;
4133                         }
4134                         else {
4135                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4136                                 sayNO;
4137                             }
4138                             s += 2;
4139                         }
4140                         l++;
4141                     }
4142                 }
4143                 locinput = l;
4144                 break;
4145             }
4146             /* The target and the pattern have the same utf8ness. */
4147             /* Inline the first character, for speed. */
4148             if (UCHARAT(s) != nextchr)
4149                 sayNO;
4150             if (PL_regeol - locinput < ln)
4151                 sayNO;
4152             if (ln > 1 && memNE(s, locinput, ln))
4153                 sayNO;
4154             locinput += ln;
4155             break;
4156             }
4157
4158         case EXACTFL: {          /*  /abc/il      */
4159             re_fold_t folder;
4160             const U8 * fold_array;
4161             const char * s;
4162             U32 fold_utf8_flags;
4163
4164             PL_reg_flags |= RF_tainted;
4165             folder = foldEQ_locale;
4166             fold_array = PL_fold_locale;
4167             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4168             goto do_exactf;
4169
4170         case EXACTFU_SS:         /*  /\x{df}/iu   */
4171         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4172         case EXACTFU:            /*  /abc/iu      */
4173             folder = foldEQ_latin1;
4174             fold_array = PL_fold_latin1;
4175             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4176             goto do_exactf;
4177
4178         case EXACTFA:            /*  /abc/iaa     */
4179             folder = foldEQ_latin1;
4180             fold_array = PL_fold_latin1;
4181             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4182             goto do_exactf;
4183
4184         case EXACTF:             /*  /abc/i       */
4185             folder = foldEQ;
4186             fold_array = PL_fold;
4187             fold_utf8_flags = 0;
4188
4189           do_exactf:
4190             s = STRING(scan);
4191             ln = STR_LEN(scan);
4192
4193             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4194               /* Either target or the pattern are utf8, or has the issue where
4195                * the fold lengths may differ. */
4196                 const char * const l = locinput;
4197                 char *e = PL_regeol;
4198
4199                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4200                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4201                 {
4202                     sayNO;
4203                 }
4204                 locinput = e;
4205                 break;
4206             }
4207
4208             /* Neither the target nor the pattern are utf8 */
4209             if (UCHARAT(s) != nextchr
4210                 && !NEXTCHR_IS_EOS
4211                 && UCHARAT(s) != fold_array[nextchr])
4212             {
4213                 sayNO;
4214             }
4215             if (PL_regeol - locinput < ln)
4216                 sayNO;
4217             if (ln > 1 && ! folder(s, locinput, ln))
4218                 sayNO;
4219             locinput += ln;
4220             break;
4221         }
4222
4223         /* XXX Could improve efficiency by separating these all out using a
4224          * macro or in-line function.  At that point regcomp.c would no longer
4225          * have to set the FLAGS fields of these */
4226         case BOUNDL:  /*  /\b/l  */
4227         case NBOUNDL: /*  /\B/l  */
4228             PL_reg_flags |= RF_tainted;
4229             /* FALL THROUGH */
4230         case BOUND:   /*  /\b/   */
4231         case BOUNDU:  /*  /\b/u  */
4232         case BOUNDA:  /*  /\b/a  */
4233         case NBOUND:  /*  /\B/   */
4234         case NBOUNDU: /*  /\B/u  */
4235         case NBOUNDA: /*  /\B/a  */
4236             /* was last char in word? */
4237             if (utf8_target
4238                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4239                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4240             {
4241                 if (locinput == PL_bostr)
4242                     ln = '\n';
4243                 else {
4244                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4245
4246                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4247                 }
4248                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4249                     ln = isALNUM_uni(ln);
4250                     if (NEXTCHR_IS_EOS)
4251                         n = 0;
4252                     else {
4253                         LOAD_UTF8_CHARCLASS_ALNUM();
4254                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4255                                                                 utf8_target);
4256                     }
4257                 }
4258                 else {
4259                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4260                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4261                 }
4262             }
4263             else {
4264
4265                 /* Here the string isn't utf8, or is utf8 and only ascii
4266                  * characters are to match \w.  In the latter case looking at
4267                  * the byte just prior to the current one may be just the final
4268                  * byte of a multi-byte character.  This is ok.  There are two
4269                  * cases:
4270                  * 1) it is a single byte character, and then the test is doing
4271                  *      just what it's supposed to.
4272                  * 2) it is a multi-byte character, in which case the final
4273                  *      byte is never mistakable for ASCII, and so the test
4274                  *      will say it is not a word character, which is the
4275                  *      correct answer. */
4276                 ln = (locinput != PL_bostr) ?
4277                     UCHARAT(locinput - 1) : '\n';
4278                 switch (FLAGS(scan)) {
4279                     case REGEX_UNICODE_CHARSET:
4280                         ln = isWORDCHAR_L1(ln);
4281                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4282                         break;
4283                     case REGEX_LOCALE_CHARSET:
4284                         ln = isALNUM_LC(ln);
4285                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4286                         break;
4287                     case REGEX_DEPENDS_CHARSET:
4288                         ln = isALNUM(ln);
4289                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4290                         break;
4291                     case REGEX_ASCII_RESTRICTED_CHARSET:
4292                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4293                         ln = isWORDCHAR_A(ln);
4294                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4295                         break;
4296                     default:
4297                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4298                         break;
4299                 }
4300             }
4301             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4302              * regcomp.sym */
4303             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4304                     sayNO;
4305             break;
4306
4307         case ANYOF:  /*  /[abc]/       */
4308             if (NEXTCHR_IS_EOS)
4309                 sayNO;
4310             if (utf8_target) {
4311                 STRLEN inclasslen = PL_regeol - locinput;
4312                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
4313                     sayNO;
4314                 locinput += inclasslen;
4315                 break;
4316             }
4317             else {
4318                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4319                     sayNO;
4320                 locinput++;
4321                 break;
4322             }
4323             break;
4324
4325         /* Special char classes: \d, \w etc.
4326          * The defines start on line 166 or so */
4327         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4328                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4329                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4330                   ALNUMA, NALNUMA, isWORDCHAR_A,
4331                   alnum, "a");
4332
4333         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
4334                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4335                   SPACEU, NSPACEU, isSPACE_L1,
4336                   SPACEA, NSPACEA, isSPACE_A,
4337                   space, " ");
4338
4339         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4340                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4341                 DIGITA, NDIGITA, isDIGIT_A,
4342                 digit, "0");
4343
4344         case POSIXA: /* /[[:ascii:]]/ etc */
4345             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4346                 sayNO;
4347             }
4348             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4349             locinput++;
4350             break;
4351
4352         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4353             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4354                 sayNO;
4355             }
4356             goto increment_locinput;
4357
4358         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4359                        a Unicode extended Grapheme Cluster */
4360             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4361               extended Grapheme Cluster is:
4362
4363                CR LF
4364                | Prepend* Begin Extend*
4365                | .
4366
4367                Begin is:           ( Special_Begin | ! Control )
4368                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4369                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4370                Control is:         [ GCB_Control  CR  LF ]
4371                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4372
4373                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4374                we can rewrite
4375
4376                    Begin is ( Regular_Begin + Special Begin )
4377
4378                It turns out that 98.4% of all Unicode code points match
4379                Regular_Begin.  Doing it this way eliminates a table match in
4380                the previous implementation for almost all Unicode code points.
4381
4382                There is a subtlety with Prepend* which showed up in testing.
4383                Note that the Begin, and only the Begin is required in:
4384                 | Prepend* Begin Extend*
4385                Also, Begin contains '! Control'.  A Prepend must be a
4386                '!  Control', which means it must also be a Begin.  What it
4387                comes down to is that if we match Prepend* and then find no
4388                suitable Begin afterwards, that if we backtrack the last
4389                Prepend, that one will be a suitable Begin.
4390             */
4391
4392             if (NEXTCHR_IS_EOS)
4393                 sayNO;
4394             if  (! utf8_target) {
4395
4396                 /* Match either CR LF  or '.', as all the other possibilities
4397                  * require utf8 */
4398                 locinput++;         /* Match the . or CR */
4399                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4400                                        match the LF */
4401                     && locinput < PL_regeol
4402                     && UCHARAT(locinput) == '\n') locinput++;
4403             }
4404             else {
4405
4406                 /* Utf8: See if is ( CR LF ); already know that locinput <
4407                  * PL_regeol, so locinput+1 is in bounds */
4408                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4409                         && UCHARAT(locinput + 1) == '\n')
4410                 {
4411                     locinput += 2;
4412                 }
4413                 else {
4414                     STRLEN len;
4415
4416                     /* In case have to backtrack to beginning, then match '.' */
4417                     char *starting = locinput;
4418
4419                     /* In case have to backtrack the last prepend */
4420                     char *previous_prepend = 0;
4421
4422                     LOAD_UTF8_CHARCLASS_GCB();
4423
4424                     /* Match (prepend)*   */
4425                     while (locinput < PL_regeol
4426                            && (len = is_GCB_Prepend_utf8(locinput)))
4427                     {
4428                         previous_prepend = locinput;
4429                         locinput += len;
4430                     }
4431
4432                     /* As noted above, if we matched a prepend character, but
4433                      * the next thing won't match, back off the last prepend we
4434                      * matched, as it is guaranteed to match the begin */
4435                     if (previous_prepend
4436                         && (locinput >=  PL_regeol
4437                             || (! swash_fetch(PL_utf8_X_regular_begin,
4438                                              (U8*)locinput, utf8_target)
4439                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4440                         )
4441                     {
4442                         locinput = previous_prepend;
4443                     }
4444
4445                     /* Note that here we know PL_regeol > locinput, as we
4446                      * tested that upon input to this switch case, and if we
4447                      * moved locinput forward, we tested the result just above
4448                      * and it either passed, or we backed off so that it will
4449                      * now pass */
4450                     if (swash_fetch(PL_utf8_X_regular_begin,
4451                                     (U8*)locinput, utf8_target)) {
4452                         locinput += UTF8SKIP(locinput);
4453                     }
4454                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4455
4456                         /* Here did not match the required 'Begin' in the
4457                          * second term.  So just match the very first
4458                          * character, the '.' of the final term of the regex */
4459                         locinput = starting + UTF8SKIP(starting);
4460                         goto exit_utf8;
4461                     } else {
4462
4463                         /* Here is a special begin.  It can be composed of
4464                          * several individual characters.  One possibility is
4465                          * RI+ */
4466                         if ((len = is_GCB_RI_utf8(locinput))) {
4467                             locinput += len;
4468                             while (locinput < PL_regeol
4469                                    && (len = is_GCB_RI_utf8(locinput)))
4470                             {
4471                                 locinput += len;
4472                             }
4473                         } else if ((len = is_GCB_T_utf8(locinput))) {
4474                             /* Another possibility is T+ */
4475                             locinput += len;
4476                             while (locinput < PL_regeol
4477                                 && (len = is_GCB_T_utf8(locinput)))
4478                             {
4479                                 locinput += len;
4480                             }
4481                         } else {
4482
4483                             /* Here, neither RI+ nor T+; must be some other
4484                              * Hangul.  That means it is one of the others: L,
4485                              * LV, LVT or V, and matches:
4486                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4487
4488                             /* Match L*           */
4489                             while (locinput < PL_regeol
4490                                    && (len = is_GCB_L_utf8(locinput)))
4491                             {
4492                                 locinput += len;
4493                             }
4494
4495                             /* Here, have exhausted L*.  If the next character
4496                              * is not an LV, LVT nor V, it means we had to have
4497                              * at least one L, so matches L+ in the original
4498                              * equation, we have a complete hangul syllable.
4499                              * Are done. */
4500
4501                             if (locinput < PL_regeol
4502                                 && is_GCB_LV_LVT_V_utf8(locinput))
4503                             {
4504
4505                                 /* Otherwise keep going.  Must be LV, LVT or V.
4506                                  * See if LVT */
4507                                 if (is_utf8_X_LVT((U8*)locinput)) {
4508                                     locinput += UTF8SKIP(locinput);
4509                                 } else {
4510
4511                                     /* Must be  V or LV.  Take it, then match
4512                                      * V*     */
4513                                     locinput += UTF8SKIP(locinput);
4514                                     while (locinput < PL_regeol
4515                                            && (len = is_GCB_V_utf8(locinput)))
4516                                     {
4517                                         locinput += len;
4518                                     }
4519                                 }
4520
4521                                 /* And any of LV, LVT, or V can be followed
4522                                  * by T*            */
4523                                 while (locinput < PL_regeol
4524                                        && (len = is_GCB_T_utf8(locinput)))
4525                                 {
4526                                     locinput += len;
4527                                 }
4528                             }
4529                         }
4530                     }
4531
4532                     /* Match any extender */
4533                     while (locinput < PL_regeol
4534                             && swash_fetch(PL_utf8_X_extend,
4535                                             (U8*)locinput, utf8_target))
4536                     {
4537                         locinput += UTF8SKIP(locinput);
4538                     }
4539                 }
4540             exit_utf8:
4541                 if (locinput > PL_regeol) sayNO;
4542             }
4543             break;
4544
4545         case NREFFL:  /*  /\g{name}/il  */
4546         {   /* The capture buffer cases.  The ones beginning with N for the
4547                named buffers just convert to the equivalent numbered and
4548                pretend they were called as the corresponding numbered buffer
4549                op.  */
4550             /* don't initialize these in the declaration, it makes C++
4551                unhappy */
4552             char *s;
4553             char type;
4554             re_fold_t folder;
4555             const U8 *fold_array;
4556             UV utf8_fold_flags;
4557
4558             PL_reg_flags |= RF_tainted;
4559             folder = foldEQ_locale;
4560             fold_array = PL_fold_locale;
4561             type = REFFL;
4562             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4563             goto do_nref;
4564
4565         case NREFFA:  /*  /\g{name}/iaa  */
4566             folder = foldEQ_latin1;
4567             fold_array = PL_fold_latin1;
4568             type = REFFA;
4569             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4570             goto do_nref;
4571
4572         case NREFFU:  /*  /\g{name}/iu  */
4573             folder = foldEQ_latin1;
4574             fold_array = PL_fold_latin1;
4575             type = REFFU;
4576             utf8_fold_flags = 0;
4577             goto do_nref;
4578
4579         case NREFF:  /*  /\g{name}/i  */
4580             folder = foldEQ;
4581             fold_array = PL_fold;
4582             type = REFF;
4583             utf8_fold_flags = 0;
4584             goto do_nref;
4585
4586         case NREF:  /*  /\g{name}/   */
4587             type = REF;
4588             folder = NULL;
4589             fold_array = NULL;
4590             utf8_fold_flags = 0;
4591           do_nref:
4592
4593             /* For the named back references, find the corresponding buffer
4594              * number */
4595             n = reg_check_named_buff_matched(rex,scan);
4596
4597             if ( ! n ) {
4598                 sayNO;
4599             }
4600             goto do_nref_ref_common;
4601
4602         case REFFL:  /*  /\1/il  */
4603             PL_reg_flags |= RF_tainted;
4604             folder = foldEQ_locale;
4605             fold_array = PL_fold_locale;
4606             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4607             goto do_ref;
4608
4609         case REFFA:  /*  /\1/iaa  */
4610             folder = foldEQ_latin1;
4611             fold_array = PL_fold_latin1;
4612             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4613             goto do_ref;
4614
4615         case REFFU:  /*  /\1/iu  */
4616             folder = foldEQ_latin1;
4617             fold_array = PL_fold_latin1;
4618             utf8_fold_flags = 0;
4619             goto do_ref;
4620
4621         case REFF:  /*  /\1/i  */
4622             folder = foldEQ;
4623             fold_array = PL_fold;
4624             utf8_fold_flags = 0;
4625             goto do_ref;
4626
4627         case REF:  /*  /\1/    */
4628             folder = NULL;
4629             fold_array = NULL;
4630             utf8_fold_flags = 0;
4631
4632           do_ref:
4633             type = OP(scan);
4634             n = ARG(scan);  /* which paren pair */
4635
4636           do_nref_ref_common:
4637             ln = rex->offs[n].start;
4638             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4639             if (rex->lastparen < n || ln == -1)
4640                 sayNO;                  /* Do not match unless seen CLOSEn. */
4641             if (ln == rex->offs[n].end)
4642                 break;
4643
4644             s = PL_bostr + ln;
4645             if (type != REF     /* REF can do byte comparison */
4646                 && (utf8_target || type == REFFU))
4647             { /* XXX handle REFFL better */
4648                 char * limit = PL_regeol;
4649
4650                 /* This call case insensitively compares the entire buffer
4651                     * at s, with the current input starting at locinput, but
4652                     * not going off the end given by PL_regeol, and returns in
4653                     * <limit> upon success, how much of the current input was
4654                     * matched */
4655                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4656                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4657                 {
4658                     sayNO;
4659                 }
4660                 locinput = limit;
4661                 break;
4662             }
4663
4664             /* Not utf8:  Inline the first character, for speed. */
4665             if (!NEXTCHR_IS_EOS &&
4666                 UCHARAT(s) != nextchr &&
4667                 (type == REF ||
4668                  UCHARAT(s) != fold_array[nextchr]))
4669                 sayNO;
4670             ln = rex->offs[n].end - ln;
4671             if (locinput + ln > PL_regeol)
4672                 sayNO;
4673             if (ln > 1 && (type == REF
4674                            ? memNE(s, locinput, ln)
4675                            : ! folder(s, locinput, ln)))
4676                 sayNO;
4677             locinput += ln;
4678             break;
4679         }
4680
4681         case NOTHING: /* null op; e.g. the 'nothing' following
4682                        * the '*' in m{(a+|b)*}' */
4683             break;
4684         case TAIL: /* placeholder while compiling (A|B|C) */
4685             break;
4686
4687         case BACK: /* ??? doesn't appear to be used ??? */
4688             break;
4689
4690 #undef  ST
4691 #define ST st->u.eval
4692         {
4693             SV *ret;
4694             REGEXP *re_sv;
4695             regexp *re;
4696             regexp_internal *rei;
4697             regnode *startpoint;
4698
4699         case GOSTART: /*  (?R)  */
4700         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4701             if (cur_eval && cur_eval->locinput==locinput) {
4702                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4703                     Perl_croak(aTHX_ "Infinite recursion in regex");
4704                 if ( ++nochange_depth > max_nochange_depth )
4705                     Perl_croak(aTHX_
4706                         "Pattern subroutine nesting without pos change"
4707                         " exceeded limit in regex");
4708             } else {
4709                 nochange_depth = 0;
4710             }
4711             re_sv = rex_sv;
4712             re = rex;
4713             rei = rexi;
4714             if (OP(scan)==GOSUB) {
4715                 startpoint = scan + ARG2L(scan);
4716                 ST.close_paren = ARG(scan);
4717             } else {
4718                 startpoint = rei->program+1;
4719                 ST.close_paren = 0;
4720             }
4721             goto eval_recurse_doit;
4722             assert(0); /* NOTREACHED */
4723
4724         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4725             if (cur_eval && cur_eval->locinput==locinput) {
4726                 if ( ++nochange_depth > max_nochange_depth )
4727                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4728             } else {
4729                 nochange_depth = 0;
4730             }
4731             {
4732                 /* execute the code in the {...} */
4733
4734                 dSP;
4735                 SV ** before;
4736                 OP * const oop = PL_op;
4737                 COP * const ocurcop = PL_curcop;
4738                 OP *nop;
4739                 char *saved_regeol = PL_regeol;
4740                 struct re_save_state saved_state;
4741                 CV *newcv;
4742
4743                 /* save *all* paren positions */
4744                 regcppush(rex, 0);
4745                 REGCP_SET(runops_cp);
4746
4747                 /* To not corrupt the existing regex state while executing the
4748                  * eval we would normally put it on the save stack, like with
4749                  * save_re_context. However, re-evals have a weird scoping so we
4750                  * can't just add ENTER/LEAVE here. With that, things like
4751                  *
4752                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4753                  *
4754                  * would break, as they expect the localisation to be unwound
4755                  * only when the re-engine backtracks through the bit that
4756                  * localised it.
4757                  *
4758                  * What we do instead is just saving the state in a local c
4759                  * variable.
4760                  */
4761                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4762
4763                 PL_reg_state.re_reparsing = FALSE;
4764
4765                 if (!caller_cv)
4766                     caller_cv = find_runcv(NULL);
4767
4768                 n = ARG(scan);
4769
4770                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4771                     newcv = ((struct regexp *)SvANY(
4772                                                 (REGEXP*)(rexi->data->data[n])
4773                                             ))->qr_anoncv
4774                                         ;
4775                     nop = (OP*)rexi->data->data[n+1];
4776                 }
4777                 else if (rexi->data->what[n] == 'l') { /* literal code */
4778                     newcv = caller_cv;
4779                     nop = (OP*)rexi->data->data[n];
4780                     assert(CvDEPTH(newcv));
4781                 }
4782                 else {
4783                     /* literal with own CV */
4784                     assert(rexi->data->what[n] == 'L');
4785                     newcv = rex->qr_anoncv;
4786                     nop = (OP*)rexi->data->data[n];
4787                 }
4788
4789                 /* normally if we're about to execute code from the same
4790                  * CV that we used previously, we just use the existing
4791                  * CX stack entry. However, its possible that in the
4792                  * meantime we may have backtracked, popped from the save
4793                  * stack, and undone the SAVECOMPPAD(s) associated with
4794                  * PUSH_MULTICALL; in which case PL_comppad no longer
4795                  * points to newcv's pad. */
4796                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4797                 {
4798                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4799                     if (last_pushed_cv) {
4800                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4801                     }
4802                     else {
4803                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4804                     }
4805                     last_pushed_cv = newcv;
4806                 }
4807                 last_pad = PL_comppad;
4808
4809                 /* the initial nextstate you would normally execute
4810                  * at the start of an eval (which would cause error
4811                  * messages to come from the eval), may be optimised
4812                  * away from the execution path in the regex code blocks;
4813                  * so manually set PL_curcop to it initially */
4814                 {
4815                     OP *o = cUNOPx(nop)->op_first;
4816                     assert(o->op_type == OP_NULL);
4817                     if (o->op_targ == OP_SCOPE) {
4818                         o = cUNOPo->op_first;
4819                     }
4820                     else {
4821                         assert(o->op_targ == OP_LEAVE);
4822                         o = cUNOPo->op_first;
4823                         assert(o->op_type == OP_ENTER);
4824                         o = o->op_sibling;
4825                     }
4826
4827                     if (o->op_type != OP_STUB) {
4828                         assert(    o->op_type == OP_NEXTSTATE
4829                                 || o->op_type == OP_DBSTATE
4830                                 || (o->op_type == OP_NULL
4831                                     &&  (  o->op_targ == OP_NEXTSTATE
4832                                         || o->op_targ == OP_DBSTATE
4833                                         )
4834                                     )
4835                         );
4836                         PL_curcop = (COP*)o;
4837                     }
4838                 }
4839                 nop = nop->op_next;
4840
4841                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4842                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4843
4844                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4845
4846                 if (sv_yes_mark) {
4847                     SV *sv_mrk = get_sv("REGMARK", 1);
4848                     sv_setsv(sv_mrk, sv_yes_mark);
4849                 }
4850
4851                 /* we don't use MULTICALL here as we want to call the
4852                  * first op of the block of interest, rather than the
4853                  * first op of the sub */
4854                 before = SP;
4855                 PL_op = nop;
4856                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4857                 SPAGAIN;
4858                 if (SP == before)
4859                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4860                 else {
4861                     ret = POPs;
4862                     PUTBACK;
4863                 }
4864
4865                 /* before restoring everything, evaluate the returned
4866                  * value, so that 'uninit' warnings don't use the wrong
4867                  * PL_op or pad. Also need to process any magic vars
4868                  * (e.g. $1) *before* parentheses are restored */
4869
4870                 PL_op = NULL;
4871
4872                 re_sv = NULL;
4873                 if (logical == 0)        /*   (?{})/   */
4874                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4875                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4876                     sw = cBOOL(SvTRUE(ret));
4877                     logical = 0;
4878                 }
4879                 else {                   /*  /(??{})  */
4880                     /*  if its overloaded, let the regex compiler handle
4881                      *  it; otherwise extract regex, or stringify  */
4882                     if (!SvAMAGIC(ret)) {
4883                         SV *sv = ret;
4884                         if (SvROK(sv))
4885                             sv = SvRV(sv);
4886                         if (SvTYPE(sv) == SVt_REGEXP)
4887                             re_sv = (REGEXP*) sv;
4888                         else if (SvSMAGICAL(sv)) {
4889                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4890                             if (mg)
4891                                 re_sv = (REGEXP *) mg->mg_obj;
4892                         }
4893
4894                         /* force any magic, undef warnings here */
4895                         if (!re_sv) {
4896                             ret = sv_mortalcopy(ret);
4897                             (void) SvPV_force_nolen(ret);
4898                         }
4899                     }
4900
4901                 }
4902
4903                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4904
4905                 /* *** Note that at this point we don't restore
4906                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4907                  * be used again soon. This is safe as long as nothing
4908                  * in the regexp code uses the pad ! */
4909                 PL_op = oop;
4910                 PL_curcop = ocurcop;
4911                 PL_regeol = saved_regeol;
4912                 S_regcp_restore(aTHX_ rex, runops_cp);
4913
4914                 if (logical != 2)
4915                     break;
4916             }
4917
4918                 /* only /(??{})/  from now on */
4919                 logical = 0;
4920                 {
4921                     /* extract RE object from returned value; compiling if
4922                      * necessary */
4923
4924                     if (re_sv) {
4925                         re_sv = reg_temp_copy(NULL, re_sv);
4926                     }
4927                     else {
4928                         U32 pm_flags = 0;
4929                         const I32 osize = PL_regsize;
4930
4931                         if (SvUTF8(ret) && IN_BYTES) {
4932                             /* In use 'bytes': make a copy of the octet
4933                              * sequence, but without the flag on */
4934                             STRLEN len;
4935                             const char *const p = SvPV(ret, len);
4936                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4937                         }
4938                         if (rex->intflags & PREGf_USE_RE_EVAL)
4939                             pm_flags |= PMf_USE_RE_EVAL;
4940
4941                         /* if we got here, it should be an engine which
4942                          * supports compiling code blocks and stuff */
4943                         assert(rex->engine && rex->engine->op_comp);
4944                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4945                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4946                                     rex->engine, NULL, NULL,
4947                                     /* copy /msix etc to inner pattern */
4948                                     scan->flags,
4949                                     pm_flags);
4950
4951                         if (!(SvFLAGS(ret)
4952                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4953                                  | SVs_GMG))) {
4954                             /* This isn't a first class regexp. Instead, it's
4955                                caching a regexp onto an existing, Perl visible
4956                                scalar.  */
4957                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4958                         }
4959                         PL_regsize = osize;
4960                         /* safe to do now that any $1 etc has been
4961                          * interpolated into the new pattern string and
4962                          * compiled */
4963                         S_regcp_restore(aTHX_ rex, runops_cp);
4964                     }
4965                     re = (struct regexp *)SvANY(re_sv);
4966                 }
4967                 RXp_MATCH_COPIED_off(re);
4968                 re->subbeg = rex->subbeg;
4969                 re->sublen = rex->sublen;
4970                 re->suboffset = rex->suboffset;
4971                 re->subcoffset = rex->subcoffset;
4972                 rei = RXi_GET(re);
4973                 DEBUG_EXECUTE_r(
4974                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4975                         "Matching embedded");
4976                 );
4977                 startpoint = rei->program + 1;
4978                 ST.close_paren = 0; /* only used for GOSUB */
4979
4980         eval_recurse_doit: /* Share code with GOSUB below this line */
4981                 /* run the pattern returned from (??{...}) */
4982                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4983                 REGCP_SET(ST.lastcp);
4984
4985                 re->lastparen = 0;
4986                 re->lastcloseparen = 0;
4987
4988                 PL_regsize = 0;
4989
4990                 /* XXXX This is too dramatic a measure... */
4991                 PL_reg_maxiter = 0;
4992
4993                 ST.toggle_reg_flags = PL_reg_flags;
4994                 if (RX_UTF8(re_sv))
4995                     PL_reg_flags |= RF_utf8;
4996                 else
4997                     PL_reg_flags &= ~RF_utf8;
4998                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4999
5000                 ST.prev_rex = rex_sv;
5001                 ST.prev_curlyx = cur_curlyx;
5002                 rex_sv = re_sv;
5003                 SET_reg_curpm(rex_sv);
5004                 rex = re;
5005                 rexi = rei;
5006                 cur_curlyx = NULL;
5007                 ST.B = next;
5008                 ST.prev_eval = cur_eval;
5009                 cur_eval = st;
5010                 /* now continue from first node in postoned RE */
5011                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5012                 assert(0); /* NOTREACHED */
5013         }
5014
5015         case EVAL_AB: /* cleanup after a successful (??{A})B */
5016             /* note: this is called twice; first after popping B, then A */
5017             PL_reg_flags ^= ST.toggle_reg_flags;
5018             rex_sv = ST.prev_rex;
5019             SET_reg_curpm(rex_sv);
5020             rex = (struct regexp *)SvANY(rex_sv);
5021             rexi = RXi_GET(rex);
5022             regcpblow(ST.cp);
5023             cur_eval = ST.prev_eval;
5024             cur_curlyx = ST.prev_curlyx;
5025
5026             /* XXXX This is too dramatic a measure... */
5027             PL_reg_maxiter = 0;
5028             if ( nochange_depth )
5029                 nochange_depth--;
5030             sayYES;
5031
5032
5033         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5034             /* note: this is called twice; first after popping B, then A */
5035             PL_reg_flags ^= ST.toggle_reg_flags;
5036             rex_sv = ST.prev_rex;
5037             SET_reg_curpm(rex_sv);
5038             rex = (struct regexp *)SvANY(rex_sv);
5039             rexi = RXi_GET(rex);
5040
5041             REGCP_UNWIND(ST.lastcp);
5042             regcppop(rex);
5043             cur_eval = ST.prev_eval;
5044             cur_curlyx = ST.prev_curlyx;
5045             /* XXXX This is too dramatic a measure... */
5046             PL_reg_maxiter = 0;
5047             if ( nochange_depth )
5048                 nochange_depth--;
5049             sayNO_SILENT;
5050 #undef ST
5051
5052         case OPEN: /*  (  */
5053             n = ARG(scan);  /* which paren pair */
5054             rex->offs[n].start_tmp = locinput - PL_bostr;
5055             if (n > PL_regsize)
5056                 PL_regsize = n;
5057             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5058                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5059                 PTR2UV(rex),
5060                 PTR2UV(rex->offs),
5061                 (UV)n,
5062                 (IV)rex->offs[n].start_tmp,
5063                 (UV)PL_regsize
5064             ));
5065             lastopen = n;
5066             break;
5067
5068 /* XXX really need to log other places start/end are set too */
5069 #define CLOSE_CAPTURE \
5070     rex->offs[n].start = rex->offs[n].start_tmp; \
5071     rex->offs[n].end = locinput - PL_bostr; \
5072     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5073         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5074         PTR2UV(rex), \
5075         PTR2UV(rex->offs), \
5076         (UV)n, \
5077         (IV)rex->offs[n].start, \
5078         (IV)rex->offs[n].end \
5079     ))
5080
5081         case CLOSE:  /*  )  */
5082             n = ARG(scan);  /* which paren pair */
5083             CLOSE_CAPTURE;
5084             /*if (n > PL_regsize)
5085                 PL_regsize = n;*/
5086             if (n > rex->lastparen)
5087                 rex->lastparen = n;
5088             rex->lastcloseparen = n;
5089             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5090                 goto fake_end;
5091             }
5092             break;
5093
5094         case ACCEPT:  /*  (*ACCEPT)  */
5095             if (ARG(scan)){
5096                 regnode *cursor;
5097                 for (cursor=scan;
5098                      cursor && OP(cursor)!=END;
5099                      cursor=regnext(cursor))
5100                 {
5101                     if ( OP(cursor)==CLOSE ){
5102                         n = ARG(cursor);
5103                         if ( n <= lastopen ) {
5104                             CLOSE_CAPTURE;
5105                             /*if (n > PL_regsize)
5106                             PL_regsize = n;*/
5107                             if (n > rex->lastparen)
5108                                 rex->lastparen = n;
5109                             rex->lastcloseparen = n;
5110                             if ( n == ARG(scan) || (cur_eval &&
5111                                 cur_eval->u.eval.close_paren == n))
5112                                 break;
5113                         }
5114                     }
5115                 }
5116             }
5117             goto fake_end;
5118             /*NOTREACHED*/
5119
5120         case GROUPP:  /*  (?(1))  */
5121             n = ARG(scan);  /* which paren pair */
5122             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5123             break;
5124
5125         case NGROUPP:  /*  (?(<name>))  */
5126             /* reg_check_named_buff_matched returns 0 for no match */
5127             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5128             break;
5129
5130         case INSUBP:   /*  (?(R))  */
5131             n = ARG(scan);
5132             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5133             break;
5134
5135         case DEFINEP:  /*  (?(DEFINE))  */
5136             sw = 0;
5137             break;
5138
5139         case IFTHEN:   /*  (?(cond)A|B)  */
5140             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5141             if (sw)
5142                 next = NEXTOPER(NEXTOPER(scan));
5143             else {
5144                 next = scan + ARG(scan);
5145                 if (OP(next) == IFTHEN) /* Fake one. */
5146                     next = NEXTOPER(NEXTOPER(next));
5147             }
5148             break;
5149
5150         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5151             logical = scan->flags;
5152             break;
5153
5154 /*******************************************************************
5155
5156 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5157 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5158 STAR/PLUS/CURLY/CURLYN are used instead.)
5159
5160 A*B is compiled as <CURLYX><A><WHILEM><B>
5161
5162 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5163 state, which contains the current count, initialised to -1. It also sets
5164 cur_curlyx to point to this state, with any previous value saved in the
5165 state block.
5166
5167 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5168 since the pattern may possibly match zero times (i.e. it's a while {} loop
5169 rather than a do {} while loop).
5170
5171 Each entry to WHILEM represents a successful match of A. The count in the
5172 CURLYX block is incremented, another WHILEM state is pushed, and execution
5173 passes to A or B depending on greediness and the current count.
5174
5175 For example, if matching against the string a1a2a3b (where the aN are
5176 substrings that match /A/), then the match progresses as follows: (the
5177 pushed states are interspersed with the bits of strings matched so far):
5178
5179     <CURLYX cnt=-1>
5180     <CURLYX cnt=0><WHILEM>
5181     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5182     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5183     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5184     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5185
5186 (Contrast this with something like CURLYM, which maintains only a single
5187 backtrack state:
5188
5189     <CURLYM cnt=0> a1
5190     a1 <CURLYM cnt=1> a2
5191     a1 a2 <CURLYM cnt=2> a3
5192     a1 a2 a3 <CURLYM cnt=3> b
5193 )
5194
5195 Each WHILEM state block marks a point to backtrack to upon partial failure
5196 of A or B, and also contains some minor state data related to that
5197 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5198 overall state, such as the count, and pointers to the A and B ops.
5199
5200 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5201 must always point to the *current* CURLYX block, the rules are:
5202
5203 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5204 and set cur_curlyx to point the new block.
5205
5206 When popping the CURLYX block after a successful or unsuccessful match,
5207 restore the previous cur_curlyx.
5208
5209 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5210 to the outer one saved in the CURLYX block.
5211
5212 When popping the WHILEM block after a successful or unsuccessful B match,
5213 restore the previous cur_curlyx.
5214
5215 Here's an example for the pattern (AI* BI)*BO
5216 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5217
5218 cur_
5219 curlyx backtrack stack
5220 ------ ---------------
5221 NULL
5222 CO     <CO prev=NULL> <WO>
5223 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5224 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5225 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5226
5227 At this point the pattern succeeds, and we work back down the stack to
5228 clean up, restoring as we go:
5229
5230 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5231 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5232 CO     <CO prev=NULL> <WO>
5233 NULL
5234
5235 *******************************************************************/
5236
5237 #define ST st->u.curlyx
5238
5239         case CURLYX:    /* start of /A*B/  (for complex A) */
5240         {
5241             /* No need to save/restore up to this paren */
5242             I32 parenfloor = scan->flags;
5243
5244             assert(next); /* keep Coverity happy */
5245             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5246                 next += ARG(next);
5247
5248             /* XXXX Probably it is better to teach regpush to support
5249                parenfloor > PL_regsize... */
5250             if (parenfloor > (I32)rex->lastparen)
5251                 parenfloor = rex->lastparen; /* Pessimization... */
5252
5253             ST.prev_curlyx= cur_curlyx;
5254             cur_curlyx = st;
5255             ST.cp = PL_savestack_ix;
5256
5257             /* these fields contain the state of the current curly.
5258              * they are accessed by subsequent WHILEMs */
5259             ST.parenfloor = parenfloor;
5260             ST.me = scan;
5261             ST.B = next;
5262             ST.minmod = minmod;
5263             minmod = 0;
5264             ST.count = -1;      /* this will be updated by WHILEM */
5265             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5266
5267             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5268             assert(0); /* NOTREACHED */
5269         }
5270
5271         case CURLYX_end: /* just finished matching all of A*B */
5272             cur_curlyx = ST.prev_curlyx;
5273             sayYES;
5274             assert(0); /* NOTREACHED */
5275
5276         case CURLYX_end_fail: /* just failed to match all of A*B */
5277             regcpblow(ST.cp);
5278             cur_curlyx = ST.prev_curlyx;
5279             sayNO;
5280             assert(0); /* NOTREACHED */
5281
5282
5283 #undef ST
5284 #define ST st->u.whilem
5285
5286         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5287         {
5288             /* see the discussion above about CURLYX/WHILEM */
5289             I32 n;
5290             int min = ARG1(cur_curlyx->u.curlyx.me);
5291             int max = ARG2(cur_curlyx->u.curlyx.me);
5292             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5293
5294             assert(cur_curlyx); /* keep Coverity happy */
5295             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5296             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5297             ST.cache_offset = 0;
5298             ST.cache_mask = 0;
5299
5300
5301             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5302                   "%*s  whilem: matched %ld out of %d..%d\n",
5303                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5304             );
5305
5306             /* First just match a string of min A's. */
5307
5308             if (n < min) {
5309                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5310                 cur_curlyx->u.curlyx.lastloc = locinput;
5311                 REGCP_SET(ST.lastcp);
5312
5313                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5314                 assert(0); /* NOTREACHED */
5315             }
5316
5317             /* If degenerate A matches "", assume A done. */
5318
5319             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5320                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5321                    "%*s  whilem: empty match detected, trying continuation...\n",
5322                    REPORT_CODE_OFF+depth*2, "")
5323                 );
5324                 goto do_whilem_B_max;
5325             }
5326
5327             /* super-linear cache processing */
5328
5329             if (scan->flags) {
5330
5331                 if (!PL_reg_maxiter) {
5332                     /* start the countdown: Postpone detection until we
5333                      * know the match is not *that* much linear. */
5334                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5335                     /* possible overflow for long strings and many CURLYX's */
5336                     if (PL_reg_maxiter < 0)
5337                         PL_reg_maxiter = I32_MAX;
5338                     PL_reg_leftiter = PL_reg_maxiter;
5339                 }
5340
5341                 if (PL_reg_leftiter-- == 0) {
5342                     /* initialise cache */
5343                     const I32 size = (PL_reg_maxiter + 7)/8;
5344                     if (PL_reg_poscache) {
5345                         if ((I32)PL_reg_poscache_size < size) {
5346                             Renew(PL_reg_poscache, size, char);
5347                             PL_reg_poscache_size = size;
5348                         }
5349                         Zero(PL_reg_poscache, size, char);
5350                     }
5351                     else {
5352                         PL_reg_poscache_size = size;
5353                         Newxz(PL_reg_poscache, size, char);
5354                     }
5355                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5356       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5357                               PL_colors[4], PL_colors[5])
5358                     );
5359                 }
5360
5361                 if (PL_reg_leftiter < 0) {
5362                     /* have we already failed at this position? */
5363                     I32 offset, mask;
5364                     offset  = (scan->flags & 0xf) - 1
5365                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5366                     mask    = 1 << (offset % 8);
5367                     offset /= 8;
5368                     if (PL_reg_poscache[offset] & mask) {
5369                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5370                             "%*s  whilem: (cache) already tried at this position...\n",
5371                             REPORT_CODE_OFF+depth*2, "")
5372                         );
5373                         sayNO; /* cache records failure */
5374                     }
5375                     ST.cache_offset = offset;
5376                     ST.cache_mask   = mask;
5377                 }
5378             }
5379
5380             /* Prefer B over A for minimal matching. */
5381
5382             if (cur_curlyx->u.curlyx.minmod) {
5383                 ST.save_curlyx = cur_curlyx;
5384                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5385                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5386                 REGCP_SET(ST.lastcp);
5387                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5388                                     locinput);
5389                 assert(0); /* NOTREACHED */
5390             }
5391
5392             /* Prefer A over B for maximal matching. */
5393
5394             if (n < max) { /* More greed allowed? */
5395                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5396                 cur_curlyx->u.curlyx.lastloc = locinput;
5397                 REGCP_SET(ST.lastcp);
5398                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5399                 assert(0); /* NOTREACHED */
5400             }
5401             goto do_whilem_B_max;
5402         }
5403         assert(0); /* NOTREACHED */
5404
5405         case WHILEM_B_min: /* just matched B in a minimal match */
5406         case WHILEM_B_max: /* just matched B in a maximal match */
5407             cur_curlyx = ST.save_curlyx;
5408             sayYES;
5409             assert(0); /* NOTREACHED */
5410
5411         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5412             cur_curlyx = ST.save_curlyx;
5413             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5414             cur_curlyx->u.curlyx.count--;
5415             CACHEsayNO;
5416             assert(0); /* NOTREACHED */
5417
5418         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5419             /* FALL THROUGH */
5420         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5421             REGCP_UNWIND(ST.lastcp);
5422             regcppop(rex);
5423             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5424             cur_curlyx->u.curlyx.count--;
5425             CACHEsayNO;
5426             assert(0); /* NOTREACHED */
5427
5428         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5429             REGCP_UNWIND(ST.lastcp);
5430             regcppop(rex);      /* Restore some previous $<digit>s? */
5431             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5432                 "%*s  whilem: failed, trying continuation...\n",
5433                 REPORT_CODE_OFF+depth*2, "")
5434             );
5435           do_whilem_B_max:
5436             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5437                 && ckWARN(WARN_REGEXP)
5438                 && !(PL_reg_flags & RF_warned))
5439             {
5440                 PL_reg_flags |= RF_warned;
5441                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5442                      "Complex regular subexpression recursion limit (%d) "
5443                      "exceeded",
5444                      REG_INFTY - 1);
5445             }
5446
5447             /* now try B */
5448             ST.save_curlyx = cur_curlyx;
5449             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5450             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5451                                 locinput);
5452             assert(0); /* NOTREACHED */
5453
5454         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5455             cur_curlyx = ST.save_curlyx;
5456             REGCP_UNWIND(ST.lastcp);
5457             regcppop(rex);
5458
5459             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5460                 /* Maximum greed exceeded */
5461                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5462                     && ckWARN(WARN_REGEXP)
5463                     && !(PL_reg_flags & RF_warned))
5464                 {
5465                     PL_reg_flags |= RF_warned;
5466                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5467                         "Complex regular subexpression recursion "
5468                         "limit (%d) exceeded",
5469                         REG_INFTY - 1);
5470                 }
5471                 cur_curlyx->u.curlyx.count--;
5472                 CACHEsayNO;
5473             }
5474
5475             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5476                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5477             );
5478             /* Try grabbing another A and see if it helps. */
5479             cur_curlyx->u.curlyx.lastloc = locinput;
5480             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5481             REGCP_SET(ST.lastcp);
5482             PUSH_STATE_GOTO(WHILEM_A_min,
5483                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5484                 locinput);
5485             assert(0); /* NOTREACHED */
5486
5487 #undef  ST
5488 #define ST st->u.branch
5489
5490         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5491             next = scan + ARG(scan);
5492             if (next == scan)
5493                 next = NULL;
5494             scan = NEXTOPER(scan);
5495             /* FALL THROUGH */
5496
5497         case BRANCH:        /*  /(...|A|...)/ */
5498             scan = NEXTOPER(scan); /* scan now points to inner node */
5499             ST.lastparen = rex->lastparen;
5500             ST.lastcloseparen = rex->lastcloseparen;
5501             ST.next_branch = next;
5502             REGCP_SET(ST.cp);
5503
5504             /* Now go into the branch */
5505             if (has_cutgroup) {
5506                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5507             } else {
5508                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5509             }
5510             assert(0); /* NOTREACHED */
5511
5512         case CUTGROUP:  /*  /(*THEN)/  */
5513             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5514                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5515             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5516             assert(0); /* NOTREACHED */
5517
5518         case CUTGROUP_next_fail:
5519             do_cutgroup = 1;
5520             no_final = 1;
5521             if (st->u.mark.mark_name)
5522                 sv_commit = st->u.mark.mark_name;
5523             sayNO;
5524             assert(0); /* NOTREACHED */
5525
5526         case BRANCH_next:
5527             sayYES;
5528             assert(0); /* NOTREACHED */
5529
5530         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5531             if (do_cutgroup) {
5532                 do_cutgroup = 0;
5533                 no_final = 0;
5534             }
5535             REGCP_UNWIND(ST.cp);
5536             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5537             scan = ST.next_branch;
5538             /* no more branches? */
5539             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5540                 DEBUG_EXECUTE_r({
5541                     PerlIO_printf( Perl_debug_log,
5542                         "%*s  %sBRANCH failed...%s\n",
5543                         REPORT_CODE_OFF+depth*2, "",
5544                         PL_colors[4],
5545                         PL_colors[5] );
5546                 });
5547                 sayNO_SILENT;
5548             }
5549             continue; /* execute next BRANCH[J] op */
5550             assert(0); /* NOTREACHED */
5551
5552         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5553             minmod = 1;
5554             break;
5555
5556 #undef  ST
5557 #define ST st->u.curlym
5558
5559         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5560
5561             /* This is an optimisation of CURLYX that enables us to push
5562              * only a single backtracking state, no matter how many matches
5563              * there are in {m,n}. It relies on the pattern being constant
5564              * length, with no parens to influence future backrefs
5565              */
5566
5567             ST.me = scan;
5568             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5569
5570             ST.lastparen      = rex->lastparen;
5571             ST.lastcloseparen = rex->lastcloseparen;
5572
5573             /* if paren positive, emulate an OPEN/CLOSE around A */
5574             if (ST.me->flags) {
5575                 U32 paren = ST.me->flags;
5576                 if (paren > PL_regsize)
5577                     PL_regsize = paren;
5578                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5579             }
5580             ST.A = scan;
5581             ST.B = next;
5582             ST.alen = 0;
5583             ST.count = 0;
5584             ST.minmod = minmod;
5585             minmod = 0;
5586             ST.c1 = CHRTEST_UNINIT;
5587             REGCP_SET(ST.cp);
5588
5589             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5590                 goto curlym_do_B;
5591
5592           curlym_do_A: /* execute the A in /A{m,n}B/  */
5593             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5594             assert(0); /* NOTREACHED */
5595
5596         case CURLYM_A: /* we've just matched an A */
5597             ST.count++;
5598             /* after first match, determine A's length: u.curlym.alen */
5599             if (ST.count == 1) {
5600                 if (PL_reg_match_utf8) {
5601                     char *s = st->locinput;
5602                     while (s < locinput) {
5603                         ST.alen++;
5604                         s += UTF8SKIP(s);
5605                     }
5606                 }
5607                 else {
5608                     ST.alen = locinput - st->locinput;
5609                 }
5610                 if (ST.alen == 0)
5611                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5612             }
5613             DEBUG_EXECUTE_r(
5614                 PerlIO_printf(Perl_debug_log,
5615                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5616                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5617                           (IV) ST.count, (IV)ST.alen)
5618             );
5619
5620             if (cur_eval && cur_eval->u.eval.close_paren &&
5621                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5622                 goto fake_end;
5623
5624             {
5625                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5626                 if ( max == REG_INFTY || ST.count < max )
5627                     goto curlym_do_A; /* try to match another A */
5628             }
5629             goto curlym_do_B; /* try to match B */
5630
5631         case CURLYM_A_fail: /* just failed to match an A */
5632             REGCP_UNWIND(ST.cp);
5633
5634             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5635                 || (cur_eval && cur_eval->u.eval.close_paren &&
5636                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5637                 sayNO;
5638
5639           curlym_do_B: /* execute the B in /A{m,n}B/  */
5640             if (ST.c1 == CHRTEST_UNINIT) {
5641                 /* calculate c1 and c2 for possible match of 1st char
5642                  * following curly */
5643                 ST.c1 = ST.c2 = CHRTEST_VOID;
5644                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5645                     regnode *text_node = ST.B;
5646                     if (! HAS_TEXT(text_node))
5647                         FIND_NEXT_IMPT(text_node);
5648                     /* this used to be
5649
5650                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5651
5652                         But the former is redundant in light of the latter.
5653
5654                         if this changes back then the macro for
5655                         IS_TEXT and friends need to change.
5656                      */
5657                     if (PL_regkind[OP(text_node)] == EXACT) {
5658                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5659                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5660                         {
5661                             sayNO;
5662                         }
5663                     }
5664                 }
5665             }
5666
5667             DEBUG_EXECUTE_r(
5668                 PerlIO_printf(Perl_debug_log,
5669                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5670                     (int)(REPORT_CODE_OFF+(depth*2)),
5671                     "", (IV)ST.count)
5672                 );
5673             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5674                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5675                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5676                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5677                     {
5678                         /* simulate B failing */
5679                         DEBUG_OPTIMISE_r(
5680                             PerlIO_printf(Perl_debug_log,
5681                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5682                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5683                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5684                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5685                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5686                         );
5687                         state_num = CURLYM_B_fail;
5688                         goto reenter_switch;
5689                     }
5690                 }
5691                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5692                     /* simulate B failing */
5693                     DEBUG_OPTIMISE_r(
5694                         PerlIO_printf(Perl_debug_log,
5695                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5696                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5697                             (int) nextchr, ST.c1, ST.c2)
5698                     );
5699                     state_num = CURLYM_B_fail;
5700                     goto reenter_switch;
5701                 }
5702             }
5703
5704             if (ST.me->flags) {
5705                 /* emulate CLOSE: mark current A as captured */
5706                 I32 paren = ST.me->flags;
5707                 if (ST.count) {
5708                     rex->offs[paren].start
5709                         = HOPc(locinput, -ST.alen) - PL_bostr;
5710                     rex->offs[paren].end = locinput - PL_bostr;
5711                     if ((U32)paren > rex->lastparen)
5712                         rex->lastparen = paren;
5713                     rex->lastcloseparen = paren;
5714                 }
5715                 else
5716                     rex->offs[paren].end = -1;
5717                 if (cur_eval && cur_eval->u.eval.close_paren &&
5718                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5719                 {
5720                     if (ST.count)
5721                         goto fake_end;
5722                     else
5723                         sayNO;
5724                 }
5725             }
5726
5727             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5728             assert(0); /* NOTREACHED */
5729
5730         case CURLYM_B_fail: /* just failed to match a B */
5731             REGCP_UNWIND(ST.cp);
5732             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5733             if (ST.minmod) {
5734                 I32 max = ARG2(ST.me);
5735                 if (max != REG_INFTY && ST.count == max)
5736                     sayNO;
5737                 goto curlym_do_A; /* try to match a further A */
5738             }
5739             /* backtrack one A */
5740             if (ST.count == ARG1(ST.me) /* min */)
5741                 sayNO;
5742             ST.count--;
5743             SET_locinput(HOPc(locinput, -ST.alen));
5744             goto curlym_do_B; /* try to match B */
5745
5746 #undef ST
5747 #define ST st->u.curly
5748
5749 #define CURLY_SETPAREN(paren, success) \
5750     if (paren) { \
5751         if (success) { \
5752             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5753             rex->offs[paren].end = locinput - PL_bostr; \
5754             if (paren > rex->lastparen) \
5755                 rex->lastparen = paren; \
5756             rex->lastcloseparen = paren; \
5757         } \
5758         else { \
5759             rex->offs[paren].end = -1; \
5760             rex->lastparen      = ST.lastparen; \
5761             rex->lastcloseparen = ST.lastcloseparen; \
5762         } \
5763     }
5764
5765         case STAR:              /*  /A*B/ where A is width 1 char */
5766             ST.paren = 0;
5767             ST.min = 0;
5768             ST.max = REG_INFTY;
5769             scan = NEXTOPER(scan);
5770             goto repeat;
5771
5772         case PLUS:              /*  /A+B/ where A is width 1 char */
5773             ST.paren = 0;
5774             ST.min = 1;
5775             ST.max = REG_INFTY;
5776             scan = NEXTOPER(scan);
5777             goto repeat;
5778
5779         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5780             ST.paren = scan->flags;     /* Which paren to set */
5781             ST.lastparen      = rex->lastparen;
5782             ST.lastcloseparen = rex->lastcloseparen;
5783             if (ST.paren > PL_regsize)
5784                 PL_regsize = ST.paren;
5785             ST.min = ARG1(scan);  /* min to match */
5786             ST.max = ARG2(scan);  /* max to match */
5787             if (cur_eval && cur_eval->u.eval.close_paren &&
5788                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5789                 ST.min=1;
5790                 ST.max=1;
5791             }
5792             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5793             goto repeat;
5794
5795         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5796             ST.paren = 0;
5797             ST.min = ARG1(scan);  /* min to match */
5798             ST.max = ARG2(scan);  /* max to match */
5799             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5800           repeat:
5801             /*
5802             * Lookahead to avoid useless match attempts
5803             * when we know what character comes next.
5804             *
5805             * Used to only do .*x and .*?x, but now it allows
5806             * for )'s, ('s and (?{ ... })'s to be in the way
5807             * of the quantifier and the EXACT-like node.  -- japhy
5808             */
5809
5810             assert(ST.min <= ST.max);
5811             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5812                 ST.c1 = ST.c2 = CHRTEST_VOID;
5813             }
5814             else {
5815                 regnode *text_node = next;
5816
5817                 if (! HAS_TEXT(text_node))
5818                     FIND_NEXT_IMPT(text_node);
5819
5820                 if (! HAS_TEXT(text_node))
5821                     ST.c1 = ST.c2 = CHRTEST_VOID;
5822                 else {
5823                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5824                         ST.c1 = ST.c2 = CHRTEST_VOID;
5825                     }
5826                     else {
5827
5828                     /*  Currently we only get here when
5829
5830                         PL_rekind[OP(text_node)] == EXACT
5831
5832                         if this changes back then the macro for IS_TEXT and
5833                         friends need to change. */
5834                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5835                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5836                         {
5837                             sayNO;
5838                         }
5839                     }
5840                 }
5841             }
5842
5843             ST.A = scan;
5844             ST.B = next;
5845             if (minmod) {
5846                 char *li = locinput;
5847                 minmod = 0;
5848                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5849                     sayNO;
5850                 SET_locinput(li);
5851                 ST.count = ST.min;
5852                 REGCP_SET(ST.cp);
5853                 if (ST.c1 == CHRTEST_VOID)
5854                     goto curly_try_B_min;
5855
5856                 ST.oldloc = locinput;
5857
5858                 /* set ST.maxpos to the furthest point along the
5859                  * string that could possibly match */
5860                 if  (ST.max == REG_INFTY) {
5861                     ST.maxpos = PL_regeol - 1;
5862                     if (utf8_target)
5863                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5864                             ST.maxpos--;
5865                 }
5866                 else if (utf8_target) {
5867                     int m = ST.max - ST.min;
5868                     for (ST.maxpos = locinput;
5869                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5870                         ST.maxpos += UTF8SKIP(ST.maxpos);
5871                 }
5872                 else {
5873                     ST.maxpos = locinput + ST.max - ST.min;
5874                     if (ST.maxpos >= PL_regeol)
5875                         ST.maxpos = PL_regeol - 1;
5876                 }
5877                 goto curly_try_B_min_known;
5878
5879             }
5880             else {
5881                 /* avoid taking address of locinput, so it can remain
5882                  * a register var */
5883                 char *li = locinput;
5884                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5885                 if (ST.count < ST.min)
5886                     sayNO;
5887                 SET_locinput(li);
5888                 if ((ST.count > ST.min)
5889                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5890                 {
5891                     /* A{m,n} must come at the end of the string, there's
5892                      * no point in backing off ... */
5893                     ST.min = ST.count;
5894                     /* ...except that $ and \Z can match before *and* after
5895                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5896                        We may back off by one in this case. */
5897                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5898                         ST.min--;
5899                 }
5900                 REGCP_SET(ST.cp);
5901                 goto curly_try_B_max;
5902             }
5903             assert(0); /* NOTREACHED */
5904
5905
5906         case CURLY_B_min_known_fail:
5907             /* failed to find B in a non-greedy match where c1,c2 valid */
5908
5909             REGCP_UNWIND(ST.cp);
5910             if (ST.paren) {
5911                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5912             }
5913             /* Couldn't or didn't -- move forward. */
5914             ST.oldloc = locinput;
5915             if (utf8_target)
5916                 locinput += UTF8SKIP(locinput);
5917             else
5918                 locinput++;
5919             ST.count++;
5920           curly_try_B_min_known:
5921              /* find the next place where 'B' could work, then call B */
5922             {
5923                 int n;
5924                 if (utf8_target) {
5925                     n = (ST.oldloc == locinput) ? 0 : 1;
5926                     if (ST.c1 == ST.c2) {
5927                         /* set n to utf8_distance(oldloc, locinput) */
5928                         while (locinput <= ST.maxpos
5929                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5930                         {
5931                             locinput += UTF8SKIP(locinput);
5932                             n++;
5933                         }
5934                     }
5935                     else {
5936                         /* set n to utf8_distance(oldloc, locinput) */
5937                         while (locinput <= ST.maxpos
5938                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5939                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5940                         {
5941                             locinput += UTF8SKIP(locinput);
5942                             n++;
5943                         }
5944                     }
5945                 }
5946                 else {  /* Not utf8_target */
5947                     if (ST.c1 == ST.c2) {
5948                         while (locinput <= ST.maxpos &&
5949                                UCHARAT(locinput) != ST.c1)
5950                             locinput++;
5951                     }
5952                     else {
5953                         while (locinput <= ST.maxpos
5954                                && UCHARAT(locinput) != ST.c1
5955                                && UCHARAT(locinput) != ST.c2)
5956                             locinput++;
5957                     }
5958                     n = locinput - ST.oldloc;
5959                 }
5960                 if (locinput > ST.maxpos)
5961                     sayNO;
5962                 if (n) {
5963                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5964                      * at b; check that everything between oldloc and
5965                      * locinput matches */
5966                     char *li = ST.oldloc;
5967                     ST.count += n;
5968                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
5969                         sayNO;
5970                     assert(n == REG_INFTY || locinput == li);
5971                 }
5972                 CURLY_SETPAREN(ST.paren, ST.count);
5973                 if (cur_eval && cur_eval->u.eval.close_paren &&
5974                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5975                     goto fake_end;
5976                 }
5977                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5978             }
5979             assert(0); /* NOTREACHED */
5980
5981
5982         case CURLY_B_min_fail:
5983             /* failed to find B in a non-greedy match where c1,c2 invalid */
5984
5985             REGCP_UNWIND(ST.cp);
5986             if (ST.paren) {
5987                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5988             }
5989             /* failed -- move forward one */
5990             {
5991                 char *li = locinput;
5992                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5993                     sayNO;
5994                 }
5995                 locinput = li;
5996             }
5997             {
5998                 ST.count++;
5999                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6000                         ST.count > 0)) /* count overflow ? */
6001                 {
6002                   curly_try_B_min:
6003                     CURLY_SETPAREN(ST.paren, ST.count);
6004                     if (cur_eval && cur_eval->u.eval.close_paren &&
6005                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6006                         goto fake_end;
6007                     }
6008                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6009                 }
6010             }
6011             sayNO;
6012             assert(0); /* NOTREACHED */
6013
6014
6015         curly_try_B_max:
6016             /* a successful greedy match: now try to match B */
6017             if (cur_eval && cur_eval->u.eval.close_paren &&
6018                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6019                 goto fake_end;
6020             }
6021             {
6022                 bool could_match = locinput < PL_regeol;
6023
6024                 /* If it could work, try it. */
6025                 if (ST.c1 != CHRTEST_VOID && could_match) {
6026                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6027                     {
6028                         could_match = memEQ(locinput,
6029                                             ST.c1_utf8,
6030                                             UTF8SKIP(locinput))
6031                                     || memEQ(locinput,
6032                                              ST.c2_utf8,
6033                                              UTF8SKIP(locinput));
6034                     }
6035                     else {
6036                         could_match = UCHARAT(locinput) == ST.c1
6037                                       || UCHARAT(locinput) == ST.c2;
6038                     }
6039                 }
6040                 if (ST.c1 == CHRTEST_VOID || could_match) {
6041                     CURLY_SETPAREN(ST.paren, ST.count);
6042                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6043                     assert(0); /* NOTREACHED */
6044                 }
6045             }
6046             /* FALL THROUGH */
6047
6048         case CURLY_B_max_fail:
6049             /* failed to find B in a greedy match */
6050
6051             REGCP_UNWIND(ST.cp);
6052             if (ST.paren) {
6053                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6054             }
6055             /*  back up. */
6056             if (--ST.count < ST.min)
6057                 sayNO;
6058             locinput = HOPc(locinput, -1);
6059             goto curly_try_B_max;
6060
6061 #undef ST
6062
6063         case END: /*  last op of main pattern  */
6064             fake_end:
6065             if (cur_eval) {
6066                 /* we've just finished A in /(??{A})B/; now continue with B */
6067                 st->u.eval.toggle_reg_flags
6068                             = cur_eval->u.eval.toggle_reg_flags;
6069                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6070
6071                 st->u.eval.prev_rex = rex_sv;           /* inner */
6072                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6073                 rex_sv = cur_eval->u.eval.prev_rex;
6074                 SET_reg_curpm(rex_sv);
6075                 rex = (struct regexp *)SvANY(rex_sv);
6076                 rexi = RXi_GET(rex);
6077                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6078
6079                 REGCP_SET(st->u.eval.lastcp);
6080
6081                 /* Restore parens of the outer rex without popping the
6082                  * savestack */
6083                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6084
6085                 st->u.eval.prev_eval = cur_eval;
6086                 cur_eval = cur_eval->u.eval.prev_eval;
6087                 DEBUG_EXECUTE_r(
6088                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6089                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6090                 if ( nochange_depth )
6091                     nochange_depth--;
6092
6093                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6094                                     locinput); /* match B */
6095             }
6096
6097             if (locinput < reginfo->till) {
6098                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6099                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6100                                       PL_colors[4],
6101                                       (long)(locinput - PL_reg_starttry),
6102                                       (long)(reginfo->till - PL_reg_starttry),
6103                                       PL_colors[5]));
6104
6105                 sayNO_SILENT;           /* Cannot match: too short. */
6106             }
6107             sayYES;                     /* Success! */
6108
6109         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6110             DEBUG_EXECUTE_r(
6111             PerlIO_printf(Perl_debug_log,
6112                 "%*s  %ssubpattern success...%s\n",
6113                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6114             sayYES;                     /* Success! */
6115
6116 #undef  ST
6117 #define ST st->u.ifmatch
6118
6119         {
6120             char *newstart;
6121
6122         case SUSPEND:   /* (?>A) */
6123             ST.wanted = 1;
6124             newstart = locinput;
6125             goto do_ifmatch;
6126
6127         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6128             ST.wanted = 0;
6129             goto ifmatch_trivial_fail_test;
6130
6131         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6132             ST.wanted = 1;
6133           ifmatch_trivial_fail_test:
6134             if (scan->flags) {
6135                 char * const s = HOPBACKc(locinput, scan->flags);
6136                 if (!s) {
6137                     /* trivial fail */
6138                     if (logical) {
6139                         logical = 0;
6140                         sw = 1 - cBOOL(ST.wanted);
6141                     }
6142                     else if (ST.wanted)
6143                         sayNO;
6144                     next = scan + ARG(scan);
6145                     if (next == scan)
6146                         next = NULL;
6147                     break;
6148                 }
6149                 newstart = s;
6150             }
6151             else
6152                 newstart = locinput;
6153
6154           do_ifmatch:
6155             ST.me = scan;
6156             ST.logical = logical;
6157             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6158
6159             /* execute body of (?...A) */
6160             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6161             assert(0); /* NOTREACHED */
6162         }
6163
6164         case IFMATCH_A_fail: /* body of (?...A) failed */
6165             ST.wanted = !ST.wanted;
6166             /* FALL THROUGH */
6167
6168         case IFMATCH_A: /* body of (?...A) succeeded */
6169             if (ST.logical) {
6170                 sw = cBOOL(ST.wanted);
6171             }
6172             else if (!ST.wanted)
6173                 sayNO;
6174
6175             if (OP(ST.me) != SUSPEND) {
6176                 /* restore old position except for (?>...) */
6177                 locinput = st->locinput;
6178             }
6179             scan = ST.me + ARG(ST.me);
6180             if (scan == ST.me)
6181                 scan = NULL;
6182             continue; /* execute B */
6183
6184 #undef ST
6185
6186         case LONGJMP: /*  alternative with many branches compiles to
6187                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6188             next = scan + ARG(scan);
6189             if (next == scan)
6190                 next = NULL;
6191             break;
6192
6193         case COMMIT:  /*  (*COMMIT)  */
6194             reginfo->cutpoint = PL_regeol;
6195             /* FALLTHROUGH */
6196
6197         case PRUNE:   /*  (*PRUNE)   */
6198             if (!scan->flags)
6199                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6200             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6201             assert(0); /* NOTREACHED */
6202
6203         case COMMIT_next_fail:
6204             no_final = 1;
6205             /* FALLTHROUGH */
6206
6207         case OPFAIL:   /* (*FAIL)  */
6208             sayNO;
6209             assert(0); /* NOTREACHED */
6210
6211 #define ST st->u.mark
6212         case MARKPOINT: /*  (*MARK:foo)  */
6213             ST.prev_mark = mark_state;
6214             ST.mark_name = sv_commit = sv_yes_mark
6215                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6216             mark_state = st;
6217             ST.mark_loc = locinput;
6218             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6219             assert(0); /* NOTREACHED */
6220
6221         case MARKPOINT_next:
6222             mark_state = ST.prev_mark;
6223             sayYES;
6224             assert(0); /* NOTREACHED */
6225
6226         case MARKPOINT_next_fail:
6227             if (popmark && sv_eq(ST.mark_name,popmark))
6228             {
6229                 if (ST.mark_loc > startpoint)
6230                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6231                 popmark = NULL; /* we found our mark */
6232                 sv_commit = ST.mark_name;
6233
6234                 DEBUG_EXECUTE_r({
6235                         PerlIO_printf(Perl_debug_log,
6236                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6237                             REPORT_CODE_OFF+depth*2, "",
6238                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6239                 });
6240             }
6241             mark_state = ST.prev_mark;
6242             sv_yes_mark = mark_state ?
6243                 mark_state->u.mark.mark_name : NULL;
6244             sayNO;
6245             assert(0); /* NOTREACHED */
6246
6247         case SKIP:  /*  (*SKIP)  */
6248             if (scan->flags) {
6249                 /* (*SKIP) : if we fail we cut here*/
6250                 ST.mark_name = NULL;
6251                 ST.mark_loc = locinput;
6252                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6253             } else {
6254                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6255                    otherwise do nothing.  Meaning we need to scan
6256                  */
6257                 regmatch_state *cur = mark_state;
6258                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6259
6260                 while (cur) {
6261                     if ( sv_eq( cur->u.mark.mark_name,
6262                                 find ) )
6263                     {
6264                         ST.mark_name = find;
6265                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6266                     }
6267                     cur = cur->u.mark.prev_mark;
6268                 }
6269             }
6270             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6271             break;
6272
6273         case SKIP_next_fail:
6274             if (ST.mark_name) {
6275                 /* (*CUT:NAME) - Set up to search for the name as we
6276                    collapse the stack*/
6277                 popmark = ST.mark_name;
6278             } else {
6279                 /* (*CUT) - No name, we cut here.*/
6280                 if (ST.mark_loc > startpoint)
6281                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6282                 /* but we set sv_commit to latest mark_name if there
6283                    is one so they can test to see how things lead to this
6284                    cut */
6285                 if (mark_state)
6286                     sv_commit=mark_state->u.mark.mark_name;
6287             }
6288             no_final = 1;
6289             sayNO;
6290             assert(0); /* NOTREACHED */
6291 #undef ST
6292
6293         case LNBREAK: /* \R */
6294             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6295                 locinput += n;
6296             } else
6297                 sayNO;
6298             break;
6299
6300 #define CASE_CLASS(nAmE)                              \
6301         case nAmE:                                    \
6302             if (NEXTCHR_IS_EOS)                       \
6303                 sayNO;                                \
6304             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6305                 locinput += n;                        \
6306             } else                                    \
6307                 sayNO;                                \
6308             break;                                    \
6309         case N##nAmE:                                 \
6310             if (NEXTCHR_IS_EOS)                       \
6311                 sayNO;                                \
6312             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6313                 sayNO;                                \
6314             } else {                                  \
6315                 locinput += UTF8SKIP(locinput);       \
6316             }                                         \
6317             break
6318
6319         CASE_CLASS(VERTWS);  /*  \v \V  */
6320         CASE_CLASS(HORIZWS); /*  \h \H  */
6321 #undef CASE_CLASS
6322
6323         default:
6324             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6325                           PTR2UV(scan), OP(scan));
6326             Perl_croak(aTHX_ "regexp memory corruption");
6327
6328         /* this is a point to jump to in order to increment
6329          * locinput by one character */
6330         increment_locinput:
6331             assert(!NEXTCHR_IS_EOS);
6332             if (utf8_target) {
6333                 locinput += PL_utf8skip[nextchr];
6334                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6335                 if (locinput > PL_regeol)
6336                     sayNO;
6337             }
6338             else
6339                 locinput++;
6340             break;
6341
6342         } /* end switch */
6343
6344         /* switch break jumps here */
6345         scan = next; /* prepare to execute the next op and ... */
6346         continue;    /* ... jump back to the top, reusing st */
6347         assert(0); /* NOTREACHED */
6348
6349       push_yes_state:
6350         /* push a state that backtracks on success */
6351         st->u.yes.prev_yes_state = yes_state;
6352         yes_state = st;
6353         /* FALL THROUGH */
6354       push_state:
6355         /* push a new regex state, then continue at scan  */
6356         {
6357             regmatch_state *newst;
6358
6359             DEBUG_STACK_r({
6360                 regmatch_state *cur = st;
6361                 regmatch_state *curyes = yes_state;
6362                 int curd = depth;
6363                 regmatch_slab *slab = PL_regmatch_slab;
6364                 for (;curd > -1;cur--,curd--) {
6365                     if (cur < SLAB_FIRST(slab)) {
6366                         slab = slab->prev;
6367                         cur = SLAB_LAST(slab);
6368                     }
6369                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6370                         REPORT_CODE_OFF + 2 + depth * 2,"",
6371                         curd, PL_reg_name[cur->resume_state],
6372                         (curyes == cur) ? "yes" : ""
6373                     );
6374                     if (curyes == cur)
6375                         curyes = cur->u.yes.prev_yes_state;
6376                 }
6377             } else
6378                 DEBUG_STATE_pp("push")
6379             );
6380             depth++;
6381             st->locinput = locinput;
6382             newst = st+1;
6383             if (newst >  SLAB_LAST(PL_regmatch_slab))
6384                 newst = S_push_slab(aTHX);
6385             PL_regmatch_state = newst;
6386
6387             locinput = pushinput;
6388             st = newst;
6389             continue;
6390             assert(0); /* NOTREACHED */
6391         }
6392     }
6393
6394     /*
6395     * We get here only if there's trouble -- normally "case END" is
6396     * the terminating point.
6397     */
6398     Perl_croak(aTHX_ "corrupted regexp pointers");
6399     /*NOTREACHED*/
6400     sayNO;
6401
6402 yes:
6403     if (yes_state) {
6404         /* we have successfully completed a subexpression, but we must now
6405          * pop to the state marked by yes_state and continue from there */
6406         assert(st != yes_state);
6407 #ifdef DEBUGGING
6408         while (st != yes_state) {
6409             st--;
6410             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6411                 PL_regmatch_slab = PL_regmatch_slab->prev;
6412                 st = SLAB_LAST(PL_regmatch_slab);
6413             }
6414             DEBUG_STATE_r({
6415                 if (no_final) {
6416                     DEBUG_STATE_pp("pop (no final)");
6417                 } else {
6418                     DEBUG_STATE_pp("pop (yes)");
6419                 }
6420             });
6421             depth--;
6422         }
6423 #else
6424         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6425             || yes_state > SLAB_LAST(PL_regmatch_slab))
6426         {
6427             /* not in this slab, pop slab */
6428             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6429             PL_regmatch_slab = PL_regmatch_slab->prev;
6430             st = SLAB_LAST(PL_regmatch_slab);
6431         }
6432         depth -= (st - yes_state);
6433 #endif
6434         st = yes_state;
6435         yes_state = st->u.yes.prev_yes_state;
6436         PL_regmatch_state = st;
6437
6438         if (no_final)
6439             locinput= st->locinput;
6440         state_num = st->resume_state + no_final;
6441         goto reenter_switch;
6442     }
6443
6444     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6445                           PL_colors[4], PL_colors[5]));
6446
6447     if (PL_reg_state.re_state_eval_setup_done) {
6448         /* each successfully executed (?{...}) block does the equivalent of
6449          *   local $^R = do {...}
6450          * When popping the save stack, all these locals would be undone;
6451          * bypass this by setting the outermost saved $^R to the latest
6452          * value */
6453         if (oreplsv != GvSV(PL_replgv))
6454             sv_setsv(oreplsv, GvSV(PL_replgv));
6455     }
6456     result = 1;
6457     goto final_exit;
6458
6459 no:
6460     DEBUG_EXECUTE_r(
6461         PerlIO_printf(Perl_debug_log,
6462             "%*s  %sfailed...%s\n",
6463             REPORT_CODE_OFF+depth*2, "",
6464             PL_colors[4], PL_colors[5])
6465         );
6466
6467 no_silent:
6468     if (no_final) {
6469         if (yes_state) {
6470             goto yes;
6471         } else {
6472             goto final_exit;
6473         }
6474     }
6475     if (depth) {
6476         /* there's a previous state to backtrack to */
6477         st--;
6478         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6479             PL_regmatch_slab = PL_regmatch_slab->prev;
6480             st = SLAB_LAST(PL_regmatch_slab);
6481         }
6482         PL_regmatch_state = st;
6483         locinput= st->locinput;
6484
6485         DEBUG_STATE_pp("pop");
6486         depth--;
6487         if (yes_state == st)
6488             yes_state = st->u.yes.prev_yes_state;
6489
6490         state_num = st->resume_state + 1; /* failure = success + 1 */
6491         goto reenter_switch;
6492     }
6493     result = 0;
6494
6495   final_exit:
6496     if (rex->intflags & PREGf_VERBARG_SEEN) {
6497         SV *sv_err = get_sv("REGERROR", 1);
6498         SV *sv_mrk = get_sv("REGMARK", 1);
6499         if (result) {
6500             sv_commit = &PL_sv_no;
6501             if (!sv_yes_mark)
6502                 sv_yes_mark = &PL_sv_yes;
6503         } else {
6504             if (!sv_commit)
6505                 sv_commit = &PL_sv_yes;
6506             sv_yes_mark = &PL_sv_no;
6507         }
6508         sv_setsv(sv_err, sv_commit);
6509         sv_setsv(sv_mrk, sv_yes_mark);
6510     }
6511
6512
6513     if (last_pushed_cv) {
6514         dSP;
6515         POP_MULTICALL;
6516         PERL_UNUSED_VAR(SP);
6517     }
6518
6519     /* clean up; in particular, free all slabs above current one */
6520     LEAVE_SCOPE(oldsave);
6521
6522     assert(!result ||  locinput - PL_bostr >= 0);
6523     return result ?  locinput - PL_bostr : -1;
6524 }
6525
6526 /*
6527  - regrepeat - repeatedly match something simple, report how many
6528  *
6529  * startposp - pointer a pointer to the start position.  This is updated
6530  *             to point to the byte following the highest successful
6531  *             match.
6532  * p         - the regnode to be repeatedly matched against.
6533  * max       - maximum number of characters to match.
6534  * depth     - (for debugging) backtracking depth.
6535  */
6536 STATIC I32
6537 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6538 {
6539     dVAR;
6540     char *scan;
6541     I32 c;
6542     char *loceol = PL_regeol;
6543     I32 hardcount = 0;
6544     bool utf8_target = PL_reg_match_utf8;
6545     UV utf8_flags;
6546 #ifndef DEBUGGING
6547     PERL_UNUSED_ARG(depth);
6548 #endif
6549
6550     PERL_ARGS_ASSERT_REGREPEAT;
6551
6552     scan = *startposp;
6553     if (max == REG_INFTY)
6554         max = I32_MAX;
6555     else if (max < loceol - scan)
6556         loceol = scan + max;
6557     switch (OP(p)) {
6558     case REG_ANY:
6559         if (utf8_target) {
6560             loceol = PL_regeol;
6561             while (scan < loceol && hardcount < max && *scan != '\n') {
6562                 scan += UTF8SKIP(scan);
6563                 hardcount++;
6564             }
6565         } else {
6566             while (scan < loceol && *scan != '\n')
6567                 scan++;
6568         }
6569         break;
6570     case SANY:
6571         if (utf8_target) {
6572             loceol = PL_regeol;
6573             while (scan < loceol && hardcount < max) {
6574                 scan += UTF8SKIP(scan);
6575                 hardcount++;
6576             }
6577         }
6578         else
6579             scan = loceol;
6580         break;
6581     case CANY:
6582         scan = loceol;
6583         break;
6584     case EXACT:
6585         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6586
6587         c = (U8)*STRING(p);
6588
6589         /* Can use a simple loop if the pattern char to match on is invariant
6590          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6591          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6592          * true iff it doesn't matter if the argument is in UTF-8 or not */
6593         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6594             while (scan < loceol && UCHARAT(scan) == c) {
6595                 scan++;
6596             }
6597         }
6598         else if (UTF_PATTERN) {
6599             if (utf8_target) {
6600                 STRLEN scan_char_len;
6601                 loceol = PL_regeol;
6602
6603                 /* When both target and pattern are UTF-8, we have to do s
6604                  * string EQ */
6605                 while (hardcount < max
6606                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6607                        && scan_char_len <= STR_LEN(p)
6608                        && memEQ(scan, STRING(p), scan_char_len))
6609                 {
6610                     scan += scan_char_len;
6611                     hardcount++;
6612                 }
6613             }
6614             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6615
6616                 /* Target isn't utf8; convert the character in the UTF-8
6617                  * pattern to non-UTF8, and do a simple loop */
6618                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6619                 while (scan < loceol && UCHARAT(scan) == c) {
6620                     scan++;
6621                 }
6622             } /* else pattern char is above Latin1, can't possibly match the
6623                  non-UTF-8 target */
6624         }
6625         else {
6626
6627             /* Here, the string must be utf8; pattern isn't, and <c> is
6628              * different in utf8 than not, so can't compare them directly.
6629              * Outside the loop, find the two utf8 bytes that represent c, and
6630              * then look for those in sequence in the utf8 string */
6631             U8 high = UTF8_TWO_BYTE_HI(c);
6632             U8 low = UTF8_TWO_BYTE_LO(c);
6633             loceol = PL_regeol;
6634
6635             while (hardcount < max
6636                     && scan + 1 < loceol
6637                     && UCHARAT(scan) == high
6638                     && UCHARAT(scan + 1) == low)
6639             {
6640                 scan += 2;
6641                 hardcount++;
6642             }
6643         }
6644         break;
6645
6646     case EXACTFA:
6647         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6648         goto do_exactf;
6649
6650     case EXACTFL:
6651         PL_reg_flags |= RF_tainted;
6652         utf8_flags = FOLDEQ_UTF8_LOCALE;
6653         goto do_exactf;
6654
6655     case EXACTF:
6656             utf8_flags = 0;
6657             goto do_exactf;
6658
6659     case EXACTFU_SS:
6660     case EXACTFU_TRICKYFOLD:
6661     case EXACTFU:
6662         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6663
6664     do_exactf: {
6665         int c1, c2;
6666         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6667
6668         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6669
6670         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6671             if (c1 == CHRTEST_VOID) {
6672                 /* Use full Unicode fold matching */
6673                 char *tmpeol = loceol;
6674                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6675                 while (hardcount < max
6676                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6677                                              STRING(p), NULL, pat_len,
6678                                              cBOOL(UTF_PATTERN), utf8_flags))
6679                 {
6680                     scan = tmpeol;
6681                     tmpeol = loceol;
6682                     hardcount++;
6683                 }
6684             }
6685             else if (utf8_target) {
6686                 if (c1 == c2) {
6687                     while (hardcount < max
6688                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6689                     {
6690                         scan += UTF8SKIP(scan);
6691                         hardcount++;
6692                     }
6693                 }
6694                 else {
6695                     while (hardcount < max
6696                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6697                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6698                     {
6699                         scan += UTF8SKIP(scan);
6700                         hardcount++;
6701                     }
6702                 }
6703             }
6704             else if (c1 == c2) {
6705                 while (scan < loceol && UCHARAT(scan) == c1) {
6706                     scan++;
6707                 }
6708             }
6709             else {
6710                 while (scan < loceol &&
6711                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6712                 {
6713                     scan++;
6714                 }
6715             }
6716         }
6717         break;
6718     }
6719     case ANYOF:
6720         if (utf8_target) {
6721             STRLEN inclasslen;
6722             loceol = PL_regeol;
6723             inclasslen = loceol - scan;
6724             while (hardcount < max
6725                    && ((inclasslen = loceol - scan) > 0)
6726                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6727             {
6728                 scan += inclasslen;
6729                 hardcount++;
6730             }
6731         } else {
6732             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6733                 scan++;
6734         }
6735         break;
6736     case ALNUMU:
6737         if (utf8_target) {
6738     utf8_wordchar:
6739             loceol = PL_regeol;
6740             LOAD_UTF8_CHARCLASS_ALNUM();
6741             while (hardcount < max && scan < loceol &&
6742                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6743             {
6744                 scan += UTF8SKIP(scan);
6745                 hardcount++;
6746             }
6747         } else {
6748             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6749                 scan++;
6750             }
6751         }
6752         break;
6753     case ALNUM:
6754         if (utf8_target)
6755             goto utf8_wordchar;
6756         while (scan < loceol && isALNUM((U8) *scan)) {
6757             scan++;
6758         }
6759         break;
6760     case ALNUMA:
6761         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6762             scan++;
6763         }
6764         break;
6765     case ALNUML:
6766         PL_reg_flags |= RF_tainted;
6767         if (utf8_target) {
6768             loceol = PL_regeol;
6769             while (hardcount < max && scan < loceol &&
6770                    isALNUM_LC_utf8((U8*)scan)) {
6771                 scan += UTF8SKIP(scan);
6772                 hardcount++;
6773             }
6774         } else {
6775             while (scan < loceol && isALNUM_LC(*scan))
6776                 scan++;
6777         }
6778         break;
6779     case NALNUMU:
6780         if (utf8_target) {
6781
6782     utf8_Nwordchar:
6783
6784             loceol = PL_regeol;
6785             LOAD_UTF8_CHARCLASS_ALNUM();
6786             while (hardcount < max && scan < loceol &&
6787                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6788             {
6789                 scan += UTF8SKIP(scan);
6790                 hardcount++;
6791             }
6792         } else {
6793             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6794                 scan++;
6795             }
6796         }
6797         break;
6798     case NALNUM:
6799         if (utf8_target)
6800             goto utf8_Nwordchar;
6801         while (scan < loceol && ! isALNUM((U8) *scan)) {
6802             scan++;
6803         }
6804         break;
6805
6806     case POSIXA:
6807        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6808             scan++;
6809         }
6810         break;
6811     case NPOSIXA:
6812         if (utf8_target) {
6813             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6814                 scan += UTF8SKIP(scan);
6815             }
6816         }
6817         else {
6818             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6819                 scan++;
6820             }
6821         }
6822         break;
6823     case NALNUMA:
6824         if (utf8_target) {
6825             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6826                 scan += UTF8SKIP(scan);
6827             }
6828         }
6829         else {
6830             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6831                 scan++;
6832             }
6833         }
6834         break;
6835     case NALNUML:
6836         PL_reg_flags |= RF_tainted;
6837         if (utf8_target) {
6838             loceol = PL_regeol;
6839             while (hardcount < max && scan < loceol &&
6840                    !isALNUM_LC_utf8((U8*)scan)) {
6841                 scan += UTF8SKIP(scan);
6842                 hardcount++;
6843             }
6844         } else {
6845             while (scan < loceol && !isALNUM_LC(*scan))
6846                 scan++;
6847         }
6848         break;
6849     case SPACEU:
6850         if (utf8_target) {
6851
6852     utf8_space:
6853
6854             loceol = PL_regeol;
6855             LOAD_UTF8_CHARCLASS_SPACE();
6856             while (hardcount < max && scan < loceol &&
6857                    (*scan == ' ' ||
6858                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6859             {
6860                 scan += UTF8SKIP(scan);
6861                 hardcount++;
6862             }
6863             break;
6864         }
6865         else {
6866             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6867                 scan++;
6868             }
6869             break;
6870         }
6871     case SPACE:
6872         if (utf8_target)
6873             goto utf8_space;
6874
6875         while (scan < loceol && isSPACE((U8) *scan)) {
6876             scan++;
6877         }
6878         break;
6879     case SPACEA:
6880         while (scan < loceol && isSPACE_A((U8) *scan)) {
6881             scan++;
6882         }
6883         break;
6884     case SPACEL:
6885         PL_reg_flags |= RF_tainted;
6886         if (utf8_target) {
6887             loceol = PL_regeol;
6888             while (hardcount < max && scan < loceol &&
6889                    isSPACE_LC_utf8((U8*)scan)) {
6890                 scan += UTF8SKIP(scan);
6891                 hardcount++;
6892             }
6893         } else {
6894             while (scan < loceol && isSPACE_LC(*scan))
6895                 scan++;
6896         }
6897         break;
6898     case NSPACEU:
6899         if (utf8_target) {
6900
6901     utf8_Nspace:
6902
6903             loceol = PL_regeol;
6904             LOAD_UTF8_CHARCLASS_SPACE();
6905             while (hardcount < max && scan < loceol &&
6906                    ! (*scan == ' ' ||
6907                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6908             {
6909                 scan += UTF8SKIP(scan);
6910                 hardcount++;
6911             }
6912             break;
6913         }
6914         else {
6915             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6916                 scan++;
6917             }
6918         }
6919         break;
6920     case NSPACE:
6921         if (utf8_target)
6922             goto utf8_Nspace;
6923
6924         while (scan < loceol && ! isSPACE((U8) *scan)) {
6925             scan++;
6926         }
6927         break;
6928     case NSPACEA:
6929         if (utf8_target) {
6930             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6931                 scan += UTF8SKIP(scan);
6932             }
6933         }
6934         else {
6935             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6936                 scan++;
6937             }
6938         }
6939         break;
6940     case NSPACEL:
6941         PL_reg_flags |= RF_tainted;
6942         if (utf8_target) {
6943             loceol = PL_regeol;
6944             while (hardcount < max && scan < loceol &&
6945                    !isSPACE_LC_utf8((U8*)scan)) {
6946                 scan += UTF8SKIP(scan);
6947                 hardcount++;
6948             }
6949         } else {
6950             while (scan < loceol && !isSPACE_LC(*scan))
6951                 scan++;
6952         }
6953         break;
6954     case DIGIT:
6955         if (utf8_target) {
6956             loceol = PL_regeol;
6957             LOAD_UTF8_CHARCLASS_DIGIT();
6958             while (hardcount < max && scan < loceol &&
6959                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6960                 scan += UTF8SKIP(scan);
6961                 hardcount++;
6962             }
6963         } else {
6964             while (scan < loceol && isDIGIT(*scan))
6965                 scan++;
6966         }
6967         break;
6968     case DIGITA:
6969         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6970             scan++;
6971         }
6972         break;
6973     case DIGITL:
6974         PL_reg_flags |= RF_tainted;
6975         if (utf8_target) {
6976             loceol = PL_regeol;
6977             while (hardcount < max && scan < loceol &&
6978                    isDIGIT_LC_utf8((U8*)scan)) {
6979                 scan += UTF8SKIP(scan);
6980                 hardcount++;
6981             }
6982         } else {
6983             while (scan < loceol && isDIGIT_LC(*scan))
6984                 scan++;
6985         }
6986         break;
6987     case NDIGIT:
6988         if (utf8_target) {
6989             loceol = PL_regeol;
6990             LOAD_UTF8_CHARCLASS_DIGIT();
6991             while (hardcount < max && scan < loceol &&
6992                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6993                 scan += UTF8SKIP(scan);
6994                 hardcount++;
6995             }
6996         } else {
6997             while (scan < loceol && !isDIGIT(*scan))
6998                 scan++;
6999         }
7000         break;
7001     case NDIGITA:
7002         if (utf8_target) {
7003             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7004                 scan += UTF8SKIP(scan);
7005             }
7006         }
7007         else {
7008             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7009                 scan++;
7010             }
7011         }
7012         break;
7013     case NDIGITL:
7014         PL_reg_flags |= RF_tainted;
7015         if (utf8_target) {
7016             loceol = PL_regeol;
7017             while (hardcount < max && scan < loceol &&
7018                    !isDIGIT_LC_utf8((U8*)scan)) {
7019                 scan += UTF8SKIP(scan);
7020                 hardcount++;
7021             }
7022         } else {
7023             while (scan < loceol && !isDIGIT_LC(*scan))
7024                 scan++;
7025         }
7026         break;
7027     case LNBREAK:
7028         Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK");
7029         assert(0); /* NOTREACHED */
7030     case HORIZWS:
7031         if (utf8_target) {
7032             loceol = PL_regeol;
7033             while (hardcount < max && scan < loceol &&
7034                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7035             {
7036                 scan += c;
7037                 hardcount++;
7038             }
7039         } else {
7040             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7041                 scan++;
7042         }
7043         break;
7044     case NHORIZWS:
7045         if (utf8_target) {
7046             loceol = PL_regeol;
7047             while (hardcount < max && scan < loceol &&
7048                         !is_HORIZWS_utf8_safe(scan, loceol))
7049             {
7050                 scan += UTF8SKIP(scan);
7051                 hardcount++;
7052             }
7053         } else {
7054             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7055                 scan++;
7056
7057         }
7058         break;
7059     case VERTWS:
7060         if (utf8_target) {
7061             loceol = PL_regeol;
7062             while (hardcount < max && scan < loceol &&
7063                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7064             {
7065                 scan += c;
7066                 hardcount++;
7067             }
7068         } else {
7069             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7070                 scan++;
7071
7072         }
7073         break;
7074     case NVERTWS:
7075         if (utf8_target) {
7076             loceol = PL_regeol;
7077             while (hardcount < max && scan < loceol &&
7078                                 !is_VERTWS_utf8_safe(scan, loceol))
7079             {
7080                 scan += UTF8SKIP(scan);
7081                 hardcount++;
7082             }
7083         } else {
7084             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7085                 scan++;
7086
7087         }
7088         break;
7089
7090     default:            /* Called on something of 0 width. */
7091         break;          /* So match right here or not at all. */
7092     }
7093
7094     if (hardcount)
7095         c = hardcount;
7096     else
7097         c = scan - *startposp;
7098     *startposp = scan;
7099
7100     DEBUG_r({
7101         GET_RE_DEBUG_FLAGS_DECL;
7102         DEBUG_EXECUTE_r({
7103             SV * const prop = sv_newmortal();
7104             regprop(prog, prop, p);
7105             PerlIO_printf(Perl_debug_log,
7106                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7107                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7108         });
7109     });
7110
7111     return(c);
7112 }
7113
7114
7115 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7116 /*
7117 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7118 create a copy so that changes the caller makes won't change the shared one.
7119 If <altsvp> is non-null, will return NULL in it, for back-compat.
7120  */
7121 SV *
7122 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7123 {
7124     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7125
7126     if (altsvp) {
7127         *altsvp = NULL;
7128     }
7129
7130     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7131 }
7132 #endif
7133
7134 STATIC SV *
7135 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7136 {
7137     /* Returns the swash for the input 'node' in the regex 'prog'.
7138      * If <doinit> is true, will attempt to create the swash if not already
7139      *    done.
7140      * If <listsvp> is non-null, will return the swash initialization string in
7141      *    it.
7142      * Tied intimately to how regcomp.c sets up the data structure */
7143
7144     dVAR;
7145     SV *sw  = NULL;
7146     SV *si  = NULL;
7147     SV*  invlist = NULL;
7148
7149     RXi_GET_DECL(prog,progi);
7150     const struct reg_data * const data = prog ? progi->data : NULL;
7151
7152     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7153
7154     assert(ANYOF_NONBITMAP(node));
7155
7156     if (data && data->count) {
7157         const U32 n = ARG(node);
7158
7159         if (data->what[n] == 's') {
7160             SV * const rv = MUTABLE_SV(data->data[n]);
7161             AV * const av = MUTABLE_AV(SvRV(rv));
7162             SV **const ary = AvARRAY(av);
7163             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7164
7165             si = *ary;  /* ary[0] = the string to initialize the swash with */
7166
7167             /* Elements 2 and 3 are either both present or both absent. [2] is
7168              * any inversion list generated at compile time; [3] indicates if
7169              * that inversion list has any user-defined properties in it. */
7170             if (av_len(av) >= 2) {
7171                 invlist = ary[2];
7172                 if (SvUV(ary[3])) {
7173                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7174                 }
7175             }
7176             else {
7177                 invlist = NULL;
7178             }
7179
7180             /* Element [1] is reserved for the set-up swash.  If already there,
7181              * return it; if not, create it and store it there */
7182             if (SvROK(ary[1])) {
7183                 sw = ary[1];
7184             }
7185             else if (si && doinit) {
7186
7187                 sw = _core_swash_init("utf8", /* the utf8 package */
7188                                       "", /* nameless */
7189                                       si,
7190                                       1, /* binary */
7191                                       0, /* not from tr/// */
7192                                       invlist,
7193                                       &swash_init_flags);
7194                 (void)av_store(av, 1, sw);
7195             }
7196         }
7197     }
7198
7199     if (listsvp) {
7200         SV* matches_string = newSVpvn("", 0);
7201
7202         /* Use the swash, if any, which has to have incorporated into it all
7203          * possibilities */
7204         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7205             && (si && si != &PL_sv_undef))
7206         {
7207
7208             /* If no swash, use the input initialization string, if available */
7209             sv_catsv(matches_string, si);
7210         }
7211
7212         /* Add the inversion list to whatever we have.  This may have come from
7213          * the swash, or from an input parameter */
7214         if (invlist) {
7215             sv_catsv(matches_string, _invlist_contents(invlist));
7216         }
7217         *listsvp = matches_string;
7218     }
7219
7220     return sw;
7221 }
7222
7223 /*
7224  - reginclass - determine if a character falls into a character class
7225
7226   n is the ANYOF regnode
7227   p is the target string
7228   lenp is pointer to the maximum number of bytes of how far to go in p
7229     (This is assumed wthout checking to always be at least the current
7230     character's size)
7231   utf8_target tells whether p is in UTF-8.
7232
7233   Returns true if matched; false otherwise.  If lenp is not NULL, on return
7234   from a successful match, the value it points to will be updated to how many
7235   bytes in p were matched.  If there was no match, the value is undefined,
7236   possibly changed from the input.
7237
7238   Note that this can be a synthetic start class, a combination of various
7239   nodes, so things you think might be mutually exclusive, such as locale,
7240   aren't.  It can match both locale and non-locale
7241
7242  */
7243
7244 STATIC bool
7245 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
7246 {
7247     dVAR;
7248     const char flags = ANYOF_FLAGS(n);
7249     bool match = FALSE;
7250     UV c = *p;
7251     STRLEN c_len = 0;
7252     STRLEN maxlen;
7253
7254     PERL_ARGS_ASSERT_REGINCLASS;
7255
7256     /* If c is not already the code point, get it */
7257     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
7258         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7259                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7260                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7261                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7262                  * UTF8_ALLOW_FFFF */
7263         if (c_len == (STRLEN)-1)
7264             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7265     }
7266     else {
7267         c_len = 1;
7268     }
7269
7270     /* Use passed in max length, or one character if none passed in or less
7271      * than one character.  And assume will match just one character.  This is
7272      * overwritten later if matched more. */
7273     if (lenp) {
7274         maxlen = (*lenp > c_len) ? *lenp : c_len;
7275         *lenp = c_len;
7276
7277     }
7278     else {
7279         maxlen = c_len;
7280     }
7281
7282     /* If this character is potentially in the bitmap, check it */
7283     if (c < 256) {
7284         if (ANYOF_BITMAP_TEST(n, c))
7285             match = TRUE;
7286         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7287                 && ! utf8_target
7288                 && ! isASCII(c))
7289         {
7290             match = TRUE;
7291         }
7292         else if (flags & ANYOF_LOCALE) {
7293             PL_reg_flags |= RF_tainted;
7294
7295             if ((flags & ANYOF_LOC_FOLD)
7296                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7297             {
7298                 match = TRUE;
7299             }
7300             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7301                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7302                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7303                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7304                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7305                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7306                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7307                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7308                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7309                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7310                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7311                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7312                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7313                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7314                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7315                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7316                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7317                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7318                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7319                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7320                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7321                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7322                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7323                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7324                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7325                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7326                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7327                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7328                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7329                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7330                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7331                      ) /* How's that for a conditional? */
7332             ) {
7333                 match = TRUE;
7334             }
7335         }
7336     }
7337
7338     /* If the bitmap didn't (or couldn't) match, and something outside the
7339      * bitmap could match, try that.  Locale nodes specify completely the
7340      * behavior of code points in the bit map (otherwise, a utf8 target would
7341      * cause them to be treated as Unicode and not locale), except in
7342      * the very unlikely event when this node is a synthetic start class, which
7343      * could be a combination of locale and non-locale nodes.  So allow locale
7344      * to match for the synthetic start class, which will give a false
7345      * positive that will be resolved when the match is done again as not part
7346      * of the synthetic start class */
7347     if (!match) {
7348         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7349             match = TRUE;       /* Everything above 255 matches */
7350         }
7351         else if (ANYOF_NONBITMAP(n)
7352                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7353                      || (utf8_target
7354                          && (c >=256
7355                              || (! (flags & ANYOF_LOCALE))
7356                              || (flags & ANYOF_IS_SYNTHETIC)))))
7357         {
7358             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7359             if (sw) {
7360                 U8 * utf8_p;
7361                 if (utf8_target) {
7362                     utf8_p = (U8 *) p;
7363                 } else { /* Convert to utf8 */
7364                     STRLEN len = 1;
7365                     utf8_p = bytes_to_utf8(p, &len);
7366                 }
7367
7368                 if (swash_fetch(sw, utf8_p, TRUE)) {
7369                     match = TRUE;
7370                 }
7371
7372                 /* If we allocated a string above, free it */
7373                 if (! utf8_target) Safefree(utf8_p);
7374             }
7375         }
7376
7377         if (UNICODE_IS_SUPER(c)
7378             && (flags & ANYOF_WARN_SUPER)
7379             && ckWARN_d(WARN_NON_UNICODE))
7380         {
7381             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7382                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7383         }
7384     }
7385
7386     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7387     return cBOOL(flags & ANYOF_INVERT) ^ match;
7388 }
7389
7390 STATIC U8 *
7391 S_reghop3(U8 *s, I32 off, const U8* lim)
7392 {
7393     /* return the position 'off' UTF-8 characters away from 's', forward if
7394      * 'off' >= 0, backwards if negative.  But don't go outside of position
7395      * 'lim', which better be < s  if off < 0 */
7396
7397     dVAR;
7398
7399     PERL_ARGS_ASSERT_REGHOP3;
7400
7401     if (off >= 0) {
7402         while (off-- && s < lim) {
7403             /* XXX could check well-formedness here */
7404             s += UTF8SKIP(s);
7405         }
7406     }
7407     else {
7408         while (off++ && s > lim) {
7409             s--;
7410             if (UTF8_IS_CONTINUED(*s)) {
7411                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7412                     s--;
7413             }
7414             /* XXX could check well-formedness here */
7415         }
7416     }
7417     return s;
7418 }
7419
7420 #ifdef XXX_dmq
7421 /* there are a bunch of places where we use two reghop3's that should
7422    be replaced with this routine. but since thats not done yet
7423    we ifdef it out - dmq
7424 */
7425 STATIC U8 *
7426 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7427 {
7428     dVAR;
7429
7430     PERL_ARGS_ASSERT_REGHOP4;
7431
7432     if (off >= 0) {
7433         while (off-- && s < rlim) {
7434             /* XXX could check well-formedness here */
7435             s += UTF8SKIP(s);
7436         }
7437     }
7438     else {
7439         while (off++ && s > llim) {
7440             s--;
7441             if (UTF8_IS_CONTINUED(*s)) {
7442                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7443                     s--;
7444             }
7445             /* XXX could check well-formedness here */
7446         }
7447     }
7448     return s;
7449 }
7450 #endif
7451
7452 STATIC U8 *
7453 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7454 {
7455     dVAR;
7456
7457     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7458
7459     if (off >= 0) {
7460         while (off-- && s < lim) {
7461             /* XXX could check well-formedness here */
7462             s += UTF8SKIP(s);
7463         }
7464         if (off >= 0)
7465             return NULL;
7466     }
7467     else {
7468         while (off++ && s > lim) {
7469             s--;
7470             if (UTF8_IS_CONTINUED(*s)) {
7471                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7472                     s--;
7473             }
7474             /* XXX could check well-formedness here */
7475         }
7476         if (off <= 0)
7477             return NULL;
7478     }
7479     return s;
7480 }
7481
7482 static void
7483 restore_pos(pTHX_ void *arg)
7484 {
7485     dVAR;
7486     regexp * const rex = (regexp *)arg;
7487     if (PL_reg_state.re_state_eval_setup_done) {
7488         if (PL_reg_oldsaved) {
7489             rex->subbeg = PL_reg_oldsaved;
7490             rex->sublen = PL_reg_oldsavedlen;
7491             rex->suboffset = PL_reg_oldsavedoffset;
7492             rex->subcoffset = PL_reg_oldsavedcoffset;
7493 #ifdef PERL_OLD_COPY_ON_WRITE
7494             rex->saved_copy = PL_nrs;
7495 #endif
7496             RXp_MATCH_COPIED_on(rex);
7497         }
7498         PL_reg_magic->mg_len = PL_reg_oldpos;
7499         PL_reg_state.re_state_eval_setup_done = FALSE;
7500         PL_curpm = PL_reg_oldcurpm;
7501     }
7502 }
7503
7504 STATIC void
7505 S_to_utf8_substr(pTHX_ register regexp *prog)
7506 {
7507     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7508      * on the converted value */
7509
7510     int i = 1;
7511
7512     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7513
7514     do {
7515         if (prog->substrs->data[i].substr
7516             && !prog->substrs->data[i].utf8_substr) {
7517             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7518             prog->substrs->data[i].utf8_substr = sv;
7519             sv_utf8_upgrade(sv);
7520             if (SvVALID(prog->substrs->data[i].substr)) {
7521                 if (SvTAIL(prog->substrs->data[i].substr)) {
7522                     /* Trim the trailing \n that fbm_compile added last
7523                        time.  */
7524                     SvCUR_set(sv, SvCUR(sv) - 1);
7525                     /* Whilst this makes the SV technically "invalid" (as its
7526                        buffer is no longer followed by "\0") when fbm_compile()
7527                        adds the "\n" back, a "\0" is restored.  */
7528                     fbm_compile(sv, FBMcf_TAIL);
7529                 } else
7530                     fbm_compile(sv, 0);
7531             }
7532             if (prog->substrs->data[i].substr == prog->check_substr)
7533                 prog->check_utf8 = sv;
7534         }
7535     } while (i--);
7536 }
7537
7538 STATIC bool
7539 S_to_byte_substr(pTHX_ register regexp *prog)
7540 {
7541     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7542      * on the converted value; returns FALSE if can't be converted. */
7543
7544     dVAR;
7545     int i = 1;
7546
7547     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7548
7549     do {
7550         if (prog->substrs->data[i].utf8_substr
7551             && !prog->substrs->data[i].substr) {
7552             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7553             if (! sv_utf8_downgrade(sv, TRUE)) {
7554                 return FALSE;
7555             }
7556             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7557                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7558                     /* Trim the trailing \n that fbm_compile added last
7559                         time.  */
7560                     SvCUR_set(sv, SvCUR(sv) - 1);
7561                     fbm_compile(sv, FBMcf_TAIL);
7562                 } else
7563                     fbm_compile(sv, 0);
7564             }
7565             prog->substrs->data[i].substr = sv;
7566             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7567                 prog->check_substr = sv;
7568         }
7569     } while (i--);
7570
7571     return TRUE;
7572 }
7573
7574 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7575  * pre-composed Hangul syllables, which are all in a contiguous block and
7576  * arranged there in such a way so as to facilitate alorithmic determination of
7577  * their characteristics.  As such, they don't need a swash, but can be
7578  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7579  * is a GCB=LV */
7580 #define SBASE 0xAC00    /* Start of block */
7581 #define SCount 11172    /* Length of block */
7582 #define TCount 28
7583
7584 #if 0   /* This routine is not currently used */
7585 PERL_STATIC_INLINE bool
7586 S_is_utf8_X_LV(pTHX_ const U8 *p)
7587 {
7588     /* Unlike most other similarly named routines here, this does not create a
7589      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7590
7591     dVAR;
7592
7593     UV cp = valid_utf8_to_uvchr(p, NULL);
7594
7595     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7596
7597     /* The earliest Unicode releases did not have these precomposed Hangul
7598      * syllables.  Set to point to undef in that case, so will return false on
7599      * every call */
7600     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7601         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7602         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7603             SvREFCNT_dec(PL_utf8_X_LV);
7604             PL_utf8_X_LV = &PL_sv_undef;
7605         }
7606     }
7607
7608     return (PL_utf8_X_LV != &PL_sv_undef
7609             && cp >= SBASE && cp < SBASE + SCount
7610             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7611 }
7612 #endif
7613
7614 PERL_STATIC_INLINE bool
7615 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7616 {
7617     /* Unlike most other similarly named routines here, this does not create a
7618      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7619
7620     dVAR;
7621
7622     UV cp = valid_utf8_to_uvchr(p, NULL);
7623
7624     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7625
7626     /* The earliest Unicode releases did not have these precomposed Hangul
7627      * syllables.  Set to point to undef in that case, so will return false on
7628      * every call */
7629     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7630         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7631         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7632             SvREFCNT_dec(PL_utf8_X_LVT);
7633             PL_utf8_X_LVT = &PL_sv_undef;
7634         }
7635     }
7636
7637     return (PL_utf8_X_LVT != &PL_sv_undef
7638             && cp >= SBASE && cp < SBASE + SCount
7639             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7640 }
7641
7642 /*
7643  * Local variables:
7644  * c-indentation-style: bsd
7645  * c-basic-offset: 4
7646  * indent-tabs-mode: nil
7647  * End:
7648  *
7649  * ex: set ts=8 sts=4 sw=4 et:
7650  */