src/5017006/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86 #include "re_defs.h"
  87
  88 #ifdef PERL_IN_XSUB_RE
  89 #  include "re_comp.h"
  90 #else
  91 #  include "regcomp.h"
  92 #endif
  93
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define RF_tainted      1       /* tainted information used? e.g. locale */
  98 #define RF_warned       2               /* warned about big count? */
  99
 100 #define RF_utf8         8               /* Pattern contains multibyte chars? */
 101
 102 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid for non-utf8 strings: avoids the reginclass
 111  * call if there are no complications: i.e., if everything matchable is
 112  * straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 114                                               : ANYOF_BITMAP_TEST(p,*(c)))
 115
 116 /*
 117  * Forwards.
 118  */
 119
 120 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 121 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 122
 123 #define HOPc(pos,off) \
 124         (char *)(PL_reg_match_utf8 \
 125             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 126             : (U8*)(pos + off))
 127 #define HOPBACKc(pos, off) \
 128         (char*)(PL_reg_match_utf8\
 129             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 130             : (pos - off >= PL_bostr)           \
 131                 ? (U8*)pos - off                \
 132                 : NULL)
 133
 134 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 135 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 136
 137
 138 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 139 #define NEXTCHR_IS_EOS (nextchr < 0)
 140
 141 #define SET_nextchr \
 142     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 143
 144 #define SET_locinput(p) \
 145     locinput = (p);  \
 146     SET_nextchr
 147
 148
 149 /* these are unrolled below in the CCC_TRY_XXX defined */
 150 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 151     if (!CAT2(PL_utf8_,class)) { \
 152         bool ok; \
 153         ENTER; save_re_context(); \
 154         ok=CAT2(is_utf8_,class)((const U8*)str); \
 155         PERL_UNUSED_VAR(ok); \
 156         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 157 /* Doesn't do an assert to verify that is correct */
 158 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 159     if (!CAT2(PL_utf8_,class)) { \
 160         bool throw_away PERL_UNUSED_DECL; \
 161         ENTER; save_re_context(); \
 162         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 163         PERL_UNUSED_VAR(throw_away); \
 164         LEAVE; } } STMT_END
 165
 166 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 167 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 168
 169 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 170         /* No asserts are done for some of these, in case called on a   */  \
 171         /* Unicode version in which they map to nothing */                  \
 172         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 173         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 174
 175 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 176
 177 /* The actual code for CCC_TRY, which uses several variables from the routine
 178  * it's callable from.  It is designed to be the bulk of a case statement.
 179  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 180  *      nextchr matches the class.
 181  * UTF8_TEST is the whole test string to use for utf8 targets
 182  * LOAD is what to use to test, and if not present to load in the swash for the
 183  *      class
 184  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 185  *      UTF8_TEST test.
 186  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 187  * utf8 and a variant, load the swash if necessary and test using the utf8
 188  * test.  Advance to the next character if test is ok, otherwise fail; If not
 189  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 190  * fails, or advance to the next character */
 191
 192 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 193     if (NEXTCHR_IS_EOS) {                                              \
 194         sayNO;                                                                \
 195     }                                                                         \
 196     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 197         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 198         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 199             sayNO;                                                            \
 200         }                                                                     \
 201     }                                                                         \
 202     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 203             sayNO;                                                            \
 204     }                                                                         \
 205     goto increment_locinput;
 206
 207 /* Handle the non-locale cases for a character class and its complement.  It
 208  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 209  * This is because that code fails when the test succeeds, so we want to have
 210  * the test fail so that the code succeeds.  The swash is stored in a
 211  * predictable PL_ place */
 212 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 213                            CLASS, STR)                                        \
 214     case NAME:                                                                \
 215         _CCC_TRY_CODE( !, FUNC,                                               \
 216                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 217                                             (U8*)locinput, TRUE)),            \
 218                           CLASS, STR)                                         \
 219     case NNAME:                                                               \
 220         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 221                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 222                                             (U8*)locinput, TRUE)),            \
 223                           CLASS, STR)                                         \
 224
 225 /* Generate the case statements for both locale and non-locale character
 226  * classes in regmatch for classes that don't have special unicode semantics.
 227  * Locales don't use an immediate swash, but an intermediary special locale
 228  * function that is called on the pointer to the current place in the input
 229  * string.  That function will resolve to needing the same swash.  One might
 230  * think that because we don't know what the locale will match, we shouldn't
 231  * check with the swash loading function that it loaded properly; ie, that we
 232  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 233  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 234  * irrelevant here */
 235 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 236                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 237                 NAMEA, NNAMEA, FUNCA,                                         \
 238                 CLASS, STR)                                                   \
 239     case NAMEL:                                                               \
 240         PL_reg_flags |= RF_tainted;                                           \
 241         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 242     case NNAMEL:                                                              \
 243         PL_reg_flags |= RF_tainted;                                           \
 244         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 245                        CLASS, STR)                                            \
 246     case NAMEA:                                                               \
 247         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 248             sayNO;                                                            \
 249         }                                                                     \
 250         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 251         locinput++;                                        \
 252         break;                                                                \
 253     case NNAMEA:                                                              \
 254         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 255             sayNO;                                                            \
 256         }                                                                     \
 257         goto increment_locinput;                                              \
 258     /* Generate the non-locale cases */                                       \
 259     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 260
 261 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 262  * statements to handle separate Unicode semantics nodes */
 263 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 264                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 265                   NAMEU, NNAMEU, FUNCU,                                        \
 266                   NAMEA, NNAMEA, FUNCA,                                        \
 267                   CLASS, STR)                                                  \
 268     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 269             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 270             NAMEA, NNAMEA, FUNCA,                                              \
 271             CLASS, STR)                                                        \
 272     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 273
 274 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 275
 276 /* for use after a quantifier and before an EXACT-like node -- japhy */
 277 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 278  *
 279  * NOTE that *nothing* that affects backtracking should be in here, specifically
 280  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 281  * node that is in between two EXACT like nodes when ascertaining what the required
 282  * "follow" character is. This should probably be moved to regex compile time
 283  * although it may be done at run time beause of the REF possibility - more
 284  * investigation required. -- demerphq
 285 */
 286 #define JUMPABLE(rn) (      \
 287     OP(rn) == OPEN ||       \
 288     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 289     OP(rn) == EVAL ||   \
 290     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 291     OP(rn) == PLUS || OP(rn) == MINMOD || \
 292     OP(rn) == KEEPS || \
 293     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 294 )
 295 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 296
 297 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 298
 299 #if 0
 300 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 301    we don't need this definition. */
 302 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 303 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 305
 306 #else
 307 /* ... so we use this as its faster. */
 308 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 309 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 310 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 311 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 312
 313 #endif
 314
 315 /*
 316   Search for mandatory following text node; for lookahead, the text must
 317   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 318 */
 319 #define FIND_NEXT_IMPT(rn) STMT_START { \
 320     while (JUMPABLE(rn)) { \
 321         const OPCODE type = OP(rn); \
 322         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 323             rn = NEXTOPER(NEXTOPER(rn)); \
 324         else if (type == PLUS) \
 325             rn = NEXTOPER(rn); \
 326         else if (type == IFMATCH) \
 327             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 328         else rn += NEXT_OFF(rn); \
 329     } \
 330 } STMT_END
 331
 332
 333 static void restore_pos(pTHX_ void *arg);
 334
 335 #define REGCP_PAREN_ELEMS 3
 336 #define REGCP_OTHER_ELEMS 3
 337 #define REGCP_FRAME_ELEMS 1
 338 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 339  * are needed for the regexp context stack bookkeeping. */
 340
 341 STATIC CHECKPOINT
 342 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 343 {
 344     dVAR;
 345     const int retval = PL_savestack_ix;
 346     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 347     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 348     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 349     I32 p;
 350     GET_RE_DEBUG_FLAGS_DECL;
 351
 352     PERL_ARGS_ASSERT_REGCPPUSH;
 353
 354     if (paren_elems_to_push < 0)
 355         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 356                    paren_elems_to_push);
 357
 358     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 359         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 360                    " out of range (%lu-%ld)",
 361                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 362
 363     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 364
 365     DEBUG_BUFFERS_r(
 366         if ((int)PL_regsize > (int)parenfloor)
 367             PerlIO_printf(Perl_debug_log,
 368                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 369                 PTR2UV(rex),
 370                 PTR2UV(rex->offs)
 371             );
 372     );
 373     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 374 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 375         SSPUSHINT(rex->offs[p].end);
 376         SSPUSHINT(rex->offs[p].start);
 377         SSPUSHINT(rex->offs[p].start_tmp);
 378         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 379             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 380             (UV)p,
 381             (IV)rex->offs[p].start,
 382             (IV)rex->offs[p].start_tmp,
 383             (IV)rex->offs[p].end
 384         ));
 385     }
 386 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 387     SSPUSHINT(PL_regsize);
 388     SSPUSHINT(rex->lastparen);
 389     SSPUSHINT(rex->lastcloseparen);
 390     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 391
 392     return retval;
 393 }
 394
 395 /* These are needed since we do not localize EVAL nodes: */
 396 #define REGCP_SET(cp)                                           \
 397     DEBUG_STATE_r(                                              \
 398             PerlIO_printf(Perl_debug_log,                       \
 399                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 400                 (IV)PL_savestack_ix));                          \
 401     cp = PL_savestack_ix
 402
 403 #define REGCP_UNWIND(cp)                                        \
 404     DEBUG_STATE_r(                                              \
 405         if (cp != PL_savestack_ix)                              \
 406             PerlIO_printf(Perl_debug_log,                       \
 407                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 408                 (IV)(cp), (IV)PL_savestack_ix));                \
 409     regcpblow(cp)
 410
 411 #define UNWIND_PAREN(lp, lcp)               \
 412     for (n = rex->lastparen; n > lp; n--)   \
 413         rex->offs[n].end = -1;              \
 414     rex->lastparen = n;                     \
 415     rex->lastcloseparen = lcp;
 416
 417
 418 STATIC void
 419 S_regcppop(pTHX_ regexp *rex)
 420 {
 421     dVAR;
 422     UV i;
 423     U32 paren;
 424     GET_RE_DEBUG_FLAGS_DECL;
 425
 426     PERL_ARGS_ASSERT_REGCPPOP;
 427
 428     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 429     i = SSPOPUV;
 430     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 431     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 432     rex->lastcloseparen = SSPOPINT;
 433     rex->lastparen = SSPOPINT;
 434     PL_regsize = SSPOPINT;
 435
 436     i -= REGCP_OTHER_ELEMS;
 437     /* Now restore the parentheses context. */
 438     DEBUG_BUFFERS_r(
 439         if (i || rex->lastparen + 1 <= rex->nparens)
 440             PerlIO_printf(Perl_debug_log,
 441                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 442                 PTR2UV(rex),
 443                 PTR2UV(rex->offs)
 444             );
 445     );
 446     paren = PL_regsize;
 447     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 448         I32 tmps;
 449         rex->offs[paren].start_tmp = SSPOPINT;
 450         rex->offs[paren].start = SSPOPINT;
 451         tmps = SSPOPINT;
 452         if (paren <= rex->lastparen)
 453             rex->offs[paren].end = tmps;
 454         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 455             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 456             (UV)paren,
 457             (IV)rex->offs[paren].start,
 458             (IV)rex->offs[paren].start_tmp,
 459             (IV)rex->offs[paren].end,
 460             (paren > rex->lastparen ? "(skipped)" : ""));
 461         );
 462         paren--;
 463     }
 464 #if 1
 465     /* It would seem that the similar code in regtry()
 466      * already takes care of this, and in fact it is in
 467      * a better location to since this code can #if 0-ed out
 468      * but the code in regtry() is needed or otherwise tests
 469      * requiring null fields (pat.t#187 and split.t#{13,14}
 470      * (as of patchlevel 7877)  will fail.  Then again,
 471      * this code seems to be necessary or otherwise
 472      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 473      * --jhi updated by dapm */
 474     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 475         if (i > PL_regsize)
 476             rex->offs[i].start = -1;
 477         rex->offs[i].end = -1;
 478         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 479             "    \\%"UVuf": %s   ..-1 undeffing\n",
 480             (UV)i,
 481             (i > PL_regsize) ? "-1" : "  "
 482         ));
 483     }
 484 #endif
 485 }
 486
 487 /* restore the parens and associated vars at savestack position ix,
 488  * but without popping the stack */
 489
 490 STATIC void
 491 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 492 {
 493     I32 tmpix = PL_savestack_ix;
 494     PL_savestack_ix = ix;
 495     regcppop(rex);
 496     PL_savestack_ix = tmpix;
 497 }
 498
 499 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 500
 501 /*
 502  * pregexec and friends
 503  */
 504
 505 #ifndef PERL_IN_XSUB_RE
 506 /*
 507  - pregexec - match a regexp against a string
 508  */
 509 I32
 510 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 511          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 512 /* stringarg: the point in the string at which to begin matching */
 513 /* strend:    pointer to null at end of string */
 514 /* strbeg:    real beginning of string */
 515 /* minend:    end of match must be >= minend bytes after stringarg. */
 516 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 517  *            itself is accessed via the pointers above */
 518 /* nosave:    For optimizations. */
 519 {
 520     PERL_ARGS_ASSERT_PREGEXEC;
 521
 522     return
 523         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 524                       nosave ? 0 : REXEC_COPY_STR);
 525 }
 526 #endif
 527
 528 /*
 529  * Need to implement the following flags for reg_anch:
 530  *
 531  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 532  * USE_INTUIT_ML
 533  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 534  * INTUIT_AUTORITATIVE_ML
 535  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 536  * INTUIT_ONCE_ML
 537  *
 538  * Another flag for this function: SECOND_TIME (so that float substrs
 539  * with giant delta may be not rechecked).
 540  */
 541
 542 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 543
 544 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 545    Otherwise, only SvCUR(sv) is used to get strbeg. */
 546
 547 /* XXXX We assume that strpos is strbeg unless sv. */
 548
 549 /* XXXX Some places assume that there is a fixed substring.
 550         An update may be needed if optimizer marks as "INTUITable"
 551         RExen without fixed substrings.  Similarly, it is assumed that
 552         lengths of all the strings are no more than minlen, thus they
 553         cannot come from lookahead.
 554         (Or minlen should take into account lookahead.)
 555   NOTE: Some of this comment is not correct. minlen does now take account
 556   of lookahead/behind. Further research is required. -- demerphq
 557
 558 */
 559
 560 /* A failure to find a constant substring means that there is no need to make
 561    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 562    finding a substring too deep into the string means that less calls to
 563    regtry() should be needed.
 564
 565    REx compiler's optimizer found 4 possible hints:
 566         a) Anchored substring;
 567         b) Fixed substring;
 568         c) Whether we are anchored (beginning-of-line or \G);
 569         d) First node (of those at offset 0) which may distinguish positions;
 570    We use a)b)d) and multiline-part of c), and try to find a position in the
 571    string which does not contradict any of them.
 572  */
 573
 574 /* Most of decisions we do here should have been done at compile time.
 575    The nodes of the REx which we used for the search should have been
 576    deleted from the finite automaton. */
 577
 578 char *
 579 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 580                      char *strend, const U32 flags, re_scream_pos_data *data)
 581 {
 582     dVAR;
 583     struct regexp *const prog = ReANY(rx);
 584     I32 start_shift = 0;
 585     /* Should be nonnegative! */
 586     I32 end_shift   = 0;
 587     char *s;
 588     SV *check;
 589     char *strbeg;
 590     char *t;
 591     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 592     I32 ml_anch;
 593     char *other_last = NULL;    /* other substr checked before this */
 594     char *check_at = NULL;              /* check substr found at this pos */
 595     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 596     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 597     RXi_GET_DECL(prog,progi);
 598 #ifdef DEBUGGING
 599     const char * const i_strpos = strpos;
 600 #endif
 601     GET_RE_DEBUG_FLAGS_DECL;
 602
 603     PERL_ARGS_ASSERT_RE_INTUIT_START;
 604     PERL_UNUSED_ARG(flags);
 605     PERL_UNUSED_ARG(data);
 606
 607     RX_MATCH_UTF8_set(rx,utf8_target);
 608
 609     if (RX_UTF8(rx)) {
 610         PL_reg_flags |= RF_utf8;
 611     }
 612     DEBUG_EXECUTE_r(
 613         debug_start_match(rx, utf8_target, strpos, strend,
 614             sv ? "Guessing start of match in sv for"
 615                : "Guessing start of match in string for");
 616               );
 617
 618     /* CHR_DIST() would be more correct here but it makes things slow. */
 619     if (prog->minlen > strend - strpos) {
 620         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 621                               "String too short... [re_intuit_start]\n"));
 622         goto fail;
 623     }
 624
 625     /* XXX we need to pass strbeg as a separate arg: the following is
 626      * guesswork and can be wrong... */
 627     if (sv && SvPOK(sv)) {
 628         char * p   = SvPVX(sv);
 629         STRLEN cur = SvCUR(sv);
 630         if (p <= strpos && strpos < p + cur) {
 631             strbeg = p;
 632             assert(p <= strend && strend <= p + cur);
 633         }
 634         else
 635             strbeg = strend - cur;
 636     }
 637     else
 638         strbeg = strpos;
 639
 640     PL_regeol = strend;
 641     if (utf8_target) {
 642         if (!prog->check_utf8 && prog->check_substr)
 643             to_utf8_substr(prog);
 644         check = prog->check_utf8;
 645     } else {
 646         if (!prog->check_substr && prog->check_utf8) {
 647             if (! to_byte_substr(prog)) {
 648                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 649             }
 650         }
 651         check = prog->check_substr;
 652     }
 653     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 654         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 655                      || ( (prog->extflags & RXf_ANCH_BOL)
 656                           && !multiline ) );    /* Check after \n? */
 657
 658         if (!ml_anch) {
 659           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 660                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 661                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 662                && sv && !SvROK(sv)
 663                && (strpos != strbeg)) {
 664               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 665               goto fail;
 666           }
 667           if (prog->check_offset_min == prog->check_offset_max
 668               && !(prog->extflags & RXf_CANY_SEEN)
 669               && ! multiline)   /* /m can cause \n's to match that aren't
 670                                    accounted for in the string max length.
 671                                    See [perl #115242] */
 672           {
 673             /* Substring at constant offset from beg-of-str... */
 674             I32 slen;
 675
 676             s = HOP3c(strpos, prog->check_offset_min, strend);
 677
 678             if (SvTAIL(check)) {
 679                 slen = SvCUR(check);    /* >= 1 */
 680
 681                 if ( strend - s > slen || strend - s < slen - 1
 682                      || (strend - s == slen && strend[-1] != '\n')) {
 683                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 684                     goto fail_finish;
 685                 }
 686                 /* Now should match s[0..slen-2] */
 687                 slen--;
 688                 if (slen && (*SvPVX_const(check) != *s
 689                              || (slen > 1
 690                                  && memNE(SvPVX_const(check), s, slen)))) {
 691                   report_neq:
 692                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 693                     goto fail_finish;
 694                 }
 695             }
 696             else if (*SvPVX_const(check) != *s
 697                      || ((slen = SvCUR(check)) > 1
 698                          && memNE(SvPVX_const(check), s, slen)))
 699                 goto report_neq;
 700             check_at = s;
 701             goto success_at_start;
 702           }
 703         }
 704         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 705         s = strpos;
 706         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 707         end_shift = prog->check_end_shift;
 708
 709         if (!ml_anch) {
 710             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 711                                          - (SvTAIL(check) != 0);
 712             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 713
 714             if (end_shift < eshift)
 715                 end_shift = eshift;
 716         }
 717     }
 718     else {                              /* Can match at random position */
 719         ml_anch = 0;
 720         s = strpos;
 721         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 722         end_shift = prog->check_end_shift;
 723
 724         /* end shift should be non negative here */
 725     }
 726
 727 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 728     if (end_shift < 0)
 729         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 730                    (IV)end_shift, RX_PRECOMP(prog));
 731 #endif
 732
 733   restart:
 734     /* Find a possible match in the region s..strend by looking for
 735        the "check" substring in the region corrected by start/end_shift. */
 736
 737     {
 738         I32 srch_start_shift = start_shift;
 739         I32 srch_end_shift = end_shift;
 740         U8* start_point;
 741         U8* end_point;
 742         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 743             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 744             srch_start_shift = strbeg - s;
 745         }
 746     DEBUG_OPTIMISE_MORE_r({
 747         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 748             (IV)prog->check_offset_min,
 749             (IV)srch_start_shift,
 750             (IV)srch_end_shift,
 751             (IV)prog->check_end_shift);
 752     });
 753
 754         if (prog->extflags & RXf_CANY_SEEN) {
 755             start_point= (U8*)(s + srch_start_shift);
 756             end_point= (U8*)(strend - srch_end_shift);
 757         } else {
 758             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 759             end_point= HOP3(strend, -srch_end_shift, strbeg);
 760         }
 761         DEBUG_OPTIMISE_MORE_r({
 762             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 763                 (int)(end_point - start_point),
 764                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 765                 start_point);
 766         });
 767
 768         s = fbm_instr( start_point, end_point,
 769                       check, multiline ? FBMrf_MULTILINE : 0);
 770     }
 771     /* Update the count-of-usability, remove useless subpatterns,
 772         unshift s.  */
 773
 774     DEBUG_EXECUTE_r({
 775         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 776             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 777         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 778                           (s ? "Found" : "Did not find"),
 779             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 780                 ? "anchored" : "floating"),
 781             quoted,
 782             RE_SV_TAIL(check),
 783             (s ? " at offset " : "...\n") );
 784     });
 785
 786     if (!s)
 787         goto fail_finish;
 788     /* Finish the diagnostic message */
 789     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 790
 791     /* XXX dmq: first branch is for positive lookbehind...
 792        Our check string is offset from the beginning of the pattern.
 793        So we need to do any stclass tests offset forward from that
 794        point. I think. :-(
 795      */
 796
 797
 798
 799     check_at=s;
 800
 801
 802     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 803        Start with the other substr.
 804        XXXX no SCREAM optimization yet - and a very coarse implementation
 805        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 806                 *always* match.  Probably should be marked during compile...
 807        Probably it is right to do no SCREAM here...
 808      */
 809
 810     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 811                 : (prog->float_substr && prog->anchored_substr))
 812     {
 813         /* Take into account the "other" substring. */
 814         /* XXXX May be hopelessly wrong for UTF... */
 815         if (!other_last)
 816             other_last = strpos;
 817         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 818           do_other_anchored:
 819             {
 820                 char * const last = HOP3c(s, -start_shift, strbeg);
 821                 char *last1, *last2;
 822                 char * const saved_s = s;
 823                 SV* must;
 824
 825                 t = s - prog->check_offset_max;
 826                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 827                     && (!utf8_target
 828                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 829                             && t > strpos)))
 830                     NOOP;
 831                 else
 832                     t = strpos;
 833                 t = HOP3c(t, prog->anchored_offset, strend);
 834                 if (t < other_last)     /* These positions already checked */
 835                     t = other_last;
 836                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 837                 if (last < last1)
 838                     last1 = last;
 839                 /* XXXX It is not documented what units *_offsets are in.
 840                    We assume bytes, but this is clearly wrong.
 841                    Meaning this code needs to be carefully reviewed for errors.
 842                    dmq.
 843                   */
 844
 845                 /* On end-of-str: see comment below. */
 846                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 847                 if (must == &PL_sv_undef) {
 848                     s = (char*)NULL;
 849                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 850                 }
 851                 else
 852                     s = fbm_instr(
 853                         (unsigned char*)t,
 854                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 855                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 856                         must,
 857                         multiline ? FBMrf_MULTILINE : 0
 858                     );
 859                 DEBUG_EXECUTE_r({
 860                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 861                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 862                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 863                         (s ? "Found" : "Contradicts"),
 864                         quoted, RE_SV_TAIL(must));
 865                 });
 866
 867
 868                 if (!s) {
 869                     if (last1 >= last2) {
 870                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 871                                                 ", giving up...\n"));
 872                         goto fail_finish;
 873                     }
 874                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 875                         ", trying floating at offset %ld...\n",
 876                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 877                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 878                     s = HOP3c(last, 1, strend);
 879                     goto restart;
 880                 }
 881                 else {
 882                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 883                           (long)(s - i_strpos)));
 884                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 885                     other_last = HOP3c(s, 1, strend);
 886                     s = saved_s;
 887                     if (t == strpos)
 888                         goto try_at_start;
 889                     goto try_at_offset;
 890                 }
 891             }
 892         }
 893         else {          /* Take into account the floating substring. */
 894             char *last, *last1;
 895             char * const saved_s = s;
 896             SV* must;
 897
 898             t = HOP3c(s, -start_shift, strbeg);
 899             last1 = last =
 900                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 901             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 902                 last = HOP3c(t, prog->float_max_offset, strend);
 903             s = HOP3c(t, prog->float_min_offset, strend);
 904             if (s < other_last)
 905                 s = other_last;
 906  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 907             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 908             /* fbm_instr() takes into account exact value of end-of-str
 909                if the check is SvTAIL(ed).  Since false positives are OK,
 910                and end-of-str is not later than strend we are OK. */
 911             if (must == &PL_sv_undef) {
 912                 s = (char*)NULL;
 913                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 914             }
 915             else
 916                 s = fbm_instr((unsigned char*)s,
 917                               (unsigned char*)last + SvCUR(must)
 918                                   - (SvTAIL(must)!=0),
 919                               must, multiline ? FBMrf_MULTILINE : 0);
 920             DEBUG_EXECUTE_r({
 921                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 922                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 923                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 924                     (s ? "Found" : "Contradicts"),
 925                     quoted, RE_SV_TAIL(must));
 926             });
 927             if (!s) {
 928                 if (last1 == last) {
 929                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 930                                             ", giving up...\n"));
 931                     goto fail_finish;
 932                 }
 933                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 934                     ", trying anchored starting at offset %ld...\n",
 935                     (long)(saved_s + 1 - i_strpos)));
 936                 other_last = last;
 937                 s = HOP3c(t, 1, strend);
 938                 goto restart;
 939             }
 940             else {
 941                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 942                       (long)(s - i_strpos)));
 943                 other_last = s; /* Fix this later. --Hugo */
 944                 s = saved_s;
 945                 if (t == strpos)
 946                     goto try_at_start;
 947                 goto try_at_offset;
 948             }
 949         }
 950     }
 951
 952
 953     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 954
 955     DEBUG_OPTIMISE_MORE_r(
 956         PerlIO_printf(Perl_debug_log,
 957             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 958             (IV)prog->check_offset_min,
 959             (IV)prog->check_offset_max,
 960             (IV)(s-strpos),
 961             (IV)(t-strpos),
 962             (IV)(t-s),
 963             (IV)(strend-strpos)
 964         )
 965     );
 966
 967     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 968         && (!utf8_target
 969             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 970                  && t > strpos)))
 971     {
 972         /* Fixed substring is found far enough so that the match
 973            cannot start at strpos. */
 974       try_at_offset:
 975         if (ml_anch && t[-1] != '\n') {
 976             /* Eventually fbm_*() should handle this, but often
 977                anchored_offset is not 0, so this check will not be wasted. */
 978             /* XXXX In the code below we prefer to look for "^" even in
 979                presence of anchored substrings.  And we search even
 980                beyond the found float position.  These pessimizations
 981                are historical artefacts only.  */
 982           find_anchor:
 983             while (t < strend - prog->minlen) {
 984                 if (*t == '\n') {
 985                     if (t < check_at - prog->check_offset_min) {
 986                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 987                             /* Since we moved from the found position,
 988                                we definitely contradict the found anchored
 989                                substr.  Due to the above check we do not
 990                                contradict "check" substr.
 991                                Thus we can arrive here only if check substr
 992                                is float.  Redo checking for "other"=="fixed".
 993                              */
 994                             strpos = t + 1;
 995                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 996                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 997                             goto do_other_anchored;
 998                         }
 999                         /* We don't contradict the found floating substring. */
1000                         /* XXXX Why not check for STCLASS? */
1001                         s = t + 1;
1002                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1003                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1004                         goto set_useful;
1005                     }
1006                     /* Position contradicts check-string */
1007                     /* XXXX probably better to look for check-string
1008                        than for "\n", so one should lower the limit for t? */
1009                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1010                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1011                     other_last = strpos = s = t + 1;
1012                     goto restart;
1013                 }
1014                 t++;
1015             }
1016             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1017                         PL_colors[0], PL_colors[1]));
1018             goto fail_finish;
1019         }
1020         else {
1021             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1022                         PL_colors[0], PL_colors[1]));
1023         }
1024         s = t;
1025       set_useful:
1026         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1027     }
1028     else {
1029         /* The found string does not prohibit matching at strpos,
1030            - no optimization of calling REx engine can be performed,
1031            unless it was an MBOL and we are not after MBOL,
1032            or a future STCLASS check will fail this. */
1033       try_at_start:
1034         /* Even in this situation we may use MBOL flag if strpos is offset
1035            wrt the start of the string. */
1036         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1037             && (strpos != strbeg) && strpos[-1] != '\n'
1038             /* May be due to an implicit anchor of m{.*foo}  */
1039             && !(prog->intflags & PREGf_IMPLICIT))
1040         {
1041             t = strpos;
1042             goto find_anchor;
1043         }
1044         DEBUG_EXECUTE_r( if (ml_anch)
1045             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1046                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1047         );
1048       success_at_start:
1049         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1050             && (utf8_target ? (
1051                 prog->check_utf8                /* Could be deleted already */
1052                 && --BmUSEFUL(prog->check_utf8) < 0
1053                 && (prog->check_utf8 == prog->float_utf8)
1054             ) : (
1055                 prog->check_substr              /* Could be deleted already */
1056                 && --BmUSEFUL(prog->check_substr) < 0
1057                 && (prog->check_substr == prog->float_substr)
1058             )))
1059         {
1060             /* If flags & SOMETHING - do not do it many times on the same match */
1061             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1062             /* XXX Does the destruction order has to change with utf8_target? */
1063             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1064             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1065             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1066             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1067             check = NULL;                       /* abort */
1068             s = strpos;
1069             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1070                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1071             if (prog->intflags & PREGf_IMPLICIT)
1072                 prog->extflags &= ~RXf_ANCH_MBOL;
1073             /* XXXX This is a remnant of the old implementation.  It
1074                     looks wasteful, since now INTUIT can use many
1075                     other heuristics. */
1076             prog->extflags &= ~RXf_USE_INTUIT;
1077             /* XXXX What other flags might need to be cleared in this branch? */
1078         }
1079         else
1080             s = strpos;
1081     }
1082
1083     /* Last resort... */
1084     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1085     /* trie stclasses are too expensive to use here, we are better off to
1086        leave it to regmatch itself */
1087     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1088         /* minlen == 0 is possible if regstclass is \b or \B,
1089            and the fixed substr is ''$.
1090            Since minlen is already taken into account, s+1 is before strend;
1091            accidentally, minlen >= 1 guaranties no false positives at s + 1
1092            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1093            regstclass does not come from lookahead...  */
1094         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1095            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1096         const U8* const str = (U8*)STRING(progi->regstclass);
1097         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1098                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1099                     : 1);
1100         char * endpos;
1101         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1102             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1103         else if (prog->float_substr || prog->float_utf8)
1104             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1105         else
1106             endpos= strend;
1107
1108         if (checked_upto < s)
1109            checked_upto = s;
1110         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1111                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1112
1113         t = s;
1114         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1115         if (s) {
1116             checked_upto = s;
1117         } else {
1118 #ifdef DEBUGGING
1119             const char *what = NULL;
1120 #endif
1121             if (endpos == strend) {
1122                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1123                                 "Could not match STCLASS...\n") );
1124                 goto fail;
1125             }
1126             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                                    "This position contradicts STCLASS...\n") );
1128             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1129                 goto fail;
1130             checked_upto = HOPBACKc(endpos, start_shift);
1131             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1132                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1133             /* Contradict one of substrings */
1134             if (prog->anchored_substr || prog->anchored_utf8) {
1135                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1136                     DEBUG_EXECUTE_r( what = "anchored" );
1137                   hop_and_restart:
1138                     s = HOP3c(t, 1, strend);
1139                     if (s + start_shift + end_shift > strend) {
1140                         /* XXXX Should be taken into account earlier? */
1141                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142                                                "Could not match STCLASS...\n") );
1143                         goto fail;
1144                     }
1145                     if (!check)
1146                         goto giveup;
1147                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148                                 "Looking for %s substr starting at offset %ld...\n",
1149                                  what, (long)(s + start_shift - i_strpos)) );
1150                     goto restart;
1151                 }
1152                 /* Have both, check_string is floating */
1153                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1154                     goto retry_floating_check;
1155                 /* Recheck anchored substring, but not floating... */
1156                 s = check_at;
1157                 if (!check)
1158                     goto giveup;
1159                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1160                           "Looking for anchored substr starting at offset %ld...\n",
1161                           (long)(other_last - i_strpos)) );
1162                 goto do_other_anchored;
1163             }
1164             /* Another way we could have checked stclass at the
1165                current position only: */
1166             if (ml_anch) {
1167                 s = t = t + 1;
1168                 if (!check)
1169                     goto giveup;
1170                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1171                           "Looking for /%s^%s/m starting at offset %ld...\n",
1172                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1173                 goto try_at_offset;
1174             }
1175             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1176                 goto fail;
1177             /* Check is floating substring. */
1178           retry_floating_check:
1179             t = check_at - start_shift;
1180             DEBUG_EXECUTE_r( what = "floating" );
1181             goto hop_and_restart;
1182         }
1183         if (t != s) {
1184             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1185                         "By STCLASS: moving %ld --> %ld\n",
1186                                   (long)(t - i_strpos), (long)(s - i_strpos))
1187                    );
1188         }
1189         else {
1190             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1191                                   "Does not contradict STCLASS...\n");
1192                    );
1193         }
1194     }
1195   giveup:
1196     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1197                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1198                           PL_colors[5], (long)(s - i_strpos)) );
1199     return s;
1200
1201   fail_finish:                          /* Substring not found */
1202     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1203         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1204   fail:
1205     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1206                           PL_colors[4], PL_colors[5]));
1207     return NULL;
1208 }
1209
1210 #define DECL_TRIE_TYPE(scan) \
1211     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1212                     trie_type = ((scan->flags == EXACT) \
1213                               ? (utf8_target ? trie_utf8 : trie_plain) \
1214                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1215
1216 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1217 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1218     STRLEN skiplen;                                                                 \
1219     switch (trie_type) {                                                            \
1220     case trie_utf8_fold:                                                            \
1221         if ( foldlen>0 ) {                                                          \
1222             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1223             foldlen -= len;                                                         \
1224             uscan += len;                                                           \
1225             len=0;                                                                  \
1226         } else {                                                                    \
1227             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1228             len = UTF8SKIP(uc);                                                     \
1229             skiplen = UNISKIP( uvc );                                               \
1230             foldlen -= skiplen;                                                     \
1231             uscan = foldbuf + skiplen;                                              \
1232         }                                                                           \
1233         break;                                                                      \
1234     case trie_latin_utf8_fold:                                                      \
1235         if ( foldlen>0 ) {                                                          \
1236             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1237             foldlen -= len;                                                         \
1238             uscan += len;                                                           \
1239             len=0;                                                                  \
1240         } else {                                                                    \
1241             len = 1;                                                                \
1242             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1243             skiplen = UNISKIP( uvc );                                               \
1244             foldlen -= skiplen;                                                     \
1245             uscan = foldbuf + skiplen;                                              \
1246         }                                                                           \
1247         break;                                                                      \
1248     case trie_utf8:                                                                 \
1249         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1250         break;                                                                      \
1251     case trie_plain:                                                                \
1252         uvc = (UV)*uc;                                                              \
1253         len = 1;                                                                    \
1254     }                                                                               \
1255     if (uvc < 256) {                                                                \
1256         charid = trie->charmap[ uvc ];                                              \
1257     }                                                                               \
1258     else {                                                                          \
1259         charid = 0;                                                                 \
1260         if (widecharmap) {                                                          \
1261             SV** const svpp = hv_fetch(widecharmap,                                 \
1262                         (char*)&uvc, sizeof(UV), 0);                                \
1263             if (svpp)                                                               \
1264                 charid = (U16)SvIV(*svpp);                                          \
1265         }                                                                           \
1266     }                                                                               \
1267 } STMT_END
1268
1269 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1270 STMT_START {                                              \
1271     while (s <= e) {                                      \
1272         if ( (CoNd)                                       \
1273              && (ln == 1 || folder(s, pat_string, ln))    \
1274              && (!reginfo || regtry(reginfo, &s)) )       \
1275             goto got_it;                                  \
1276         s++;                                              \
1277     }                                                     \
1278 } STMT_END
1279
1280 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1281 STMT_START {                                          \
1282     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1283         CoDe                                          \
1284         s += uskip;                                   \
1285     }                                                 \
1286 } STMT_END
1287
1288 #define REXEC_FBC_SCAN(CoDe)                          \
1289 STMT_START {                                          \
1290     while (s < strend) {                              \
1291         CoDe                                          \
1292         s++;                                          \
1293     }                                                 \
1294 } STMT_END
1295
1296 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1297 REXEC_FBC_UTF8_SCAN(                                  \
1298     if (CoNd) {                                       \
1299         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1300             goto got_it;                              \
1301         else                                          \
1302             tmp = doevery;                            \
1303     }                                                 \
1304     else                                              \
1305         tmp = 1;                                      \
1306 )
1307
1308 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1309 REXEC_FBC_SCAN(                                       \
1310     if (CoNd) {                                       \
1311         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1312             goto got_it;                              \
1313         else                                          \
1314             tmp = doevery;                            \
1315     }                                                 \
1316     else                                              \
1317         tmp = 1;                                      \
1318 )
1319
1320 #define REXEC_FBC_TRYIT               \
1321 if ((!reginfo || regtry(reginfo, &s))) \
1322     goto got_it
1323
1324 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1325     if (utf8_target) {                                             \
1326         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1327     }                                                          \
1328     else {                                                     \
1329         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1330     }
1331
1332 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1333     if (utf8_target) {                                             \
1334         UtFpReLoAd;                                            \
1335         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1336     }                                                          \
1337     else {                                                     \
1338         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1339     }
1340
1341 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1342     PL_reg_flags |= RF_tainted;                                \
1343     if (utf8_target) {                                             \
1344         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1345     }                                                          \
1346     else {                                                     \
1347         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1348     }
1349
1350 #define DUMP_EXEC_POS(li,s,doutf8) \
1351     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1352
1353
1354 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1355         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1356         tmp = TEST_NON_UTF8(tmp);                                              \
1357         REXEC_FBC_UTF8_SCAN(                                                   \
1358             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1359                 tmp = !tmp;                                                    \
1360                 IF_SUCCESS;                                                    \
1361             }                                                                  \
1362             else {                                                             \
1363                 IF_FAIL;                                                       \
1364             }                                                                  \
1365         );                                                                     \
1366
1367 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1368         if (s == PL_bostr) {                                                   \
1369             tmp = '\n';                                                        \
1370         }                                                                      \
1371         else {                                                                 \
1372             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1373             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1374         }                                                                      \
1375         tmp = TeSt1_UtF8;                                                      \
1376         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1377         REXEC_FBC_UTF8_SCAN(                                                   \
1378             if (tmp == ! (TeSt2_UtF8)) { \
1379                 tmp = !tmp;                                                    \
1380                 IF_SUCCESS;                                                    \
1381             }                                                                  \
1382             else {                                                             \
1383                 IF_FAIL;                                                       \
1384             }                                                                  \
1385         );                                                                     \
1386
1387 /* The only difference between the BOUND and NBOUND cases is that
1388  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1389  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1390  * with the other one being empty */
1391 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1392     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1393
1394 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1395     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1396
1397 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1398     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1399
1400 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1401     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1402
1403
1404 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1405  * be passed in completely with the variable name being tested, which isn't
1406  * such a clean interface, but this is easier to read than it was before.  We
1407  * are looking for the boundary (or non-boundary between a word and non-word
1408  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1409  * must be different.  Find the "wordness" of the character just prior to this
1410  * one, and compare it with the wordness of this one.  If they differ, we have
1411  * a boundary.  At the beginning of the string, pretend that the previous
1412  * character was a new-line */
1413 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1414     if (utf8_target) {                                                         \
1415                 UTF8_CODE \
1416     }                                                                          \
1417     else {  /* Not utf8 */                                                     \
1418         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1419         tmp = TEST_NON_UTF8(tmp);                                              \
1420         REXEC_FBC_SCAN(                                                        \
1421             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1422                 tmp = !tmp;                                                    \
1423                 IF_SUCCESS;                                                    \
1424             }                                                                  \
1425             else {                                                             \
1426                 IF_FAIL;                                                       \
1427             }                                                                  \
1428         );                                                                     \
1429     }                                                                          \
1430     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1431         goto got_it;
1432
1433 /* We know what class REx starts with.  Try to find this position... */
1434 /* if reginfo is NULL, its a dryrun */
1435 /* annoyingly all the vars in this routine have different names from their counterparts
1436    in regmatch. /grrr */
1437
1438 STATIC char *
1439 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1440     const char *strend, regmatch_info *reginfo)
1441 {
1442         dVAR;
1443         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1444         char *pat_string;   /* The pattern's exactish string */
1445         char *pat_end;      /* ptr to end char of pat_string */
1446         re_fold_t folder;       /* Function for computing non-utf8 folds */
1447         const U8 *fold_array;   /* array for folding ords < 256 */
1448         STRLEN ln;
1449         STRLEN lnc;
1450         STRLEN uskip;
1451         U8 c1;
1452         U8 c2;
1453         char *e;
1454         I32 tmp = 1;    /* Scratch variable? */
1455         const bool utf8_target = PL_reg_match_utf8;
1456         UV utf8_fold_flags = 0;
1457         RXi_GET_DECL(prog,progi);
1458
1459         PERL_ARGS_ASSERT_FIND_BYCLASS;
1460
1461         /* We know what class it must start with. */
1462         switch (OP(c)) {
1463         case ANYOF:
1464             if (utf8_target) {
1465                 REXEC_FBC_UTF8_CLASS_SCAN(
1466                           reginclass(prog, c, (U8*)s, utf8_target));
1467             }
1468             else {
1469                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1470             }
1471             break;
1472         case CANY:
1473             REXEC_FBC_SCAN(
1474                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1475                     goto got_it;
1476                 else
1477                     tmp = doevery;
1478             );
1479             break;
1480
1481         case EXACTFA:
1482             if (UTF_PATTERN || utf8_target) {
1483                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1484                 goto do_exactf_utf8;
1485             }
1486             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1487             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1488             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1489
1490         case EXACTF:
1491             if (utf8_target) {
1492
1493                 /* regcomp.c already folded this if pattern is in UTF-8 */
1494                 utf8_fold_flags = 0;
1495                 goto do_exactf_utf8;
1496             }
1497             fold_array = PL_fold;
1498             folder = foldEQ;
1499             goto do_exactf_non_utf8;
1500
1501         case EXACTFL:
1502             if (UTF_PATTERN || utf8_target) {
1503                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1504                 goto do_exactf_utf8;
1505             }
1506             fold_array = PL_fold_locale;
1507             folder = foldEQ_locale;
1508             goto do_exactf_non_utf8;
1509
1510         case EXACTFU_SS:
1511             if (UTF_PATTERN) {
1512                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1513             }
1514             goto do_exactf_utf8;
1515
1516         case EXACTFU_TRICKYFOLD:
1517         case EXACTFU:
1518             if (UTF_PATTERN || utf8_target) {
1519                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1520                 goto do_exactf_utf8;
1521             }
1522
1523             /* Any 'ss' in the pattern should have been replaced by regcomp,
1524              * so we don't have to worry here about this single special case
1525              * in the Latin1 range */
1526             fold_array = PL_fold_latin1;
1527             folder = foldEQ_latin1;
1528
1529             /* FALL THROUGH */
1530
1531         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1532                                are no glitches with fold-length differences
1533                                between the target string and pattern */
1534
1535             /* The idea in the non-utf8 EXACTF* cases is to first find the
1536              * first character of the EXACTF* node and then, if necessary,
1537              * case-insensitively compare the full text of the node.  c1 is the
1538              * first character.  c2 is its fold.  This logic will not work for
1539              * Unicode semantics and the german sharp ss, which hence should
1540              * not be compiled into a node that gets here. */
1541             pat_string = STRING(c);
1542             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1543
1544             /* We know that we have to match at least 'ln' bytes (which is the
1545              * same as characters, since not utf8).  If we have to match 3
1546              * characters, and there are only 2 availabe, we know without
1547              * trying that it will fail; so don't start a match past the
1548              * required minimum number from the far end */
1549             e = HOP3c(strend, -((I32)ln), s);
1550
1551             if (!reginfo && e < s) {
1552                 e = s;                  /* Due to minlen logic of intuit() */
1553             }
1554
1555             c1 = *pat_string;
1556             c2 = fold_array[c1];
1557             if (c1 == c2) { /* If char and fold are the same */
1558                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1559             }
1560             else {
1561                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1562             }
1563             break;
1564
1565         do_exactf_utf8:
1566         {
1567             unsigned expansion;
1568
1569
1570             /* If one of the operands is in utf8, we can't use the simpler
1571              * folding above, due to the fact that many different characters
1572              * can have the same fold, or portion of a fold, or different-
1573              * length fold */
1574             pat_string = STRING(c);
1575             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1576             pat_end = pat_string + ln;
1577             lnc = (UTF_PATTERN) /* length to match in characters */
1578                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1579                     : ln;
1580
1581             /* We have 'lnc' characters to match in the pattern, but because of
1582              * multi-character folding, each character in the target can match
1583              * up to 3 characters (Unicode guarantees it will never exceed
1584              * this) if it is utf8-encoded; and up to 2 if not (based on the
1585              * fact that the Latin 1 folds are already determined, and the
1586              * only multi-char fold in that range is the sharp-s folding to
1587              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1588              * string character.  Adjust lnc accordingly, rounding up, so that
1589              * if we need to match at least 4+1/3 chars, that really is 5. */
1590             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1591             lnc = (lnc + expansion - 1) / expansion;
1592
1593             /* As in the non-UTF8 case, if we have to match 3 characters, and
1594              * only 2 are left, it's guaranteed to fail, so don't start a
1595              * match that would require us to go beyond the end of the string
1596              */
1597             e = HOP3c(strend, -((I32)lnc), s);
1598
1599             if (!reginfo && e < s) {
1600                 e = s;                  /* Due to minlen logic of intuit() */
1601             }
1602
1603             /* XXX Note that we could recalculate e to stop the loop earlier,
1604              * as the worst case expansion above will rarely be met, and as we
1605              * go along we would usually find that e moves further to the left.
1606              * This would happen only after we reached the point in the loop
1607              * where if there were no expansion we should fail.  Unclear if
1608              * worth the expense */
1609
1610             while (s <= e) {
1611                 char *my_strend= (char *)strend;
1612                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1613                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1614                     && (!reginfo || regtry(reginfo, &s)) )
1615                 {
1616                     goto got_it;
1617                 }
1618                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1619             }
1620             break;
1621         }
1622         case BOUNDL:
1623             PL_reg_flags |= RF_tainted;
1624             FBC_BOUND(isALNUM_LC,
1625                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1626                       isALNUM_LC_utf8((U8*)s));
1627             break;
1628         case NBOUNDL:
1629             PL_reg_flags |= RF_tainted;
1630             FBC_NBOUND(isALNUM_LC,
1631                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1632                        isALNUM_LC_utf8((U8*)s));
1633             break;
1634         case BOUND:
1635             FBC_BOUND(isWORDCHAR,
1636                       isALNUM_uni(tmp),
1637                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1638             break;
1639         case BOUNDA:
1640             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1641                              isWORDCHAR_A(tmp),
1642                              isWORDCHAR_A((U8*)s));
1643             break;
1644         case NBOUND:
1645             FBC_NBOUND(isWORDCHAR,
1646                        isALNUM_uni(tmp),
1647                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1648             break;
1649         case NBOUNDA:
1650             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1651                               isWORDCHAR_A(tmp),
1652                               isWORDCHAR_A((U8*)s));
1653             break;
1654         case BOUNDU:
1655             FBC_BOUND(isWORDCHAR_L1,
1656                       isALNUM_uni(tmp),
1657                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1658             break;
1659         case NBOUNDU:
1660             FBC_NBOUND(isWORDCHAR_L1,
1661                        isALNUM_uni(tmp),
1662                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1663             break;
1664         case ALNUML:
1665             REXEC_FBC_CSCAN_TAINT(
1666                 isALNUM_LC_utf8((U8*)s),
1667                 isALNUM_LC(*s)
1668             );
1669             break;
1670         case ALNUMU:
1671             REXEC_FBC_CSCAN_PRELOAD(
1672                 LOAD_UTF8_CHARCLASS_ALNUM(),
1673                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1674                 isWORDCHAR_L1((U8) *s)
1675             );
1676             break;
1677         case ALNUM:
1678             REXEC_FBC_CSCAN_PRELOAD(
1679                 LOAD_UTF8_CHARCLASS_ALNUM(),
1680                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1681                 isWORDCHAR((U8) *s)
1682             );
1683             break;
1684         case ALNUMA:
1685             /* Don't need to worry about utf8, as it can match only a single
1686              * byte invariant character */
1687             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1688             break;
1689         case NALNUMU:
1690             REXEC_FBC_CSCAN_PRELOAD(
1691                 LOAD_UTF8_CHARCLASS_ALNUM(),
1692                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1693                 ! isWORDCHAR_L1((U8) *s)
1694             );
1695             break;
1696         case NALNUM:
1697             REXEC_FBC_CSCAN_PRELOAD(
1698                 LOAD_UTF8_CHARCLASS_ALNUM(),
1699                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1700                 ! isALNUM(*s)
1701             );
1702             break;
1703         case NALNUMA:
1704             REXEC_FBC_CSCAN(
1705                 !isWORDCHAR_A(*s),
1706                 !isWORDCHAR_A(*s)
1707             );
1708             break;
1709         case NALNUML:
1710             REXEC_FBC_CSCAN_TAINT(
1711                 !isALNUM_LC_utf8((U8*)s),
1712                 !isALNUM_LC(*s)
1713             );
1714             break;
1715         case SPACEU:
1716             REXEC_FBC_CSCAN(
1717                 is_XPERLSPACE_utf8(s),
1718                 isSPACE_L1((U8) *s)
1719             );
1720             break;
1721         case SPACE:
1722             REXEC_FBC_CSCAN(
1723                 is_XPERLSPACE_utf8(s),
1724                 isSPACE((U8) *s)
1725             );
1726             break;
1727         case SPACEA:
1728             /* Don't need to worry about utf8, as it can match only a single
1729              * byte invariant character */
1730             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1731             break;
1732         case SPACEL:
1733             REXEC_FBC_CSCAN_TAINT(
1734                 isSPACE_LC_utf8((U8*)s),
1735                 isSPACE_LC(*s)
1736             );
1737             break;
1738         case NSPACEU:
1739             REXEC_FBC_CSCAN(
1740                 ! is_XPERLSPACE_utf8(s),
1741                 ! isSPACE_L1((U8) *s)
1742             );
1743             break;
1744         case NSPACE:
1745             REXEC_FBC_CSCAN(
1746                 ! is_XPERLSPACE_utf8(s),
1747                 ! isSPACE((U8) *s)
1748             );
1749             break;
1750         case NSPACEA:
1751             REXEC_FBC_CSCAN(
1752                 !isSPACE_A(*s),
1753                 !isSPACE_A(*s)
1754             );
1755             break;
1756         case NSPACEL:
1757             REXEC_FBC_CSCAN_TAINT(
1758                 !isSPACE_LC_utf8((U8*)s),
1759                 !isSPACE_LC(*s)
1760             );
1761             break;
1762         case DIGIT:
1763             REXEC_FBC_CSCAN_PRELOAD(
1764                 LOAD_UTF8_CHARCLASS_DIGIT(),
1765                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1766                 isDIGIT(*s)
1767             );
1768             break;
1769         case DIGITA:
1770             /* Don't need to worry about utf8, as it can match only a single
1771              * byte invariant character */
1772             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1773             break;
1774         case DIGITL:
1775             REXEC_FBC_CSCAN_TAINT(
1776                 isDIGIT_LC_utf8((U8*)s),
1777                 isDIGIT_LC(*s)
1778             );
1779             break;
1780         case NDIGIT:
1781             REXEC_FBC_CSCAN_PRELOAD(
1782                 LOAD_UTF8_CHARCLASS_DIGIT(),
1783                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1784                 !isDIGIT(*s)
1785             );
1786             break;
1787         case NDIGITA:
1788             REXEC_FBC_CSCAN(
1789                 !isDIGIT_A(*s),
1790                 !isDIGIT_A(*s)
1791             );
1792             break;
1793         case NDIGITL:
1794             REXEC_FBC_CSCAN_TAINT(
1795                 !isDIGIT_LC_utf8((U8*)s),
1796                 !isDIGIT_LC(*s)
1797             );
1798             break;
1799         case LNBREAK:
1800             REXEC_FBC_CSCAN(
1801                 is_LNBREAK_utf8_safe(s, strend),
1802                 is_LNBREAK_latin1_safe(s, strend)
1803             );
1804             break;
1805         case VERTWS:
1806             REXEC_FBC_CSCAN(
1807                 is_VERTWS_utf8_safe(s, strend),
1808                 is_VERTWS_latin1_safe(s, strend)
1809             );
1810             break;
1811         case NVERTWS:
1812             REXEC_FBC_CSCAN(
1813                 !is_VERTWS_utf8_safe(s, strend),
1814                 !is_VERTWS_latin1_safe(s, strend)
1815             );
1816             break;
1817         case HORIZWS:
1818             REXEC_FBC_CSCAN(
1819                 is_HORIZWS_utf8_safe(s, strend),
1820                 is_HORIZWS_latin1_safe(s, strend)
1821             );
1822             break;
1823         case NHORIZWS:
1824             REXEC_FBC_CSCAN(
1825                 !is_HORIZWS_utf8_safe(s, strend),
1826                 !is_HORIZWS_latin1_safe(s, strend)
1827             );
1828             break;
1829         case POSIXA:
1830             /* Don't need to worry about utf8, as it can match only a single
1831             * byte invariant character.  The flag in this node type is the
1832             * class number to pass to _generic_isCC() to build a mask for
1833             * searching in PL_charclass[] */
1834             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1835             break;
1836         case NPOSIXA:
1837             REXEC_FBC_CSCAN(
1838                 !_generic_isCC_A(*s, FLAGS(c)),
1839                 !_generic_isCC_A(*s, FLAGS(c))
1840             );
1841             break;
1842
1843         case AHOCORASICKC:
1844         case AHOCORASICK:
1845             {
1846                 DECL_TRIE_TYPE(c);
1847                 /* what trie are we using right now */
1848                 reg_ac_data *aho
1849                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1850                 reg_trie_data *trie
1851                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1852                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1853
1854                 const char *last_start = strend - trie->minlen;
1855 #ifdef DEBUGGING
1856                 const char *real_start = s;
1857 #endif
1858                 STRLEN maxlen = trie->maxlen;
1859                 SV *sv_points;
1860                 U8 **points; /* map of where we were in the input string
1861                                 when reading a given char. For ASCII this
1862                                 is unnecessary overhead as the relationship
1863                                 is always 1:1, but for Unicode, especially
1864                                 case folded Unicode this is not true. */
1865                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1866                 U8 *bitmap=NULL;
1867
1868
1869                 GET_RE_DEBUG_FLAGS_DECL;
1870
1871                 /* We can't just allocate points here. We need to wrap it in
1872                  * an SV so it gets freed properly if there is a croak while
1873                  * running the match */
1874                 ENTER;
1875                 SAVETMPS;
1876                 sv_points=newSV(maxlen * sizeof(U8 *));
1877                 SvCUR_set(sv_points,
1878                     maxlen * sizeof(U8 *));
1879                 SvPOK_on(sv_points);
1880                 sv_2mortal(sv_points);
1881                 points=(U8**)SvPV_nolen(sv_points );
1882                 if ( trie_type != trie_utf8_fold
1883                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1884                 {
1885                     if (trie->bitmap)
1886                         bitmap=(U8*)trie->bitmap;
1887                     else
1888                         bitmap=(U8*)ANYOF_BITMAP(c);
1889                 }
1890                 /* this is the Aho-Corasick algorithm modified a touch
1891                    to include special handling for long "unknown char"
1892                    sequences. The basic idea being that we use AC as long
1893                    as we are dealing with a possible matching char, when
1894                    we encounter an unknown char (and we have not encountered
1895                    an accepting state) we scan forward until we find a legal
1896                    starting char.
1897                    AC matching is basically that of trie matching, except
1898                    that when we encounter a failing transition, we fall back
1899                    to the current states "fail state", and try the current char
1900                    again, a process we repeat until we reach the root state,
1901                    state 1, or a legal transition. If we fail on the root state
1902                    then we can either terminate if we have reached an accepting
1903                    state previously, or restart the entire process from the beginning
1904                    if we have not.
1905
1906                  */
1907                 while (s <= last_start) {
1908                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1909                     U8 *uc = (U8*)s;
1910                     U16 charid = 0;
1911                     U32 base = 1;
1912                     U32 state = 1;
1913                     UV uvc = 0;
1914                     STRLEN len = 0;
1915                     STRLEN foldlen = 0;
1916                     U8 *uscan = (U8*)NULL;
1917                     U8 *leftmost = NULL;
1918 #ifdef DEBUGGING
1919                     U32 accepted_word= 0;
1920 #endif
1921                     U32 pointpos = 0;
1922
1923                     while ( state && uc <= (U8*)strend ) {
1924                         int failed=0;
1925                         U32 word = aho->states[ state ].wordnum;
1926
1927                         if( state==1 ) {
1928                             if ( bitmap ) {
1929                                 DEBUG_TRIE_EXECUTE_r(
1930                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1931                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1932                                             (char *)uc, utf8_target );
1933                                         PerlIO_printf( Perl_debug_log,
1934                                             " Scanning for legal start char...\n");
1935                                     }
1936                                 );
1937                                 if (utf8_target) {
1938                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1939                                         uc += UTF8SKIP(uc);
1940                                     }
1941                                 } else {
1942                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1943                                         uc++;
1944                                     }
1945                                 }
1946                                 s= (char *)uc;
1947                             }
1948                             if (uc >(U8*)last_start) break;
1949                         }
1950
1951                         if ( word ) {
1952                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1953                             if (!leftmost || lpos < leftmost) {
1954                                 DEBUG_r(accepted_word=word);
1955                                 leftmost= lpos;
1956                             }
1957                             if (base==0) break;
1958
1959                         }
1960                         points[pointpos++ % maxlen]= uc;
1961                         if (foldlen || uc < (U8*)strend) {
1962                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1963                                              widecharmap, uc,
1964                                              uscan, len, uvc, charid, foldlen,
1965                                              foldbuf, uniflags);
1966                             DEBUG_TRIE_EXECUTE_r({
1967                                 dump_exec_pos( (char *)uc, c, strend,
1968                                             real_start, s, utf8_target);
1969                                 PerlIO_printf(Perl_debug_log,
1970                                     " Charid:%3u CP:%4"UVxf" ",
1971                                      charid, uvc);
1972                             });
1973                         }
1974                         else {
1975                             len = 0;
1976                             charid = 0;
1977                         }
1978
1979
1980                         do {
1981 #ifdef DEBUGGING
1982                             word = aho->states[ state ].wordnum;
1983 #endif
1984                             base = aho->states[ state ].trans.base;
1985
1986                             DEBUG_TRIE_EXECUTE_r({
1987                                 if (failed)
1988                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1989                                         s,   utf8_target );
1990                                 PerlIO_printf( Perl_debug_log,
1991                                     "%sState: %4"UVxf", word=%"UVxf,
1992                                     failed ? " Fail transition to " : "",
1993                                     (UV)state, (UV)word);
1994                             });
1995                             if ( base ) {
1996                                 U32 tmp;
1997                                 I32 offset;
1998                                 if (charid &&
1999                                      ( ((offset = base + charid
2000                                         - 1 - trie->uniquecharcount)) >= 0)
2001                                      && ((U32)offset < trie->lasttrans)
2002                                      && trie->trans[offset].check == state
2003                                      && (tmp=trie->trans[offset].next))
2004                                 {
2005                                     DEBUG_TRIE_EXECUTE_r(
2006                                         PerlIO_printf( Perl_debug_log," - legal\n"));
2007                                     state = tmp;
2008                                     break;
2009                                 }
2010                                 else {
2011                                     DEBUG_TRIE_EXECUTE_r(
2012                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2013                                     failed = 1;
2014                                     state = aho->fail[state];
2015                                 }
2016                             }
2017                             else {
2018                                 /* we must be accepting here */
2019                                 DEBUG_TRIE_EXECUTE_r(
2020                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2021                                 failed = 1;
2022                                 break;
2023                             }
2024                         } while(state);
2025                         uc += len;
2026                         if (failed) {
2027                             if (leftmost)
2028                                 break;
2029                             if (!state) state = 1;
2030                         }
2031                     }
2032                     if ( aho->states[ state ].wordnum ) {
2033                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2034                         if (!leftmost || lpos < leftmost) {
2035                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2036                             leftmost = lpos;
2037                         }
2038                     }
2039                     if (leftmost) {
2040                         s = (char*)leftmost;
2041                         DEBUG_TRIE_EXECUTE_r({
2042                             PerlIO_printf(
2043                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2044                                 (UV)accepted_word, (IV)(s - real_start)
2045                             );
2046                         });
2047                         if (!reginfo || regtry(reginfo, &s)) {
2048                             FREETMPS;
2049                             LEAVE;
2050                             goto got_it;
2051                         }
2052                         s = HOPc(s,1);
2053                         DEBUG_TRIE_EXECUTE_r({
2054                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2055                         });
2056                     } else {
2057                         DEBUG_TRIE_EXECUTE_r(
2058                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2059                         break;
2060                     }
2061                 }
2062                 FREETMPS;
2063                 LEAVE;
2064             }
2065             break;
2066         default:
2067             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2068             break;
2069         }
2070         return 0;
2071       got_it:
2072         return s;
2073 }
2074
2075
2076 /*
2077  - regexec_flags - match a regexp against a string
2078  */
2079 I32
2080 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2081               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2082 /* stringarg: the point in the string at which to begin matching */
2083 /* strend:    pointer to null at end of string */
2084 /* strbeg:    real beginning of string */
2085 /* minend:    end of match must be >= minend bytes after stringarg. */
2086 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2087  *            itself is accessed via the pointers above */
2088 /* data:      May be used for some additional optimizations.
2089               Currently its only used, with a U32 cast, for transmitting
2090               the ganch offset when doing a /g match. This will change */
2091 /* nosave:    For optimizations. */
2092
2093 {
2094     dVAR;
2095     struct regexp *const prog = ReANY(rx);
2096     /*register*/ char *s;
2097     regnode *c;
2098     /*register*/ char *startpos = stringarg;
2099     I32 minlen;         /* must match at least this many chars */
2100     I32 dontbother = 0; /* how many characters not to try at end */
2101     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2102     I32 scream_pos = -1;                /* Internal iterator of scream. */
2103     char *scream_olds = NULL;
2104     const bool utf8_target = cBOOL(DO_UTF8(sv));
2105     I32 multiline;
2106     RXi_GET_DECL(prog,progi);
2107     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2108     regexp_paren_pair *swap = NULL;
2109     GET_RE_DEBUG_FLAGS_DECL;
2110
2111     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2112     PERL_UNUSED_ARG(data);
2113
2114     /* Be paranoid... */
2115     if (prog == NULL || startpos == NULL) {
2116         Perl_croak(aTHX_ "NULL regexp parameter");
2117         return 0;
2118     }
2119
2120     multiline = prog->extflags & RXf_PMf_MULTILINE;
2121     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2122
2123     RX_MATCH_UTF8_set(rx, utf8_target);
2124     DEBUG_EXECUTE_r(
2125         debug_start_match(rx, utf8_target, startpos, strend,
2126         "Matching");
2127     );
2128
2129     minlen = prog->minlen;
2130
2131     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2132         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2133                               "String too short [regexec_flags]...\n"));
2134         goto phooey;
2135     }
2136
2137
2138     /* Check validity of program. */
2139     if (UCHARAT(progi->program) != REG_MAGIC) {
2140         Perl_croak(aTHX_ "corrupted regexp program");
2141     }
2142
2143     PL_reg_flags = 0;
2144     PL_reg_state.re_state_eval_setup_done = FALSE;
2145     PL_reg_maxiter = 0;
2146
2147     if (RX_UTF8(rx))
2148         PL_reg_flags |= RF_utf8;
2149
2150     /* Mark beginning of line for ^ and lookbehind. */
2151     reginfo.bol = startpos; /* XXX not used ??? */
2152     PL_bostr  = strbeg;
2153     reginfo.sv = sv;
2154
2155     /* Mark end of line for $ (and such) */
2156     PL_regeol = strend;
2157
2158     /* see how far we have to get to not match where we matched before */
2159     reginfo.till = startpos+minend;
2160
2161     /* If there is a "must appear" string, look for it. */
2162     s = startpos;
2163
2164     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2165         MAGIC *mg;
2166         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2167             reginfo.ganch = startpos + prog->gofs;
2168             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2169               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2170         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2171                   && SvMAGIC(sv)
2172                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2173                   && mg->mg_len >= 0) {
2174             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2175             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2176                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2177
2178             if (prog->extflags & RXf_ANCH_GPOS) {
2179                 if (s > reginfo.ganch)
2180                     goto phooey;
2181                 s = reginfo.ganch - prog->gofs;
2182                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2183                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2184                 if (s < strbeg)
2185                     goto phooey;
2186             }
2187         }
2188         else if (data) {
2189             reginfo.ganch = strbeg + PTR2UV(data);
2190             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2191                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2192
2193         } else {                                /* pos() not defined */
2194             reginfo.ganch = strbeg;
2195             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2196                  "GPOS: reginfo.ganch = strbeg\n"));
2197         }
2198     }
2199     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2200         /* We have to be careful. If the previous successful match
2201            was from this regex we don't want a subsequent partially
2202            successful match to clobber the old results.
2203            So when we detect this possibility we add a swap buffer
2204            to the re, and switch the buffer each match. If we fail
2205            we switch it back, otherwise we leave it swapped.
2206         */
2207         swap = prog->offs;
2208         /* do we need a save destructor here for eval dies? */
2209         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2210         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2211             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2212             PTR2UV(prog),
2213             PTR2UV(swap),
2214             PTR2UV(prog->offs)
2215         ));
2216     }
2217     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2218         re_scream_pos_data d;
2219
2220         d.scream_olds = &scream_olds;
2221         d.scream_pos = &scream_pos;
2222         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2223         if (!s) {
2224             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2225             goto phooey;        /* not present */
2226         }
2227     }
2228
2229
2230
2231     /* Simplest case:  anchored match need be tried only once. */
2232     /*  [unless only anchor is BOL and multiline is set] */
2233     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2234         if (s == startpos && regtry(&reginfo, &startpos))
2235             goto got_it;
2236         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2237                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2238         {
2239             char *end;
2240
2241             if (minlen)
2242                 dontbother = minlen - 1;
2243             end = HOP3c(strend, -dontbother, strbeg) - 1;
2244             /* for multiline we only have to try after newlines */
2245             if (prog->check_substr || prog->check_utf8) {
2246                 /* because of the goto we can not easily reuse the macros for bifurcating the
2247                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2248                 if (utf8_target) {
2249                     if (s == startpos)
2250                         goto after_try_utf8;
2251                     while (1) {
2252                         if (regtry(&reginfo, &s)) {
2253                             goto got_it;
2254                         }
2255                       after_try_utf8:
2256                         if (s > end) {
2257                             goto phooey;
2258                         }
2259                         if (prog->extflags & RXf_USE_INTUIT) {
2260                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2261                             if (!s) {
2262                                 goto phooey;
2263                             }
2264                         }
2265                         else {
2266                             s += UTF8SKIP(s);
2267                         }
2268                     }
2269                 } /* end search for check string in unicode */
2270                 else {
2271                     if (s == startpos) {
2272                         goto after_try_latin;
2273                     }
2274                     while (1) {
2275                         if (regtry(&reginfo, &s)) {
2276                             goto got_it;
2277                         }
2278                       after_try_latin:
2279                         if (s > end) {
2280                             goto phooey;
2281                         }
2282                         if (prog->extflags & RXf_USE_INTUIT) {
2283                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2284                             if (!s) {
2285                                 goto phooey;
2286                             }
2287                         }
2288                         else {
2289                             s++;
2290                         }
2291                     }
2292                 } /* end search for check string in latin*/
2293             } /* end search for check string */
2294             else { /* search for newline */
2295                 if (s > startpos) {
2296                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2297                     s--;
2298                 }
2299                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2300                 while (s <= end) { /* note it could be possible to match at the end of the string */
2301                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2302                         if (regtry(&reginfo, &s))
2303                             goto got_it;
2304                     }
2305                 }
2306             } /* end search for newline */
2307         } /* end anchored/multiline check string search */
2308         goto phooey;
2309     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2310     {
2311         /* the warning about reginfo.ganch being used without initialization
2312            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2313            and we only enter this block when the same bit is set. */
2314         char *tmp_s = reginfo.ganch - prog->gofs;
2315
2316         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2317             goto got_it;
2318         goto phooey;
2319     }
2320
2321     /* Messy cases:  unanchored match. */
2322     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2323         /* we have /x+whatever/ */
2324         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2325         char ch;
2326 #ifdef DEBUGGING
2327         int did_match = 0;
2328 #endif
2329         if (utf8_target) {
2330             if (! prog->anchored_utf8) {
2331                 to_utf8_substr(prog);
2332             }
2333             ch = SvPVX_const(prog->anchored_utf8)[0];
2334             REXEC_FBC_SCAN(
2335                 if (*s == ch) {
2336                     DEBUG_EXECUTE_r( did_match = 1 );
2337                     if (regtry(&reginfo, &s)) goto got_it;
2338                     s += UTF8SKIP(s);
2339                     while (s < strend && *s == ch)
2340                         s += UTF8SKIP(s);
2341                 }
2342             );
2343
2344         }
2345         else {
2346             if (! prog->anchored_substr) {
2347                 if (! to_byte_substr(prog)) {
2348                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2349                 }
2350             }
2351             ch = SvPVX_const(prog->anchored_substr)[0];
2352             REXEC_FBC_SCAN(
2353                 if (*s == ch) {
2354                     DEBUG_EXECUTE_r( did_match = 1 );
2355                     if (regtry(&reginfo, &s)) goto got_it;
2356                     s++;
2357                     while (s < strend && *s == ch)
2358                         s++;
2359                 }
2360             );
2361         }
2362         DEBUG_EXECUTE_r(if (!did_match)
2363                 PerlIO_printf(Perl_debug_log,
2364                                   "Did not find anchored character...\n")
2365                );
2366     }
2367     else if (prog->anchored_substr != NULL
2368               || prog->anchored_utf8 != NULL
2369               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2370                   && prog->float_max_offset < strend - s)) {
2371         SV *must;
2372         I32 back_max;
2373         I32 back_min;
2374         char *last;
2375         char *last1;            /* Last position checked before */
2376 #ifdef DEBUGGING
2377         int did_match = 0;
2378 #endif
2379         if (prog->anchored_substr || prog->anchored_utf8) {
2380             if (utf8_target) {
2381                 if (! prog->anchored_utf8) {
2382                     to_utf8_substr(prog);
2383                 }
2384                 must = prog->anchored_utf8;
2385             }
2386             else {
2387                 if (! prog->anchored_substr) {
2388                     if (! to_byte_substr(prog)) {
2389                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2390                     }
2391                 }
2392                 must = prog->anchored_substr;
2393             }
2394             back_max = back_min = prog->anchored_offset;
2395         } else {
2396             if (utf8_target) {
2397                 if (! prog->float_utf8) {
2398                     to_utf8_substr(prog);
2399                 }
2400                 must = prog->float_utf8;
2401             }
2402             else {
2403                 if (! prog->float_substr) {
2404                     if (! to_byte_substr(prog)) {
2405                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2406                     }
2407                 }
2408                 must = prog->float_substr;
2409             }
2410             back_max = prog->float_max_offset;
2411             back_min = prog->float_min_offset;
2412         }
2413
2414         if (back_min<0) {
2415             last = strend;
2416         } else {
2417             last = HOP3c(strend,        /* Cannot start after this */
2418                   -(I32)(CHR_SVLEN(must)
2419                          - (SvTAIL(must) != 0) + back_min), strbeg);
2420         }
2421         if (s > PL_bostr)
2422             last1 = HOPc(s, -1);
2423         else
2424             last1 = s - 1;      /* bogus */
2425
2426         /* XXXX check_substr already used to find "s", can optimize if
2427            check_substr==must. */
2428         scream_pos = -1;
2429         dontbother = end_shift;
2430         strend = HOPc(strend, -dontbother);
2431         while ( (s <= last) &&
2432                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2433                                   (unsigned char*)strend, must,
2434                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2435             DEBUG_EXECUTE_r( did_match = 1 );
2436             if (HOPc(s, -back_max) > last1) {
2437                 last1 = HOPc(s, -back_min);
2438                 s = HOPc(s, -back_max);
2439             }
2440             else {
2441                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2442
2443                 last1 = HOPc(s, -back_min);
2444                 s = t;
2445             }
2446             if (utf8_target) {
2447                 while (s <= last1) {
2448                     if (regtry(&reginfo, &s))
2449                         goto got_it;
2450                     if (s >= last1) {
2451                         s++; /* to break out of outer loop */
2452                         break;
2453                     }
2454                     s += UTF8SKIP(s);
2455                 }
2456             }
2457             else {
2458                 while (s <= last1) {
2459                     if (regtry(&reginfo, &s))
2460                         goto got_it;
2461                     s++;
2462                 }
2463             }
2464         }
2465         DEBUG_EXECUTE_r(if (!did_match) {
2466             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2467                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2468             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2469                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2470                                ? "anchored" : "floating"),
2471                 quoted, RE_SV_TAIL(must));
2472         });
2473         goto phooey;
2474     }
2475     else if ( (c = progi->regstclass) ) {
2476         if (minlen) {
2477             const OPCODE op = OP(progi->regstclass);
2478             /* don't bother with what can't match */
2479             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2480                 strend = HOPc(strend, -(minlen - 1));
2481         }
2482         DEBUG_EXECUTE_r({
2483             SV * const prop = sv_newmortal();
2484             regprop(prog, prop, c);
2485             {
2486                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2487                     s,strend-s,60);
2488                 PerlIO_printf(Perl_debug_log,
2489                     "Matching stclass %.*s against %s (%d bytes)\n",
2490                     (int)SvCUR(prop), SvPVX_const(prop),
2491                      quoted, (int)(strend - s));
2492             }
2493         });
2494         if (find_byclass(prog, c, s, strend, &reginfo))
2495             goto got_it;
2496         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2497     }
2498     else {
2499         dontbother = 0;
2500         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2501             /* Trim the end. */
2502             char *last= NULL;
2503             SV* float_real;
2504             STRLEN len;
2505             const char *little;
2506
2507             if (utf8_target) {
2508                 if (! prog->float_utf8) {
2509                     to_utf8_substr(prog);
2510                 }
2511                 float_real = prog->float_utf8;
2512             }
2513             else {
2514                 if (! prog->float_substr) {
2515                     if (! to_byte_substr(prog)) {
2516                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2517                     }
2518                 }
2519                 float_real = prog->float_substr;
2520             }
2521
2522             little = SvPV_const(float_real, len);
2523             if (SvTAIL(float_real)) {
2524                     /* This means that float_real contains an artificial \n on
2525                      * the end due to the presence of something like this:
2526                      * /foo$/ where we can match both "foo" and "foo\n" at the
2527                      * end of the string.  So we have to compare the end of the
2528                      * string first against the float_real without the \n and
2529                      * then against the full float_real with the string.  We
2530                      * have to watch out for cases where the string might be
2531                      * smaller than the float_real or the float_real without
2532                      * the \n. */
2533                     char *checkpos= strend - len;
2534                     DEBUG_OPTIMISE_r(
2535                         PerlIO_printf(Perl_debug_log,
2536                             "%sChecking for float_real.%s\n",
2537                             PL_colors[4], PL_colors[5]));
2538                     if (checkpos + 1 < strbeg) {
2539                         /* can't match, even if we remove the trailing \n
2540                          * string is too short to match */
2541                         DEBUG_EXECUTE_r(
2542                             PerlIO_printf(Perl_debug_log,
2543                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2544                                 PL_colors[4], PL_colors[5]));
2545                         goto phooey;
2546                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2547                         /* can match, the end of the string matches without the
2548                          * "\n" */
2549                         last = checkpos + 1;
2550                     } else if (checkpos < strbeg) {
2551                         /* cant match, string is too short when the "\n" is
2552                          * included */
2553                         DEBUG_EXECUTE_r(
2554                             PerlIO_printf(Perl_debug_log,
2555                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2556                                 PL_colors[4], PL_colors[5]));
2557                         goto phooey;
2558                     } else if (!multiline) {
2559                         /* non multiline match, so compare with the "\n" at the
2560                          * end of the string */
2561                         if (memEQ(checkpos, little, len)) {
2562                             last= checkpos;
2563                         } else {
2564                             DEBUG_EXECUTE_r(
2565                                 PerlIO_printf(Perl_debug_log,
2566                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2567                                     PL_colors[4], PL_colors[5]));
2568                             goto phooey;
2569                         }
2570                     } else {
2571                         /* multiline match, so we have to search for a place
2572                          * where the full string is located */
2573                         goto find_last;
2574                     }
2575             } else {
2576                   find_last:
2577                     if (len)
2578                         last = rninstr(s, strend, little, little + len);
2579                     else
2580                         last = strend;  /* matching "$" */
2581             }
2582             if (!last) {
2583                 /* at one point this block contained a comment which was
2584                  * probably incorrect, which said that this was a "should not
2585                  * happen" case.  Even if it was true when it was written I am
2586                  * pretty sure it is not anymore, so I have removed the comment
2587                  * and replaced it with this one. Yves */
2588                 DEBUG_EXECUTE_r(
2589                     PerlIO_printf(Perl_debug_log,
2590                         "String does not contain required substring, cannot match.\n"
2591                     ));
2592                 goto phooey;
2593             }
2594             dontbother = strend - last + prog->float_min_offset;
2595         }
2596         if (minlen && (dontbother < minlen))
2597             dontbother = minlen - 1;
2598         strend -= dontbother;              /* this one's always in bytes! */
2599         /* We don't know much -- general case. */
2600         if (utf8_target) {
2601             for (;;) {
2602                 if (regtry(&reginfo, &s))
2603                     goto got_it;
2604                 if (s >= strend)
2605                     break;
2606                 s += UTF8SKIP(s);
2607             };
2608         }
2609         else {
2610             do {
2611                 if (regtry(&reginfo, &s))
2612                     goto got_it;
2613             } while (s++ < strend);
2614         }
2615     }
2616
2617     /* Failure. */
2618     goto phooey;
2619
2620 got_it:
2621     DEBUG_BUFFERS_r(
2622         if (swap)
2623             PerlIO_printf(Perl_debug_log,
2624                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2625                 PTR2UV(prog),
2626                 PTR2UV(swap)
2627             );
2628     );
2629     Safefree(swap);
2630     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2631
2632     if (PL_reg_state.re_state_eval_setup_done)
2633         restore_pos(aTHX_ prog);
2634     if (RXp_PAREN_NAMES(prog))
2635         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2636
2637     /* make sure $`, $&, $', and $digit will work later */
2638     if ( !(flags & REXEC_NOT_FIRST) ) {
2639         if (flags & REXEC_COPY_STR) {
2640 #ifdef PERL_OLD_COPY_ON_WRITE
2641             if ((SvIsCOW(sv)
2642                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2643                 if (DEBUG_C_TEST) {
2644                     PerlIO_printf(Perl_debug_log,
2645                                   "Copy on write: regexp capture, type %d\n",
2646                                   (int) SvTYPE(sv));
2647                 }
2648                 RX_MATCH_COPY_FREE(rx);
2649                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2650                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2651                 assert (SvPOKp(prog->saved_copy));
2652                 prog->sublen  = PL_regeol - strbeg;
2653                 prog->suboffset = 0;
2654                 prog->subcoffset = 0;
2655             } else
2656 #endif
2657             {
2658                 I32 min = 0;
2659                 I32 max = PL_regeol - strbeg;
2660                 I32 sublen;
2661
2662                 if (    (flags & REXEC_COPY_SKIP_POST)
2663                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2664                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2665                 ) { /* don't copy $' part of string */
2666                     U32 n = 0;
2667                     max = -1;
2668                     /* calculate the right-most part of the string covered
2669                      * by a capture. Due to look-ahead, this may be to
2670                      * the right of $&, so we have to scan all captures */
2671                     while (n <= prog->lastparen) {
2672                         if (prog->offs[n].end > max)
2673                             max = prog->offs[n].end;
2674                         n++;
2675                     }
2676                     if (max == -1)
2677                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2678                                 ? prog->offs[0].start
2679                                 : 0;
2680                     assert(max >= 0 && max <= PL_regeol - strbeg);
2681                 }
2682
2683                 if (    (flags & REXEC_COPY_SKIP_PRE)
2684                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2685                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2686                 ) { /* don't copy $` part of string */
2687                     U32 n = 0;
2688                     min = max;
2689                     /* calculate the left-most part of the string covered
2690                      * by a capture. Due to look-behind, this may be to
2691                      * the left of $&, so we have to scan all captures */
2692                     while (min && n <= prog->lastparen) {
2693                         if (   prog->offs[n].start != -1
2694                             && prog->offs[n].start < min)
2695                         {
2696                             min = prog->offs[n].start;
2697                         }
2698                         n++;
2699                     }
2700                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2701                         && min >  prog->offs[0].end
2702                     )
2703                         min = prog->offs[0].end;
2704
2705                 }
2706
2707                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2708                 sublen = max - min;
2709
2710                 if (RX_MATCH_COPIED(rx)) {
2711                     if (sublen > prog->sublen)
2712                         prog->subbeg =
2713                                 (char*)saferealloc(prog->subbeg, sublen+1);
2714                 }
2715                 else
2716                     prog->subbeg = (char*)safemalloc(sublen+1);
2717                 Copy(strbeg + min, prog->subbeg, sublen, char);
2718                 prog->subbeg[sublen] = '\0';
2719                 prog->suboffset = min;
2720                 prog->sublen = sublen;
2721                 RX_MATCH_COPIED_on(rx);
2722             }
2723             prog->subcoffset = prog->suboffset;
2724             if (prog->suboffset && utf8_target) {
2725                 /* Convert byte offset to chars.
2726                  * XXX ideally should only compute this if @-/@+
2727                  * has been seen, a la PL_sawampersand ??? */
2728
2729                 /* If there's a direct correspondence between the
2730                  * string which we're matching and the original SV,
2731                  * then we can use the utf8 len cache associated with
2732                  * the SV. In particular, it means that under //g,
2733                  * sv_pos_b2u() will use the previously cached
2734                  * position to speed up working out the new length of
2735                  * subcoffset, rather than counting from the start of
2736                  * the string each time. This stops
2737                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2738                  * from going quadratic */
2739                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2740                     sv_pos_b2u(sv, &(prog->subcoffset));
2741                 else
2742                     prog->subcoffset = utf8_length((U8*)strbeg,
2743                                         (U8*)(strbeg+prog->suboffset));
2744             }
2745         }
2746         else {
2747             RX_MATCH_COPY_FREE(rx);
2748             prog->subbeg = strbeg;
2749             prog->suboffset = 0;
2750             prog->subcoffset = 0;
2751             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2752         }
2753     }
2754
2755     return 1;
2756
2757 phooey:
2758     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2759                           PL_colors[4], PL_colors[5]));
2760     if (PL_reg_state.re_state_eval_setup_done)
2761         restore_pos(aTHX_ prog);
2762     if (swap) {
2763         /* we failed :-( roll it back */
2764         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2765             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2766             PTR2UV(prog),
2767             PTR2UV(prog->offs),
2768             PTR2UV(swap)
2769         ));
2770         Safefree(prog->offs);
2771         prog->offs = swap;
2772     }
2773     return 0;
2774 }
2775
2776
2777 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2778  * Do inc before dec, in case old and new rex are the same */
2779 #define SET_reg_curpm(Re2) \
2780     if (PL_reg_state.re_state_eval_setup_done) {    \
2781         (void)ReREFCNT_inc(Re2);                    \
2782         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2783         PM_SETRE((PL_reg_curpm), (Re2));            \
2784     }
2785
2786
2787 /*
2788  - regtry - try match at specific point
2789  */
2790 STATIC I32                      /* 0 failure, 1 success */
2791 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2792 {
2793     dVAR;
2794     CHECKPOINT lastcp;
2795     REGEXP *const rx = reginfo->prog;
2796     regexp *const prog = ReANY(rx);
2797     I32 result;
2798     RXi_GET_DECL(prog,progi);
2799     GET_RE_DEBUG_FLAGS_DECL;
2800
2801     PERL_ARGS_ASSERT_REGTRY;
2802
2803     reginfo->cutpoint=NULL;
2804
2805     if ((prog->extflags & RXf_EVAL_SEEN)
2806         && !PL_reg_state.re_state_eval_setup_done)
2807     {
2808         MAGIC *mg;
2809
2810         PL_reg_state.re_state_eval_setup_done = TRUE;
2811         if (reginfo->sv) {
2812             /* Make $_ available to executed code. */
2813             if (reginfo->sv != DEFSV) {
2814                 SAVE_DEFSV;
2815                 DEFSV_set(reginfo->sv);
2816             }
2817
2818             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2819                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2820                 /* prepare for quick setting of pos */
2821 #ifdef PERL_OLD_COPY_ON_WRITE
2822                 if (SvIsCOW(reginfo->sv))
2823                     sv_force_normal_flags(reginfo->sv, 0);
2824 #endif
2825                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2826                                  &PL_vtbl_mglob, NULL, 0);
2827                 mg->mg_len = -1;
2828             }
2829             PL_reg_magic    = mg;
2830             PL_reg_oldpos   = mg->mg_len;
2831             SAVEDESTRUCTOR_X(restore_pos, prog);
2832         }
2833         if (!PL_reg_curpm) {
2834             Newxz(PL_reg_curpm, 1, PMOP);
2835 #ifdef USE_ITHREADS
2836             {
2837                 SV* const repointer = &PL_sv_undef;
2838                 /* this regexp is also owned by the new PL_reg_curpm, which
2839                    will try to free it.  */
2840                 av_push(PL_regex_padav, repointer);
2841                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2842                 PL_regex_pad = AvARRAY(PL_regex_padav);
2843             }
2844 #endif
2845         }
2846         SET_reg_curpm(rx);
2847         PL_reg_oldcurpm = PL_curpm;
2848         PL_curpm = PL_reg_curpm;
2849         if (RXp_MATCH_COPIED(prog)) {
2850             /*  Here is a serious problem: we cannot rewrite subbeg,
2851                 since it may be needed if this match fails.  Thus
2852                 $` inside (?{}) could fail... */
2853             PL_reg_oldsaved = prog->subbeg;
2854             PL_reg_oldsavedlen = prog->sublen;
2855             PL_reg_oldsavedoffset = prog->suboffset;
2856             PL_reg_oldsavedcoffset = prog->suboffset;
2857 #ifdef PERL_OLD_COPY_ON_WRITE
2858             PL_nrs = prog->saved_copy;
2859 #endif
2860             RXp_MATCH_COPIED_off(prog);
2861         }
2862         else
2863             PL_reg_oldsaved = NULL;
2864         prog->subbeg = PL_bostr;
2865         prog->suboffset = 0;
2866         prog->subcoffset = 0;
2867         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2868     }
2869 #ifdef DEBUGGING
2870     PL_reg_starttry = *startposp;
2871 #endif
2872     prog->offs[0].start = *startposp - PL_bostr;
2873     prog->lastparen = 0;
2874     prog->lastcloseparen = 0;
2875     PL_regsize = 0;
2876
2877     /* XXXX What this code is doing here?!!!  There should be no need
2878        to do this again and again, prog->lastparen should take care of
2879        this!  --ilya*/
2880
2881     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2882      * Actually, the code in regcppop() (which Ilya may be meaning by
2883      * prog->lastparen), is not needed at all by the test suite
2884      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2885      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2886      * Meanwhile, this code *is* needed for the
2887      * above-mentioned test suite tests to succeed.  The common theme
2888      * on those tests seems to be returning null fields from matches.
2889      * --jhi updated by dapm */
2890 #if 1
2891     if (prog->nparens) {
2892         regexp_paren_pair *pp = prog->offs;
2893         I32 i;
2894         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2895             ++pp;
2896             pp->start = -1;
2897             pp->end = -1;
2898         }
2899     }
2900 #endif
2901     REGCP_SET(lastcp);
2902     result = regmatch(reginfo, *startposp, progi->program + 1);
2903     if (result != -1) {
2904         prog->offs[0].end = result;
2905         return 1;
2906     }
2907     if (reginfo->cutpoint)
2908         *startposp= reginfo->cutpoint;
2909     REGCP_UNWIND(lastcp);
2910     return 0;
2911 }
2912
2913
2914 #define sayYES goto yes
2915 #define sayNO goto no
2916 #define sayNO_SILENT goto no_silent
2917
2918 /* we dont use STMT_START/END here because it leads to
2919    "unreachable code" warnings, which are bogus, but distracting. */
2920 #define CACHEsayNO \
2921     if (ST.cache_mask) \
2922        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2923     sayNO
2924
2925 /* this is used to determine how far from the left messages like
2926    'failed...' are printed. It should be set such that messages
2927    are inline with the regop output that created them.
2928 */
2929 #define REPORT_CODE_OFF 32
2930
2931
2932 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2933 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2934 #define CHRTEST_NOT_A_CP_1 -999
2935 #define CHRTEST_NOT_A_CP_2 -998
2936
2937 #define SLAB_FIRST(s) (&(s)->states[0])
2938 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2939
2940 /* grab a new slab and return the first slot in it */
2941
2942 STATIC regmatch_state *
2943 S_push_slab(pTHX)
2944 {
2945 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2946     dMY_CXT;
2947 #endif
2948     regmatch_slab *s = PL_regmatch_slab->next;
2949     if (!s) {
2950         Newx(s, 1, regmatch_slab);
2951         s->prev = PL_regmatch_slab;
2952         s->next = NULL;
2953         PL_regmatch_slab->next = s;
2954     }
2955     PL_regmatch_slab = s;
2956     return SLAB_FIRST(s);
2957 }
2958
2959
2960 /* push a new state then goto it */
2961
2962 #define PUSH_STATE_GOTO(state, node, input) \
2963     pushinput = input; \
2964     scan = node; \
2965     st->resume_state = state; \
2966     goto push_state;
2967
2968 /* push a new state with success backtracking, then goto it */
2969
2970 #define PUSH_YES_STATE_GOTO(state, node, input) \
2971     pushinput = input; \
2972     scan = node; \
2973     st->resume_state = state; \
2974     goto push_yes_state;
2975
2976
2977
2978
2979 /*
2980
2981 regmatch() - main matching routine
2982
2983 This is basically one big switch statement in a loop. We execute an op,
2984 set 'next' to point the next op, and continue. If we come to a point which
2985 we may need to backtrack to on failure such as (A|B|C), we push a
2986 backtrack state onto the backtrack stack. On failure, we pop the top
2987 state, and re-enter the loop at the state indicated. If there are no more
2988 states to pop, we return failure.
2989
2990 Sometimes we also need to backtrack on success; for example /A+/, where
2991 after successfully matching one A, we need to go back and try to
2992 match another one; similarly for lookahead assertions: if the assertion
2993 completes successfully, we backtrack to the state just before the assertion
2994 and then carry on.  In these cases, the pushed state is marked as
2995 'backtrack on success too'. This marking is in fact done by a chain of
2996 pointers, each pointing to the previous 'yes' state. On success, we pop to
2997 the nearest yes state, discarding any intermediate failure-only states.
2998 Sometimes a yes state is pushed just to force some cleanup code to be
2999 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3000 it to free the inner regex.
3001
3002 Note that failure backtracking rewinds the cursor position, while
3003 success backtracking leaves it alone.
3004
3005 A pattern is complete when the END op is executed, while a subpattern
3006 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3007 ops trigger the "pop to last yes state if any, otherwise return true"
3008 behaviour.
3009
3010 A common convention in this function is to use A and B to refer to the two
3011 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3012 the subpattern to be matched possibly multiple times, while B is the entire
3013 rest of the pattern. Variable and state names reflect this convention.
3014
3015 The states in the main switch are the union of ops and failure/success of
3016 substates associated with with that op.  For example, IFMATCH is the op
3017 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3018 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3019 successfully matched A and IFMATCH_A_fail is a state saying that we have
3020 just failed to match A. Resume states always come in pairs. The backtrack
3021 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3022 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3023 on success or failure.
3024
3025 The struct that holds a backtracking state is actually a big union, with
3026 one variant for each major type of op. The variable st points to the
3027 top-most backtrack struct. To make the code clearer, within each
3028 block of code we #define ST to alias the relevant union.
3029
3030 Here's a concrete example of a (vastly oversimplified) IFMATCH
3031 implementation:
3032
3033     switch (state) {
3034     ....
3035
3036 #define ST st->u.ifmatch
3037
3038     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3039         ST.foo = ...; // some state we wish to save
3040         ...
3041         // push a yes backtrack state with a resume value of
3042         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3043         // first node of A:
3044         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3045         // NOTREACHED
3046
3047     case IFMATCH_A: // we have successfully executed A; now continue with B
3048         next = B;
3049         bar = ST.foo; // do something with the preserved value
3050         break;
3051
3052     case IFMATCH_A_fail: // A failed, so the assertion failed
3053         ...;   // do some housekeeping, then ...
3054         sayNO; // propagate the failure
3055
3056 #undef ST
3057
3058     ...
3059     }
3060
3061 For any old-timers reading this who are familiar with the old recursive
3062 approach, the code above is equivalent to:
3063
3064     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3065     {
3066         int foo = ...
3067         ...
3068         if (regmatch(A)) {
3069             next = B;
3070             bar = foo;
3071             break;
3072         }
3073         ...;   // do some housekeeping, then ...
3074         sayNO; // propagate the failure
3075     }
3076
3077 The topmost backtrack state, pointed to by st, is usually free. If you
3078 want to claim it, populate any ST.foo fields in it with values you wish to
3079 save, then do one of
3080
3081         PUSH_STATE_GOTO(resume_state, node, newinput);
3082         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3083
3084 which sets that backtrack state's resume value to 'resume_state', pushes a
3085 new free entry to the top of the backtrack stack, then goes to 'node'.
3086 On backtracking, the free slot is popped, and the saved state becomes the
3087 new free state. An ST.foo field in this new top state can be temporarily
3088 accessed to retrieve values, but once the main loop is re-entered, it
3089 becomes available for reuse.
3090
3091 Note that the depth of the backtrack stack constantly increases during the
3092 left-to-right execution of the pattern, rather than going up and down with
3093 the pattern nesting. For example the stack is at its maximum at Z at the
3094 end of the pattern, rather than at X in the following:
3095
3096     /(((X)+)+)+....(Y)+....Z/
3097
3098 The only exceptions to this are lookahead/behind assertions and the cut,
3099 (?>A), which pop all the backtrack states associated with A before
3100 continuing.
3101
3102 Backtrack state structs are allocated in slabs of about 4K in size.
3103 PL_regmatch_state and st always point to the currently active state,
3104 and PL_regmatch_slab points to the slab currently containing
3105 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3106 allocated, and is never freed until interpreter destruction. When the slab
3107 is full, a new one is allocated and chained to the end. At exit from
3108 regmatch(), slabs allocated since entry are freed.
3109
3110 */
3111
3112
3113 #define DEBUG_STATE_pp(pp)                                  \
3114     DEBUG_STATE_r({                                         \
3115         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3116         PerlIO_printf(Perl_debug_log,                       \
3117             "    %*s"pp" %s%s%s%s%s\n",                     \
3118             depth*2, "",                                    \
3119             PL_reg_name[st->resume_state],                     \
3120             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3121             ((st==yes_state) ? "Y" : ""),                   \
3122             ((st==mark_state) ? "M" : ""),                  \
3123             ((st==yes_state||st==mark_state) ? "]" : "")    \
3124         );                                                  \
3125     });
3126
3127
3128 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3129
3130 #ifdef DEBUGGING
3131
3132 STATIC void
3133 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3134     const char *start, const char *end, const char *blurb)
3135 {
3136     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3137
3138     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3139
3140     if (!PL_colorset)
3141             reginitcolors();
3142     {
3143         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3144             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3145
3146         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3147             start, end - start, 60);
3148
3149         PerlIO_printf(Perl_debug_log,
3150             "%s%s REx%s %s against %s\n",
3151                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3152
3153         if (utf8_target||utf8_pat)
3154             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3155                 utf8_pat ? "pattern" : "",
3156                 utf8_pat && utf8_target ? " and " : "",
3157                 utf8_target ? "string" : ""
3158             );
3159     }
3160 }
3161
3162 STATIC void
3163 S_dump_exec_pos(pTHX_ const char *locinput,
3164                       const regnode *scan,
3165                       const char *loc_regeol,
3166                       const char *loc_bostr,
3167                       const char *loc_reg_starttry,
3168                       const bool utf8_target)
3169 {
3170     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3171     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3172     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3173     /* The part of the string before starttry has one color
3174        (pref0_len chars), between starttry and current
3175        position another one (pref_len - pref0_len chars),
3176        after the current position the third one.
3177        We assume that pref0_len <= pref_len, otherwise we
3178        decrease pref0_len.  */
3179     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3180         ? (5 + taill) - l : locinput - loc_bostr;
3181     int pref0_len;
3182
3183     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3184
3185     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3186         pref_len++;
3187     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3188     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3189         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3190               ? (5 + taill) - pref_len : loc_regeol - locinput);
3191     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3192         l--;
3193     if (pref0_len < 0)
3194         pref0_len = 0;
3195     if (pref0_len > pref_len)
3196         pref0_len = pref_len;
3197     {
3198         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3199
3200         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3201             (locinput - pref_len),pref0_len, 60, 4, 5);
3202
3203         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3204                     (locinput - pref_len + pref0_len),
3205                     pref_len - pref0_len, 60, 2, 3);
3206
3207         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3208                     locinput, loc_regeol - locinput, 10, 0, 1);
3209
3210         const STRLEN tlen=len0+len1+len2;
3211         PerlIO_printf(Perl_debug_log,
3212                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3213                     (IV)(locinput - loc_bostr),
3214                     len0, s0,
3215                     len1, s1,
3216                     (docolor ? "" : "> <"),
3217                     len2, s2,
3218                     (int)(tlen > 19 ? 0 :  19 - tlen),
3219                     "");
3220     }
3221 }
3222
3223 #endif
3224
3225 /* reg_check_named_buff_matched()
3226  * Checks to see if a named buffer has matched. The data array of
3227  * buffer numbers corresponding to the buffer is expected to reside
3228  * in the regexp->data->data array in the slot stored in the ARG() of
3229  * node involved. Note that this routine doesn't actually care about the
3230  * name, that information is not preserved from compilation to execution.
3231  * Returns the index of the leftmost defined buffer with the given name
3232  * or 0 if non of the buffers matched.
3233  */
3234 STATIC I32
3235 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3236 {
3237     I32 n;
3238     RXi_GET_DECL(rex,rexi);
3239     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3240     I32 *nums=(I32*)SvPVX(sv_dat);
3241
3242     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3243
3244     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3245         if ((I32)rex->lastparen >= nums[n] &&
3246             rex->offs[nums[n]].end != -1)
3247         {
3248             return nums[n];
3249         }
3250     }
3251     return 0;
3252 }
3253
3254
3255 /* free all slabs above current one  - called during LEAVE_SCOPE */
3256
3257 STATIC void
3258 S_clear_backtrack_stack(pTHX_ void *p)
3259 {
3260     regmatch_slab *s = PL_regmatch_slab->next;
3261     PERL_UNUSED_ARG(p);
3262
3263     if (!s)
3264         return;
3265     PL_regmatch_slab->next = NULL;
3266     while (s) {
3267         regmatch_slab * const osl = s;
3268         s = s->next;
3269         Safefree(osl);
3270     }
3271 }
3272 static bool
3273 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3274 {
3275     /* This function determines if there are one or two characters that match
3276      * the first character of the passed-in EXACTish node <text_node>, and if
3277      * so, returns them in the passed-in pointers.
3278      *
3279      * If it determines that no possible character in the target string can
3280      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3281      * the first character in <text_node> requires UTF-8 to represent, and the
3282      * target string isn't in UTF-8.)
3283      *
3284      * If there are more than two characters that could match the beginning of
3285      * <text_node>, or if more context is required to determine a match or not,
3286      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3287      *
3288      * The motiviation behind this function is to allow the caller to set up
3289      * tight loops for matching.  If <text_node> is of type EXACT, there is
3290      * only one possible character that can match its first character, and so
3291      * the situation is quite simple.  But things get much more complicated if
3292      * folding is involved.  It may be that the first character of an EXACTFish
3293      * node doesn't participate in any possible fold, e.g., punctuation, so it
3294      * can be matched only by itself.  The vast majority of characters that are
3295      * in folds match just two things, their lower and upper-case equivalents.
3296      * But not all are like that; some have multiple possible matches, or match
3297      * sequences of more than one character.  This function sorts all that out.
3298      *
3299      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3300      * loop of trying to match A*, we know we can't exit where the thing
3301      * following it isn't a B.  And something can't be a B unless it is the
3302      * beginning of B.  By putting a quick test for that beginning in a tight
3303      * loop, we can rule out things that can't possibly be B without having to
3304      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3305      * character, we can make a tight loop matching A*, using the outputs of
3306      * this function.
3307      *
3308      * If the target string to match isn't in UTF-8, and there aren't
3309      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3310      * the one or two possible octets (which are characters in this situation)
3311      * that can match.  In all cases, if there is only one character that can
3312      * match, *<c1p> and *<c2p> will be identical.
3313      *
3314      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3315      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3316      * can match the beginning of <text_node>.  They should be declared with at
3317      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3318      * undefined what these contain.)  If one or both of the buffers are
3319      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3320      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3321      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3322      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3323      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3324
3325     const bool utf8_target = PL_reg_match_utf8;
3326
3327     UV c1 = CHRTEST_NOT_A_CP_1;
3328     UV c2 = CHRTEST_NOT_A_CP_2;
3329     bool use_chrtest_void = FALSE;
3330
3331     /* Used when we have both utf8 input and utf8 output, to avoid converting
3332      * to/from code points */
3333     bool utf8_has_been_setup = FALSE;
3334
3335     dVAR;
3336
3337     U8 *pat = (U8*)STRING(text_node);
3338
3339     if (OP(text_node) == EXACT) {
3340
3341         /* In an exact node, only one thing can be matched, that first
3342          * character.  If both the pat and the target are UTF-8, we can just
3343          * copy the input to the output, avoiding finding the code point of
3344          * that character */
3345         if (! UTF_PATTERN) {
3346             c2 = c1 = *pat;
3347         }
3348         else if (utf8_target) {
3349             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3350             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3351             utf8_has_been_setup = TRUE;
3352         }
3353         else {
3354             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3355         }
3356     }
3357     else /* an EXACTFish node */
3358          if ((UTF_PATTERN
3359                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3360                                                     pat + STR_LEN(text_node)))
3361              || (! UTF_PATTERN
3362                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3363                                                     pat + STR_LEN(text_node))))
3364     {
3365         /* Multi-character folds require more context to sort out.  Also
3366          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3367          * handled outside this routine */
3368         use_chrtest_void = TRUE;
3369     }
3370     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3371         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3372         if (c1 > 256) {
3373             /* Load the folds hash, if not already done */
3374             SV** listp;
3375             if (! PL_utf8_foldclosures) {
3376                 if (! PL_utf8_tofold) {
3377                     U8 dummy[UTF8_MAXBYTES+1];
3378
3379                     /* Force loading this by folding an above-Latin1 char */
3380                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3381                     assert(PL_utf8_tofold); /* Verify that worked */
3382                 }
3383                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3384             }
3385
3386             /* The fold closures data structure is a hash with the keys being
3387              * the UTF-8 of every character that is folded to, like 'k', and
3388              * the values each an array of all code points that fold to its
3389              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3390              * not included */
3391             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3392                                      (char *) pat,
3393                                      UTF8SKIP(pat),
3394                                      FALSE))))
3395             {
3396                 /* Not found in the hash, therefore there are no folds
3397                  * containing it, so there is only a single character that
3398                  * could match */
3399                 c2 = c1;
3400             }
3401             else {  /* Does participate in folds */
3402                 AV* list = (AV*) *listp;
3403                 if (av_len(list) != 1) {
3404
3405                     /* If there aren't exactly two folds to this, it is outside
3406                      * the scope of this function */
3407                     use_chrtest_void = TRUE;
3408                 }
3409                 else {  /* There are two.  Get them */
3410                     SV** c_p = av_fetch(list, 0, FALSE);
3411                     if (c_p == NULL) {
3412                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3413                     }
3414                     c1 = SvUV(*c_p);
3415
3416                     c_p = av_fetch(list, 1, FALSE);
3417                     if (c_p == NULL) {
3418                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3419                     }
3420                     c2 = SvUV(*c_p);
3421
3422                     /* Folds that cross the 255/256 boundary are forbidden if
3423                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3424                      * pattern character is above 256, and its only other match
3425                      * is below 256, the only legal match will be to itself.
3426                      * We have thrown away the original, so have to compute
3427                      * which is the one above 255 */
3428                     if ((c1 < 256) != (c2 < 256)) {
3429                         if (OP(text_node) == EXACTFL
3430                             || (OP(text_node) == EXACTFA
3431                                 && (isASCII(c1) || isASCII(c2))))
3432                         {
3433                             if (c1 < 256) {
3434                                 c1 = c2;
3435                             }
3436                             else {
3437                                 c2 = c1;
3438                             }
3439                         }
3440                     }
3441                 }
3442             }
3443         }
3444         else /* Here, c1 is < 255 */
3445              if (utf8_target
3446                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3447                  && OP(text_node) != EXACTFL
3448                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3449         {
3450             /* Here, there could be something above Latin1 in the target which
3451              * folds to this character in the pattern.  All such cases except
3452              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3453              * involved in their folds, so are outside the scope of this
3454              * function */
3455             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3456                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3457             }
3458             else {
3459                 use_chrtest_void = TRUE;
3460             }
3461         }
3462         else { /* Here nothing above Latin1 can fold to the pattern character */
3463             switch (OP(text_node)) {
3464
3465                 case EXACTFL:   /* /l rules */
3466                     c2 = PL_fold_locale[c1];
3467                     break;
3468
3469                 case EXACTF:
3470                     if (! utf8_target) {    /* /d rules */
3471                         c2 = PL_fold[c1];
3472                         break;
3473                     }
3474                     /* FALLTHROUGH */
3475                     /* /u rules for all these.  This happens to work for
3476                      * EXACTFA as nothing in Latin1 folds to ASCII */
3477                 case EXACTFA:
3478                 case EXACTFU_TRICKYFOLD:
3479                 case EXACTFU_SS:
3480                 case EXACTFU:
3481                     c2 = PL_fold_latin1[c1];
3482                     break;
3483
3484                 default:
3485                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3486                     assert(0); /* NOTREACHED */
3487             }
3488         }
3489     }
3490
3491     /* Here have figured things out.  Set up the returns */
3492     if (use_chrtest_void) {
3493         *c2p = *c1p = CHRTEST_VOID;
3494     }
3495     else if (utf8_target) {
3496         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3497             uvchr_to_utf8(c1_utf8, c1);
3498             uvchr_to_utf8(c2_utf8, c2);
3499         }
3500
3501         /* Invariants are stored in both the utf8 and byte outputs; Use
3502          * negative numbers otherwise for the byte ones.  Make sure that the
3503          * byte ones are the same iff the utf8 ones are the same */
3504         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3505         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3506                 ? *c2_utf8
3507                 : (c1 == c2)
3508                   ? CHRTEST_NOT_A_CP_1
3509                   : CHRTEST_NOT_A_CP_2;
3510     }
3511     else if (c1 > 255) {
3512        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3513                            can represent */
3514            return FALSE;
3515        }
3516
3517        *c1p = *c2p = c2;    /* c2 is the only representable value */
3518     }
3519     else {  /* c1 is representable; see about c2 */
3520        *c1p = c1;
3521        *c2p = (c2 < 256) ? c2 : c1;
3522     }
3523
3524     return TRUE;
3525 }
3526
3527 /* returns -1 on failure, $+[0] on success */
3528 STATIC I32
3529 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3530 {
3531 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3532     dMY_CXT;
3533 #endif
3534     dVAR;
3535     const bool utf8_target = PL_reg_match_utf8;
3536     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3537     REGEXP *rex_sv = reginfo->prog;
3538     regexp *rex = ReANY(rex_sv);
3539     RXi_GET_DECL(rex,rexi);
3540     I32 oldsave;
3541     /* the current state. This is a cached copy of PL_regmatch_state */
3542     regmatch_state *st;
3543     /* cache heavy used fields of st in registers */
3544     regnode *scan;
3545     regnode *next;
3546     U32 n = 0;  /* general value; init to avoid compiler warning */
3547     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3548     char *locinput = startpos;
3549     char *pushinput; /* where to continue after a PUSH */
3550     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3551
3552     bool result = 0;        /* return value of S_regmatch */
3553     int depth = 0;          /* depth of backtrack stack */
3554     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3555     const U32 max_nochange_depth =
3556         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3557         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3558     regmatch_state *yes_state = NULL; /* state to pop to on success of
3559                                                             subpattern */
3560     /* mark_state piggy backs on the yes_state logic so that when we unwind
3561        the stack on success we can update the mark_state as we go */
3562     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3563     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3564     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3565     U32 state_num;
3566     bool no_final = 0;      /* prevent failure from backtracking? */
3567     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3568     char *startpoint = locinput;
3569     SV *popmark = NULL;     /* are we looking for a mark? */
3570     SV *sv_commit = NULL;   /* last mark name seen in failure */
3571     SV *sv_yes_mark = NULL; /* last mark name we have seen
3572                                during a successful match */
3573     U32 lastopen = 0;       /* last open we saw */
3574     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3575     SV* const oreplsv = GvSV(PL_replgv);
3576     /* these three flags are set by various ops to signal information to
3577      * the very next op. They have a useful lifetime of exactly one loop
3578      * iteration, and are not preserved or restored by state pushes/pops
3579      */
3580     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3581     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3582     int logical = 0;        /* the following EVAL is:
3583                                 0: (?{...})
3584                                 1: (?(?{...})X|Y)
3585                                 2: (??{...})
3586                                or the following IFMATCH/UNLESSM is:
3587                                 false: plain (?=foo)
3588                                 true:  used as a condition: (?(?=foo))
3589                             */
3590     PAD* last_pad = NULL;
3591     dMULTICALL;
3592     I32 gimme = G_SCALAR;
3593     CV *caller_cv = NULL;       /* who called us */
3594     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3595     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3596
3597 #ifdef DEBUGGING
3598     GET_RE_DEBUG_FLAGS_DECL;
3599 #endif
3600
3601     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3602     multicall_oldcatch = 0;
3603     multicall_cv = NULL;
3604     cx = NULL;
3605     PERL_UNUSED_VAR(multicall_cop);
3606     PERL_UNUSED_VAR(newsp);
3607
3608
3609     PERL_ARGS_ASSERT_REGMATCH;
3610
3611     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3612             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3613     }));
3614     /* on first ever call to regmatch, allocate first slab */
3615     if (!PL_regmatch_slab) {
3616         Newx(PL_regmatch_slab, 1, regmatch_slab);
3617         PL_regmatch_slab->prev = NULL;
3618         PL_regmatch_slab->next = NULL;
3619         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3620     }
3621
3622     oldsave = PL_savestack_ix;
3623     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3624     SAVEVPTR(PL_regmatch_slab);
3625     SAVEVPTR(PL_regmatch_state);
3626
3627     /* grab next free state slot */
3628     st = ++PL_regmatch_state;
3629     if (st >  SLAB_LAST(PL_regmatch_slab))
3630         st = PL_regmatch_state = S_push_slab(aTHX);
3631
3632     /* Note that nextchr is a byte even in UTF */
3633     SET_nextchr;
3634     scan = prog;
3635     while (scan != NULL) {
3636
3637         DEBUG_EXECUTE_r( {
3638             SV * const prop = sv_newmortal();
3639             regnode *rnext=regnext(scan);
3640             DUMP_EXEC_POS( locinput, scan, utf8_target );
3641             regprop(rex, prop, scan);
3642
3643             PerlIO_printf(Perl_debug_log,
3644                     "%3"IVdf":%*s%s(%"IVdf")\n",
3645                     (IV)(scan - rexi->program), depth*2, "",
3646                     SvPVX_const(prop),
3647                     (PL_regkind[OP(scan)] == END || !rnext) ?
3648                         0 : (IV)(rnext - rexi->program));
3649         });
3650
3651         next = scan + NEXT_OFF(scan);
3652         if (next == scan)
3653             next = NULL;
3654         state_num = OP(scan);
3655
3656         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3657       reenter_switch:
3658
3659         SET_nextchr;
3660         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3661
3662         switch (state_num) {
3663         case BOL: /*  /^../  */
3664             if (locinput == PL_bostr)
3665             {
3666                 /* reginfo->till = reginfo->bol; */
3667                 break;
3668             }
3669             sayNO;
3670
3671         case MBOL: /*  /^../m  */
3672             if (locinput == PL_bostr ||
3673                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3674             {
3675                 break;
3676             }
3677             sayNO;
3678
3679         case SBOL: /*  /^../s  */
3680             if (locinput == PL_bostr)
3681                 break;
3682             sayNO;
3683
3684         case GPOS: /*  \G  */
3685             if (locinput == reginfo->ganch)
3686                 break;
3687             sayNO;
3688
3689         case KEEPS: /*   \K  */
3690             /* update the startpoint */
3691             st->u.keeper.val = rex->offs[0].start;
3692             rex->offs[0].start = locinput - PL_bostr;
3693             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3694             assert(0); /*NOTREACHED*/
3695         case KEEPS_next_fail:
3696             /* rollback the start point change */
3697             rex->offs[0].start = st->u.keeper.val;
3698             sayNO_SILENT;
3699             assert(0); /*NOTREACHED*/
3700
3701         case EOL: /* /..$/  */
3702                 goto seol;
3703
3704         case MEOL: /* /..$/m  */
3705             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3706                 sayNO;
3707             break;
3708
3709         case SEOL: /* /..$/s  */
3710           seol:
3711             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3712                 sayNO;
3713             if (PL_regeol - locinput > 1)
3714                 sayNO;
3715             break;
3716
3717         case EOS: /*  \z  */
3718             if (!NEXTCHR_IS_EOS)
3719                 sayNO;
3720             break;
3721
3722         case SANY: /*  /./s  */
3723             if (NEXTCHR_IS_EOS)
3724                 sayNO;
3725             goto increment_locinput;
3726
3727         case CANY: /*  \C  */
3728             if (NEXTCHR_IS_EOS)
3729                 sayNO;
3730             locinput++;
3731             break;
3732
3733         case REG_ANY: /*  /./  */
3734             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3735                 sayNO;
3736             goto increment_locinput;
3737
3738
3739 #undef  ST
3740 #define ST st->u.trie
3741         case TRIEC: /* (ab|cd) with known charclass */
3742             /* In this case the charclass data is available inline so
3743                we can fail fast without a lot of extra overhead.
3744              */
3745             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3746                 DEBUG_EXECUTE_r(
3747                     PerlIO_printf(Perl_debug_log,
3748                               "%*s  %sfailed to match trie start class...%s\n",
3749                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3750                 );
3751                 sayNO_SILENT;
3752                 assert(0); /* NOTREACHED */
3753             }
3754             /* FALL THROUGH */
3755         case TRIE:  /* (ab|cd)  */
3756             /* the basic plan of execution of the trie is:
3757              * At the beginning, run though all the states, and
3758              * find the longest-matching word. Also remember the position
3759              * of the shortest matching word. For example, this pattern:
3760              *    1  2 3 4    5
3761              *    ab|a|x|abcd|abc
3762              * when matched against the string "abcde", will generate
3763              * accept states for all words except 3, with the longest
3764              * matching word being 4, and the shortest being 2 (with
3765              * the position being after char 1 of the string).
3766              *
3767              * Then for each matching word, in word order (i.e. 1,2,4,5),
3768              * we run the remainder of the pattern; on each try setting
3769              * the current position to the character following the word,
3770              * returning to try the next word on failure.
3771              *
3772              * We avoid having to build a list of words at runtime by
3773              * using a compile-time structure, wordinfo[].prev, which
3774              * gives, for each word, the previous accepting word (if any).
3775              * In the case above it would contain the mappings 1->2, 2->0,
3776              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3777              * the longest word (4 above), a list of all words, by
3778              * following the list of prev pointers; this gives us the
3779              * unordered list 4,5,1,2. Then given the current word we have
3780              * just tried, we can go through the list and find the
3781              * next-biggest word to try (so if we just failed on word 2,
3782              * the next in the list is 4).
3783              *
3784              * Since at runtime we don't record the matching position in
3785              * the string for each word, we have to work that out for
3786              * each word we're about to process. The wordinfo table holds
3787              * the character length of each word; given that we recorded
3788              * at the start: the position of the shortest word and its
3789              * length in chars, we just need to move the pointer the
3790              * difference between the two char lengths. Depending on
3791              * Unicode status and folding, that's cheap or expensive.
3792              *
3793              * This algorithm is optimised for the case where are only a
3794              * small number of accept states, i.e. 0,1, or maybe 2.
3795              * With lots of accepts states, and having to try all of them,
3796              * it becomes quadratic on number of accept states to find all
3797              * the next words.
3798              */
3799
3800             {
3801                 /* what type of TRIE am I? (utf8 makes this contextual) */
3802                 DECL_TRIE_TYPE(scan);
3803
3804                 /* what trie are we using right now */
3805                 reg_trie_data * const trie
3806                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3807                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3808                 U32 state = trie->startstate;
3809
3810                 if (   trie->bitmap
3811                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3812                 {
3813                     if (trie->states[ state ].wordnum) {
3814                          DEBUG_EXECUTE_r(
3815                             PerlIO_printf(Perl_debug_log,
3816                                           "%*s  %smatched empty string...%s\n",
3817                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3818                         );
3819                         if (!trie->jump)
3820                             break;
3821                     } else {
3822                         DEBUG_EXECUTE_r(
3823                             PerlIO_printf(Perl_debug_log,
3824                                           "%*s  %sfailed to match trie start class...%s\n",
3825                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3826                         );
3827                         sayNO_SILENT;
3828                    }
3829                 }
3830
3831             {
3832                 U8 *uc = ( U8* )locinput;
3833
3834                 STRLEN len = 0;
3835                 STRLEN foldlen = 0;
3836                 U8 *uscan = (U8*)NULL;
3837                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3838                 U32 charcount = 0; /* how many input chars we have matched */
3839                 U32 accepted = 0; /* have we seen any accepting states? */
3840
3841                 ST.jump = trie->jump;
3842                 ST.me = scan;
3843                 ST.firstpos = NULL;
3844                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3845                 ST.nextword = 0;
3846
3847                 /* fully traverse the TRIE; note the position of the
3848                    shortest accept state and the wordnum of the longest
3849                    accept state */
3850
3851                 while ( state && uc <= (U8*)PL_regeol ) {
3852                     U32 base = trie->states[ state ].trans.base;
3853                     UV uvc = 0;
3854                     U16 charid = 0;
3855                     U16 wordnum;
3856                     wordnum = trie->states[ state ].wordnum;
3857
3858                     if (wordnum) { /* it's an accept state */
3859                         if (!accepted) {
3860                             accepted = 1;
3861                             /* record first match position */
3862                             if (ST.longfold) {
3863                                 ST.firstpos = (U8*)locinput;
3864                                 ST.firstchars = 0;
3865                             }
3866                             else {
3867                                 ST.firstpos = uc;
3868                                 ST.firstchars = charcount;
3869                             }
3870                         }
3871                         if (!ST.nextword || wordnum < ST.nextword)
3872                             ST.nextword = wordnum;
3873                         ST.topword = wordnum;
3874                     }
3875
3876                     DEBUG_TRIE_EXECUTE_r({
3877                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3878                                 PerlIO_printf( Perl_debug_log,
3879                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3880                                     2+depth * 2, "", PL_colors[4],
3881                                     (UV)state, (accepted ? 'Y' : 'N'));
3882                     });
3883
3884                     /* read a char and goto next state */
3885                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3886                         I32 offset;
3887                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3888                                              uscan, len, uvc, charid, foldlen,
3889                                              foldbuf, uniflags);
3890                         charcount++;
3891                         if (foldlen>0)
3892                             ST.longfold = TRUE;
3893                         if (charid &&
3894                              ( ((offset =
3895                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3896
3897                              && ((U32)offset < trie->lasttrans)
3898                              && trie->trans[offset].check == state)
3899                         {
3900                             state = trie->trans[offset].next;
3901                         }
3902                         else {
3903                             state = 0;
3904                         }
3905                         uc += len;
3906
3907                     }
3908                     else {
3909                         state = 0;
3910                     }
3911                     DEBUG_TRIE_EXECUTE_r(
3912                         PerlIO_printf( Perl_debug_log,
3913                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3914                             charid, uvc, (UV)state, PL_colors[5] );
3915                     );
3916                 }
3917                 if (!accepted)
3918                    sayNO;
3919
3920                 /* calculate total number of accept states */
3921                 {
3922                     U16 w = ST.topword;
3923                     accepted = 0;
3924                     while (w) {
3925                         w = trie->wordinfo[w].prev;
3926                         accepted++;
3927                     }
3928                     ST.accepted = accepted;
3929                 }
3930
3931                 DEBUG_EXECUTE_r(
3932                     PerlIO_printf( Perl_debug_log,
3933                         "%*s  %sgot %"IVdf" possible matches%s\n",
3934                         REPORT_CODE_OFF + depth * 2, "",
3935                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3936                 );
3937                 goto trie_first_try; /* jump into the fail handler */
3938             }}
3939             assert(0); /* NOTREACHED */
3940
3941         case TRIE_next_fail: /* we failed - try next alternative */
3942         {
3943             U8 *uc;
3944             if ( ST.jump) {
3945                 REGCP_UNWIND(ST.cp);
3946                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3947             }
3948             if (!--ST.accepted) {
3949                 DEBUG_EXECUTE_r({
3950                     PerlIO_printf( Perl_debug_log,
3951                         "%*s  %sTRIE failed...%s\n",
3952                         REPORT_CODE_OFF+depth*2, "",
3953                         PL_colors[4],
3954                         PL_colors[5] );
3955                 });
3956                 sayNO_SILENT;
3957             }
3958             {
3959                 /* Find next-highest word to process.  Note that this code
3960                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3961                 U16 min = 0;
3962                 U16 word;
3963                 U16 const nextword = ST.nextword;
3964                 reg_trie_wordinfo * const wordinfo
3965                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3966                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3967                     if (word > nextword && (!min || word < min))
3968                         min = word;
3969                 }
3970                 ST.nextword = min;
3971             }
3972
3973           trie_first_try:
3974             if (do_cutgroup) {
3975                 do_cutgroup = 0;
3976                 no_final = 0;
3977             }
3978
3979             if ( ST.jump) {
3980                 ST.lastparen = rex->lastparen;
3981                 ST.lastcloseparen = rex->lastcloseparen;
3982                 REGCP_SET(ST.cp);
3983             }
3984
3985             /* find start char of end of current word */
3986             {
3987                 U32 chars; /* how many chars to skip */
3988                 reg_trie_data * const trie
3989                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3990
3991                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3992                             >=  ST.firstchars);
3993                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3994                             - ST.firstchars;
3995                 uc = ST.firstpos;
3996
3997                 if (ST.longfold) {
3998                     /* the hard option - fold each char in turn and find
3999                      * its folded length (which may be different */
4000                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4001                     STRLEN foldlen;
4002                     STRLEN len;
4003                     UV uvc;
4004                     U8 *uscan;
4005
4006                     while (chars) {
4007                         if (utf8_target) {
4008                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4009                                                     uniflags);
4010                             uc += len;
4011                         }
4012                         else {
4013                             uvc = *uc;
4014                             uc++;
4015                         }
4016                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4017                         uscan = foldbuf;
4018                         while (foldlen) {
4019                             if (!--chars)
4020                                 break;
4021                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4022                                             uniflags);
4023                             uscan += len;
4024                             foldlen -= len;
4025                         }
4026                     }
4027                 }
4028                 else {
4029                     if (utf8_target)
4030                         while (chars--)
4031                             uc += UTF8SKIP(uc);
4032                     else
4033                         uc += chars;
4034                 }
4035             }
4036
4037             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4038                             ? ST.jump[ST.nextword]
4039                             : NEXT_OFF(ST.me));
4040
4041             DEBUG_EXECUTE_r({
4042                 PerlIO_printf( Perl_debug_log,
4043                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4044                     REPORT_CODE_OFF+depth*2, "",
4045                     PL_colors[4],
4046                     ST.nextword,
4047                     PL_colors[5]
4048                     );
4049             });
4050
4051             if (ST.accepted > 1 || has_cutgroup) {
4052                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4053                 assert(0); /* NOTREACHED */
4054             }
4055             /* only one choice left - just continue */
4056             DEBUG_EXECUTE_r({
4057                 AV *const trie_words
4058                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4059                 SV ** const tmp = av_fetch( trie_words,
4060                     ST.nextword-1, 0 );
4061                 SV *sv= tmp ? sv_newmortal() : NULL;
4062
4063                 PerlIO_printf( Perl_debug_log,
4064                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4065                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4066                     ST.nextword,
4067                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4068                             PL_colors[0], PL_colors[1],
4069                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4070                         )
4071                     : "not compiled under -Dr",
4072                     PL_colors[5] );
4073             });
4074
4075             locinput = (char*)uc;
4076             continue; /* execute rest of RE */
4077             assert(0); /* NOTREACHED */
4078         }
4079 #undef  ST
4080
4081         case EXACT: {            /*  /abc/        */
4082             char *s = STRING(scan);
4083             ln = STR_LEN(scan);
4084             if (utf8_target != UTF_PATTERN) {
4085                 /* The target and the pattern have differing utf8ness. */
4086                 char *l = locinput;
4087                 const char * const e = s + ln;
4088
4089                 if (utf8_target) {
4090                     /* The target is utf8, the pattern is not utf8.
4091                      * Above-Latin1 code points can't match the pattern;
4092                      * invariants match exactly, and the other Latin1 ones need
4093                      * to be downgraded to a single byte in order to do the
4094                      * comparison.  (If we could be confident that the target
4095                      * is not malformed, this could be refactored to have fewer
4096                      * tests by just assuming that if the first bytes match, it
4097                      * is an invariant, but there are tests in the test suite
4098                      * dealing with (??{...}) which violate this) */
4099                     while (s < e) {
4100                         if (l >= PL_regeol)
4101                              sayNO;
4102                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4103                             sayNO;
4104                         }
4105                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4106                             if (*l != *s) {
4107                                 sayNO;
4108                             }
4109                             l++;
4110                         }
4111                         else {
4112                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4113                                 sayNO;
4114                             }
4115                             l += 2;
4116                         }
4117                         s++;
4118                     }
4119                 }
4120                 else {
4121                     /* The target is not utf8, the pattern is utf8. */
4122                     while (s < e) {
4123                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4124                         {
4125                             sayNO;
4126                         }
4127                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4128                             if (*s != *l) {
4129                                 sayNO;
4130                             }
4131                             s++;
4132                         }
4133                         else {
4134                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4135                                 sayNO;
4136                             }
4137                             s += 2;
4138                         }
4139                         l++;
4140                     }
4141                 }
4142                 locinput = l;
4143                 break;
4144             }
4145             /* The target and the pattern have the same utf8ness. */
4146             /* Inline the first character, for speed. */
4147             if (UCHARAT(s) != nextchr)
4148                 sayNO;
4149             if (PL_regeol - locinput < ln)
4150                 sayNO;
4151             if (ln > 1 && memNE(s, locinput, ln))
4152                 sayNO;
4153             locinput += ln;
4154             break;
4155             }
4156
4157         case EXACTFL: {          /*  /abc/il      */
4158             re_fold_t folder;
4159             const U8 * fold_array;
4160             const char * s;
4161             U32 fold_utf8_flags;
4162
4163             PL_reg_flags |= RF_tainted;
4164             folder = foldEQ_locale;
4165             fold_array = PL_fold_locale;
4166             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4167             goto do_exactf;
4168
4169         case EXACTFU_SS:         /*  /\x{df}/iu   */
4170         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4171         case EXACTFU:            /*  /abc/iu      */
4172             folder = foldEQ_latin1;
4173             fold_array = PL_fold_latin1;
4174             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4175             goto do_exactf;
4176
4177         case EXACTFA:            /*  /abc/iaa     */
4178             folder = foldEQ_latin1;
4179             fold_array = PL_fold_latin1;
4180             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4181             goto do_exactf;
4182
4183         case EXACTF:             /*  /abc/i       */
4184             folder = foldEQ;
4185             fold_array = PL_fold;
4186             fold_utf8_flags = 0;
4187
4188           do_exactf:
4189             s = STRING(scan);
4190             ln = STR_LEN(scan);
4191
4192             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4193               /* Either target or the pattern are utf8, or has the issue where
4194                * the fold lengths may differ. */
4195                 const char * const l = locinput;
4196                 char *e = PL_regeol;
4197
4198                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4199                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4200                 {
4201                     sayNO;
4202                 }
4203                 locinput = e;
4204                 break;
4205             }
4206
4207             /* Neither the target nor the pattern are utf8 */
4208             if (UCHARAT(s) != nextchr
4209                 && !NEXTCHR_IS_EOS
4210                 && UCHARAT(s) != fold_array[nextchr])
4211             {
4212                 sayNO;
4213             }
4214             if (PL_regeol - locinput < ln)
4215                 sayNO;
4216             if (ln > 1 && ! folder(s, locinput, ln))
4217                 sayNO;
4218             locinput += ln;
4219             break;
4220         }
4221
4222         /* XXX Could improve efficiency by separating these all out using a
4223          * macro or in-line function.  At that point regcomp.c would no longer
4224          * have to set the FLAGS fields of these */
4225         case BOUNDL:  /*  /\b/l  */
4226         case NBOUNDL: /*  /\B/l  */
4227             PL_reg_flags |= RF_tainted;
4228             /* FALL THROUGH */
4229         case BOUND:   /*  /\b/   */
4230         case BOUNDU:  /*  /\b/u  */
4231         case BOUNDA:  /*  /\b/a  */
4232         case NBOUND:  /*  /\B/   */
4233         case NBOUNDU: /*  /\B/u  */
4234         case NBOUNDA: /*  /\B/a  */
4235             /* was last char in word? */
4236             if (utf8_target
4237                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4238                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4239             {
4240                 if (locinput == PL_bostr)
4241                     ln = '\n';
4242                 else {
4243                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4244
4245                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4246                 }
4247                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4248                     ln = isALNUM_uni(ln);
4249                     if (NEXTCHR_IS_EOS)
4250                         n = 0;
4251                     else {
4252                         LOAD_UTF8_CHARCLASS_ALNUM();
4253                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4254                                                                 utf8_target);
4255                     }
4256                 }
4257                 else {
4258                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4259                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4260                 }
4261             }
4262             else {
4263
4264                 /* Here the string isn't utf8, or is utf8 and only ascii
4265                  * characters are to match \w.  In the latter case looking at
4266                  * the byte just prior to the current one may be just the final
4267                  * byte of a multi-byte character.  This is ok.  There are two
4268                  * cases:
4269                  * 1) it is a single byte character, and then the test is doing
4270                  *      just what it's supposed to.
4271                  * 2) it is a multi-byte character, in which case the final
4272                  *      byte is never mistakable for ASCII, and so the test
4273                  *      will say it is not a word character, which is the
4274                  *      correct answer. */
4275                 ln = (locinput != PL_bostr) ?
4276                     UCHARAT(locinput - 1) : '\n';
4277                 switch (FLAGS(scan)) {
4278                     case REGEX_UNICODE_CHARSET:
4279                         ln = isWORDCHAR_L1(ln);
4280                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4281                         break;
4282                     case REGEX_LOCALE_CHARSET:
4283                         ln = isALNUM_LC(ln);
4284                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4285                         break;
4286                     case REGEX_DEPENDS_CHARSET:
4287                         ln = isALNUM(ln);
4288                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4289                         break;
4290                     case REGEX_ASCII_RESTRICTED_CHARSET:
4291                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4292                         ln = isWORDCHAR_A(ln);
4293                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4294                         break;
4295                     default:
4296                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4297                         break;
4298                 }
4299             }
4300             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4301              * regcomp.sym */
4302             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4303                     sayNO;
4304             break;
4305
4306         case ANYOF:  /*  /[abc]/       */
4307             if (NEXTCHR_IS_EOS)
4308                 sayNO;
4309             if (utf8_target) {
4310                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4311                     sayNO;
4312                 locinput += UTF8SKIP(locinput);
4313                 break;
4314             }
4315             else {
4316                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4317                     sayNO;
4318                 locinput++;
4319                 break;
4320             }
4321             break;
4322
4323         /* Special char classes: \d, \w etc.
4324          * The defines start on line 166 or so */
4325         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4326                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4327                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4328                   ALNUMA, NALNUMA, isWORDCHAR_A,
4329                   alnum, "a");
4330
4331         case SPACEL:
4332             PL_reg_flags |= RF_tainted;
4333             if (NEXTCHR_IS_EOS) {
4334                 sayNO;
4335             }
4336             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4337                 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4338                     sayNO;
4339                 }
4340             }
4341             else if (! isSPACE_LC((U8) nextchr)) {
4342                     sayNO;
4343             }
4344             goto increment_locinput;
4345
4346         case NSPACEL:
4347             PL_reg_flags |= RF_tainted;
4348             if (NEXTCHR_IS_EOS) {
4349                 sayNO;
4350             }
4351             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4352                 if (isSPACE_LC_utf8((U8 *) locinput)) {
4353                     sayNO;
4354                 }
4355             }
4356             else if (isSPACE_LC(nextchr)) {
4357                     sayNO;
4358             }
4359             goto increment_locinput;
4360
4361         case SPACE:
4362             if (utf8_target) {
4363                 goto utf8_space;
4364             }
4365             /* FALL THROUGH */
4366         case SPACEA:
4367             if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4368                 sayNO;
4369             }
4370             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4371             locinput++;
4372             break;
4373
4374         case NSPACE:
4375             if (utf8_target) {
4376                 goto utf8_nspace;
4377             }
4378             /* FALL THROUGH */
4379         case NSPACEA:
4380             if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4381                 sayNO;
4382             }
4383             goto increment_locinput;
4384
4385         case SPACEU:
4386           utf8_space:
4387             if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4388                 sayNO;
4389             }
4390             goto increment_locinput;
4391
4392         case NSPACEU:
4393           utf8_nspace:
4394             if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4395                 sayNO;
4396             }
4397             goto increment_locinput;
4398
4399         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4400                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4401                 DIGITA, NDIGITA, isDIGIT_A,
4402                 digit, "0");
4403
4404         case POSIXA: /* /[[:ascii:]]/ etc */
4405             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4406                 sayNO;
4407             }
4408             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4409             locinput++;
4410             break;
4411
4412         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4413             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4414                 sayNO;
4415             }
4416             goto increment_locinput;
4417
4418         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4419                        a Unicode extended Grapheme Cluster */
4420             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4421               extended Grapheme Cluster is:
4422
4423                CR LF
4424                | Prepend* Begin Extend*
4425                | .
4426
4427                Begin is:           ( Special_Begin | ! Control )
4428                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4429                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4430                Control is:         [ GCB_Control  CR  LF ]
4431                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4432
4433                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4434                we can rewrite
4435
4436                    Begin is ( Regular_Begin + Special Begin )
4437
4438                It turns out that 98.4% of all Unicode code points match
4439                Regular_Begin.  Doing it this way eliminates a table match in
4440                the previous implementation for almost all Unicode code points.
4441
4442                There is a subtlety with Prepend* which showed up in testing.
4443                Note that the Begin, and only the Begin is required in:
4444                 | Prepend* Begin Extend*
4445                Also, Begin contains '! Control'.  A Prepend must be a
4446                '!  Control', which means it must also be a Begin.  What it
4447                comes down to is that if we match Prepend* and then find no
4448                suitable Begin afterwards, that if we backtrack the last
4449                Prepend, that one will be a suitable Begin.
4450             */
4451
4452             if (NEXTCHR_IS_EOS)
4453                 sayNO;
4454             if  (! utf8_target) {
4455
4456                 /* Match either CR LF  or '.', as all the other possibilities
4457                  * require utf8 */
4458                 locinput++;         /* Match the . or CR */
4459                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4460                                        match the LF */
4461                     && locinput < PL_regeol
4462                     && UCHARAT(locinput) == '\n') locinput++;
4463             }
4464             else {
4465
4466                 /* Utf8: See if is ( CR LF ); already know that locinput <
4467                  * PL_regeol, so locinput+1 is in bounds */
4468                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4469                         && UCHARAT(locinput + 1) == '\n')
4470                 {
4471                     locinput += 2;
4472                 }
4473                 else {
4474                     STRLEN len;
4475
4476                     /* In case have to backtrack to beginning, then match '.' */
4477                     char *starting = locinput;
4478
4479                     /* In case have to backtrack the last prepend */
4480                     char *previous_prepend = 0;
4481
4482                     LOAD_UTF8_CHARCLASS_GCB();
4483
4484                     /* Match (prepend)*   */
4485                     while (locinput < PL_regeol
4486                            && (len = is_GCB_Prepend_utf8(locinput)))
4487                     {
4488                         previous_prepend = locinput;
4489                         locinput += len;
4490                     }
4491
4492                     /* As noted above, if we matched a prepend character, but
4493                      * the next thing won't match, back off the last prepend we
4494                      * matched, as it is guaranteed to match the begin */
4495                     if (previous_prepend
4496                         && (locinput >=  PL_regeol
4497                             || (! swash_fetch(PL_utf8_X_regular_begin,
4498                                              (U8*)locinput, utf8_target)
4499                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4500                         )
4501                     {
4502                         locinput = previous_prepend;
4503                     }
4504
4505                     /* Note that here we know PL_regeol > locinput, as we
4506                      * tested that upon input to this switch case, and if we
4507                      * moved locinput forward, we tested the result just above
4508                      * and it either passed, or we backed off so that it will
4509                      * now pass */
4510                     if (swash_fetch(PL_utf8_X_regular_begin,
4511                                     (U8*)locinput, utf8_target)) {
4512                         locinput += UTF8SKIP(locinput);
4513                     }
4514                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4515
4516                         /* Here did not match the required 'Begin' in the
4517                          * second term.  So just match the very first
4518                          * character, the '.' of the final term of the regex */
4519                         locinput = starting + UTF8SKIP(starting);
4520                         goto exit_utf8;
4521                     } else {
4522
4523                         /* Here is a special begin.  It can be composed of
4524                          * several individual characters.  One possibility is
4525                          * RI+ */
4526                         if ((len = is_GCB_RI_utf8(locinput))) {
4527                             locinput += len;
4528                             while (locinput < PL_regeol
4529                                    && (len = is_GCB_RI_utf8(locinput)))
4530                             {
4531                                 locinput += len;
4532                             }
4533                         } else if ((len = is_GCB_T_utf8(locinput))) {
4534                             /* Another possibility is T+ */
4535                             locinput += len;
4536                             while (locinput < PL_regeol
4537                                 && (len = is_GCB_T_utf8(locinput)))
4538                             {
4539                                 locinput += len;
4540                             }
4541                         } else {
4542
4543                             /* Here, neither RI+ nor T+; must be some other
4544                              * Hangul.  That means it is one of the others: L,
4545                              * LV, LVT or V, and matches:
4546                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4547
4548                             /* Match L*           */
4549                             while (locinput < PL_regeol
4550                                    && (len = is_GCB_L_utf8(locinput)))
4551                             {
4552                                 locinput += len;
4553                             }
4554
4555                             /* Here, have exhausted L*.  If the next character
4556                              * is not an LV, LVT nor V, it means we had to have
4557                              * at least one L, so matches L+ in the original
4558                              * equation, we have a complete hangul syllable.
4559                              * Are done. */
4560
4561                             if (locinput < PL_regeol
4562                                 && is_GCB_LV_LVT_V_utf8(locinput))
4563                             {
4564
4565                                 /* Otherwise keep going.  Must be LV, LVT or V.
4566                                  * See if LVT */
4567                                 if (is_utf8_X_LVT((U8*)locinput)) {
4568                                     locinput += UTF8SKIP(locinput);
4569                                 } else {
4570
4571                                     /* Must be  V or LV.  Take it, then match
4572                                      * V*     */
4573                                     locinput += UTF8SKIP(locinput);
4574                                     while (locinput < PL_regeol
4575                                            && (len = is_GCB_V_utf8(locinput)))
4576                                     {
4577                                         locinput += len;
4578                                     }
4579                                 }
4580
4581                                 /* And any of LV, LVT, or V can be followed
4582                                  * by T*            */
4583                                 while (locinput < PL_regeol
4584                                        && (len = is_GCB_T_utf8(locinput)))
4585                                 {
4586                                     locinput += len;
4587                                 }
4588                             }
4589                         }
4590                     }
4591
4592                     /* Match any extender */
4593                     while (locinput < PL_regeol
4594                             && swash_fetch(PL_utf8_X_extend,
4595                                             (U8*)locinput, utf8_target))
4596                     {
4597                         locinput += UTF8SKIP(locinput);
4598                     }
4599                 }
4600             exit_utf8:
4601                 if (locinput > PL_regeol) sayNO;
4602             }
4603             break;
4604
4605         case NREFFL:  /*  /\g{name}/il  */
4606         {   /* The capture buffer cases.  The ones beginning with N for the
4607                named buffers just convert to the equivalent numbered and
4608                pretend they were called as the corresponding numbered buffer
4609                op.  */
4610             /* don't initialize these in the declaration, it makes C++
4611                unhappy */
4612             char *s;
4613             char type;
4614             re_fold_t folder;
4615             const U8 *fold_array;
4616             UV utf8_fold_flags;
4617
4618             PL_reg_flags |= RF_tainted;
4619             folder = foldEQ_locale;
4620             fold_array = PL_fold_locale;
4621             type = REFFL;
4622             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4623             goto do_nref;
4624
4625         case NREFFA:  /*  /\g{name}/iaa  */
4626             folder = foldEQ_latin1;
4627             fold_array = PL_fold_latin1;
4628             type = REFFA;
4629             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4630             goto do_nref;
4631
4632         case NREFFU:  /*  /\g{name}/iu  */
4633             folder = foldEQ_latin1;
4634             fold_array = PL_fold_latin1;
4635             type = REFFU;
4636             utf8_fold_flags = 0;
4637             goto do_nref;
4638
4639         case NREFF:  /*  /\g{name}/i  */
4640             folder = foldEQ;
4641             fold_array = PL_fold;
4642             type = REFF;
4643             utf8_fold_flags = 0;
4644             goto do_nref;
4645
4646         case NREF:  /*  /\g{name}/   */
4647             type = REF;
4648             folder = NULL;
4649             fold_array = NULL;
4650             utf8_fold_flags = 0;
4651           do_nref:
4652
4653             /* For the named back references, find the corresponding buffer
4654              * number */
4655             n = reg_check_named_buff_matched(rex,scan);
4656
4657             if ( ! n ) {
4658                 sayNO;
4659             }
4660             goto do_nref_ref_common;
4661
4662         case REFFL:  /*  /\1/il  */
4663             PL_reg_flags |= RF_tainted;
4664             folder = foldEQ_locale;
4665             fold_array = PL_fold_locale;
4666             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4667             goto do_ref;
4668
4669         case REFFA:  /*  /\1/iaa  */
4670             folder = foldEQ_latin1;
4671             fold_array = PL_fold_latin1;
4672             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4673             goto do_ref;
4674
4675         case REFFU:  /*  /\1/iu  */
4676             folder = foldEQ_latin1;
4677             fold_array = PL_fold_latin1;
4678             utf8_fold_flags = 0;
4679             goto do_ref;
4680
4681         case REFF:  /*  /\1/i  */
4682             folder = foldEQ;
4683             fold_array = PL_fold;
4684             utf8_fold_flags = 0;
4685             goto do_ref;
4686
4687         case REF:  /*  /\1/    */
4688             folder = NULL;
4689             fold_array = NULL;
4690             utf8_fold_flags = 0;
4691
4692           do_ref:
4693             type = OP(scan);
4694             n = ARG(scan);  /* which paren pair */
4695
4696           do_nref_ref_common:
4697             ln = rex->offs[n].start;
4698             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4699             if (rex->lastparen < n || ln == -1)
4700                 sayNO;                  /* Do not match unless seen CLOSEn. */
4701             if (ln == rex->offs[n].end)
4702                 break;
4703
4704             s = PL_bostr + ln;
4705             if (type != REF     /* REF can do byte comparison */
4706                 && (utf8_target || type == REFFU))
4707             { /* XXX handle REFFL better */
4708                 char * limit = PL_regeol;
4709
4710                 /* This call case insensitively compares the entire buffer
4711                     * at s, with the current input starting at locinput, but
4712                     * not going off the end given by PL_regeol, and returns in
4713                     * <limit> upon success, how much of the current input was
4714                     * matched */
4715                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4716                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4717                 {
4718                     sayNO;
4719                 }
4720                 locinput = limit;
4721                 break;
4722             }
4723
4724             /* Not utf8:  Inline the first character, for speed. */
4725             if (!NEXTCHR_IS_EOS &&
4726                 UCHARAT(s) != nextchr &&
4727                 (type == REF ||
4728                  UCHARAT(s) != fold_array[nextchr]))
4729                 sayNO;
4730             ln = rex->offs[n].end - ln;
4731             if (locinput + ln > PL_regeol)
4732                 sayNO;
4733             if (ln > 1 && (type == REF
4734                            ? memNE(s, locinput, ln)
4735                            : ! folder(s, locinput, ln)))
4736                 sayNO;
4737             locinput += ln;
4738             break;
4739         }
4740
4741         case NOTHING: /* null op; e.g. the 'nothing' following
4742                        * the '*' in m{(a+|b)*}' */
4743             break;
4744         case TAIL: /* placeholder while compiling (A|B|C) */
4745             break;
4746
4747         case BACK: /* ??? doesn't appear to be used ??? */
4748             break;
4749
4750 #undef  ST
4751 #define ST st->u.eval
4752         {
4753             SV *ret;
4754             REGEXP *re_sv;
4755             regexp *re;
4756             regexp_internal *rei;
4757             regnode *startpoint;
4758
4759         case GOSTART: /*  (?R)  */
4760         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4761             if (cur_eval && cur_eval->locinput==locinput) {
4762                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4763                     Perl_croak(aTHX_ "Infinite recursion in regex");
4764                 if ( ++nochange_depth > max_nochange_depth )
4765                     Perl_croak(aTHX_
4766                         "Pattern subroutine nesting without pos change"
4767                         " exceeded limit in regex");
4768             } else {
4769                 nochange_depth = 0;
4770             }
4771             re_sv = rex_sv;
4772             re = rex;
4773             rei = rexi;
4774             if (OP(scan)==GOSUB) {
4775                 startpoint = scan + ARG2L(scan);
4776                 ST.close_paren = ARG(scan);
4777             } else {
4778                 startpoint = rei->program+1;
4779                 ST.close_paren = 0;
4780             }
4781             goto eval_recurse_doit;
4782             assert(0); /* NOTREACHED */
4783
4784         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4785             if (cur_eval && cur_eval->locinput==locinput) {
4786                 if ( ++nochange_depth > max_nochange_depth )
4787                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4788             } else {
4789                 nochange_depth = 0;
4790             }
4791             {
4792                 /* execute the code in the {...} */
4793
4794                 dSP;
4795                 PADOFFSET before;
4796                 OP * const oop = PL_op;
4797                 COP * const ocurcop = PL_curcop;
4798                 OP *nop;
4799                 char *saved_regeol = PL_regeol;
4800                 struct re_save_state saved_state;
4801                 CV *newcv;
4802
4803                 /* save *all* paren positions */
4804                 regcppush(rex, 0);
4805                 REGCP_SET(runops_cp);
4806
4807                 /* To not corrupt the existing regex state while executing the
4808                  * eval we would normally put it on the save stack, like with
4809                  * save_re_context. However, re-evals have a weird scoping so we
4810                  * can't just add ENTER/LEAVE here. With that, things like
4811                  *
4812                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4813                  *
4814                  * would break, as they expect the localisation to be unwound
4815                  * only when the re-engine backtracks through the bit that
4816                  * localised it.
4817                  *
4818                  * What we do instead is just saving the state in a local c
4819                  * variable.
4820                  */
4821                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4822
4823                 PL_reg_state.re_reparsing = FALSE;
4824
4825                 if (!caller_cv)
4826                     caller_cv = find_runcv(NULL);
4827
4828                 n = ARG(scan);
4829
4830                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4831                     newcv = (ReANY(
4832                                                 (REGEXP*)(rexi->data->data[n])
4833                                             ))->qr_anoncv
4834                                         ;
4835                     nop = (OP*)rexi->data->data[n+1];
4836                 }
4837                 else if (rexi->data->what[n] == 'l') { /* literal code */
4838                     newcv = caller_cv;
4839                     nop = (OP*)rexi->data->data[n];
4840                     assert(CvDEPTH(newcv));
4841                 }
4842                 else {
4843                     /* literal with own CV */
4844                     assert(rexi->data->what[n] == 'L');
4845                     newcv = rex->qr_anoncv;
4846                     nop = (OP*)rexi->data->data[n];
4847                 }
4848
4849                 /* normally if we're about to execute code from the same
4850                  * CV that we used previously, we just use the existing
4851                  * CX stack entry. However, its possible that in the
4852                  * meantime we may have backtracked, popped from the save
4853                  * stack, and undone the SAVECOMPPAD(s) associated with
4854                  * PUSH_MULTICALL; in which case PL_comppad no longer
4855                  * points to newcv's pad. */
4856                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4857                 {
4858                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4859                     if (last_pushed_cv) {
4860                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4861                     }
4862                     else {
4863                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4864                     }
4865                     last_pushed_cv = newcv;
4866                 }
4867                 last_pad = PL_comppad;
4868
4869                 /* the initial nextstate you would normally execute
4870                  * at the start of an eval (which would cause error
4871                  * messages to come from the eval), may be optimised
4872                  * away from the execution path in the regex code blocks;
4873                  * so manually set PL_curcop to it initially */
4874                 {
4875                     OP *o = cUNOPx(nop)->op_first;
4876                     assert(o->op_type == OP_NULL);
4877                     if (o->op_targ == OP_SCOPE) {
4878                         o = cUNOPo->op_first;
4879                     }
4880                     else {
4881                         assert(o->op_targ == OP_LEAVE);
4882                         o = cUNOPo->op_first;
4883                         assert(o->op_type == OP_ENTER);
4884                         o = o->op_sibling;
4885                     }
4886
4887                     if (o->op_type != OP_STUB) {
4888                         assert(    o->op_type == OP_NEXTSTATE
4889                                 || o->op_type == OP_DBSTATE
4890                                 || (o->op_type == OP_NULL
4891                                     &&  (  o->op_targ == OP_NEXTSTATE
4892                                         || o->op_targ == OP_DBSTATE
4893                                         )
4894                                     )
4895                         );
4896                         PL_curcop = (COP*)o;
4897                     }
4898                 }
4899                 nop = nop->op_next;
4900
4901                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4902                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4903
4904                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4905
4906                 if (sv_yes_mark) {
4907                     SV *sv_mrk = get_sv("REGMARK", 1);
4908                     sv_setsv(sv_mrk, sv_yes_mark);
4909                 }
4910
4911                 /* we don't use MULTICALL here as we want to call the
4912                  * first op of the block of interest, rather than the
4913                  * first op of the sub */
4914                 before = SP-PL_stack_base;
4915                 PL_op = nop;
4916                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4917                 SPAGAIN;
4918                 if (SP-PL_stack_base == before)
4919                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4920                 else {
4921                     ret = POPs;
4922                     PUTBACK;
4923                 }
4924
4925                 /* before restoring everything, evaluate the returned
4926                  * value, so that 'uninit' warnings don't use the wrong
4927                  * PL_op or pad. Also need to process any magic vars
4928                  * (e.g. $1) *before* parentheses are restored */
4929
4930                 PL_op = NULL;
4931
4932                 re_sv = NULL;
4933                 if (logical == 0)        /*   (?{})/   */
4934                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4935                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4936                     sw = cBOOL(SvTRUE(ret));
4937                     logical = 0;
4938                 }
4939                 else {                   /*  /(??{})  */
4940                     /*  if its overloaded, let the regex compiler handle
4941                      *  it; otherwise extract regex, or stringify  */
4942                     if (!SvAMAGIC(ret)) {
4943                         SV *sv = ret;
4944                         if (SvROK(sv))
4945                             sv = SvRV(sv);
4946                         if (SvTYPE(sv) == SVt_REGEXP)
4947                             re_sv = (REGEXP*) sv;
4948                         else if (SvSMAGICAL(sv)) {
4949                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4950                             if (mg)
4951                                 re_sv = (REGEXP *) mg->mg_obj;
4952                         }
4953
4954                         /* force any magic, undef warnings here */
4955                         if (!re_sv) {
4956                             ret = sv_mortalcopy(ret);
4957                             (void) SvPV_force_nolen(ret);
4958                         }
4959                     }
4960
4961                 }
4962
4963                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4964
4965                 /* *** Note that at this point we don't restore
4966                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4967                  * be used again soon. This is safe as long as nothing
4968                  * in the regexp code uses the pad ! */
4969                 PL_op = oop;
4970                 PL_curcop = ocurcop;
4971                 PL_regeol = saved_regeol;
4972                 S_regcp_restore(aTHX_ rex, runops_cp);
4973
4974                 if (logical != 2)
4975                     break;
4976             }
4977
4978                 /* only /(??{})/  from now on */
4979                 logical = 0;
4980                 {
4981                     /* extract RE object from returned value; compiling if
4982                      * necessary */
4983
4984                     if (re_sv) {
4985                         re_sv = reg_temp_copy(NULL, re_sv);
4986                     }
4987                     else {
4988                         U32 pm_flags = 0;
4989                         const I32 osize = PL_regsize;
4990
4991                         if (SvUTF8(ret) && IN_BYTES) {
4992                             /* In use 'bytes': make a copy of the octet
4993                              * sequence, but without the flag on */
4994                             STRLEN len;
4995                             const char *const p = SvPV(ret, len);
4996                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4997                         }
4998                         if (rex->intflags & PREGf_USE_RE_EVAL)
4999                             pm_flags |= PMf_USE_RE_EVAL;
5000
5001                         /* if we got here, it should be an engine which
5002                          * supports compiling code blocks and stuff */
5003                         assert(rex->engine && rex->engine->op_comp);
5004                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5005                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5006                                     rex->engine, NULL, NULL,
5007                                     /* copy /msix etc to inner pattern */
5008                                     scan->flags,
5009                                     pm_flags);
5010
5011                         if (!(SvFLAGS(ret)
5012                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5013                                  | SVs_GMG))) {
5014                             /* This isn't a first class regexp. Instead, it's
5015                                caching a regexp onto an existing, Perl visible
5016                                scalar.  */
5017                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5018                         }
5019                         PL_regsize = osize;
5020                         /* safe to do now that any $1 etc has been
5021                          * interpolated into the new pattern string and
5022                          * compiled */
5023                         S_regcp_restore(aTHX_ rex, runops_cp);
5024                     }
5025                     SAVEFREESV(re_sv);
5026                     re = ReANY(re_sv);
5027                 }
5028                 RXp_MATCH_COPIED_off(re);
5029                 re->subbeg = rex->subbeg;
5030                 re->sublen = rex->sublen;
5031                 re->suboffset = rex->suboffset;
5032                 re->subcoffset = rex->subcoffset;
5033                 rei = RXi_GET(re);
5034                 DEBUG_EXECUTE_r(
5035                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5036                         "Matching embedded");
5037                 );
5038                 startpoint = rei->program + 1;
5039                 ST.close_paren = 0; /* only used for GOSUB */
5040
5041         eval_recurse_doit: /* Share code with GOSUB below this line */
5042                 /* run the pattern returned from (??{...}) */
5043                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
5044                 REGCP_SET(ST.lastcp);
5045
5046                 re->lastparen = 0;
5047                 re->lastcloseparen = 0;
5048
5049                 PL_regsize = 0;
5050
5051                 /* XXXX This is too dramatic a measure... */
5052                 PL_reg_maxiter = 0;
5053
5054                 ST.toggle_reg_flags = PL_reg_flags;
5055                 if (RX_UTF8(re_sv))
5056                     PL_reg_flags |= RF_utf8;
5057                 else
5058                     PL_reg_flags &= ~RF_utf8;
5059                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5060
5061                 ST.prev_rex = rex_sv;
5062                 ST.prev_curlyx = cur_curlyx;
5063                 rex_sv = re_sv;
5064                 SET_reg_curpm(rex_sv);
5065                 rex = re;
5066                 rexi = rei;
5067                 cur_curlyx = NULL;
5068                 ST.B = next;
5069                 ST.prev_eval = cur_eval;
5070                 cur_eval = st;
5071                 /* now continue from first node in postoned RE */
5072                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5073                 assert(0); /* NOTREACHED */
5074         }
5075
5076         case EVAL_AB: /* cleanup after a successful (??{A})B */
5077             /* note: this is called twice; first after popping B, then A */
5078             PL_reg_flags ^= ST.toggle_reg_flags;
5079             rex_sv = ST.prev_rex;
5080             SET_reg_curpm(rex_sv);
5081             rex = ReANY(rex_sv);
5082             rexi = RXi_GET(rex);
5083             regcpblow(ST.cp);
5084             cur_eval = ST.prev_eval;
5085             cur_curlyx = ST.prev_curlyx;
5086
5087             /* XXXX This is too dramatic a measure... */
5088             PL_reg_maxiter = 0;
5089             if ( nochange_depth )
5090                 nochange_depth--;
5091             sayYES;
5092
5093
5094         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5095             /* note: this is called twice; first after popping B, then A */
5096             PL_reg_flags ^= ST.toggle_reg_flags;
5097             rex_sv = ST.prev_rex;
5098             SET_reg_curpm(rex_sv);
5099             rex = ReANY(rex_sv);
5100             rexi = RXi_GET(rex);
5101
5102             REGCP_UNWIND(ST.lastcp);
5103             regcppop(rex);
5104             cur_eval = ST.prev_eval;
5105             cur_curlyx = ST.prev_curlyx;
5106             /* XXXX This is too dramatic a measure... */
5107             PL_reg_maxiter = 0;
5108             if ( nochange_depth )
5109                 nochange_depth--;
5110             sayNO_SILENT;
5111 #undef ST
5112
5113         case OPEN: /*  (  */
5114             n = ARG(scan);  /* which paren pair */
5115             rex->offs[n].start_tmp = locinput - PL_bostr;
5116             if (n > PL_regsize)
5117                 PL_regsize = n;
5118             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5119                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5120                 PTR2UV(rex),
5121                 PTR2UV(rex->offs),
5122                 (UV)n,
5123                 (IV)rex->offs[n].start_tmp,
5124                 (UV)PL_regsize
5125             ));
5126             lastopen = n;
5127             break;
5128
5129 /* XXX really need to log other places start/end are set too */
5130 #define CLOSE_CAPTURE \
5131     rex->offs[n].start = rex->offs[n].start_tmp; \
5132     rex->offs[n].end = locinput - PL_bostr; \
5133     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5134         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5135         PTR2UV(rex), \
5136         PTR2UV(rex->offs), \
5137         (UV)n, \
5138         (IV)rex->offs[n].start, \
5139         (IV)rex->offs[n].end \
5140     ))
5141
5142         case CLOSE:  /*  )  */
5143             n = ARG(scan);  /* which paren pair */
5144             CLOSE_CAPTURE;
5145             /*if (n > PL_regsize)
5146                 PL_regsize = n;*/
5147             if (n > rex->lastparen)
5148                 rex->lastparen = n;
5149             rex->lastcloseparen = n;
5150             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5151                 goto fake_end;
5152             }
5153             break;
5154
5155         case ACCEPT:  /*  (*ACCEPT)  */
5156             if (ARG(scan)){
5157                 regnode *cursor;
5158                 for (cursor=scan;
5159                      cursor && OP(cursor)!=END;
5160                      cursor=regnext(cursor))
5161                 {
5162                     if ( OP(cursor)==CLOSE ){
5163                         n = ARG(cursor);
5164                         if ( n <= lastopen ) {
5165                             CLOSE_CAPTURE;
5166                             /*if (n > PL_regsize)
5167                             PL_regsize = n;*/
5168                             if (n > rex->lastparen)
5169                                 rex->lastparen = n;
5170                             rex->lastcloseparen = n;
5171                             if ( n == ARG(scan) || (cur_eval &&
5172                                 cur_eval->u.eval.close_paren == n))
5173                                 break;
5174                         }
5175                     }
5176                 }
5177             }
5178             goto fake_end;
5179             /*NOTREACHED*/
5180
5181         case GROUPP:  /*  (?(1))  */
5182             n = ARG(scan);  /* which paren pair */
5183             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5184             break;
5185
5186         case NGROUPP:  /*  (?(<name>))  */
5187             /* reg_check_named_buff_matched returns 0 for no match */
5188             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5189             break;
5190
5191         case INSUBP:   /*  (?(R))  */
5192             n = ARG(scan);
5193             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5194             break;
5195
5196         case DEFINEP:  /*  (?(DEFINE))  */
5197             sw = 0;
5198             break;
5199
5200         case IFTHEN:   /*  (?(cond)A|B)  */
5201             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5202             if (sw)
5203                 next = NEXTOPER(NEXTOPER(scan));
5204             else {
5205                 next = scan + ARG(scan);
5206                 if (OP(next) == IFTHEN) /* Fake one. */
5207                     next = NEXTOPER(NEXTOPER(next));
5208             }
5209             break;
5210
5211         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5212             logical = scan->flags;
5213             break;
5214
5215 /*******************************************************************
5216
5217 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5218 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5219 STAR/PLUS/CURLY/CURLYN are used instead.)
5220
5221 A*B is compiled as <CURLYX><A><WHILEM><B>
5222
5223 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5224 state, which contains the current count, initialised to -1. It also sets
5225 cur_curlyx to point to this state, with any previous value saved in the
5226 state block.
5227
5228 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5229 since the pattern may possibly match zero times (i.e. it's a while {} loop
5230 rather than a do {} while loop).
5231
5232 Each entry to WHILEM represents a successful match of A. The count in the
5233 CURLYX block is incremented, another WHILEM state is pushed, and execution
5234 passes to A or B depending on greediness and the current count.
5235
5236 For example, if matching against the string a1a2a3b (where the aN are
5237 substrings that match /A/), then the match progresses as follows: (the
5238 pushed states are interspersed with the bits of strings matched so far):
5239
5240     <CURLYX cnt=-1>
5241     <CURLYX cnt=0><WHILEM>
5242     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5243     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5244     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5245     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5246
5247 (Contrast this with something like CURLYM, which maintains only a single
5248 backtrack state:
5249
5250     <CURLYM cnt=0> a1
5251     a1 <CURLYM cnt=1> a2
5252     a1 a2 <CURLYM cnt=2> a3
5253     a1 a2 a3 <CURLYM cnt=3> b
5254 )
5255
5256 Each WHILEM state block marks a point to backtrack to upon partial failure
5257 of A or B, and also contains some minor state data related to that
5258 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5259 overall state, such as the count, and pointers to the A and B ops.
5260
5261 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5262 must always point to the *current* CURLYX block, the rules are:
5263
5264 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5265 and set cur_curlyx to point the new block.
5266
5267 When popping the CURLYX block after a successful or unsuccessful match,
5268 restore the previous cur_curlyx.
5269
5270 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5271 to the outer one saved in the CURLYX block.
5272
5273 When popping the WHILEM block after a successful or unsuccessful B match,
5274 restore the previous cur_curlyx.
5275
5276 Here's an example for the pattern (AI* BI)*BO
5277 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5278
5279 cur_
5280 curlyx backtrack stack
5281 ------ ---------------
5282 NULL
5283 CO     <CO prev=NULL> <WO>
5284 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5285 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5286 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5287
5288 At this point the pattern succeeds, and we work back down the stack to
5289 clean up, restoring as we go:
5290
5291 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5292 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5293 CO     <CO prev=NULL> <WO>
5294 NULL
5295
5296 *******************************************************************/
5297
5298 #define ST st->u.curlyx
5299
5300         case CURLYX:    /* start of /A*B/  (for complex A) */
5301         {
5302             /* No need to save/restore up to this paren */
5303             I32 parenfloor = scan->flags;
5304
5305             assert(next); /* keep Coverity happy */
5306             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5307                 next += ARG(next);
5308
5309             /* XXXX Probably it is better to teach regpush to support
5310                parenfloor > PL_regsize... */
5311             if (parenfloor > (I32)rex->lastparen)
5312                 parenfloor = rex->lastparen; /* Pessimization... */
5313
5314             ST.prev_curlyx= cur_curlyx;
5315             cur_curlyx = st;
5316             ST.cp = PL_savestack_ix;
5317
5318             /* these fields contain the state of the current curly.
5319              * they are accessed by subsequent WHILEMs */
5320             ST.parenfloor = parenfloor;
5321             ST.me = scan;
5322             ST.B = next;
5323             ST.minmod = minmod;
5324             minmod = 0;
5325             ST.count = -1;      /* this will be updated by WHILEM */
5326             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5327
5328             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5329             assert(0); /* NOTREACHED */
5330         }
5331
5332         case CURLYX_end: /* just finished matching all of A*B */
5333             cur_curlyx = ST.prev_curlyx;
5334             sayYES;
5335             assert(0); /* NOTREACHED */
5336
5337         case CURLYX_end_fail: /* just failed to match all of A*B */
5338             regcpblow(ST.cp);
5339             cur_curlyx = ST.prev_curlyx;
5340             sayNO;
5341             assert(0); /* NOTREACHED */
5342
5343
5344 #undef ST
5345 #define ST st->u.whilem
5346
5347         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5348         {
5349             /* see the discussion above about CURLYX/WHILEM */
5350             I32 n;
5351             int min = ARG1(cur_curlyx->u.curlyx.me);
5352             int max = ARG2(cur_curlyx->u.curlyx.me);
5353             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5354
5355             assert(cur_curlyx); /* keep Coverity happy */
5356             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5357             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5358             ST.cache_offset = 0;
5359             ST.cache_mask = 0;
5360
5361
5362             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5363                   "%*s  whilem: matched %ld out of %d..%d\n",
5364                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5365             );
5366
5367             /* First just match a string of min A's. */
5368
5369             if (n < min) {
5370                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5371                 cur_curlyx->u.curlyx.lastloc = locinput;
5372                 REGCP_SET(ST.lastcp);
5373
5374                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5375                 assert(0); /* NOTREACHED */
5376             }
5377
5378             /* If degenerate A matches "", assume A done. */
5379
5380             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5381                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5382                    "%*s  whilem: empty match detected, trying continuation...\n",
5383                    REPORT_CODE_OFF+depth*2, "")
5384                 );
5385                 goto do_whilem_B_max;
5386             }
5387
5388             /* super-linear cache processing */
5389
5390             if (scan->flags) {
5391
5392                 if (!PL_reg_maxiter) {
5393                     /* start the countdown: Postpone detection until we
5394                      * know the match is not *that* much linear. */
5395                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5396                     /* possible overflow for long strings and many CURLYX's */
5397                     if (PL_reg_maxiter < 0)
5398                         PL_reg_maxiter = I32_MAX;
5399                     PL_reg_leftiter = PL_reg_maxiter;
5400                 }
5401
5402                 if (PL_reg_leftiter-- == 0) {
5403                     /* initialise cache */
5404                     const I32 size = (PL_reg_maxiter + 7)/8;
5405                     if (PL_reg_poscache) {
5406                         if ((I32)PL_reg_poscache_size < size) {
5407                             Renew(PL_reg_poscache, size, char);
5408                             PL_reg_poscache_size = size;
5409                         }
5410                         Zero(PL_reg_poscache, size, char);
5411                     }
5412                     else {
5413                         PL_reg_poscache_size = size;
5414                         Newxz(PL_reg_poscache, size, char);
5415                     }
5416                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5417       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5418                               PL_colors[4], PL_colors[5])
5419                     );
5420                 }
5421
5422                 if (PL_reg_leftiter < 0) {
5423                     /* have we already failed at this position? */
5424                     I32 offset, mask;
5425                     offset  = (scan->flags & 0xf) - 1
5426                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5427                     mask    = 1 << (offset % 8);
5428                     offset /= 8;
5429                     if (PL_reg_poscache[offset] & mask) {
5430                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5431                             "%*s  whilem: (cache) already tried at this position...\n",
5432                             REPORT_CODE_OFF+depth*2, "")
5433                         );
5434                         sayNO; /* cache records failure */
5435                     }
5436                     ST.cache_offset = offset;
5437                     ST.cache_mask   = mask;
5438                 }
5439             }
5440
5441             /* Prefer B over A for minimal matching. */
5442
5443             if (cur_curlyx->u.curlyx.minmod) {
5444                 ST.save_curlyx = cur_curlyx;
5445                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5446                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5447                 REGCP_SET(ST.lastcp);
5448                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5449                                     locinput);
5450                 assert(0); /* NOTREACHED */
5451             }
5452
5453             /* Prefer A over B for maximal matching. */
5454
5455             if (n < max) { /* More greed allowed? */
5456                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5457                 cur_curlyx->u.curlyx.lastloc = locinput;
5458                 REGCP_SET(ST.lastcp);
5459                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5460                 assert(0); /* NOTREACHED */
5461             }
5462             goto do_whilem_B_max;
5463         }
5464         assert(0); /* NOTREACHED */
5465
5466         case WHILEM_B_min: /* just matched B in a minimal match */
5467         case WHILEM_B_max: /* just matched B in a maximal match */
5468             cur_curlyx = ST.save_curlyx;
5469             sayYES;
5470             assert(0); /* NOTREACHED */
5471
5472         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5473             cur_curlyx = ST.save_curlyx;
5474             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5475             cur_curlyx->u.curlyx.count--;
5476             CACHEsayNO;
5477             assert(0); /* NOTREACHED */
5478
5479         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5480             /* FALL THROUGH */
5481         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5482             REGCP_UNWIND(ST.lastcp);
5483             regcppop(rex);
5484             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5485             cur_curlyx->u.curlyx.count--;
5486             CACHEsayNO;
5487             assert(0); /* NOTREACHED */
5488
5489         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5490             REGCP_UNWIND(ST.lastcp);
5491             regcppop(rex);      /* Restore some previous $<digit>s? */
5492             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5493                 "%*s  whilem: failed, trying continuation...\n",
5494                 REPORT_CODE_OFF+depth*2, "")
5495             );
5496           do_whilem_B_max:
5497             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5498                 && ckWARN(WARN_REGEXP)
5499                 && !(PL_reg_flags & RF_warned))
5500             {
5501                 PL_reg_flags |= RF_warned;
5502                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5503                      "Complex regular subexpression recursion limit (%d) "
5504                      "exceeded",
5505                      REG_INFTY - 1);
5506             }
5507
5508             /* now try B */
5509             ST.save_curlyx = cur_curlyx;
5510             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5511             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5512                                 locinput);
5513             assert(0); /* NOTREACHED */
5514
5515         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5516             cur_curlyx = ST.save_curlyx;
5517             REGCP_UNWIND(ST.lastcp);
5518             regcppop(rex);
5519
5520             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5521                 /* Maximum greed exceeded */
5522                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5523                     && ckWARN(WARN_REGEXP)
5524                     && !(PL_reg_flags & RF_warned))
5525                 {
5526                     PL_reg_flags |= RF_warned;
5527                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5528                         "Complex regular subexpression recursion "
5529                         "limit (%d) exceeded",
5530                         REG_INFTY - 1);
5531                 }
5532                 cur_curlyx->u.curlyx.count--;
5533                 CACHEsayNO;
5534             }
5535
5536             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5537                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5538             );
5539             /* Try grabbing another A and see if it helps. */
5540             cur_curlyx->u.curlyx.lastloc = locinput;
5541             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5542             REGCP_SET(ST.lastcp);
5543             PUSH_STATE_GOTO(WHILEM_A_min,
5544                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5545                 locinput);
5546             assert(0); /* NOTREACHED */
5547
5548 #undef  ST
5549 #define ST st->u.branch
5550
5551         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5552             next = scan + ARG(scan);
5553             if (next == scan)
5554                 next = NULL;
5555             scan = NEXTOPER(scan);
5556             /* FALL THROUGH */
5557
5558         case BRANCH:        /*  /(...|A|...)/ */
5559             scan = NEXTOPER(scan); /* scan now points to inner node */
5560             ST.lastparen = rex->lastparen;
5561             ST.lastcloseparen = rex->lastcloseparen;
5562             ST.next_branch = next;
5563             REGCP_SET(ST.cp);
5564
5565             /* Now go into the branch */
5566             if (has_cutgroup) {
5567                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5568             } else {
5569                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5570             }
5571             assert(0); /* NOTREACHED */
5572
5573         case CUTGROUP:  /*  /(*THEN)/  */
5574             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5575                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5576             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5577             assert(0); /* NOTREACHED */
5578
5579         case CUTGROUP_next_fail:
5580             do_cutgroup = 1;
5581             no_final = 1;
5582             if (st->u.mark.mark_name)
5583                 sv_commit = st->u.mark.mark_name;
5584             sayNO;
5585             assert(0); /* NOTREACHED */
5586
5587         case BRANCH_next:
5588             sayYES;
5589             assert(0); /* NOTREACHED */
5590
5591         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5592             if (do_cutgroup) {
5593                 do_cutgroup = 0;
5594                 no_final = 0;
5595             }
5596             REGCP_UNWIND(ST.cp);
5597             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5598             scan = ST.next_branch;
5599             /* no more branches? */
5600             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5601                 DEBUG_EXECUTE_r({
5602                     PerlIO_printf( Perl_debug_log,
5603                         "%*s  %sBRANCH failed...%s\n",
5604                         REPORT_CODE_OFF+depth*2, "",
5605                         PL_colors[4],
5606                         PL_colors[5] );
5607                 });
5608                 sayNO_SILENT;
5609             }
5610             continue; /* execute next BRANCH[J] op */
5611             assert(0); /* NOTREACHED */
5612
5613         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5614             minmod = 1;
5615             break;
5616
5617 #undef  ST
5618 #define ST st->u.curlym
5619
5620         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5621
5622             /* This is an optimisation of CURLYX that enables us to push
5623              * only a single backtracking state, no matter how many matches
5624              * there are in {m,n}. It relies on the pattern being constant
5625              * length, with no parens to influence future backrefs
5626              */
5627
5628             ST.me = scan;
5629             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5630
5631             ST.lastparen      = rex->lastparen;
5632             ST.lastcloseparen = rex->lastcloseparen;
5633
5634             /* if paren positive, emulate an OPEN/CLOSE around A */
5635             if (ST.me->flags) {
5636                 U32 paren = ST.me->flags;
5637                 if (paren > PL_regsize)
5638                     PL_regsize = paren;
5639                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5640             }
5641             ST.A = scan;
5642             ST.B = next;
5643             ST.alen = 0;
5644             ST.count = 0;
5645             ST.minmod = minmod;
5646             minmod = 0;
5647             ST.c1 = CHRTEST_UNINIT;
5648             REGCP_SET(ST.cp);
5649
5650             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5651                 goto curlym_do_B;
5652
5653           curlym_do_A: /* execute the A in /A{m,n}B/  */
5654             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5655             assert(0); /* NOTREACHED */
5656
5657         case CURLYM_A: /* we've just matched an A */
5658             ST.count++;
5659             /* after first match, determine A's length: u.curlym.alen */
5660             if (ST.count == 1) {
5661                 if (PL_reg_match_utf8) {
5662                     char *s = st->locinput;
5663                     while (s < locinput) {
5664                         ST.alen++;
5665                         s += UTF8SKIP(s);
5666                     }
5667                 }
5668                 else {
5669                     ST.alen = locinput - st->locinput;
5670                 }
5671                 if (ST.alen == 0)
5672                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5673             }
5674             DEBUG_EXECUTE_r(
5675                 PerlIO_printf(Perl_debug_log,
5676                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5677                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5678                           (IV) ST.count, (IV)ST.alen)
5679             );
5680
5681             if (cur_eval && cur_eval->u.eval.close_paren &&
5682                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5683                 goto fake_end;
5684
5685             {
5686                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5687                 if ( max == REG_INFTY || ST.count < max )
5688                     goto curlym_do_A; /* try to match another A */
5689             }
5690             goto curlym_do_B; /* try to match B */
5691
5692         case CURLYM_A_fail: /* just failed to match an A */
5693             REGCP_UNWIND(ST.cp);
5694
5695             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5696                 || (cur_eval && cur_eval->u.eval.close_paren &&
5697                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5698                 sayNO;
5699
5700           curlym_do_B: /* execute the B in /A{m,n}B/  */
5701             if (ST.c1 == CHRTEST_UNINIT) {
5702                 /* calculate c1 and c2 for possible match of 1st char
5703                  * following curly */
5704                 ST.c1 = ST.c2 = CHRTEST_VOID;
5705                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5706                     regnode *text_node = ST.B;
5707                     if (! HAS_TEXT(text_node))
5708                         FIND_NEXT_IMPT(text_node);
5709                     /* this used to be
5710
5711                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5712
5713                         But the former is redundant in light of the latter.
5714
5715                         if this changes back then the macro for
5716                         IS_TEXT and friends need to change.
5717                      */
5718                     if (PL_regkind[OP(text_node)] == EXACT) {
5719                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5720                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5721                         {
5722                             sayNO;
5723                         }
5724                     }
5725                 }
5726             }
5727
5728             DEBUG_EXECUTE_r(
5729                 PerlIO_printf(Perl_debug_log,
5730                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5731                     (int)(REPORT_CODE_OFF+(depth*2)),
5732                     "", (IV)ST.count)
5733                 );
5734             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5735                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5736                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5737                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5738                     {
5739                         /* simulate B failing */
5740                         DEBUG_OPTIMISE_r(
5741                             PerlIO_printf(Perl_debug_log,
5742                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5743                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5744                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5745                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5746                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5747                         );
5748                         state_num = CURLYM_B_fail;
5749                         goto reenter_switch;
5750                     }
5751                 }
5752                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5753                     /* simulate B failing */
5754                     DEBUG_OPTIMISE_r(
5755                         PerlIO_printf(Perl_debug_log,
5756                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5757                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5758                             (int) nextchr, ST.c1, ST.c2)
5759                     );
5760                     state_num = CURLYM_B_fail;
5761                     goto reenter_switch;
5762                 }
5763             }
5764
5765             if (ST.me->flags) {
5766                 /* emulate CLOSE: mark current A as captured */
5767                 I32 paren = ST.me->flags;
5768                 if (ST.count) {
5769                     rex->offs[paren].start
5770                         = HOPc(locinput, -ST.alen) - PL_bostr;
5771                     rex->offs[paren].end = locinput - PL_bostr;
5772                     if ((U32)paren > rex->lastparen)
5773                         rex->lastparen = paren;
5774                     rex->lastcloseparen = paren;
5775                 }
5776                 else
5777                     rex->offs[paren].end = -1;
5778                 if (cur_eval && cur_eval->u.eval.close_paren &&
5779                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5780                 {
5781                     if (ST.count)
5782                         goto fake_end;
5783                     else
5784                         sayNO;
5785                 }
5786             }
5787
5788             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5789             assert(0); /* NOTREACHED */
5790
5791         case CURLYM_B_fail: /* just failed to match a B */
5792             REGCP_UNWIND(ST.cp);
5793             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5794             if (ST.minmod) {
5795                 I32 max = ARG2(ST.me);
5796                 if (max != REG_INFTY && ST.count == max)
5797                     sayNO;
5798                 goto curlym_do_A; /* try to match a further A */
5799             }
5800             /* backtrack one A */
5801             if (ST.count == ARG1(ST.me) /* min */)
5802                 sayNO;
5803             ST.count--;
5804             SET_locinput(HOPc(locinput, -ST.alen));
5805             goto curlym_do_B; /* try to match B */
5806
5807 #undef ST
5808 #define ST st->u.curly
5809
5810 #define CURLY_SETPAREN(paren, success) \
5811     if (paren) { \
5812         if (success) { \
5813             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5814             rex->offs[paren].end = locinput - PL_bostr; \
5815             if (paren > rex->lastparen) \
5816                 rex->lastparen = paren; \
5817             rex->lastcloseparen = paren; \
5818         } \
5819         else { \
5820             rex->offs[paren].end = -1; \
5821             rex->lastparen      = ST.lastparen; \
5822             rex->lastcloseparen = ST.lastcloseparen; \
5823         } \
5824     }
5825
5826         case STAR:              /*  /A*B/ where A is width 1 char */
5827             ST.paren = 0;
5828             ST.min = 0;
5829             ST.max = REG_INFTY;
5830             scan = NEXTOPER(scan);
5831             goto repeat;
5832
5833         case PLUS:              /*  /A+B/ where A is width 1 char */
5834             ST.paren = 0;
5835             ST.min = 1;
5836             ST.max = REG_INFTY;
5837             scan = NEXTOPER(scan);
5838             goto repeat;
5839
5840         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5841             ST.paren = scan->flags;     /* Which paren to set */
5842             ST.lastparen      = rex->lastparen;
5843             ST.lastcloseparen = rex->lastcloseparen;
5844             if (ST.paren > PL_regsize)
5845                 PL_regsize = ST.paren;
5846             ST.min = ARG1(scan);  /* min to match */
5847             ST.max = ARG2(scan);  /* max to match */
5848             if (cur_eval && cur_eval->u.eval.close_paren &&
5849                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5850                 ST.min=1;
5851                 ST.max=1;
5852             }
5853             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5854             goto repeat;
5855
5856         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5857             ST.paren = 0;
5858             ST.min = ARG1(scan);  /* min to match */
5859             ST.max = ARG2(scan);  /* max to match */
5860             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5861           repeat:
5862             /*
5863             * Lookahead to avoid useless match attempts
5864             * when we know what character comes next.
5865             *
5866             * Used to only do .*x and .*?x, but now it allows
5867             * for )'s, ('s and (?{ ... })'s to be in the way
5868             * of the quantifier and the EXACT-like node.  -- japhy
5869             */
5870
5871             assert(ST.min <= ST.max);
5872             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5873                 ST.c1 = ST.c2 = CHRTEST_VOID;
5874             }
5875             else {
5876                 regnode *text_node = next;
5877
5878                 if (! HAS_TEXT(text_node))
5879                     FIND_NEXT_IMPT(text_node);
5880
5881                 if (! HAS_TEXT(text_node))
5882                     ST.c1 = ST.c2 = CHRTEST_VOID;
5883                 else {
5884                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5885                         ST.c1 = ST.c2 = CHRTEST_VOID;
5886                     }
5887                     else {
5888
5889                     /*  Currently we only get here when
5890
5891                         PL_rekind[OP(text_node)] == EXACT
5892
5893                         if this changes back then the macro for IS_TEXT and
5894                         friends need to change. */
5895                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5896                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5897                         {
5898                             sayNO;
5899                         }
5900                     }
5901                 }
5902             }
5903
5904             ST.A = scan;
5905             ST.B = next;
5906             if (minmod) {
5907                 char *li = locinput;
5908                 minmod = 0;
5909                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5910                     sayNO;
5911                 SET_locinput(li);
5912                 ST.count = ST.min;
5913                 REGCP_SET(ST.cp);
5914                 if (ST.c1 == CHRTEST_VOID)
5915                     goto curly_try_B_min;
5916
5917                 ST.oldloc = locinput;
5918
5919                 /* set ST.maxpos to the furthest point along the
5920                  * string that could possibly match */
5921                 if  (ST.max == REG_INFTY) {
5922                     ST.maxpos = PL_regeol - 1;
5923                     if (utf8_target)
5924                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5925                             ST.maxpos--;
5926                 }
5927                 else if (utf8_target) {
5928                     int m = ST.max - ST.min;
5929                     for (ST.maxpos = locinput;
5930                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5931                         ST.maxpos += UTF8SKIP(ST.maxpos);
5932                 }
5933                 else {
5934                     ST.maxpos = locinput + ST.max - ST.min;
5935                     if (ST.maxpos >= PL_regeol)
5936                         ST.maxpos = PL_regeol - 1;
5937                 }
5938                 goto curly_try_B_min_known;
5939
5940             }
5941             else {
5942                 /* avoid taking address of locinput, so it can remain
5943                  * a register var */
5944                 char *li = locinput;
5945                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5946                 if (ST.count < ST.min)
5947                     sayNO;
5948                 SET_locinput(li);
5949                 if ((ST.count > ST.min)
5950                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5951                 {
5952                     /* A{m,n} must come at the end of the string, there's
5953                      * no point in backing off ... */
5954                     ST.min = ST.count;
5955                     /* ...except that $ and \Z can match before *and* after
5956                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5957                        We may back off by one in this case. */
5958                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5959                         ST.min--;
5960                 }
5961                 REGCP_SET(ST.cp);
5962                 goto curly_try_B_max;
5963             }
5964             assert(0); /* NOTREACHED */
5965
5966
5967         case CURLY_B_min_known_fail:
5968             /* failed to find B in a non-greedy match where c1,c2 valid */
5969
5970             REGCP_UNWIND(ST.cp);
5971             if (ST.paren) {
5972                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5973             }
5974             /* Couldn't or didn't -- move forward. */
5975             ST.oldloc = locinput;
5976             if (utf8_target)
5977                 locinput += UTF8SKIP(locinput);
5978             else
5979                 locinput++;
5980             ST.count++;
5981           curly_try_B_min_known:
5982              /* find the next place where 'B' could work, then call B */
5983             {
5984                 int n;
5985                 if (utf8_target) {
5986                     n = (ST.oldloc == locinput) ? 0 : 1;
5987                     if (ST.c1 == ST.c2) {
5988                         /* set n to utf8_distance(oldloc, locinput) */
5989                         while (locinput <= ST.maxpos
5990                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5991                         {
5992                             locinput += UTF8SKIP(locinput);
5993                             n++;
5994                         }
5995                     }
5996                     else {
5997                         /* set n to utf8_distance(oldloc, locinput) */
5998                         while (locinput <= ST.maxpos
5999                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6000                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6001                         {
6002                             locinput += UTF8SKIP(locinput);
6003                             n++;
6004                         }
6005                     }
6006                 }
6007                 else {  /* Not utf8_target */
6008                     if (ST.c1 == ST.c2) {
6009                         while (locinput <= ST.maxpos &&
6010                                UCHARAT(locinput) != ST.c1)
6011                             locinput++;
6012                     }
6013                     else {
6014                         while (locinput <= ST.maxpos
6015                                && UCHARAT(locinput) != ST.c1
6016                                && UCHARAT(locinput) != ST.c2)
6017                             locinput++;
6018                     }
6019                     n = locinput - ST.oldloc;
6020                 }
6021                 if (locinput > ST.maxpos)
6022                     sayNO;
6023                 if (n) {
6024                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6025                      * at b; check that everything between oldloc and
6026                      * locinput matches */
6027                     char *li = ST.oldloc;
6028                     ST.count += n;
6029                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
6030                         sayNO;
6031                     assert(n == REG_INFTY || locinput == li);
6032                 }
6033                 CURLY_SETPAREN(ST.paren, ST.count);
6034                 if (cur_eval && cur_eval->u.eval.close_paren &&
6035                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6036                     goto fake_end;
6037                 }
6038                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6039             }
6040             assert(0); /* NOTREACHED */
6041
6042
6043         case CURLY_B_min_fail:
6044             /* failed to find B in a non-greedy match where c1,c2 invalid */
6045
6046             REGCP_UNWIND(ST.cp);
6047             if (ST.paren) {
6048                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6049             }
6050             /* failed -- move forward one */
6051             {
6052                 char *li = locinput;
6053                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6054                     sayNO;
6055                 }
6056                 locinput = li;
6057             }
6058             {
6059                 ST.count++;
6060                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6061                         ST.count > 0)) /* count overflow ? */
6062                 {
6063                   curly_try_B_min:
6064                     CURLY_SETPAREN(ST.paren, ST.count);
6065                     if (cur_eval && cur_eval->u.eval.close_paren &&
6066                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6067                         goto fake_end;
6068                     }
6069                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6070                 }
6071             }
6072             sayNO;
6073             assert(0); /* NOTREACHED */
6074
6075
6076         curly_try_B_max:
6077             /* a successful greedy match: now try to match B */
6078             if (cur_eval && cur_eval->u.eval.close_paren &&
6079                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6080                 goto fake_end;
6081             }
6082             {
6083                 bool could_match = locinput < PL_regeol;
6084
6085                 /* If it could work, try it. */
6086                 if (ST.c1 != CHRTEST_VOID && could_match) {
6087                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6088                     {
6089                         could_match = memEQ(locinput,
6090                                             ST.c1_utf8,
6091                                             UTF8SKIP(locinput))
6092                                     || memEQ(locinput,
6093                                              ST.c2_utf8,
6094                                              UTF8SKIP(locinput));
6095                     }
6096                     else {
6097                         could_match = UCHARAT(locinput) == ST.c1
6098                                       || UCHARAT(locinput) == ST.c2;
6099                     }
6100                 }
6101                 if (ST.c1 == CHRTEST_VOID || could_match) {
6102                     CURLY_SETPAREN(ST.paren, ST.count);
6103                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6104                     assert(0); /* NOTREACHED */
6105                 }
6106             }
6107             /* FALL THROUGH */
6108
6109         case CURLY_B_max_fail:
6110             /* failed to find B in a greedy match */
6111
6112             REGCP_UNWIND(ST.cp);
6113             if (ST.paren) {
6114                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6115             }
6116             /*  back up. */
6117             if (--ST.count < ST.min)
6118                 sayNO;
6119             locinput = HOPc(locinput, -1);
6120             goto curly_try_B_max;
6121
6122 #undef ST
6123
6124         case END: /*  last op of main pattern  */
6125             fake_end:
6126             if (cur_eval) {
6127                 /* we've just finished A in /(??{A})B/; now continue with B */
6128                 st->u.eval.toggle_reg_flags
6129                             = cur_eval->u.eval.toggle_reg_flags;
6130                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6131
6132                 st->u.eval.prev_rex = rex_sv;           /* inner */
6133                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6134                 rex_sv = cur_eval->u.eval.prev_rex;
6135                 SET_reg_curpm(rex_sv);
6136                 rex = ReANY(rex_sv);
6137                 rexi = RXi_GET(rex);
6138                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6139
6140                 REGCP_SET(st->u.eval.lastcp);
6141
6142                 /* Restore parens of the outer rex without popping the
6143                  * savestack */
6144                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6145
6146                 st->u.eval.prev_eval = cur_eval;
6147                 cur_eval = cur_eval->u.eval.prev_eval;
6148                 DEBUG_EXECUTE_r(
6149                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6150                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6151                 if ( nochange_depth )
6152                     nochange_depth--;
6153
6154                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6155                                     locinput); /* match B */
6156             }
6157
6158             if (locinput < reginfo->till) {
6159                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6160                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6161                                       PL_colors[4],
6162                                       (long)(locinput - PL_reg_starttry),
6163                                       (long)(reginfo->till - PL_reg_starttry),
6164                                       PL_colors[5]));
6165
6166                 sayNO_SILENT;           /* Cannot match: too short. */
6167             }
6168             sayYES;                     /* Success! */
6169
6170         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6171             DEBUG_EXECUTE_r(
6172             PerlIO_printf(Perl_debug_log,
6173                 "%*s  %ssubpattern success...%s\n",
6174                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6175             sayYES;                     /* Success! */
6176
6177 #undef  ST
6178 #define ST st->u.ifmatch
6179
6180         {
6181             char *newstart;
6182
6183         case SUSPEND:   /* (?>A) */
6184             ST.wanted = 1;
6185             newstart = locinput;
6186             goto do_ifmatch;
6187
6188         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6189             ST.wanted = 0;
6190             goto ifmatch_trivial_fail_test;
6191
6192         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6193             ST.wanted = 1;
6194           ifmatch_trivial_fail_test:
6195             if (scan->flags) {
6196                 char * const s = HOPBACKc(locinput, scan->flags);
6197                 if (!s) {
6198                     /* trivial fail */
6199                     if (logical) {
6200                         logical = 0;
6201                         sw = 1 - cBOOL(ST.wanted);
6202                     }
6203                     else if (ST.wanted)
6204                         sayNO;
6205                     next = scan + ARG(scan);
6206                     if (next == scan)
6207                         next = NULL;
6208                     break;
6209                 }
6210                 newstart = s;
6211             }
6212             else
6213                 newstart = locinput;
6214
6215           do_ifmatch:
6216             ST.me = scan;
6217             ST.logical = logical;
6218             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6219
6220             /* execute body of (?...A) */
6221             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6222             assert(0); /* NOTREACHED */
6223         }
6224
6225         case IFMATCH_A_fail: /* body of (?...A) failed */
6226             ST.wanted = !ST.wanted;
6227             /* FALL THROUGH */
6228
6229         case IFMATCH_A: /* body of (?...A) succeeded */
6230             if (ST.logical) {
6231                 sw = cBOOL(ST.wanted);
6232             }
6233             else if (!ST.wanted)
6234                 sayNO;
6235
6236             if (OP(ST.me) != SUSPEND) {
6237                 /* restore old position except for (?>...) */
6238                 locinput = st->locinput;
6239             }
6240             scan = ST.me + ARG(ST.me);
6241             if (scan == ST.me)
6242                 scan = NULL;
6243             continue; /* execute B */
6244
6245 #undef ST
6246
6247         case LONGJMP: /*  alternative with many branches compiles to
6248                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6249             next = scan + ARG(scan);
6250             if (next == scan)
6251                 next = NULL;
6252             break;
6253
6254         case COMMIT:  /*  (*COMMIT)  */
6255             reginfo->cutpoint = PL_regeol;
6256             /* FALLTHROUGH */
6257
6258         case PRUNE:   /*  (*PRUNE)   */
6259             if (!scan->flags)
6260                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6261             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6262             assert(0); /* NOTREACHED */
6263
6264         case COMMIT_next_fail:
6265             no_final = 1;
6266             /* FALLTHROUGH */
6267
6268         case OPFAIL:   /* (*FAIL)  */
6269             sayNO;
6270             assert(0); /* NOTREACHED */
6271
6272 #define ST st->u.mark
6273         case MARKPOINT: /*  (*MARK:foo)  */
6274             ST.prev_mark = mark_state;
6275             ST.mark_name = sv_commit = sv_yes_mark
6276                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6277             mark_state = st;
6278             ST.mark_loc = locinput;
6279             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6280             assert(0); /* NOTREACHED */
6281
6282         case MARKPOINT_next:
6283             mark_state = ST.prev_mark;
6284             sayYES;
6285             assert(0); /* NOTREACHED */
6286
6287         case MARKPOINT_next_fail:
6288             if (popmark && sv_eq(ST.mark_name,popmark))
6289             {
6290                 if (ST.mark_loc > startpoint)
6291                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6292                 popmark = NULL; /* we found our mark */
6293                 sv_commit = ST.mark_name;
6294
6295                 DEBUG_EXECUTE_r({
6296                         PerlIO_printf(Perl_debug_log,
6297                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6298                             REPORT_CODE_OFF+depth*2, "",
6299                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6300                 });
6301             }
6302             mark_state = ST.prev_mark;
6303             sv_yes_mark = mark_state ?
6304                 mark_state->u.mark.mark_name : NULL;
6305             sayNO;
6306             assert(0); /* NOTREACHED */
6307
6308         case SKIP:  /*  (*SKIP)  */
6309             if (scan->flags) {
6310                 /* (*SKIP) : if we fail we cut here*/
6311                 ST.mark_name = NULL;
6312                 ST.mark_loc = locinput;
6313                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6314             } else {
6315                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6316                    otherwise do nothing.  Meaning we need to scan
6317                  */
6318                 regmatch_state *cur = mark_state;
6319                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6320
6321                 while (cur) {
6322                     if ( sv_eq( cur->u.mark.mark_name,
6323                                 find ) )
6324                     {
6325                         ST.mark_name = find;
6326                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6327                     }
6328                     cur = cur->u.mark.prev_mark;
6329                 }
6330             }
6331             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6332             break;
6333
6334         case SKIP_next_fail:
6335             if (ST.mark_name) {
6336                 /* (*CUT:NAME) - Set up to search for the name as we
6337                    collapse the stack*/
6338                 popmark = ST.mark_name;
6339             } else {
6340                 /* (*CUT) - No name, we cut here.*/
6341                 if (ST.mark_loc > startpoint)
6342                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6343                 /* but we set sv_commit to latest mark_name if there
6344                    is one so they can test to see how things lead to this
6345                    cut */
6346                 if (mark_state)
6347                     sv_commit=mark_state->u.mark.mark_name;
6348             }
6349             no_final = 1;
6350             sayNO;
6351             assert(0); /* NOTREACHED */
6352 #undef ST
6353
6354         case LNBREAK: /* \R */
6355             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6356                 locinput += n;
6357             } else
6358                 sayNO;
6359             break;
6360
6361 #define CASE_CLASS(nAmE)                              \
6362         case nAmE:                                    \
6363             if (NEXTCHR_IS_EOS)                       \
6364                 sayNO;                                \
6365             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6366                 locinput += n;                        \
6367             } else                                    \
6368                 sayNO;                                \
6369             break;                                    \
6370         case N##nAmE:                                 \
6371             if (NEXTCHR_IS_EOS)                       \
6372                 sayNO;                                \
6373             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6374                 sayNO;                                \
6375             } else {                                  \
6376                 locinput += UTF8SKIP(locinput);       \
6377             }                                         \
6378             break
6379
6380         CASE_CLASS(VERTWS);  /*  \v \V  */
6381         CASE_CLASS(HORIZWS); /*  \h \H  */
6382 #undef CASE_CLASS
6383
6384         default:
6385             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6386                           PTR2UV(scan), OP(scan));
6387             Perl_croak(aTHX_ "regexp memory corruption");
6388
6389         /* this is a point to jump to in order to increment
6390          * locinput by one character */
6391         increment_locinput:
6392             assert(!NEXTCHR_IS_EOS);
6393             if (utf8_target) {
6394                 locinput += PL_utf8skip[nextchr];
6395                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6396                 if (locinput > PL_regeol)
6397                     sayNO;
6398             }
6399             else
6400                 locinput++;
6401             break;
6402
6403         } /* end switch */
6404
6405         /* switch break jumps here */
6406         scan = next; /* prepare to execute the next op and ... */
6407         continue;    /* ... jump back to the top, reusing st */
6408         assert(0); /* NOTREACHED */
6409
6410       push_yes_state:
6411         /* push a state that backtracks on success */
6412         st->u.yes.prev_yes_state = yes_state;
6413         yes_state = st;
6414         /* FALL THROUGH */
6415       push_state:
6416         /* push a new regex state, then continue at scan  */
6417         {
6418             regmatch_state *newst;
6419
6420             DEBUG_STACK_r({
6421                 regmatch_state *cur = st;
6422                 regmatch_state *curyes = yes_state;
6423                 int curd = depth;
6424                 regmatch_slab *slab = PL_regmatch_slab;
6425                 for (;curd > -1;cur--,curd--) {
6426                     if (cur < SLAB_FIRST(slab)) {
6427                         slab = slab->prev;
6428                         cur = SLAB_LAST(slab);
6429                     }
6430                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6431                         REPORT_CODE_OFF + 2 + depth * 2,"",
6432                         curd, PL_reg_name[cur->resume_state],
6433                         (curyes == cur) ? "yes" : ""
6434                     );
6435                     if (curyes == cur)
6436                         curyes = cur->u.yes.prev_yes_state;
6437                 }
6438             } else
6439                 DEBUG_STATE_pp("push")
6440             );
6441             depth++;
6442             st->locinput = locinput;
6443             newst = st+1;
6444             if (newst >  SLAB_LAST(PL_regmatch_slab))
6445                 newst = S_push_slab(aTHX);
6446             PL_regmatch_state = newst;
6447
6448             locinput = pushinput;
6449             st = newst;
6450             continue;
6451             assert(0); /* NOTREACHED */
6452         }
6453     }
6454
6455     /*
6456     * We get here only if there's trouble -- normally "case END" is
6457     * the terminating point.
6458     */
6459     Perl_croak(aTHX_ "corrupted regexp pointers");
6460     /*NOTREACHED*/
6461     sayNO;
6462
6463 yes:
6464     if (yes_state) {
6465         /* we have successfully completed a subexpression, but we must now
6466          * pop to the state marked by yes_state and continue from there */
6467         assert(st != yes_state);
6468 #ifdef DEBUGGING
6469         while (st != yes_state) {
6470             st--;
6471             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6472                 PL_regmatch_slab = PL_regmatch_slab->prev;
6473                 st = SLAB_LAST(PL_regmatch_slab);
6474             }
6475             DEBUG_STATE_r({
6476                 if (no_final) {
6477                     DEBUG_STATE_pp("pop (no final)");
6478                 } else {
6479                     DEBUG_STATE_pp("pop (yes)");
6480                 }
6481             });
6482             depth--;
6483         }
6484 #else
6485         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6486             || yes_state > SLAB_LAST(PL_regmatch_slab))
6487         {
6488             /* not in this slab, pop slab */
6489             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6490             PL_regmatch_slab = PL_regmatch_slab->prev;
6491             st = SLAB_LAST(PL_regmatch_slab);
6492         }
6493         depth -= (st - yes_state);
6494 #endif
6495         st = yes_state;
6496         yes_state = st->u.yes.prev_yes_state;
6497         PL_regmatch_state = st;
6498
6499         if (no_final)
6500             locinput= st->locinput;
6501         state_num = st->resume_state + no_final;
6502         goto reenter_switch;
6503     }
6504
6505     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6506                           PL_colors[4], PL_colors[5]));
6507
6508     if (PL_reg_state.re_state_eval_setup_done) {
6509         /* each successfully executed (?{...}) block does the equivalent of
6510          *   local $^R = do {...}
6511          * When popping the save stack, all these locals would be undone;
6512          * bypass this by setting the outermost saved $^R to the latest
6513          * value */
6514         if (oreplsv != GvSV(PL_replgv))
6515             sv_setsv(oreplsv, GvSV(PL_replgv));
6516     }
6517     result = 1;
6518     goto final_exit;
6519
6520 no:
6521     DEBUG_EXECUTE_r(
6522         PerlIO_printf(Perl_debug_log,
6523             "%*s  %sfailed...%s\n",
6524             REPORT_CODE_OFF+depth*2, "",
6525             PL_colors[4], PL_colors[5])
6526         );
6527
6528 no_silent:
6529     if (no_final) {
6530         if (yes_state) {
6531             goto yes;
6532         } else {
6533             goto final_exit;
6534         }
6535     }
6536     if (depth) {
6537         /* there's a previous state to backtrack to */
6538         st--;
6539         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6540             PL_regmatch_slab = PL_regmatch_slab->prev;
6541             st = SLAB_LAST(PL_regmatch_slab);
6542         }
6543         PL_regmatch_state = st;
6544         locinput= st->locinput;
6545
6546         DEBUG_STATE_pp("pop");
6547         depth--;
6548         if (yes_state == st)
6549             yes_state = st->u.yes.prev_yes_state;
6550
6551         state_num = st->resume_state + 1; /* failure = success + 1 */
6552         goto reenter_switch;
6553     }
6554     result = 0;
6555
6556   final_exit:
6557     if (rex->intflags & PREGf_VERBARG_SEEN) {
6558         SV *sv_err = get_sv("REGERROR", 1);
6559         SV *sv_mrk = get_sv("REGMARK", 1);
6560         if (result) {
6561             sv_commit = &PL_sv_no;
6562             if (!sv_yes_mark)
6563                 sv_yes_mark = &PL_sv_yes;
6564         } else {
6565             if (!sv_commit)
6566                 sv_commit = &PL_sv_yes;
6567             sv_yes_mark = &PL_sv_no;
6568         }
6569         sv_setsv(sv_err, sv_commit);
6570         sv_setsv(sv_mrk, sv_yes_mark);
6571     }
6572
6573
6574     if (last_pushed_cv) {
6575         dSP;
6576         POP_MULTICALL;
6577         PERL_UNUSED_VAR(SP);
6578     }
6579
6580     /* clean up; in particular, free all slabs above current one */
6581     LEAVE_SCOPE(oldsave);
6582
6583     assert(!result ||  locinput - PL_bostr >= 0);
6584     return result ?  locinput - PL_bostr : -1;
6585 }
6586
6587 /*
6588  - regrepeat - repeatedly match something simple, report how many
6589  *
6590  * What 'simple' means is a node which can be the operand of a quantifier like
6591  * '+', or {1,3}
6592  *
6593  * startposp - pointer a pointer to the start position.  This is updated
6594  *             to point to the byte following the highest successful
6595  *             match.
6596  * p         - the regnode to be repeatedly matched against.
6597  * max       - maximum number of things to match.
6598  * depth     - (for debugging) backtracking depth.
6599  */
6600 STATIC I32
6601 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6602 {
6603     dVAR;
6604     char *scan;     /* Pointer to current position in target string */
6605     I32 c;
6606     char *loceol = PL_regeol;   /* local version */
6607     I32 hardcount = 0;  /* How many matches so far */
6608     bool utf8_target = PL_reg_match_utf8;
6609     UV utf8_flags;
6610 #ifndef DEBUGGING
6611     PERL_UNUSED_ARG(depth);
6612 #endif
6613
6614     PERL_ARGS_ASSERT_REGREPEAT;
6615
6616     scan = *startposp;
6617     if (max == REG_INFTY)
6618         max = I32_MAX;
6619     else if (! utf8_target && scan + max < loceol)
6620         loceol = scan + max;
6621
6622     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6623      * to the maximum of how far we should go in it (leaving it set to the real
6624      * end, if the maximum permissible would take us beyond that).  This allows
6625      * us to make the loop exit condition that we haven't gone past <loceol> to
6626      * also mean that we haven't exceeded the max permissible count, saving a
6627      * test each time through the loop.  But it assumes that the OP matches a
6628      * single byte, which is true for most of the OPs below when applied to a
6629      * non-UTF-8 target.  Those relatively few OPs that don't have this
6630      * characteristic will have to compensate.
6631      *
6632      * There is no adjustment for UTF-8 targets, as the number of bytes per
6633      * character varies.  OPs will have to test both that the count is less
6634      * than the max permissible (using <hardcount> to keep track), and that we
6635      * are still within the bounds of the string (using <loceol>.  A few OPs
6636      * match a single byte no matter what the encoding.  They can omit the max
6637      * test if, for the UTF-8 case, they do the adjustment that was skipped
6638      * above.
6639      *
6640      * Thus, the code above sets things up for the common case; and exceptional
6641      * cases need extra work; the common case is to make sure <scan> doesn't
6642      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6643      * count doesn't exceed the maximum permissible */
6644
6645     switch (OP(p)) {
6646     case REG_ANY:
6647         if (utf8_target) {
6648             while (scan < loceol && hardcount < max && *scan != '\n') {
6649                 scan += UTF8SKIP(scan);
6650                 hardcount++;
6651             }
6652         } else {
6653             while (scan < loceol && *scan != '\n')
6654                 scan++;
6655         }
6656         break;
6657     case SANY:
6658         if (utf8_target) {
6659             while (scan < loceol && hardcount < max) {
6660                 scan += UTF8SKIP(scan);
6661                 hardcount++;
6662             }
6663         }
6664         else
6665             scan = loceol;
6666         break;
6667     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6668         if (utf8_target && scan + max < loceol) {
6669
6670             /* <loceol> hadn't been adjusted in the UTF-8 case */
6671             scan +=  max;
6672         }
6673         else {
6674             scan = loceol;
6675         }
6676         break;
6677     case EXACT:
6678         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6679
6680         c = (U8)*STRING(p);
6681
6682         /* Can use a simple loop if the pattern char to match on is invariant
6683          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6684          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6685          * true iff it doesn't matter if the argument is in UTF-8 or not */
6686         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6687             if (utf8_target && scan + max < loceol) {
6688                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6689                  * since here, to match at all, 1 char == 1 byte */
6690                 loceol = scan + max;
6691             }
6692             while (scan < loceol && UCHARAT(scan) == c) {
6693                 scan++;
6694             }
6695         }
6696         else if (UTF_PATTERN) {
6697             if (utf8_target) {
6698                 STRLEN scan_char_len;
6699
6700                 /* When both target and pattern are UTF-8, we have to do
6701                  * string EQ */
6702                 while (hardcount < max
6703                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6704                        && scan_char_len <= STR_LEN(p)
6705                        && memEQ(scan, STRING(p), scan_char_len))
6706                 {
6707                     scan += scan_char_len;
6708                     hardcount++;
6709                 }
6710             }
6711             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6712
6713                 /* Target isn't utf8; convert the character in the UTF-8
6714                  * pattern to non-UTF8, and do a simple loop */
6715                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6716                 while (scan < loceol && UCHARAT(scan) == c) {
6717                     scan++;
6718                 }
6719             } /* else pattern char is above Latin1, can't possibly match the
6720                  non-UTF-8 target */
6721         }
6722         else {
6723
6724             /* Here, the string must be utf8; pattern isn't, and <c> is
6725              * different in utf8 than not, so can't compare them directly.
6726              * Outside the loop, find the two utf8 bytes that represent c, and
6727              * then look for those in sequence in the utf8 string */
6728             U8 high = UTF8_TWO_BYTE_HI(c);
6729             U8 low = UTF8_TWO_BYTE_LO(c);
6730
6731             while (hardcount < max
6732                     && scan + 1 < loceol
6733                     && UCHARAT(scan) == high
6734                     && UCHARAT(scan + 1) == low)
6735             {
6736                 scan += 2;
6737                 hardcount++;
6738             }
6739         }
6740         break;
6741
6742     case EXACTFA:
6743         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6744         goto do_exactf;
6745
6746     case EXACTFL:
6747         PL_reg_flags |= RF_tainted;
6748         utf8_flags = FOLDEQ_UTF8_LOCALE;
6749         goto do_exactf;
6750
6751     case EXACTF:
6752             utf8_flags = 0;
6753             goto do_exactf;
6754
6755     case EXACTFU_SS:
6756     case EXACTFU_TRICKYFOLD:
6757     case EXACTFU:
6758         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6759
6760     do_exactf: {
6761         int c1, c2;
6762         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6763
6764         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6765
6766         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6767             if (c1 == CHRTEST_VOID) {
6768                 /* Use full Unicode fold matching */
6769                 char *tmpeol = PL_regeol;
6770                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6771                 while (hardcount < max
6772                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6773                                              STRING(p), NULL, pat_len,
6774                                              cBOOL(UTF_PATTERN), utf8_flags))
6775                 {
6776                     scan = tmpeol;
6777                     tmpeol = PL_regeol;
6778                     hardcount++;
6779                 }
6780             }
6781             else if (utf8_target) {
6782                 if (c1 == c2) {
6783                     while (scan < loceol
6784                            && hardcount < max
6785                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6786                     {
6787                         scan += UTF8SKIP(scan);
6788                         hardcount++;
6789                     }
6790                 }
6791                 else {
6792                     while (scan < loceol
6793                            && hardcount < max
6794                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6795                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6796                     {
6797                         scan += UTF8SKIP(scan);
6798                         hardcount++;
6799                     }
6800                 }
6801             }
6802             else if (c1 == c2) {
6803                 while (scan < loceol && UCHARAT(scan) == c1) {
6804                     scan++;
6805                 }
6806             }
6807             else {
6808                 while (scan < loceol &&
6809                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6810                 {
6811                     scan++;
6812                 }
6813             }
6814         }
6815         break;
6816     }
6817     case ANYOF:
6818         if (utf8_target) {
6819             STRLEN inclasslen;
6820             while (hardcount < max
6821                    && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6822                    && reginclass(prog, p, (U8*)scan, utf8_target))
6823             {
6824                 scan += inclasslen;
6825                 hardcount++;
6826             }
6827         } else {
6828             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6829                 scan++;
6830         }
6831         break;
6832     case ALNUMU:
6833         if (utf8_target) {
6834     utf8_wordchar:
6835             LOAD_UTF8_CHARCLASS_ALNUM();
6836             while (hardcount < max && scan < loceol &&
6837                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6838             {
6839                 scan += UTF8SKIP(scan);
6840                 hardcount++;
6841             }
6842         } else {
6843             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6844                 scan++;
6845             }
6846         }
6847         break;
6848     case ALNUM:
6849         if (utf8_target)
6850             goto utf8_wordchar;
6851         while (scan < loceol && isALNUM((U8) *scan)) {
6852             scan++;
6853         }
6854         break;
6855     case ALNUMA:
6856         if (utf8_target && scan + max < loceol) {
6857
6858             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6859              * since here, to match, 1 char == 1 byte */
6860             loceol = scan + max;
6861         }
6862         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6863             scan++;
6864         }
6865         break;
6866     case ALNUML:
6867         PL_reg_flags |= RF_tainted;
6868         if (utf8_target) {
6869             while (hardcount < max && scan < loceol &&
6870                    isALNUM_LC_utf8((U8*)scan)) {
6871                 scan += UTF8SKIP(scan);
6872                 hardcount++;
6873             }
6874         } else {
6875             while (scan < loceol && isALNUM_LC(*scan))
6876                 scan++;
6877         }
6878         break;
6879     case NALNUMU:
6880         if (utf8_target) {
6881
6882     utf8_Nwordchar:
6883
6884             LOAD_UTF8_CHARCLASS_ALNUM();
6885             while (hardcount < max && scan < loceol &&
6886                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6887             {
6888                 scan += UTF8SKIP(scan);
6889                 hardcount++;
6890             }
6891         } else {
6892             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6893                 scan++;
6894             }
6895         }
6896         break;
6897     case NALNUM:
6898         if (utf8_target)
6899             goto utf8_Nwordchar;
6900         while (scan < loceol && ! isALNUM((U8) *scan)) {
6901             scan++;
6902         }
6903         break;
6904
6905     case POSIXA:
6906         if (utf8_target && scan + max < loceol) {
6907
6908             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6909              * since here, to match, 1 char == 1 byte */
6910             loceol = scan + max;
6911         }
6912         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6913             scan++;
6914         }
6915         break;
6916     case NPOSIXA:
6917         if (utf8_target) {
6918             while (scan < loceol && hardcount < max
6919                    && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6920             {
6921                 scan += UTF8SKIP(scan);
6922                 hardcount++;
6923             }
6924         }
6925         else {
6926             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6927                 scan++;
6928             }
6929         }
6930         break;
6931     case NALNUMA:
6932         if (utf8_target) {
6933             while (scan < loceol && hardcount < max
6934                    && ! isWORDCHAR_A((U8) *scan))
6935             {
6936                 scan += UTF8SKIP(scan);
6937                 hardcount++;
6938             }
6939         }
6940         else {
6941             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6942                 scan++;
6943             }
6944         }
6945         break;
6946     case NALNUML:
6947         PL_reg_flags |= RF_tainted;
6948         if (utf8_target) {
6949             while (hardcount < max && scan < loceol &&
6950                    !isALNUM_LC_utf8((U8*)scan)) {
6951                 scan += UTF8SKIP(scan);
6952                 hardcount++;
6953             }
6954         } else {
6955             while (scan < loceol && !isALNUM_LC(*scan))
6956                 scan++;
6957         }
6958         break;
6959     case SPACEU:
6960         if (utf8_target) {
6961
6962     utf8_space:
6963
6964             while (hardcount < max && scan < loceol
6965                    && is_XPERLSPACE_utf8((U8*)scan))
6966             {
6967                 scan += UTF8SKIP(scan);
6968                 hardcount++;
6969             }
6970             break;
6971         }
6972         else {
6973             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6974                 scan++;
6975             }
6976             break;
6977         }
6978     case SPACE:
6979         if (utf8_target)
6980             goto utf8_space;
6981
6982         while (scan < loceol && isSPACE((U8) *scan)) {
6983             scan++;
6984         }
6985         break;
6986     case SPACEA:
6987         if (utf8_target && scan + max < loceol) {
6988
6989             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6990              * since here, to match, 1 char == 1 byte */
6991             loceol = scan + max;
6992         }
6993         while (scan < loceol && isSPACE_A((U8) *scan)) {
6994             scan++;
6995         }
6996         break;
6997     case SPACEL:
6998         PL_reg_flags |= RF_tainted;
6999         if (utf8_target) {
7000             while (hardcount < max && scan < loceol &&
7001                    isSPACE_LC_utf8((U8*)scan)) {
7002                 scan += UTF8SKIP(scan);
7003                 hardcount++;
7004             }
7005         } else {
7006             while (scan < loceol && isSPACE_LC(*scan))
7007                 scan++;
7008         }
7009         break;
7010     case NSPACEU:
7011         if (utf8_target) {
7012
7013     utf8_Nspace:
7014
7015             while (hardcount < max && scan < loceol
7016                    && ! is_XPERLSPACE_utf8((U8*)scan))
7017             {
7018                 scan += UTF8SKIP(scan);
7019                 hardcount++;
7020             }
7021             break;
7022         }
7023         else {
7024             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7025                 scan++;
7026             }
7027         }
7028         break;
7029     case NSPACE:
7030         if (utf8_target)
7031             goto utf8_Nspace;
7032
7033         while (scan < loceol && ! isSPACE((U8) *scan)) {
7034             scan++;
7035         }
7036         break;
7037     case NSPACEA:
7038         if (utf8_target) {
7039             while (hardcount < max && scan < loceol
7040                    && ! isSPACE_A((U8) *scan))
7041             {
7042                 scan += UTF8SKIP(scan);
7043                 hardcount++;
7044             }
7045         }
7046         else {
7047             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7048                 scan++;
7049             }
7050         }
7051         break;
7052     case NSPACEL:
7053         PL_reg_flags |= RF_tainted;
7054         if (utf8_target) {
7055             while (hardcount < max && scan < loceol &&
7056                    !isSPACE_LC_utf8((U8*)scan)) {
7057                 scan += UTF8SKIP(scan);
7058                 hardcount++;
7059             }
7060         } else {
7061             while (scan < loceol && !isSPACE_LC(*scan))
7062                 scan++;
7063         }
7064         break;
7065     case DIGIT:
7066         if (utf8_target) {
7067             LOAD_UTF8_CHARCLASS_DIGIT();
7068             while (hardcount < max && scan < loceol &&
7069                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7070                 scan += UTF8SKIP(scan);
7071                 hardcount++;
7072             }
7073         } else {
7074             while (scan < loceol && isDIGIT(*scan))
7075                 scan++;
7076         }
7077         break;
7078     case DIGITA:
7079         if (utf8_target && scan + max < loceol) {
7080
7081             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7082              * since here, to match, 1 char == 1 byte */
7083             loceol = scan + max;
7084         }
7085         while (scan < loceol && isDIGIT_A((U8) *scan)) {
7086             scan++;
7087         }
7088         break;
7089     case DIGITL:
7090         PL_reg_flags |= RF_tainted;
7091         if (utf8_target) {
7092             while (hardcount < max && scan < loceol &&
7093                    isDIGIT_LC_utf8((U8*)scan)) {
7094                 scan += UTF8SKIP(scan);
7095                 hardcount++;
7096             }
7097         } else {
7098             while (scan < loceol && isDIGIT_LC(*scan))
7099                 scan++;
7100         }
7101         break;
7102     case NDIGIT:
7103         if (utf8_target) {
7104             LOAD_UTF8_CHARCLASS_DIGIT();
7105             while (hardcount < max && scan < loceol &&
7106                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7107                 scan += UTF8SKIP(scan);
7108                 hardcount++;
7109             }
7110         } else {
7111             while (scan < loceol && !isDIGIT(*scan))
7112                 scan++;
7113         }
7114         break;
7115     case NDIGITA:
7116         if (utf8_target) {
7117             while (hardcount < max && scan < loceol
7118                    && ! isDIGIT_A((U8) *scan)) {
7119                 scan += UTF8SKIP(scan);
7120                 hardcount++;
7121             }
7122         }
7123         else {
7124             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7125                 scan++;
7126             }
7127         }
7128         break;
7129     case NDIGITL:
7130         PL_reg_flags |= RF_tainted;
7131         if (utf8_target) {
7132             while (hardcount < max && scan < loceol &&
7133                    !isDIGIT_LC_utf8((U8*)scan)) {
7134                 scan += UTF8SKIP(scan);
7135                 hardcount++;
7136             }
7137         } else {
7138             while (scan < loceol && !isDIGIT_LC(*scan))
7139                 scan++;
7140         }
7141         break;
7142     case LNBREAK:
7143         if (utf8_target) {
7144             while (hardcount < max && scan < loceol &&
7145                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7146                 scan += c;
7147                 hardcount++;
7148             }
7149         } else {
7150             /* LNBREAK can match one or two latin chars, which is ok, but we
7151              * have to use hardcount in this situation, and throw away the
7152              * adjustment to <loceol> done before the switch statement */
7153             loceol = PL_regeol;
7154             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7155                 scan+=c;
7156                 hardcount++;
7157             }
7158         }
7159         break;
7160     case HORIZWS:
7161         if (utf8_target) {
7162             while (hardcount < max && scan < loceol &&
7163                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7164             {
7165                 scan += c;
7166                 hardcount++;
7167             }
7168         } else {
7169             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7170                 scan++;
7171         }
7172         break;
7173     case NHORIZWS:
7174         if (utf8_target) {
7175             while (hardcount < max && scan < loceol &&
7176                         !is_HORIZWS_utf8_safe(scan, loceol))
7177             {
7178                 scan += UTF8SKIP(scan);
7179                 hardcount++;
7180             }
7181         } else {
7182             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7183                 scan++;
7184
7185         }
7186         break;
7187     case VERTWS:
7188         if (utf8_target) {
7189             while (hardcount < max && scan < loceol &&
7190                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7191             {
7192                 scan += c;
7193                 hardcount++;
7194             }
7195         } else {
7196             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7197                 scan++;
7198
7199         }
7200         break;
7201     case NVERTWS:
7202         if (utf8_target) {
7203             while (hardcount < max && scan < loceol &&
7204                                 !is_VERTWS_utf8_safe(scan, loceol))
7205             {
7206                 scan += UTF8SKIP(scan);
7207                 hardcount++;
7208             }
7209         } else {
7210             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7211                 scan++;
7212
7213         }
7214         break;
7215
7216     case BOUND:
7217     case BOUNDA:
7218     case BOUNDL:
7219     case BOUNDU:
7220     case EOS:
7221     case GPOS:
7222     case KEEPS:
7223     case NBOUND:
7224     case NBOUNDA:
7225     case NBOUNDL:
7226     case NBOUNDU:
7227     case OPFAIL:
7228     case SBOL:
7229     case SEOL:
7230         /* These are all 0 width, so match right here or not at all. */
7231         break;
7232
7233     default:
7234         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7235         assert(0); /* NOTREACHED */
7236
7237     }
7238
7239     if (hardcount)
7240         c = hardcount;
7241     else
7242         c = scan - *startposp;
7243     *startposp = scan;
7244
7245     DEBUG_r({
7246         GET_RE_DEBUG_FLAGS_DECL;
7247         DEBUG_EXECUTE_r({
7248             SV * const prop = sv_newmortal();
7249             regprop(prog, prop, p);
7250             PerlIO_printf(Perl_debug_log,
7251                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7252                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7253         });
7254     });
7255
7256     return(c);
7257 }
7258
7259
7260 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7261 /*
7262 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7263 create a copy so that changes the caller makes won't change the shared one.
7264 If <altsvp> is non-null, will return NULL in it, for back-compat.
7265  */
7266 SV *
7267 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7268 {
7269     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7270
7271     if (altsvp) {
7272         *altsvp = NULL;
7273     }
7274
7275     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7276 }
7277 #endif
7278
7279 STATIC SV *
7280 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7281 {
7282     /* Returns the swash for the input 'node' in the regex 'prog'.
7283      * If <doinit> is true, will attempt to create the swash if not already
7284      *    done.
7285      * If <listsvp> is non-null, will return the swash initialization string in
7286      *    it.
7287      * Tied intimately to how regcomp.c sets up the data structure */
7288
7289     dVAR;
7290     SV *sw  = NULL;
7291     SV *si  = NULL;
7292     SV*  invlist = NULL;
7293
7294     RXi_GET_DECL(prog,progi);
7295     const struct reg_data * const data = prog ? progi->data : NULL;
7296
7297     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7298
7299     assert(ANYOF_NONBITMAP(node));
7300
7301     if (data && data->count) {
7302         const U32 n = ARG(node);
7303
7304         if (data->what[n] == 's') {
7305             SV * const rv = MUTABLE_SV(data->data[n]);
7306             AV * const av = MUTABLE_AV(SvRV(rv));
7307             SV **const ary = AvARRAY(av);
7308             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7309
7310             si = *ary;  /* ary[0] = the string to initialize the swash with */
7311
7312             /* Elements 2 and 3 are either both present or both absent. [2] is
7313              * any inversion list generated at compile time; [3] indicates if
7314              * that inversion list has any user-defined properties in it. */
7315             if (av_len(av) >= 2) {
7316                 invlist = ary[2];
7317                 if (SvUV(ary[3])) {
7318                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7319                 }
7320             }
7321             else {
7322                 invlist = NULL;
7323             }
7324
7325             /* Element [1] is reserved for the set-up swash.  If already there,
7326              * return it; if not, create it and store it there */
7327             if (SvROK(ary[1])) {
7328                 sw = ary[1];
7329             }
7330             else if (si && doinit) {
7331
7332                 sw = _core_swash_init("utf8", /* the utf8 package */
7333                                       "", /* nameless */
7334                                       si,
7335                                       1, /* binary */
7336                                       0, /* not from tr/// */
7337                                       invlist,
7338                                       &swash_init_flags);
7339                 (void)av_store(av, 1, sw);
7340             }
7341         }
7342     }
7343
7344     if (listsvp) {
7345         SV* matches_string = newSVpvn("", 0);
7346
7347         /* Use the swash, if any, which has to have incorporated into it all
7348          * possibilities */
7349         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7350             && (si && si != &PL_sv_undef))
7351         {
7352
7353             /* If no swash, use the input initialization string, if available */
7354             sv_catsv(matches_string, si);
7355         }
7356
7357         /* Add the inversion list to whatever we have.  This may have come from
7358          * the swash, or from an input parameter */
7359         if (invlist) {
7360             sv_catsv(matches_string, _invlist_contents(invlist));
7361         }
7362         *listsvp = matches_string;
7363     }
7364
7365     return sw;
7366 }
7367
7368 /*
7369  - reginclass - determine if a character falls into a character class
7370
7371   n is the ANYOF regnode
7372   p is the target string
7373   utf8_target tells whether p is in UTF-8.
7374
7375   Returns true if matched; false otherwise.
7376
7377   Note that this can be a synthetic start class, a combination of various
7378   nodes, so things you think might be mutually exclusive, such as locale,
7379   aren't.  It can match both locale and non-locale
7380
7381  */
7382
7383 STATIC bool
7384 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
7385 {
7386     dVAR;
7387     const char flags = ANYOF_FLAGS(n);
7388     bool match = FALSE;
7389     UV c = *p;
7390
7391     PERL_ARGS_ASSERT_REGINCLASS;
7392
7393     /* If c is not already the code point, get it.  Note that
7394      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7395     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7396         STRLEN c_len = 0;
7397         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7398                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7399                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7400                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7401                  * UTF8_ALLOW_FFFF */
7402         if (c_len == (STRLEN)-1)
7403             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7404     }
7405
7406     /* If this character is potentially in the bitmap, check it */
7407     if (c < 256) {
7408         if (ANYOF_BITMAP_TEST(n, c))
7409             match = TRUE;
7410         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7411                 && ! utf8_target
7412                 && ! isASCII(c))
7413         {
7414             match = TRUE;
7415         }
7416         else if (flags & ANYOF_LOCALE) {
7417             PL_reg_flags |= RF_tainted;
7418
7419             if ((flags & ANYOF_LOC_FOLD)
7420                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7421             {
7422                 match = TRUE;
7423             }
7424             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7425                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7426                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7427                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7428                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7429                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7430                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7431                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7432                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7433                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7434                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7435                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7436                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7437                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7438                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7439                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7440                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7441                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7442                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7443                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7444                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7445                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7446                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7447                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7448                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7449                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7450                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7451                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7452                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7453                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7454                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7455                      ) /* How's that for a conditional? */
7456             ) {
7457                 match = TRUE;
7458             }
7459         }
7460     }
7461
7462     /* If the bitmap didn't (or couldn't) match, and something outside the
7463      * bitmap could match, try that.  Locale nodes specify completely the
7464      * behavior of code points in the bit map (otherwise, a utf8 target would
7465      * cause them to be treated as Unicode and not locale), except in
7466      * the very unlikely event when this node is a synthetic start class, which
7467      * could be a combination of locale and non-locale nodes.  So allow locale
7468      * to match for the synthetic start class, which will give a false
7469      * positive that will be resolved when the match is done again as not part
7470      * of the synthetic start class */
7471     if (!match) {
7472         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7473             match = TRUE;       /* Everything above 255 matches */
7474         }
7475         else if (ANYOF_NONBITMAP(n)
7476                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7477                      || (utf8_target
7478                          && (c >=256
7479                              || (! (flags & ANYOF_LOCALE))
7480                              || (flags & ANYOF_IS_SYNTHETIC)))))
7481         {
7482             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7483             if (sw) {
7484                 U8 * utf8_p;
7485                 if (utf8_target) {
7486                     utf8_p = (U8 *) p;
7487                 } else { /* Convert to utf8 */
7488                     STRLEN len = 1;
7489                     utf8_p = bytes_to_utf8(p, &len);
7490                 }
7491
7492                 if (swash_fetch(sw, utf8_p, TRUE)) {
7493                     match = TRUE;
7494                 }
7495
7496                 /* If we allocated a string above, free it */
7497                 if (! utf8_target) Safefree(utf8_p);
7498             }
7499         }
7500
7501         if (UNICODE_IS_SUPER(c)
7502             && (flags & ANYOF_WARN_SUPER)
7503             && ckWARN_d(WARN_NON_UNICODE))
7504         {
7505             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7506                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7507         }
7508     }
7509
7510     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7511     return cBOOL(flags & ANYOF_INVERT) ^ match;
7512 }
7513
7514 STATIC U8 *
7515 S_reghop3(U8 *s, I32 off, const U8* lim)
7516 {
7517     /* return the position 'off' UTF-8 characters away from 's', forward if
7518      * 'off' >= 0, backwards if negative.  But don't go outside of position
7519      * 'lim', which better be < s  if off < 0 */
7520
7521     dVAR;
7522
7523     PERL_ARGS_ASSERT_REGHOP3;
7524
7525     if (off >= 0) {
7526         while (off-- && s < lim) {
7527             /* XXX could check well-formedness here */
7528             s += UTF8SKIP(s);
7529         }
7530     }
7531     else {
7532         while (off++ && s > lim) {
7533             s--;
7534             if (UTF8_IS_CONTINUED(*s)) {
7535                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7536                     s--;
7537             }
7538             /* XXX could check well-formedness here */
7539         }
7540     }
7541     return s;
7542 }
7543
7544 #ifdef XXX_dmq
7545 /* there are a bunch of places where we use two reghop3's that should
7546    be replaced with this routine. but since thats not done yet
7547    we ifdef it out - dmq
7548 */
7549 STATIC U8 *
7550 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7551 {
7552     dVAR;
7553
7554     PERL_ARGS_ASSERT_REGHOP4;
7555
7556     if (off >= 0) {
7557         while (off-- && s < rlim) {
7558             /* XXX could check well-formedness here */
7559             s += UTF8SKIP(s);
7560         }
7561     }
7562     else {
7563         while (off++ && s > llim) {
7564             s--;
7565             if (UTF8_IS_CONTINUED(*s)) {
7566                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7567                     s--;
7568             }
7569             /* XXX could check well-formedness here */
7570         }
7571     }
7572     return s;
7573 }
7574 #endif
7575
7576 STATIC U8 *
7577 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7578 {
7579     dVAR;
7580
7581     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7582
7583     if (off >= 0) {
7584         while (off-- && s < lim) {
7585             /* XXX could check well-formedness here */
7586             s += UTF8SKIP(s);
7587         }
7588         if (off >= 0)
7589             return NULL;
7590     }
7591     else {
7592         while (off++ && s > lim) {
7593             s--;
7594             if (UTF8_IS_CONTINUED(*s)) {
7595                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7596                     s--;
7597             }
7598             /* XXX could check well-formedness here */
7599         }
7600         if (off <= 0)
7601             return NULL;
7602     }
7603     return s;
7604 }
7605
7606 static void
7607 restore_pos(pTHX_ void *arg)
7608 {
7609     dVAR;
7610     regexp * const rex = (regexp *)arg;
7611     if (PL_reg_state.re_state_eval_setup_done) {
7612         if (PL_reg_oldsaved) {
7613             rex->subbeg = PL_reg_oldsaved;
7614             rex->sublen = PL_reg_oldsavedlen;
7615             rex->suboffset = PL_reg_oldsavedoffset;
7616             rex->subcoffset = PL_reg_oldsavedcoffset;
7617 #ifdef PERL_OLD_COPY_ON_WRITE
7618             rex->saved_copy = PL_nrs;
7619 #endif
7620             RXp_MATCH_COPIED_on(rex);
7621         }
7622         PL_reg_magic->mg_len = PL_reg_oldpos;
7623         PL_reg_state.re_state_eval_setup_done = FALSE;
7624         PL_curpm = PL_reg_oldcurpm;
7625     }
7626 }
7627
7628 STATIC void
7629 S_to_utf8_substr(pTHX_ register regexp *prog)
7630 {
7631     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7632      * on the converted value */
7633
7634     int i = 1;
7635
7636     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7637
7638     do {
7639         if (prog->substrs->data[i].substr
7640             && !prog->substrs->data[i].utf8_substr) {
7641             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7642             prog->substrs->data[i].utf8_substr = sv;
7643             sv_utf8_upgrade(sv);
7644             if (SvVALID(prog->substrs->data[i].substr)) {
7645                 if (SvTAIL(prog->substrs->data[i].substr)) {
7646                     /* Trim the trailing \n that fbm_compile added last
7647                        time.  */
7648                     SvCUR_set(sv, SvCUR(sv) - 1);
7649                     /* Whilst this makes the SV technically "invalid" (as its
7650                        buffer is no longer followed by "\0") when fbm_compile()
7651                        adds the "\n" back, a "\0" is restored.  */
7652                     fbm_compile(sv, FBMcf_TAIL);
7653                 } else
7654                     fbm_compile(sv, 0);
7655             }
7656             if (prog->substrs->data[i].substr == prog->check_substr)
7657                 prog->check_utf8 = sv;
7658         }
7659     } while (i--);
7660 }
7661
7662 STATIC bool
7663 S_to_byte_substr(pTHX_ register regexp *prog)
7664 {
7665     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7666      * on the converted value; returns FALSE if can't be converted. */
7667
7668     dVAR;
7669     int i = 1;
7670
7671     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7672
7673     do {
7674         if (prog->substrs->data[i].utf8_substr
7675             && !prog->substrs->data[i].substr) {
7676             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7677             if (! sv_utf8_downgrade(sv, TRUE)) {
7678                 return FALSE;
7679             }
7680             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7681                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7682                     /* Trim the trailing \n that fbm_compile added last
7683                         time.  */
7684                     SvCUR_set(sv, SvCUR(sv) - 1);
7685                     fbm_compile(sv, FBMcf_TAIL);
7686                 } else
7687                     fbm_compile(sv, 0);
7688             }
7689             prog->substrs->data[i].substr = sv;
7690             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7691                 prog->check_substr = sv;
7692         }
7693     } while (i--);
7694
7695     return TRUE;
7696 }
7697
7698 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7699  * pre-composed Hangul syllables, which are all in a contiguous block and
7700  * arranged there in such a way so as to facilitate alorithmic determination of
7701  * their characteristics.  As such, they don't need a swash, but can be
7702  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7703  * is a GCB=LV */
7704 #define SBASE 0xAC00    /* Start of block */
7705 #define SCount 11172    /* Length of block */
7706 #define TCount 28
7707
7708 #if 0   /* This routine is not currently used */
7709 PERL_STATIC_INLINE bool
7710 S_is_utf8_X_LV(pTHX_ const U8 *p)
7711 {
7712     /* Unlike most other similarly named routines here, this does not create a
7713      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7714
7715     dVAR;
7716
7717     UV cp = valid_utf8_to_uvchr(p, NULL);
7718
7719     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7720
7721     /* The earliest Unicode releases did not have these precomposed Hangul
7722      * syllables.  Set to point to undef in that case, so will return false on
7723      * every call */
7724     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7725         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7726         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7727             SvREFCNT_dec(PL_utf8_X_LV);
7728             PL_utf8_X_LV = &PL_sv_undef;
7729         }
7730     }
7731
7732     return (PL_utf8_X_LV != &PL_sv_undef
7733             && cp >= SBASE && cp < SBASE + SCount
7734             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7735 }
7736 #endif
7737
7738 PERL_STATIC_INLINE bool
7739 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7740 {
7741     /* Unlike most other similarly named routines here, this does not create a
7742      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7743
7744     dVAR;
7745
7746     UV cp = valid_utf8_to_uvchr(p, NULL);
7747
7748     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7749
7750     /* The earliest Unicode releases did not have these precomposed Hangul
7751      * syllables.  Set to point to undef in that case, so will return false on
7752      * every call */
7753     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7754         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7755         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7756             SvREFCNT_dec(PL_utf8_X_LVT);
7757             PL_utf8_X_LVT = &PL_sv_undef;
7758         }
7759     }
7760
7761     return (PL_utf8_X_LVT != &PL_sv_undef
7762             && cp >= SBASE && cp < SBASE + SCount
7763             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7764 }
7765
7766 /*
7767  * Local variables:
7768  * c-indentation-style: bsd
7769  * c-basic-offset: 4
7770  * indent-tabs-mode: nil
7771  * End:
7772  *
7773  * ex: set ts=8 sts=4 sw=4 et:
7774  */